[extractor/fosdem] Move parsing logic

This commit is contained in:
Jesse Millwood 2023-05-06 06:52:13 -04:00 committed by Jesse Millwood
parent a1a330cd9c
commit 03e4ca498a

View file

@ -44,8 +44,6 @@ class FosdemIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
groups = self._match_valid_url(url).groupdict() groups = self._match_valid_url(url).groupdict()
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if groups['url_type'] == 'event':
print("This is an event url")
elif groups['url_type'] == 'track': elif groups['url_type'] == 'track':
print("This is a track") print("This is a track")
# Download all videos on this page # Download all videos on this page
@ -54,20 +52,19 @@ class FosdemIE(InfoExtractor):
year = groups['year'] year = groups['year']
title_rgx = r"<div id=\"pagetitles\">\n\s+<h1>(.+?)</h1>" title_rgx = r"<div id=\"pagetitles\">\n\s+<h1>(.+?)</h1>"
title = self._html_search_regex(title_rgx, webpage, 'title') title = self._html_search_regex(title_rgx, webpage, 'title')
print(f'TITLE: {title}') if groups['url_type'] == 'event':
evnt_blurb_rgx = r"<div class=\"event-blurb\">\n*(?P<blurb>(<div class=\"event-abstract\">(<p>(.+?)</p>\n*)+</div>)+\n*(<div class=\"event-description\">(<p>(.+?)</p>\n*)*</div>))+\n*</div>" evnt_blurb_rgx = r"<div class=\"event-blurb\">\n*(?P<blurb>(<div class=\"event-abstract\">(<p>(.+?)</p>\n*)+</div>)+\n*(<div class=\"event-description\">(<p>(.+?)</p>\n*)*</div>))+\n*</div>"
evnt_blurb = self._html_search_regex(evnt_blurb_rgx, evnt_blurb = self._html_search_regex(evnt_blurb_rgx,
webpage, webpage,
'event blurb', 'event blurb',
group='blurb', flags=re.DOTALL) group='blurb', flags=re.DOTALL)
description = evnt_blurb description = evnt_blurb
print(f"DESCRIPTION: {description}")
video_url_rgx = r"<li><a href=\"(https://video.fosdem.org/[0-9]{4}/.+)\">" video_url_rgx = r"<li><a href=\"(https://video.fosdem.org/[0-9]{4}/.+)\">"
video_url = self._html_search_regex(video_url_rgx, video_url = self._html_search_regex(video_url_rgx,
webpage, webpage,
'video url') 'video url')
print(f"VIDEO URL: {video_url}") cast_rgx = r"<td><a href=\"/[0-9]+/schedule/speaker/[a-z_]+/\">(?P<speaker>\w+ \w+)</a></td>"
print('\n\n___________________________') cast = re.findall(cast_rgx, webpage, flags=re.UNICODE)
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
@ -75,7 +72,7 @@ class FosdemIE(InfoExtractor):
'uploader': 'FOSDEM', 'uploader': 'FOSDEM',
'url': video_url, 'url': video_url,
'thumbnail': None, 'thumbnail': None,
# TODO more properties (see yt_dlp/extractor/common.py)
'release_date': year, 'release_date': year,
# 'presenter/author 'cast': cast,
'webpage_url': url,
} }