[extractor/fosdem] Move parsing logic

2025-01-01 06:21:09 +01:00 · 2023-05-06 06:52:13 -04:00 · 2023-05-06 06:52:13 -04:00 · 03e4ca498a
commit 03e4ca498a
parent a1a330cd9c
1 changed files with 24 additions and 27 deletions
--- a/yt_dlp/extractor/fosdem.py
+++ b/yt_dlp/extractor/fosdem.py
@ -44,8 +44,6 @@ class FosdemIE(InfoExtractor):
        video_id = self._match_id(url)
        groups = self._match_valid_url(url).groupdict()
        webpage = self._download_webpage(url, video_id)
        if groups['url_type'] == 'event':
            print("This is an event url")
        elif groups['url_type'] == 'track':
            print("This is a track")
            # Download all videos on this page
@ -54,28 +52,27 @@ class FosdemIE(InfoExtractor):
        year = groups['year']
        title_rgx = r"<div id=\"pagetitles\">\n\s+<h1>(.+?)</h1>"
        title = self._html_search_regex(title_rgx, webpage, 'title')
-        print(f'TITLE: {title}')
+        if groups['url_type'] == 'event':
-        evnt_blurb_rgx = r"<div class=\"event-blurb\">\n*(?P<blurb>(<div class=\"event-abstract\">(<p>(.+?)</p>\n*)+</div>)+\n*(<div class=\"event-description\">(<p>(.+?)</p>\n*)*</div>))+\n*</div>"
+            evnt_blurb_rgx = r"<div class=\"event-blurb\">\n*(?P<blurb>(<div class=\"event-abstract\">(<p>(.+?)</p>\n*)+</div>)+\n*(<div class=\"event-description\">(<p>(.+?)</p>\n*)*</div>))+\n*</div>"
-        evnt_blurb = self._html_search_regex(evnt_blurb_rgx,
+            evnt_blurb = self._html_search_regex(evnt_blurb_rgx,
-                                             webpage,
+                                                 webpage,
-                                             'event blurb',
+                                                 'event blurb',
-                                             group='blurb', flags=re.DOTALL)
+                                                 group='blurb', flags=re.DOTALL)
-        description = evnt_blurb
+            description = evnt_blurb
-        print(f"DESCRIPTION: {description}")
+            video_url_rgx = r"<li><a href=\"(https://video.fosdem.org/[0-9]{4}/.+)\">"
-        video_url_rgx = r"<li><a href=\"(https://video.fosdem.org/[0-9]{4}/.+)\">"
+            video_url = self._html_search_regex(video_url_rgx,
-        video_url = self._html_search_regex(video_url_rgx,
+                                                webpage,
-                                            webpage,
+                                                'video url')
-                                            'video url')
+            cast_rgx = r"<td><a href=\"/[0-9]+/schedule/speaker/[a-z_]+/\">(?P<speaker>\w+ \w+)</a></td>"
-        print(f"VIDEO URL: {video_url}")
+            cast = re.findall(cast_rgx, webpage, flags=re.UNICODE)
-        print('\n\n___________________________')
+            return {
-        return {
+                'id': video_id,
-            'id': video_id,
+                'title': title,
-            'title': title,
+                'description': description,
-            'description': description,
+                'uploader': 'FOSDEM',
-            'uploader': 'FOSDEM',
+                'url': video_url,
-            'url': video_url,
+                'thumbnail': None,
-            'thumbnail': None,
+                'release_date': year,
-            # TODO more properties (see yt_dlp/extractor/common.py)
+                'cast': cast,
-            'release_date': year,
+                'webpage_url': url,
-            # 'presenter/author
+            }
        }