From 222a230871fe4fe63f35c49590379c9a77116819 Mon Sep 17 00:00:00 2001 From: Lesmiscore Date: Sun, 29 May 2022 22:48:04 +0900 Subject: [PATCH] [extractor/common] Recognize `src` attribute from HTML5 media elements (#3899) Authored by: Lesmiscore --- test/test_InfoExtractor.py | 18 ++++++++++++++++++ yt_dlp/extractor/common.py | 7 +++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 257ea7dd35..928246668b 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -502,6 +502,24 @@ class TestInfoExtractor(unittest.TestCase): }], }) + # from https://0000.studio/ + # with type attribute but without extension in URL + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://0000.studio', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 5767662ed5..a589fb7fa5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3197,7 +3197,8 @@ class InfoExtractor: return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -3215,6 +3216,7 @@ class InfoExtractor: formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats @@ -3241,7 +3243,8 @@ class InfoExtractor: media_attributes = extract_attributes(media_tag) src = strip_or_none(media_attributes.get('src')) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: