From 4115c24d157c5b5f63089d75c4e0f51d1f8b4489 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Sep 2024 18:25:36 -0500 Subject: [PATCH 1/8] [ie/vimeo] Always try to extract original format (#10721) Closes #9163 Authored by: bashonly --- yt_dlp/extractor/vimeo.py | 208 ++++++++++++++++++++++++++++---------- 1 file changed, 155 insertions(+), 53 deletions(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index a20cf4b17d..2aaac19723 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -234,13 +234,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _extract_original_format(self, url, video_id, unlisted_hash=None): + def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + return self._download_json( + join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), + video_id, 'Downloading API JSON', headers={ + 'Authorization': f'jwt {jwt_token}', + 'Accept': 'application/json', + }, query={ + 'fields': ','.join(( + 'config_url', 'created_time', 'description', 'download', 'license', + 'metadata.connections.comments.total', 'metadata.connections.likes.total', + 'release_time', 'stats.plays')), + }, **kwargs) + + def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + # Original/source formats are only available when logged in + if not self._get_cookies('https://vimeo.com/').get('is_logged_in'): + return + query = {'action': 'load_download_config'} if unlisted_hash: query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}, + url, video_id, 'Loading download config JSON', fatal=False, + query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, expected_status=(403, 404)) or {} source_file = download_data.get('source_file') download_url = try_get(source_file, lambda x: x['download_url']) @@ -261,15 +278,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'quality': 1, } - jwt_response = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} - if not jwt_response.get('jwt'): + jwt = jwt or traverse_obj(self._download_json( + 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) + if not jwt: return - headers = {'Authorization': 'jwt {}'.format(jwt_response['jwt']), 'Accept': 'application/json'} - original_response = self._download_json( - f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False, expected_status=(403, 404)) or {} - for download_data in original_response.get('download') or []: + original_response = api_data or self._call_videos_api( + video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue @@ -354,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'No longer available', }, { - 'url': 'http://player.vimeo.com/video/54469442', + 'url': 'https://player.vimeo.com/video/54469442', 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { @@ -370,6 +385,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/68375962', @@ -379,22 +395,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, + 'timestamp': 1371214555, 'upload_date': '20130614', + 'release_timestamp': 1371214555, + 'release_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, 'comment_count': int, 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/channels/keypeele/75629013', @@ -418,29 +435,38 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, }, 'params': {'format': 'http-1080p'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/76979871', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', - 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', - 'timestamp': 1381846109, + 'description': str, # FIXME: Dynamic SEO spam description + 'timestamp': 1381860509, 'upload_date': '20131015', + 'release_timestamp': 1381860509, + 'release_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', - 'uploader': 'Vimeo Staff', + 'uploader': 'Vimeo', 'duration': 62, + 'comment_count': int, + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/452001751-8216e0571c251a09d7a8387550942d89f7f86f6398f8ed886e639b0dd50d3c90-d_1280', 'subtitles': { - 'de': [{'ext': 'vtt'}], - 'en': [{'ext': 'vtt'}], - 'es': [{'ext': 'vtt'}], - 'fr': [{'ext': 'vtt'}], + 'de': 'count:3', + 'en': 'count:3', + 'es': 'count:3', + 'fr': 'count:3', }, }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': [ + 'Ignoring subtitle tracks found in the HLS manifest', + 'Failed to parse XML: not well-formed', + ], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -456,11 +482,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 118, 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - # contains original format + # contains Original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + # 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -476,15 +503,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', 'like_count': int, + 'tags': 'count:11', }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - 'note': 'Contains original format not accessible in webpage', + 'note': 'Contains source format not accessible in webpage', 'url': 'https://vimeo.com/393756517', - 'md5': 'c464af248b592190a5ffbb5d33f382b0', + # 'md5': 'c464af248b592190a5ffbb5d33f382b0', 'info_dict': { 'id': '393756517', - 'ext': 'mov', + # 'ext': 'mov', + 'ext': 'mp4', 'timestamp': 1582642091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', @@ -495,6 +526,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', 'uploader_url': 'https://vimeo.com/frameworkla', }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -511,16 +544,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', - 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'description': str, # FIXME: Dynamic SEO spam description 'duration': 321, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', 'like_count': int, + 'tags': ['[the shining', 'vimeohq', 'cv', 'vimeo tribute]'], }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # redirects to ondemand extractor and should be passed through it @@ -543,28 +578,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'this page is no longer available.', }, { - 'url': 'http://player.vimeo.com/video/68375962', + 'url': 'https://player.vimeo.com/video/68375962', 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', 'info_dict': { 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, - 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, - 'comment_count': int, - 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', @@ -592,7 +622,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', - 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', + 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', @@ -606,6 +636,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # source file returns 403: Forbidden @@ -633,11 +664,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'release_date': '20160329', }, 'params': {'skip_download': True}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/138909882', 'info_dict': { 'id': '138909882', + # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', 'description': 'md5:5967e090768a831488f6e74b7821b3c1', @@ -645,11 +678,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Firework Champions', 'upload_date': '20150910', 'timestamp': 1441901895, + 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d_1280', + 'uploader_url': 'https://vimeo.com/fireworkchampions', + 'tags': 'count:6', + 'duration': 229, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, - 'format': 'Original', + # 'format': 'source', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/channels/staffpicks/143603739', @@ -670,8 +711,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/karimhd', 'channel_url': 'https://vimeo.com/channels/staffpicks', + 'tags': 'count:6', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires passing unlisted_hash(a52724358e) to load_download_config request @@ -701,6 +744,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308 @@ -735,6 +779,48 @@ class VimeoIE(VimeoBaseInfoExtractor): }, 'expected_warnings': ['Failed to parse XML: not well-formed'], }, + { + # vimeo.com URL with unlisted hash and Original format + 'url': 'https://vimeo.com/144579403/ec02229140', + # 'md5': '6b662c2884e0373183fbde2a0d15cb78', + 'info_dict': { + 'id': '144579403', + 'ext': 'mp4', + 'title': 'SALESMANSHIP', + 'description': 'md5:4338302f347a1ff8841b4a3aecaa09f0', + 'uploader': 'Off the Picture Pictures', + 'uploader_id': 'offthepicturepictures', + 'uploader_url': 'https://vimeo.com/offthepicturepictures', + 'duration': 669, + 'upload_date': '20151104', + 'timestamp': 1446607180, + 'release_date': '20151104', + 'release_timestamp': 1446607180, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1018638656-[\da-f]+-d_1280', + }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # player.vimeo.com URL with source format + 'url': 'https://player.vimeo.com/video/859028877', + # 'md5': '19ca3d2463441dee2d2f0671ac2916a2', + 'info_dict': { + 'id': '859028877', + 'ext': 'mp4', + 'title': 'Ariana Grande - Honeymoon Avenue (Live from London)', + 'uploader': 'Raja Virdi', + 'uploader_id': 'rajavirdi', + 'uploader_url': 'https://vimeo.com/rajavirdi', + 'duration': 309, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d_1280', + }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { # user playlist alias -> https://vimeo.com/258705797 'url': 'https://vimeo.com/user26785108/newspiritualguide', @@ -768,16 +854,6 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('Wrong video password', expected=True) return checked - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None): - return self._download_json( - join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), - video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - def _extract_from_api(self, video_id, unlisted_hash=None): viewer = self._download_json( 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') @@ -798,6 +874,11 @@ class VimeoIE(VimeoBaseInfoExtractor): info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + if source_format: + info['formats'].append(source_format) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) info.update({ 'description': video.get('description'), @@ -899,7 +980,12 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) - return self._parse_config(config, video_id) + info = self._parse_config(config, video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash) + if source_format: + info['formats'].append(source_format) + return info vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: @@ -1269,6 +1355,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): IE_DESC = 'Review pages on vimeo' _VALID_URL = r'https?://vimeo\.com/(?P[^/?#]+)/review/(?P\d+)/(?P[\da-f]{10})' _TESTS = [{ + 'url': 'https://vimeo.com/user170863801/review/996447483/a316d6ed8d', + 'info_dict': { + 'id': '996447483', + 'ext': 'mp4', + 'title': 'Rodeo day 1-_2', + 'uploader': 'BROADKAST', + 'uploader_id': 'user170863801', + 'uploader_url': 'https://vimeo.com/user170863801', + 'duration': 30, + 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML'], + }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', 'info_dict': { @@ -1282,6 +1382,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', 'uploader_url': 'https://vimeo.com/user21297594', }, + 'skip': '404 Not Found', }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1316,6 +1417,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) + viewer = {} if data.get('isLocked') is True: video_password = self._get_video_password() viewer = self._download_json( @@ -1327,8 +1429,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', video_id, - unlisted_hash=traverse_obj(config_url, ({parse_qs}, 'h', -1))) + f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', + video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) From e6f48ca80821939c1fd11ec2a0cdbf2fba9b258a Mon Sep 17 00:00:00 2001 From: Frank Aurich <1100101@gmail.com> Date: Mon, 2 Sep 2024 01:28:51 +0200 Subject: [PATCH 2/8] [ie/KiKA] Add extractor (#5788) Authored by: 1100101 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/kika.py | 126 ++++++++++++++++++++++++++++++++ yt_dlp/extractor/mdr.py | 51 +------------ 3 files changed, 130 insertions(+), 48 deletions(-) create mode 100644 yt_dlp/extractor/kika.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a3610dc976..e7b162512f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -945,6 +945,7 @@ from .kick import ( ) from .kicker import KickerIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py new file mode 100644 index 0000000000..852a4de3f2 --- /dev/null +++ b/yt_dlp/extractor/kika.py @@ -0,0 +1,126 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class KikaIE(InfoExtractor): + IE_DESC = 'KiKA.de' + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w/-]+/videos/(?P[a-z-]+\d+)' + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/videos/logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'md5': 'fbfc8da483719ef06f396e5e5b938c69', + 'info_dict': { + 'id': 'logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'ext': 'mp4', + 'upload_date': '20240831', + 'timestamp': 1725126600, + 'season_number': 2024, + 'modified_date': '20240831', + 'episode': 'Episode 476', + 'episode_number': 476, + 'season': 'Season 2024', + 'duration': 634, + 'title': 'logo! vom Samstag, 31. August 2024', + 'modified_timestamp': 1725129983, + }, + }, { + 'url': 'https://www.kika.de/kaltstart/videos/video92498', + 'md5': '710ece827e5055094afeb474beacb7aa', + 'info_dict': { + 'id': 'video92498', + 'ext': 'mp4', + 'title': '7. Wo ist Leo?', + 'description': 'md5:fb48396a5b75068bcac1df74f1524920', + 'duration': 436, + 'timestamp': 1702926876, + 'upload_date': '20231218', + 'episode_number': 7, + 'modified_date': '20240319', + 'modified_timestamp': 1710880610, + 'episode': 'Episode 7', + 'season_number': 1, + 'season': 'Season 1', + }, + }, { + 'url': 'https://www.kika.de/bernd-das-brot/astrobrot/videos/video90088', + 'md5': 'ffd1b700d7de0a6616a1d08544c77294', + 'info_dict': { + 'id': 'video90088', + 'ext': 'mp4', + 'upload_date': '20221102', + 'timestamp': 1667390580, + 'duration': 197, + 'modified_timestamp': 1711093771, + 'episode_number': 8, + 'title': 'Es ist nicht leicht, ein Astrobrot zu sein', + 'modified_date': '20240322', + 'description': 'md5:d3641deaf1b5515a160788b2be4159a9', + 'season_number': 1, + 'episode': 'Episode 8', + 'season': 'Season 1', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + doc = self._download_json(f'https://www.kika.de/_next-api/proxy/v1/videos/{video_id}', video_id) + video_assets = self._download_json(doc['assets']['url'], video_id) + + subtitles = {} + if ttml_resource := url_or_none(video_assets.get('videoSubtitle')): + subtitles['de'] = [{ + 'url': ttml_resource, + 'ext': 'ttml', + }] + if webvtt_resource := url_or_none(video_assets.get('webvttUrl')): + subtitles.setdefault('de', []).append({ + 'url': webvtt_resource, + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': list(self._extract_formats(video_assets, video_id)), + 'subtitles': subtitles, + **traverse_obj(doc, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('date', {parse_iso8601}), + 'modified_timestamp': ('modificationDate', {parse_iso8601}), + 'duration': (( + ('durationInSeconds', {int_or_none}), + ('duration', {parse_duration})), any), + 'episode_number': ('episodeNumber', {int_or_none}), + 'season_number': ('season', {int_or_none}), + }), + } + + def _extract_formats(self, media_info, video_id): + for media in traverse_obj(media_info, ('assets', lambda _, v: url_or_none(v['url']))): + stream_url = media['url'] + ext = determine_ext(stream_url) + if ext == 'm3u8': + yield from self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + yield { + 'url': stream_url, + 'format_id': ext, + **traverse_obj(media, { + 'width': ('frameWidth', {int_or_none}), + 'height': ('frameHeight', {int_or_none}), + # NB: filesize is 0 if unknown, bitrate is -1 if unknown + 'filesize': ('fileSize', {int_or_none}, {lambda x: x or None}), + 'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}), + 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), + }), + } diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 46097fa20e..dfda3cc534 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -13,8 +13,8 @@ from ..utils import ( class MDRIE(InfoExtractor): - IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' + IE_DESC = 'MDR.DE' + _VALID_URL = r'https?://(?:www\.)?mdr\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' _GEO_COUNTRIES = ['DE'] @@ -34,30 +34,6 @@ class MDRIE(InfoExtractor): 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'duration': 134, - 'uploader': 'KIKA', - }, - 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1482541200, - 'upload_date': '20161224', - 'duration': 4628, - 'uploader': 'KIKA', - }, }, { # audio with alternative playerURL pattern 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', @@ -68,28 +44,7 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, - }, { - # empty bitrateVideo and bitrateAudio - 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', - 'info_dict': { - 'id': '128372', - 'ext': 'mp4', - 'title': 'Der kleine Wichtel kehrt zurück', - 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', - 'duration': 4876, - 'timestamp': 1607823300, - 'upload_date': '20201213', - 'uploader': 'ZDF', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'only_matching': True, - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'only_matching': True, + 'skip': '404 not found', }, { 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', 'only_matching': True, From 7e41628ff523b3fe373b0981a5db441358980dab Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Sep 2024 18:56:50 -0500 Subject: [PATCH 3/8] [build] Pin `delocate` version for `macos` (#10901) Authored by: bashonly --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 12ec5b0d8c..4ff1cbc1dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -266,7 +266,7 @@ jobs: # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi - python3 -m pip install -U delocate + python3 -m pip install -U 'delocate==0.11.0' mkdir curl_cffi_whls curl_cffi_universal2 python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do From e8e6a982a1b659eed434d225d7922f632bac6568 Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 2 Sep 2024 21:20:37 +0200 Subject: [PATCH 4/8] [ie/vimeo] Fix login detection (bugfix for 4115c24d157c5b5f63089d75c4e0f51d1f8b4489) (#10906) Authored by: seproDev --- yt_dlp/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 2aaac19723..9a03948cd9 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -249,7 +249,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): # Original/source formats are only available when logged in - if not self._get_cookies('https://vimeo.com/').get('is_logged_in'): + if not self._get_cookies('https://vimeo.com/').get('vimeo'): return query = {'action': 'load_download_config'} From b6200bdcf3a9415ae36859188f9a57e3e461c696 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Thu, 5 Sep 2024 20:06:15 +0200 Subject: [PATCH 5/8] [ci] Add comment sanitization workflow (#10915) Co-authored-by: bashonly Authored by: bashonly, Grub4K --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 7 +++++-- .../ISSUE_TEMPLATE/2_site_support_request.yml | 7 +++++-- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 7 +++++-- .github/ISSUE_TEMPLATE/4_bug_report.yml | 7 +++++-- .github/ISSUE_TEMPLATE/5_feature_request.yml | 7 +++++-- .github/ISSUE_TEMPLATE/6_question.yml | 7 +++++-- .../{antispam.yaml => issue-lockdown.yml} | 5 +++-- .github/workflows/sanitize-comment.yml | 17 +++++++++++++++++ devscripts/make_issue_template.py | 7 +++++-- 9 files changed, 55 insertions(+), 16 deletions(-) rename .github/workflows/{antispam.yaml => issue-lockdown.yml} (76%) create mode 100644 .github/workflows/sanitize-comment.yml diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 4a14421869..3b0ef323d7 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -80,5 +80,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 748885e850..c8702c3569 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -92,5 +92,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index ac68a08c6f..5a6d2b0fbd 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -88,5 +88,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 6ae107ec1c..a17770f614 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -73,5 +73,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index a2263bec52..c600a9dcb6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -67,5 +67,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 27eb98bc8e..57bc9daf51 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -73,5 +73,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/workflows/antispam.yaml b/.github/workflows/issue-lockdown.yml similarity index 76% rename from .github/workflows/antispam.yaml rename to .github/workflows/issue-lockdown.yml index 0fd867072e..4b973e2e61 100644 --- a/.github/workflows/antispam.yaml +++ b/.github/workflows/issue-lockdown.yml @@ -1,4 +1,4 @@ -name: Anti-Spam +name: Issue Lockdown on: issues: types: [opened] @@ -9,6 +9,7 @@ permissions: jobs: lockdown: name: Issue Lockdown + if: vars.ISSUE_LOCKDOWN runs-on: ubuntu-latest steps: - name: "Lock new issue" @@ -17,4 +18,4 @@ jobs: ISSUE_NUMBER: ${{ github.event.issue.number }} REPOSITORY: ${{ github.repository }} run: | - gh issue lock "${ISSUE_NUMBER}" -r too_heated -R "${REPOSITORY}" + gh issue lock "${ISSUE_NUMBER}" -R "${REPOSITORY}" diff --git a/.github/workflows/sanitize-comment.yml b/.github/workflows/sanitize-comment.yml new file mode 100644 index 0000000000..45c87cdd47 --- /dev/null +++ b/.github/workflows/sanitize-comment.yml @@ -0,0 +1,17 @@ +name: Sanitize comment + +on: + issue_comment: + types: [created, edited] + +permissions: + issues: write + +jobs: + sanitize-comment: + name: Sanitize comment + if: vars.SANITIZE_COMMENT && !github.event.issue.pull_request + runs-on: ubuntu-latest + steps: + - name: Sanitize comment + uses: yt-dlp/sanitize-comment@v1 diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 4f782d8c62..8135689c7e 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -49,8 +49,11 @@ VERBOSE_TMPL = ''' - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. '''.strip() NO_SKIP = ''' From 0fba08485b6445b72b5b63ae23ca2a73fa5d967f Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 5 Sep 2024 20:47:14 +0200 Subject: [PATCH 6/8] [ie/khanacademy] Fix extractor (#10913) Closes #10912 Authored by: seproDev --- yt_dlp/extractor/khanacademy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 3f03f9e4c4..42eef3c922 100644 --- a/yt_dlp/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py @@ -15,7 +15,7 @@ from ..utils import ( class KhanAcademyBaseIE(InfoExtractor): _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' - _PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70' + _PUBLISHED_CONTENT_VERSION = 'dc34750f0572c80f5effe7134082fe351143c1e4' def _parse_video(self, video): return { @@ -39,7 +39,7 @@ class KhanAcademyBaseIE(InfoExtractor): query={ 'fastly_cacheable': 'persist_until_publish', 'pcv': self._PUBLISHED_CONTENT_VERSION, - 'hash': '1242644265', + 'hash': '3712657851', 'variables': json.dumps({ 'path': display_id, 'countryCode': 'US', From 46f4c80bc363ee8116c33d37f65202e6c3470954 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 7 Sep 2024 17:06:12 +0200 Subject: [PATCH 7/8] [ie/SampleFocus] Fix extractor (#10947) Closes #10945 Authored by: seproDev --- yt_dlp/extractor/samplefocus.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py index 36ceb0254d..3db3ce1424 100644 --- a/yt_dlp/extractor/samplefocus.py +++ b/yt_dlp/extractor/samplefocus.py @@ -36,7 +36,7 @@ class SampleFocusIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, impersonate=True) sample_id = self._search_regex( r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', @@ -82,7 +82,15 @@ class SampleFocusIE(InfoExtractor): return { 'id': sample_id, 'title': title, - 'url': mp3_url, + 'formats': [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'http_headers': { + 'Referer': url, + }, + }], 'display_id': display_id, 'thumbnail': thumbnail, 'uploader': uploader, From d1c4d88b2d912e8da5e76db455562ca63b1af690 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 8 Sep 2024 19:32:44 +1200 Subject: [PATCH 8/8] [networking] Fix handler not being added to RequestError (#10955) Authored by: coletdjnz --- test/test_networking.py | 18 ++++++++++++++++++ yt_dlp/networking/_helper.py | 4 ++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index 826f11a561..d96624af18 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -822,6 +822,24 @@ class TestRequestHandlerMisc: rh.close() assert len(logging_handlers) == before_count + def test_wrap_request_errors(self): + class TestRequestHandler(RequestHandler): + def _validate(self, request): + if request.headers.get('x-fail'): + raise UnsupportedRequest('test error') + + def _send(self, request: Request): + raise RequestError('test error') + + with TestRequestHandler(logger=FakeLogger()) as rh: + with pytest.raises(UnsupportedRequest, match='test error') as exc_info: + rh.validate(Request('http://example.com', headers={'x-fail': '1'})) + assert exc_info.value.handler is rh + + with pytest.raises(RequestError, match='test error') as exc_info: + rh.send(Request('http://example.com')) + assert exc_info.value.handler is rh + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) class TestUrllibRequestHandler(TestRequestHandlerBase): diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index fe3354ea29..b86d3606d8 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -10,7 +10,7 @@ import typing import urllib.parse import urllib.request -from .exceptions import RequestError, UnsupportedRequest +from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj @@ -206,7 +206,7 @@ def wrap_request_errors(func): def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) - except UnsupportedRequest as e: + except RequestError as e: if e.handler is None: e.handler = self raise