From 4115c24d157c5b5f63089d75c4e0f51d1f8b4489 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Sep 2024 18:25:36 -0500 Subject: [PATCH 001/216] [ie/vimeo] Always try to extract original format (#10721) Closes #9163 Authored by: bashonly --- yt_dlp/extractor/vimeo.py | 208 ++++++++++++++++++++++++++++---------- 1 file changed, 155 insertions(+), 53 deletions(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index a20cf4b17d..2aaac19723 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -234,13 +234,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _extract_original_format(self, url, video_id, unlisted_hash=None): + def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + return self._download_json( + join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), + video_id, 'Downloading API JSON', headers={ + 'Authorization': f'jwt {jwt_token}', + 'Accept': 'application/json', + }, query={ + 'fields': ','.join(( + 'config_url', 'created_time', 'description', 'download', 'license', + 'metadata.connections.comments.total', 'metadata.connections.likes.total', + 'release_time', 'stats.plays')), + }, **kwargs) + + def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + # Original/source formats are only available when logged in + if not self._get_cookies('https://vimeo.com/').get('is_logged_in'): + return + query = {'action': 'load_download_config'} if unlisted_hash: query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}, + url, video_id, 'Loading download config JSON', fatal=False, + query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, expected_status=(403, 404)) or {} source_file = download_data.get('source_file') download_url = try_get(source_file, lambda x: x['download_url']) @@ -261,15 +278,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'quality': 1, } - jwt_response = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} - if not jwt_response.get('jwt'): + jwt = jwt or traverse_obj(self._download_json( + 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) + if not jwt: return - headers = {'Authorization': 'jwt {}'.format(jwt_response['jwt']), 'Accept': 'application/json'} - original_response = self._download_json( - f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False, expected_status=(403, 404)) or {} - for download_data in original_response.get('download') or []: + original_response = api_data or self._call_videos_api( + video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue @@ -354,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'No longer available', }, { - 'url': 'http://player.vimeo.com/video/54469442', + 'url': 'https://player.vimeo.com/video/54469442', 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { @@ -370,6 +385,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/68375962', @@ -379,22 +395,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, + 'timestamp': 1371214555, 'upload_date': '20130614', + 'release_timestamp': 1371214555, + 'release_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, 'comment_count': int, 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/channels/keypeele/75629013', @@ -418,29 +435,38 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, }, 'params': {'format': 'http-1080p'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/76979871', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', - 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', - 'timestamp': 1381846109, + 'description': str, # FIXME: Dynamic SEO spam description + 'timestamp': 1381860509, 'upload_date': '20131015', + 'release_timestamp': 1381860509, + 'release_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', - 'uploader': 'Vimeo Staff', + 'uploader': 'Vimeo', 'duration': 62, + 'comment_count': int, + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/452001751-8216e0571c251a09d7a8387550942d89f7f86f6398f8ed886e639b0dd50d3c90-d_1280', 'subtitles': { - 'de': [{'ext': 'vtt'}], - 'en': [{'ext': 'vtt'}], - 'es': [{'ext': 'vtt'}], - 'fr': [{'ext': 'vtt'}], + 'de': 'count:3', + 'en': 'count:3', + 'es': 'count:3', + 'fr': 'count:3', }, }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': [ + 'Ignoring subtitle tracks found in the HLS manifest', + 'Failed to parse XML: not well-formed', + ], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -456,11 +482,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 118, 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - # contains original format + # contains Original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + # 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -476,15 +503,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', 'like_count': int, + 'tags': 'count:11', }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - 'note': 'Contains original format not accessible in webpage', + 'note': 'Contains source format not accessible in webpage', 'url': 'https://vimeo.com/393756517', - 'md5': 'c464af248b592190a5ffbb5d33f382b0', + # 'md5': 'c464af248b592190a5ffbb5d33f382b0', 'info_dict': { 'id': '393756517', - 'ext': 'mov', + # 'ext': 'mov', + 'ext': 'mp4', 'timestamp': 1582642091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', @@ -495,6 +526,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', 'uploader_url': 'https://vimeo.com/frameworkla', }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -511,16 +544,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', - 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'description': str, # FIXME: Dynamic SEO spam description 'duration': 321, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', 'like_count': int, + 'tags': ['[the shining', 'vimeohq', 'cv', 'vimeo tribute]'], }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # redirects to ondemand extractor and should be passed through it @@ -543,28 +578,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'this page is no longer available.', }, { - 'url': 'http://player.vimeo.com/video/68375962', + 'url': 'https://player.vimeo.com/video/68375962', 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', 'info_dict': { 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, - 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, - 'comment_count': int, - 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', @@ -592,7 +622,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', - 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', + 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', @@ -606,6 +636,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # source file returns 403: Forbidden @@ -633,11 +664,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'release_date': '20160329', }, 'params': {'skip_download': True}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/138909882', 'info_dict': { 'id': '138909882', + # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', 'description': 'md5:5967e090768a831488f6e74b7821b3c1', @@ -645,11 +678,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Firework Champions', 'upload_date': '20150910', 'timestamp': 1441901895, + 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d_1280', + 'uploader_url': 'https://vimeo.com/fireworkchampions', + 'tags': 'count:6', + 'duration': 229, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, - 'format': 'Original', + # 'format': 'source', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/channels/staffpicks/143603739', @@ -670,8 +711,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/karimhd', 'channel_url': 'https://vimeo.com/channels/staffpicks', + 'tags': 'count:6', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires passing unlisted_hash(a52724358e) to load_download_config request @@ -701,6 +744,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308 @@ -735,6 +779,48 @@ class VimeoIE(VimeoBaseInfoExtractor): }, 'expected_warnings': ['Failed to parse XML: not well-formed'], }, + { + # vimeo.com URL with unlisted hash and Original format + 'url': 'https://vimeo.com/144579403/ec02229140', + # 'md5': '6b662c2884e0373183fbde2a0d15cb78', + 'info_dict': { + 'id': '144579403', + 'ext': 'mp4', + 'title': 'SALESMANSHIP', + 'description': 'md5:4338302f347a1ff8841b4a3aecaa09f0', + 'uploader': 'Off the Picture Pictures', + 'uploader_id': 'offthepicturepictures', + 'uploader_url': 'https://vimeo.com/offthepicturepictures', + 'duration': 669, + 'upload_date': '20151104', + 'timestamp': 1446607180, + 'release_date': '20151104', + 'release_timestamp': 1446607180, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1018638656-[\da-f]+-d_1280', + }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # player.vimeo.com URL with source format + 'url': 'https://player.vimeo.com/video/859028877', + # 'md5': '19ca3d2463441dee2d2f0671ac2916a2', + 'info_dict': { + 'id': '859028877', + 'ext': 'mp4', + 'title': 'Ariana Grande - Honeymoon Avenue (Live from London)', + 'uploader': 'Raja Virdi', + 'uploader_id': 'rajavirdi', + 'uploader_url': 'https://vimeo.com/rajavirdi', + 'duration': 309, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d_1280', + }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { # user playlist alias -> https://vimeo.com/258705797 'url': 'https://vimeo.com/user26785108/newspiritualguide', @@ -768,16 +854,6 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('Wrong video password', expected=True) return checked - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None): - return self._download_json( - join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), - video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - def _extract_from_api(self, video_id, unlisted_hash=None): viewer = self._download_json( 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') @@ -798,6 +874,11 @@ class VimeoIE(VimeoBaseInfoExtractor): info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + if source_format: + info['formats'].append(source_format) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) info.update({ 'description': video.get('description'), @@ -899,7 +980,12 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) - return self._parse_config(config, video_id) + info = self._parse_config(config, video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash) + if source_format: + info['formats'].append(source_format) + return info vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: @@ -1269,6 +1355,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): IE_DESC = 'Review pages on vimeo' _VALID_URL = r'https?://vimeo\.com/(?P[^/?#]+)/review/(?P\d+)/(?P[\da-f]{10})' _TESTS = [{ + 'url': 'https://vimeo.com/user170863801/review/996447483/a316d6ed8d', + 'info_dict': { + 'id': '996447483', + 'ext': 'mp4', + 'title': 'Rodeo day 1-_2', + 'uploader': 'BROADKAST', + 'uploader_id': 'user170863801', + 'uploader_url': 'https://vimeo.com/user170863801', + 'duration': 30, + 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML'], + }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', 'info_dict': { @@ -1282,6 +1382,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', 'uploader_url': 'https://vimeo.com/user21297594', }, + 'skip': '404 Not Found', }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1316,6 +1417,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) + viewer = {} if data.get('isLocked') is True: video_password = self._get_video_password() viewer = self._download_json( @@ -1327,8 +1429,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', video_id, - unlisted_hash=traverse_obj(config_url, ({parse_qs}, 'h', -1))) + f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', + video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) From e6f48ca80821939c1fd11ec2a0cdbf2fba9b258a Mon Sep 17 00:00:00 2001 From: Frank Aurich <1100101@gmail.com> Date: Mon, 2 Sep 2024 01:28:51 +0200 Subject: [PATCH 002/216] [ie/KiKA] Add extractor (#5788) Authored by: 1100101 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/kika.py | 126 ++++++++++++++++++++++++++++++++ yt_dlp/extractor/mdr.py | 51 +------------ 3 files changed, 130 insertions(+), 48 deletions(-) create mode 100644 yt_dlp/extractor/kika.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a3610dc976..e7b162512f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -945,6 +945,7 @@ from .kick import ( ) from .kicker import KickerIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py new file mode 100644 index 0000000000..852a4de3f2 --- /dev/null +++ b/yt_dlp/extractor/kika.py @@ -0,0 +1,126 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class KikaIE(InfoExtractor): + IE_DESC = 'KiKA.de' + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w/-]+/videos/(?P[a-z-]+\d+)' + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/videos/logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'md5': 'fbfc8da483719ef06f396e5e5b938c69', + 'info_dict': { + 'id': 'logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'ext': 'mp4', + 'upload_date': '20240831', + 'timestamp': 1725126600, + 'season_number': 2024, + 'modified_date': '20240831', + 'episode': 'Episode 476', + 'episode_number': 476, + 'season': 'Season 2024', + 'duration': 634, + 'title': 'logo! vom Samstag, 31. August 2024', + 'modified_timestamp': 1725129983, + }, + }, { + 'url': 'https://www.kika.de/kaltstart/videos/video92498', + 'md5': '710ece827e5055094afeb474beacb7aa', + 'info_dict': { + 'id': 'video92498', + 'ext': 'mp4', + 'title': '7. Wo ist Leo?', + 'description': 'md5:fb48396a5b75068bcac1df74f1524920', + 'duration': 436, + 'timestamp': 1702926876, + 'upload_date': '20231218', + 'episode_number': 7, + 'modified_date': '20240319', + 'modified_timestamp': 1710880610, + 'episode': 'Episode 7', + 'season_number': 1, + 'season': 'Season 1', + }, + }, { + 'url': 'https://www.kika.de/bernd-das-brot/astrobrot/videos/video90088', + 'md5': 'ffd1b700d7de0a6616a1d08544c77294', + 'info_dict': { + 'id': 'video90088', + 'ext': 'mp4', + 'upload_date': '20221102', + 'timestamp': 1667390580, + 'duration': 197, + 'modified_timestamp': 1711093771, + 'episode_number': 8, + 'title': 'Es ist nicht leicht, ein Astrobrot zu sein', + 'modified_date': '20240322', + 'description': 'md5:d3641deaf1b5515a160788b2be4159a9', + 'season_number': 1, + 'episode': 'Episode 8', + 'season': 'Season 1', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + doc = self._download_json(f'https://www.kika.de/_next-api/proxy/v1/videos/{video_id}', video_id) + video_assets = self._download_json(doc['assets']['url'], video_id) + + subtitles = {} + if ttml_resource := url_or_none(video_assets.get('videoSubtitle')): + subtitles['de'] = [{ + 'url': ttml_resource, + 'ext': 'ttml', + }] + if webvtt_resource := url_or_none(video_assets.get('webvttUrl')): + subtitles.setdefault('de', []).append({ + 'url': webvtt_resource, + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': list(self._extract_formats(video_assets, video_id)), + 'subtitles': subtitles, + **traverse_obj(doc, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('date', {parse_iso8601}), + 'modified_timestamp': ('modificationDate', {parse_iso8601}), + 'duration': (( + ('durationInSeconds', {int_or_none}), + ('duration', {parse_duration})), any), + 'episode_number': ('episodeNumber', {int_or_none}), + 'season_number': ('season', {int_or_none}), + }), + } + + def _extract_formats(self, media_info, video_id): + for media in traverse_obj(media_info, ('assets', lambda _, v: url_or_none(v['url']))): + stream_url = media['url'] + ext = determine_ext(stream_url) + if ext == 'm3u8': + yield from self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + yield { + 'url': stream_url, + 'format_id': ext, + **traverse_obj(media, { + 'width': ('frameWidth', {int_or_none}), + 'height': ('frameHeight', {int_or_none}), + # NB: filesize is 0 if unknown, bitrate is -1 if unknown + 'filesize': ('fileSize', {int_or_none}, {lambda x: x or None}), + 'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}), + 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), + }), + } diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 46097fa20e..dfda3cc534 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -13,8 +13,8 @@ from ..utils import ( class MDRIE(InfoExtractor): - IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' + IE_DESC = 'MDR.DE' + _VALID_URL = r'https?://(?:www\.)?mdr\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' _GEO_COUNTRIES = ['DE'] @@ -34,30 +34,6 @@ class MDRIE(InfoExtractor): 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'duration': 134, - 'uploader': 'KIKA', - }, - 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1482541200, - 'upload_date': '20161224', - 'duration': 4628, - 'uploader': 'KIKA', - }, }, { # audio with alternative playerURL pattern 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', @@ -68,28 +44,7 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, - }, { - # empty bitrateVideo and bitrateAudio - 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', - 'info_dict': { - 'id': '128372', - 'ext': 'mp4', - 'title': 'Der kleine Wichtel kehrt zurück', - 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', - 'duration': 4876, - 'timestamp': 1607823300, - 'upload_date': '20201213', - 'uploader': 'ZDF', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'only_matching': True, - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'only_matching': True, + 'skip': '404 not found', }, { 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', 'only_matching': True, From 7e41628ff523b3fe373b0981a5db441358980dab Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Sep 2024 18:56:50 -0500 Subject: [PATCH 003/216] [build] Pin `delocate` version for `macos` (#10901) Authored by: bashonly --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 12ec5b0d8c..4ff1cbc1dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -266,7 +266,7 @@ jobs: # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi - python3 -m pip install -U delocate + python3 -m pip install -U 'delocate==0.11.0' mkdir curl_cffi_whls curl_cffi_universal2 python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do From e8e6a982a1b659eed434d225d7922f632bac6568 Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 2 Sep 2024 21:20:37 +0200 Subject: [PATCH 004/216] [ie/vimeo] Fix login detection (bugfix for 4115c24d157c5b5f63089d75c4e0f51d1f8b4489) (#10906) Authored by: seproDev --- yt_dlp/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 2aaac19723..9a03948cd9 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -249,7 +249,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): # Original/source formats are only available when logged in - if not self._get_cookies('https://vimeo.com/').get('is_logged_in'): + if not self._get_cookies('https://vimeo.com/').get('vimeo'): return query = {'action': 'load_download_config'} From b6200bdcf3a9415ae36859188f9a57e3e461c696 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Thu, 5 Sep 2024 20:06:15 +0200 Subject: [PATCH 005/216] [ci] Add comment sanitization workflow (#10915) Co-authored-by: bashonly Authored by: bashonly, Grub4K --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 7 +++++-- .../ISSUE_TEMPLATE/2_site_support_request.yml | 7 +++++-- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 7 +++++-- .github/ISSUE_TEMPLATE/4_bug_report.yml | 7 +++++-- .github/ISSUE_TEMPLATE/5_feature_request.yml | 7 +++++-- .github/ISSUE_TEMPLATE/6_question.yml | 7 +++++-- .../{antispam.yaml => issue-lockdown.yml} | 5 +++-- .github/workflows/sanitize-comment.yml | 17 +++++++++++++++++ devscripts/make_issue_template.py | 7 +++++-- 9 files changed, 55 insertions(+), 16 deletions(-) rename .github/workflows/{antispam.yaml => issue-lockdown.yml} (76%) create mode 100644 .github/workflows/sanitize-comment.yml diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 4a14421869..3b0ef323d7 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -80,5 +80,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 748885e850..c8702c3569 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -92,5 +92,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index ac68a08c6f..5a6d2b0fbd 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -88,5 +88,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 6ae107ec1c..a17770f614 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -73,5 +73,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index a2263bec52..c600a9dcb6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -67,5 +67,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 27eb98bc8e..57bc9daf51 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -73,5 +73,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/workflows/antispam.yaml b/.github/workflows/issue-lockdown.yml similarity index 76% rename from .github/workflows/antispam.yaml rename to .github/workflows/issue-lockdown.yml index 0fd867072e..4b973e2e61 100644 --- a/.github/workflows/antispam.yaml +++ b/.github/workflows/issue-lockdown.yml @@ -1,4 +1,4 @@ -name: Anti-Spam +name: Issue Lockdown on: issues: types: [opened] @@ -9,6 +9,7 @@ permissions: jobs: lockdown: name: Issue Lockdown + if: vars.ISSUE_LOCKDOWN runs-on: ubuntu-latest steps: - name: "Lock new issue" @@ -17,4 +18,4 @@ jobs: ISSUE_NUMBER: ${{ github.event.issue.number }} REPOSITORY: ${{ github.repository }} run: | - gh issue lock "${ISSUE_NUMBER}" -r too_heated -R "${REPOSITORY}" + gh issue lock "${ISSUE_NUMBER}" -R "${REPOSITORY}" diff --git a/.github/workflows/sanitize-comment.yml b/.github/workflows/sanitize-comment.yml new file mode 100644 index 0000000000..45c87cdd47 --- /dev/null +++ b/.github/workflows/sanitize-comment.yml @@ -0,0 +1,17 @@ +name: Sanitize comment + +on: + issue_comment: + types: [created, edited] + +permissions: + issues: write + +jobs: + sanitize-comment: + name: Sanitize comment + if: vars.SANITIZE_COMMENT && !github.event.issue.pull_request + runs-on: ubuntu-latest + steps: + - name: Sanitize comment + uses: yt-dlp/sanitize-comment@v1 diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 4f782d8c62..8135689c7e 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -49,8 +49,11 @@ VERBOSE_TMPL = ''' - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. '''.strip() NO_SKIP = ''' From 0fba08485b6445b72b5b63ae23ca2a73fa5d967f Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 5 Sep 2024 20:47:14 +0200 Subject: [PATCH 006/216] [ie/khanacademy] Fix extractor (#10913) Closes #10912 Authored by: seproDev --- yt_dlp/extractor/khanacademy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 3f03f9e4c4..42eef3c922 100644 --- a/yt_dlp/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py @@ -15,7 +15,7 @@ from ..utils import ( class KhanAcademyBaseIE(InfoExtractor): _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' - _PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70' + _PUBLISHED_CONTENT_VERSION = 'dc34750f0572c80f5effe7134082fe351143c1e4' def _parse_video(self, video): return { @@ -39,7 +39,7 @@ class KhanAcademyBaseIE(InfoExtractor): query={ 'fastly_cacheable': 'persist_until_publish', 'pcv': self._PUBLISHED_CONTENT_VERSION, - 'hash': '1242644265', + 'hash': '3712657851', 'variables': json.dumps({ 'path': display_id, 'countryCode': 'US', From 46f4c80bc363ee8116c33d37f65202e6c3470954 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 7 Sep 2024 17:06:12 +0200 Subject: [PATCH 007/216] [ie/SampleFocus] Fix extractor (#10947) Closes #10945 Authored by: seproDev --- yt_dlp/extractor/samplefocus.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py index 36ceb0254d..3db3ce1424 100644 --- a/yt_dlp/extractor/samplefocus.py +++ b/yt_dlp/extractor/samplefocus.py @@ -36,7 +36,7 @@ class SampleFocusIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, impersonate=True) sample_id = self._search_regex( r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', @@ -82,7 +82,15 @@ class SampleFocusIE(InfoExtractor): return { 'id': sample_id, 'title': title, - 'url': mp3_url, + 'formats': [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'http_headers': { + 'Referer': url, + }, + }], 'display_id': display_id, 'thumbnail': thumbnail, 'uploader': uploader, From d1c4d88b2d912e8da5e76db455562ca63b1af690 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 8 Sep 2024 19:32:44 +1200 Subject: [PATCH 008/216] [networking] Fix handler not being added to RequestError (#10955) Authored by: coletdjnz --- test/test_networking.py | 18 ++++++++++++++++++ yt_dlp/networking/_helper.py | 4 ++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/test/test_networking.py b/test/test_networking.py index 826f11a561..d96624af18 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -822,6 +822,24 @@ class TestRequestHandlerMisc: rh.close() assert len(logging_handlers) == before_count + def test_wrap_request_errors(self): + class TestRequestHandler(RequestHandler): + def _validate(self, request): + if request.headers.get('x-fail'): + raise UnsupportedRequest('test error') + + def _send(self, request: Request): + raise RequestError('test error') + + with TestRequestHandler(logger=FakeLogger()) as rh: + with pytest.raises(UnsupportedRequest, match='test error') as exc_info: + rh.validate(Request('http://example.com', headers={'x-fail': '1'})) + assert exc_info.value.handler is rh + + with pytest.raises(RequestError, match='test error') as exc_info: + rh.send(Request('http://example.com')) + assert exc_info.value.handler is rh + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) class TestUrllibRequestHandler(TestRequestHandlerBase): diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index fe3354ea29..b86d3606d8 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -10,7 +10,7 @@ import typing import urllib.parse import urllib.request -from .exceptions import RequestError, UnsupportedRequest +from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj @@ -206,7 +206,7 @@ def wrap_request_errors(func): def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) - except UnsupportedRequest as e: + except RequestError as e: if e.handler is None: e.handler = self raise From 3a3bd00037e9908e87da4fa9f2ad772aa34dc60e Mon Sep 17 00:00:00 2001 From: sepro Date: Fri, 13 Sep 2024 12:51:58 +0200 Subject: [PATCH 009/216] [ie/youtube] Add `po_token`, `visitor_data`, `data_sync_id` extractor args (#10648) Authored by: seproDev, coletdjnz, bashonly --- README.md | 3 + yt_dlp/extractor/youtube.py | 257 +++++++++++++++++++++++++----------- 2 files changed, 182 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index ca32e09bfb..428eb9f478 100644 --- a/README.md +++ b/README.md @@ -1777,6 +1777,9 @@ The following extractors use this feature: * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning +* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` +* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) +* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2501398ba1..343d103f65 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -69,6 +69,8 @@ from ..utils import ( ) STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' +STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token' + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -79,6 +81,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats 'web_safari': { @@ -90,6 +93,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, 'web_embedded': { 'INNERTUBE_CONTEXT': { @@ -132,6 +136,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_music': { 'INNERTUBE_CONTEXT': { @@ -146,6 +151,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_creator': { 'INNERTUBE_CONTEXT': { @@ -160,6 +166,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { @@ -323,6 +330,7 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('REQUIRE_PO_TOKEN', False) ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') @@ -688,31 +696,46 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None, fatal=False) - @staticmethod - def _extract_account_syncid(*args): + def _data_sync_id_to_delegated_session_id(self, data_sync_id): + if not data_sync_id: + return + # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel + # and just "user_syncid||" for primary channel. We only want the channel_syncid + channel_syncid, _, user_syncid = data_sync_id.partition('||') + if user_syncid: + return channel_syncid + + def _extract_account_syncid(self, *args): """ - Extract syncId required to download private playlists of secondary channels + Extract current session ID required to download private playlists of secondary channels @params response and/or ytcfg """ - for data in args: - # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) - if delegated_sid: - return delegated_sid - sync_ids = (try_get( - data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), str) or '').split('||') - if len(sync_ids) >= 2 and sync_ids[1]: - # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel - # and just "user_syncid||" for primary channel. We only want the channel_syncid - return sync_ids[0] + # ytcfg includes channel_syncid if on secondary channel + if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): + return delegated_sid - @staticmethod - def _extract_visitor_data(*args): + data_sync_id = self._extract_data_sync_id(*args) + return self._data_sync_id_to_delegated_session_id(data_sync_id) + + def _extract_data_sync_id(self, *args): + """ + Extract current account dataSyncId. + In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| + @params response and/or ytcfg + """ + if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]: + return data_sync_id + + return traverse_obj( + args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) + + def _extract_visitor_data(self, *args): """ Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ + if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]: + return visitor_data return get_first( args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) @@ -1334,11 +1357,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') - _POTOKEN_EXPERIMENTS = ('51217476', '51217102') - _BROKEN_CLIENTS = { - short_client_name(client): client - for client in ('android', 'android_creator', 'android_music') - } _DEFAULT_CLIENTS = ('ios', 'web_creator') _GEO_BYPASS = False @@ -3701,6 +3719,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor): **cls._get_checkok_params(), } + def _get_config_po_token(self, client): + po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True) + for token_str in po_token_strs: + po_token_client, sep, po_token = token_str.partition('+') + if not sep: + self.report_warning( + f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True) + continue + if po_token_client == client: + return po_token + + def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function. + if not visitor_data and not self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. ' + f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"') + return + + config_po_token = self._get_config_po_token(client) + if config_po_token: + # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token, + # if using first channel in an account then we don't need the data_sync_id anymore... + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + + return config_po_token + + # Require PO Token if logged in for external fetching + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. ' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + return + + return self._fetch_po_token( + client=client, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + player_url=player_url, + **kwargs, + ) + + def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + """External PO Token fetch stub""" + @staticmethod def _is_agegated(player_response): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): @@ -3717,13 +3783,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): - - session_index = self._extract_session_index(player_ytcfg, master_ytcfg) - syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): headers = self.generate_api_headers( - ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) + ytcfg=player_ytcfg, + default_client=client, + visitor_data=visitor_data, + session_index=self._extract_session_index(master_ytcfg, player_ytcfg), + account_syncid=( + self._data_sync_id_to_delegated_session_id(data_sync_id) + or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg) + ), + ) yt_query = { 'videoId': video_id, @@ -3734,6 +3804,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: yt_query['params'] = player_params + if po_token: + yt_query['serviceIntegrityDimensions'] = {'poToken': po_token} + + sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3744,7 +3818,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - broken_clients = [] excluded_clients = [] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), @@ -3758,12 +3831,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): excluded_clients.append(client[1:]) elif client not in allowed_clients: self.report_warning(f'Skipping unsupported client "{client}"') - elif client in self._BROKEN_CLIENTS.values(): - broken_clients.append(client) else: requested_clients.append(client) - # Force deprioritization of _BROKEN_CLIENTS for format de-duplication - requested_clients.extend(broken_clients) if not requested_clients: requested_clients.extend(self._DEFAULT_CLIENTS) for excluded_client in excluded_clients: @@ -3788,19 +3857,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return pr_id def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): - initial_pr = ignore_initial_response = None + initial_pr = None if webpage: - if 'web' in clients: - experiments = traverse_obj(master_ytcfg, ( - 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - self.report_warning( - 'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response') - ignore_initial_response = True initial_pr = self._search_json( self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) prs = [] + deprioritized_prs = [] + if initial_pr and not self._invalid_player_response(initial_pr, video_id): # Android player_response does not have microFormats which are needed for # extraction of some data. So we return the initial_pr with formats @@ -3822,14 +3886,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return tried_iframe_fallback = False - player_url = None + player_url = visitor_data = data_sync_id = None skipped_clients = {} while clients: + deprioritize_pr = False client, base_client, variant = _split_innertube_client(clients.pop()) - player_ytcfg = {} - if client == 'web': - player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg - elif 'configs' not in self._configuration_arg('player_skip'): + player_ytcfg = master_ytcfg if client == 'web' else {} + if 'configs' not in self._configuration_arg('player_skip') and client != 'web': player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) @@ -3842,34 +3905,53 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url = self._download_player_url(video_id) tried_iframe_fallback = True - pr = initial_pr if client == 'web' and not ignore_initial_response else None - for retry in self.RetryManager(fatal=False): - try: - pr = pr or self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, - player_url if require_js_player else None, initial_pr, smuggled_data) - except ExtractorError as e: - self.report_warning(e) - break - experiments = traverse_obj(pr, ( - 'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK', - 'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - pr = None - retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True) - if not pr: + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) + data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) + po_token = self.fetch_po_token( + client=client, visitor_data=visitor_data, + data_sync_id=data_sync_id if self.is_authenticated else None, + player_url=player_url if require_js_player else None, + ) + + require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') + if not po_token and require_po_token: + self.report_warning( + f'No PO Token provided for {client} client, ' + f'which is required for working {client} formats. ' + f'You can manually pass a PO Token for this client with ' + f'--extractor-args "youtube:po_token={client}+XXX"', + only_once=True) + deprioritize_pr = True + + pr = initial_pr if client == 'web' else None + try: + pr = pr or self._extract_player_response( + client, video_id, + master_ytcfg=player_ytcfg or master_ytcfg, + player_ytcfg=player_ytcfg, + player_url=player_url, + initial_pr=initial_pr, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + po_token=po_token) + except ExtractorError as e: + self.report_warning(e) continue if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: # Save client name for introspection later - name = short_client_name(client) sd = traverse_obj(pr, ('streamingData', {dict})) or {} - sd[STREAMING_DATA_CLIENT_NAME] = name + sd[STREAMING_DATA_CLIENT_NAME] = client + sd[STREAMING_DATA_PO_TOKEN] = po_token for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): - f[STREAMING_DATA_CLIENT_NAME] = name - prs.append(pr) + f[STREAMING_DATA_CLIENT_NAME] = client + f[STREAMING_DATA_PO_TOKEN] = po_token + if deprioritize_pr: + deprioritized_prs.append(pr) + else: + prs.append(pr) # tv_embedded can work around age-gate and age-verification IF the video is embeddable if self._is_agegated(pr) and variant != 'tv_embedded': @@ -3893,6 +3975,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # _producer, _testsuite, & _vr variants can also work around age-verification append_client('web_creator', 'mediaconnect') + prs.extend(deprioritized_prs) + if skipped_clients: self.report_warning( f'Skipping player responses from {"/".join(skipped_clients)} clients ' @@ -4027,13 +4111,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) - # _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds - # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 - is_broken = client_name in self._BROKEN_CLIENTS + po_token = fmt.get(STREAMING_DATA_PO_TOKEN) + + if po_token: + fmt_url = update_url_query(fmt_url, {'pot': po_token}) + + # Clients that require PO Token return videoplayback URLs that may return 403 + is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) if is_broken: self.report_warning( - f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken ' - 'and may yield HTTP Error 403. They will be deprioritized', only_once=True) + f'{video_id}: {client_name} client formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 @@ -4109,12 +4197,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, client_name, itag): + def process_manifest_format(f, proto, client_name, itag, po_token): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False itags[itag].add(key) + if f.get('source_preference') is None: + f['source_preference'] = -1 + + # Clients that require PO Token return videoplayback URLs that may return 403 + # hls does not currently require PO Token + if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': + self.report_warning( + f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) + f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ') + f['source_preference'] -= 20 + if itag and all_formats: f['format_id'] = f'{itag}-{proto}' elif any(p != proto for p, _ in itags[itag]): @@ -4126,9 +4226,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') f['language_preference'] = PREFERRED_LANG_VALUE - if f.get('source_preference') is None: - f['source_preference'] = -1 - if itag in ('616', '235'): f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') f['source_preference'] += 100 @@ -4149,23 +4246,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): subtitles = {} for sd in streaming_data: client_name = sd.get(STREAMING_DATA_CLIENT_NAME) - + po_token = sd.get(STREAMING_DATA_PO_TOKEN) hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: + if po_token: + hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None)): + r'/itag/(\d+)', f['url'], 'itag', default=None), po_token): yield f dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: + if po_token: + dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: - if process_manifest_format(f, 'dash', client_name, f['format_id']): + if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if needs_live_processing: From 9431777b4c37129a6093080c77ca59960afbb9d7 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 14 Sep 2024 00:46:44 +0200 Subject: [PATCH 010/216] [ie/youtube:tab] Fix shorts tab extraction (#10938) Closes #10936 Authored by: seproDev --- yt_dlp/extractor/youtube.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 343d103f65..3d11c32f6e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -5088,7 +5088,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( rich_grid_renderer, - ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {} video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) @@ -5100,6 +5100,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=self._get_text(renderer, 'title')) return + # shortsLockupViewModel extraction + entity_id = renderer.get('entityId') + if entity_id: + video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str})) + if not video_id: + return + yield self.url_result( + f'https://www.youtube.com/shorts/{video_id}', + ie=YoutubeIE, video_id=video_id, + **traverse_obj(renderer, ('overlayMetadata', { + 'title': ('primaryText', 'content', {str}), + 'view_count': ('secondaryText', 'content', {parse_count}), + })), + thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources')) + return def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') From b4760c778d0c92c6e3f2bc8346cd72c8f08595ae Mon Sep 17 00:00:00 2001 From: Deukhoofd <10423862+Deukhoofd@users.noreply.github.com> Date: Sat, 14 Sep 2024 00:50:15 +0200 Subject: [PATCH 011/216] [ie/beacon] Add extractor (#9901) Authored by: Deukhoofd --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/beacon.py | 68 +++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 yt_dlp/extractor/beacon.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e7b162512f..4302076f0c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -217,6 +217,7 @@ from .bbc import ( BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, ) +from .beacon import BeaconTvIE from .beatbump import ( BeatBumpPlaylistIE, BeatBumpVideoIE, diff --git a/yt_dlp/extractor/beacon.py b/yt_dlp/extractor/beacon.py new file mode 100644 index 0000000000..ae47687cc8 --- /dev/null +++ b/yt_dlp/extractor/beacon.py @@ -0,0 +1,68 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, + traverse_obj, +) + + +class BeaconTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?beacon\.tv/content/(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://beacon.tv/content/welcome-to-beacon', + 'md5': 'b3f5932d437f288e662f10f3bfc5bd04', + 'info_dict': { + 'id': 'welcome-to-beacon', + 'ext': 'mp4', + 'upload_date': '20240509', + 'description': 'md5:ea2bd32e71acf3f9fca6937412cc3563', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/I4CkkEvN/poster.jpg?width=720', + 'title': 'Your home for Critical Role!', + 'timestamp': 1715227200, + 'duration': 105.494, + }, + }, { + 'url': 'https://beacon.tv/content/re-slayers-take-trailer', + 'md5': 'd879b091485dbed2245094c8152afd89', + 'info_dict': { + 'id': 're-slayers-take-trailer', + 'ext': 'mp4', + 'title': 'The Re-Slayer’s Take | Official Trailer', + 'timestamp': 1715189040, + 'upload_date': '20240508', + 'duration': 53.249, + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/PW5ApIw3/poster.jpg?width=720', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + content_data = traverse_obj(self._search_nextjs_data(webpage, video_id), ( + 'props', 'pageProps', '__APOLLO_STATE__', + lambda k, v: k.startswith('Content:') and v['slug'] == video_id, any)) + if not content_data: + raise ExtractorError('Failed to extract content data') + + jwplayer_data = traverse_obj(content_data, ( + (('contentVideo', 'video', 'videoData'), + ('contentPodcast', 'podcast', 'audioData')), {json.loads}, {dict}, any)) + if not jwplayer_data: + if content_data.get('contentType') not in ('videoPodcast', 'video', 'podcast'): + raise ExtractorError('Content is not a video/podcast', expected=True) + if traverse_obj(content_data, ('contentTier', '__ref')) != 'MemberTier:65b258d178f89be87b4dc0a4': + self.raise_login_required('This video/podcast is for members only') + raise ExtractorError('Failed to extract content') + + return { + **self._parse_jwplayer_data(jwplayer_data, video_id), + **traverse_obj(content_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('publishedAt', {parse_iso8601}), + }), + } From 409f8e9e3b4bde81ef76fc563256f876d2ff8099 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 14 Sep 2024 00:54:41 +0200 Subject: [PATCH 012/216] [ie] Fix JW Player format parsing (#10956) Authored by: seproDev --- yt_dlp/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 187f73e7b9..432db9daf8 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3489,7 +3489,7 @@ class InfoExtractor: continue urls.add(source_url) source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) + ext = determine_ext(source_url, default_ext=mimetype2ext(source_type)) if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', From 5d0176547f16a3642cd71627126e9dfc24981e20 Mon Sep 17 00:00:00 2001 From: Scott Robinson Date: Sat, 14 Sep 2024 09:02:54 +1000 Subject: [PATCH 013/216] [ie/Bandcamp:user] Fix extraction (#10328) Authored by: quad, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/bandcamp.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 61cbab5a7a..0abe059829 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -1,3 +1,5 @@ +import functools +import json import random import re import time @@ -6,7 +8,9 @@ from .common import InfoExtractor from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, + extract_attributes, float_or_none, + get_element_html_by_id, int_or_none, parse_filesize, str_or_none, @@ -17,6 +21,7 @@ from ..utils import ( url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj class BandcampIE(InfoExtractor): @@ -459,7 +464,7 @@ class BandcampUserIE(InfoExtractor): }, }, { 'url': 'https://coldworldofficial.bandcamp.com/music', - 'playlist_mincount': 10, + 'playlist_mincount': 7, 'info_dict': { 'id': 'coldworldofficial', 'title': 'Discography of coldworldofficial', @@ -473,12 +478,19 @@ class BandcampUserIE(InfoExtractor): }, }] + def _yield_items(self, webpage): + yield from ( + re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) + + yield from traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes}, + 'data-client-items', {json.loads}, ..., 'page_url', {str})) + def _real_extract(self, url): uploader = self._match_id(url) webpage = self._download_webpage(url, uploader) - discography_data = (re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) - return self.playlist_from_matches( - discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x)) + self._yield_items(webpage), uploader, f'Discography of {uploader}', + getter=functools.partial(urljoin, url)) From d02df303d8e49390599db9f34482697e4d1cf5b2 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Sat, 14 Sep 2024 02:09:52 +0300 Subject: [PATCH 014/216] [ie/RTP] Support more subpages (#10787) Authored by: Demon000 --- yt_dlp/extractor/rtp.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py index 944e8636ab..26aec2e4cc 100644 --- a/yt_dlp/extractor/rtp.py +++ b/yt_dlp/extractor/rtp.py @@ -8,7 +8,7 @@ from ..utils import js_to_json class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P[0-9]+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -19,9 +19,25 @@ class RTPIE(InfoExtractor): 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril', + 'md5': '9a81ed53f2b2197cfa7ed455b12f8ade', + 'info_dict': { + 'id': 'e757904', + 'ext': 'mp4', + 'title': '25 Curiosidades, 25 de Abril', + 'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', + 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', + 'only_matching': True, }] _RX_OBFUSCATION = re.compile(r'''(?xs) @@ -49,17 +65,17 @@ class RTPIE(InfoExtractor): f, config = self._search_regex( r'''(?sx) - var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s* + (?:var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s*)? var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P{(?:(?!\*/).)+?})\);(?!\s*\*/) ''', webpage, 'player config', group=('f', 'config')) - f = self._parse_json( - f, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) config = self._parse_json( config, video_id, lambda data: self.__unobfuscate(data, video_id=video_id)) + f = config['file'] if not f else self._parse_json( + f, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) formats = [] if isinstance(f, dict): From 25c1cdaa2650563494d3bf00a38f72d0d9486bff Mon Sep 17 00:00:00 2001 From: hugepower Date: Sat, 14 Sep 2024 07:12:38 +0800 Subject: [PATCH 015/216] [ie/huya:video] Add extractor (#10686) Closes #10679 Authored by: hugepower --- yt_dlp/extractor/_extractors.py | 5 ++- yt_dlp/extractor/huya.py | 80 ++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4302076f0c..8903bf8fca 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -823,7 +823,10 @@ from .hungama import ( HungamaIE, HungamaSongIE, ) -from .huya import HuyaLiveIE +from .huya import ( + HuyaLiveIE, + HuyaVideoIE, +) from .hypem import HypemIE from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index 5663a78a37..f79e032e4a 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -8,15 +8,19 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + parse_duration, str_or_none, try_get, unescapeHTML, + unified_strdate, update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class HuyaLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P[^/#?&]+)(?:\D|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?!(?:video/play/))(?P[^/#?&]+)(?:\D|$)' IE_NAME = 'huya:live' IE_DESC = 'huya.com' TESTS = [{ @@ -24,6 +28,7 @@ class HuyaLiveIE(InfoExtractor): 'info_dict': { 'id': '572329', 'title': str, + 'ext': 'flv', 'description': str, 'is_live': True, 'view_count': int, @@ -131,3 +136,76 @@ class HuyaLiveIE(InfoExtractor): fm = base64.b64decode(params['fm']).decode().split('_', 1)[0] ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) return fm, ss + + +class HuyaVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?huya\.com/video/play/(?P\d+)\.html' + IE_NAME = 'huya:video' + IE_DESC = '虎牙视频' + + _TESTS = [{ + 'url': 'https://www.huya.com/video/play/1002412640.html', + 'info_dict': { + 'id': '1002412640', + 'ext': 'mp4', + 'title': '8月3日', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 14, + 'uploader': '虎牙-ATS欧卡车队青木', + 'uploader_id': '1564376151', + 'upload_date': '20240803', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + { + 'url': 'https://www.huya.com/video/play/556054543.html', + 'info_dict': { + 'id': '556054543', + 'ext': 'mp4', + 'title': '我不挑事 也不怕事', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 1864, + 'uploader': '卡尔', + 'uploader_id': '367138632', + 'upload_date': '20210811', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }] + + def _real_extract(self, url: str): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://liveapi.huya.com/moment/getMomentContent', video_id, + query={'videoId': video_id})['data']['moment']['videoInfo'] + + formats = [] + for definition in traverse_obj(video_data, ('definitions', lambda _, v: url_or_none(v['url']))): + formats.append({ + 'url': definition['url'], + **traverse_obj(definition, { + 'format_id': ('defName', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(video_data, { + 'title': ('videoTitle', {str}), + 'thumbnail': ('videoCover', {url_or_none}), + 'duration': ('videoDuration', {parse_duration}), + 'uploader': ('nickName', {str}), + 'uploader_id': ('uid', {str_or_none}), + 'upload_date': ('videoUploadTime', {unified_strdate}), + 'view_count': ('videoPlayNum', {int_or_none}), + 'comment_count': ('videoCommentNum', {int_or_none}), + 'like_count': ('favorCount', {int_or_none}), + }), + } From 3dfd720d098b4d49d69cfc77e6376f22bcd90934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xingchen=20Song=28=E5=AE=8B=E6=98=9F=E8=BE=B0=29?= Date: Sat, 14 Sep 2024 07:16:34 +0800 Subject: [PATCH 016/216] [ie/ximalaya] Add VIP support (#10832) Closes #6928 Authored by: xingchensong, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/ximalaya.py | 91 +++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index e900a4ad9f..d63964a004 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -1,7 +1,17 @@ +import base64 import math +import time from .common import InfoExtractor -from ..utils import InAdvancePagedList, str_or_none, traverse_obj, try_call +from .videa import VideaIE +from ..utils import ( + InAdvancePagedList, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, +) class XimalayaBaseIE(InfoExtractor): @@ -71,23 +81,92 @@ class XimalayaIE(XimalayaBaseIE): 'like_count': int, }, }, + { + # VIP-restricted audio + 'url': 'https://www.ximalaya.com/sound/562111701', + 'only_matching': True, + }, ] + @staticmethod + def _decrypt_filename(file_id, seed): + cgstr = '' + key = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890' + for _ in key: + seed = float(int(211 * seed + 30031) % 65536) + r = int(seed / 65536 * len(key)) + cgstr += key[r] + key = key.replace(key[r], '') + parts = file_id.split('*') + filename = ''.join(cgstr[int(part)] for part in parts if part.isdecimal()) + if not filename.startswith('/'): + filename = '/' + filename + return filename + + @staticmethod + def _decrypt_url_params(encrypted_params): + params = VideaIE.rc4( + base64.b64decode(encrypted_params), 'xkt3a41psizxrh9l').split('-') + # sign, token, timestamp + return params[1], params[2], params[3] + def _real_extract(self, url): scheme = 'https' if url.startswith('https') else 'http' audio_id = self._match_id(url) - audio_info_file = f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json' audio_info = self._download_json( - audio_info_file, audio_id, - f'Downloading info json {audio_info_file}', 'Unable to download info file') + f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json', audio_id, + 'Downloading info json', 'Unable to download info file') - formats = [{ + formats = [] + # NOTE: VIP-restricted audio + if audio_info.get('is_paid'): + ts = int(time.time()) + vip_info = self._download_json( + f'{scheme}://mpay.ximalaya.com/mobile/track/pay/{audio_id}/{ts}', + audio_id, 'Downloading VIP info json', 'Unable to download VIP info file', + query={'device': 'pc', 'isBackend': 'true', '_': ts}) + filename = self._decrypt_filename(vip_info['fileId'], vip_info['seed']) + sign, token, timestamp = self._decrypt_url_params(vip_info['ep']) + vip_url = update_url_query( + f'{vip_info["domain"]}/download/{vip_info["apiVersion"]}{filename}', { + 'sign': sign, + 'token': token, + 'timestamp': timestamp, + 'buy_key': vip_info['buyKey'], + 'duration': vip_info['duration'], + }) + fmt = { + 'format_id': 'vip', + 'url': vip_url, + 'vcodec': 'none', + } + if '_preview_' in vip_url: + self.report_warning( + f'This tracks requires a VIP account. Using a sample instead. {self._login_hint()}') + fmt.update({ + 'format_note': 'Sample', + 'preference': -10, + **traverse_obj(vip_info, { + 'filesize': ('sampleLength', {int_or_none}), + 'duration': ('sampleDuration', {int_or_none}), + }), + }) + else: + fmt.update(traverse_obj(vip_info, { + 'filesize': ('totalLength', {int_or_none}), + 'duration': ('duration', {int_or_none}), + })) + + fmt['abr'] = try_call(lambda: fmt['filesize'] * 8 / fmt['duration'] / 1024) + formats.append(fmt) + + formats.extend([{ 'format_id': f'{bps}k', 'url': audio_info[k], 'abr': bps, 'vcodec': 'none', - } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)] + } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)]) thumbnails = [] for k in audio_info: From 0e1b941c6b2caa688b0d3332e723d16dbafa4311 Mon Sep 17 00:00:00 2001 From: Leng Date: Sat, 14 Sep 2024 07:18:13 +0800 Subject: [PATCH 017/216] [ie/facebook:reel] Improve metadata extraction Closes #9057, Closes #10824 Authored by: lengzuo --- yt_dlp/extractor/facebook.py | 42 +++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index a43ffe95e2..1adb35b5f0 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -84,7 +84,7 @@ class FacebookIE(InfoExtractor): 'timestamp': 1692346159, 'thumbnail': r're:^https?://.*', 'uploader_id': '100063551323670', - 'duration': 3132.184, + 'duration': 3133.583, 'view_count': int, 'concurrent_view_count': 0, }, @@ -112,9 +112,10 @@ class FacebookIE(InfoExtractor): 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', + 'uploader_id': 'pfbid05AzrFTXgY37tqwaSgbFTTEpCLBjjEJHkigogwGiRPtKEpAsJYJpzE94H1RxYXWEtl', 'duration': 131.03, 'concurrent_view_count': int, + 'view_count': int, }, }, { 'note': 'Video with DASH manifest', @@ -167,7 +168,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': 'ca63897a90c9452efee5f8c40d080e25', + 'md5': '1659aa21fb3dd1585874f668e81a72c8', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -180,9 +181,10 @@ class FacebookIE(InfoExtractor): 'view_count': int, 'uploader_id': '100059479812265', 'concurrent_view_count': int, - 'duration': 44.478, + 'duration': 44.181, }, }, { + # FIXME: unable to extract uploader, no formats found # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', @@ -241,9 +243,9 @@ class FacebookIE(InfoExtractor): 'timestamp': 1511548260, 'upload_date': '20171124', 'uploader': 'Vickie Gentry', - 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', + 'uploader_id': 'pfbid0FkkycT95ySNNyfCw4Cho6u5G7WbbZEcxT496Hq8rtx1K3LcTCATpR3wnyYhmyGC5l', 'thumbnail': r're:^https?://.*', - 'duration': 148.435, + 'duration': 148.224, }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -271,7 +273,7 @@ class FacebookIE(InfoExtractor): 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', 'thumbnail': r're:^https?://.*', 'uploader': 'Lela Evans', - 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', + 'uploader_id': 'pfbid0swT2y7t6TAsZVBvcyeYPdhTMefGaS26mzUwML3vd1ma6ndGZKxsyS4Ssu3jitZLXl', 'upload_date': '20231228', 'timestamp': 1703804085, 'duration': 394.347, @@ -322,7 +324,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', 'uploader_id': '100066514874195', - 'duration': 4524.212, + 'duration': 4524.001, 'view_count': int, 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, @@ -339,9 +341,9 @@ class FacebookIE(InfoExtractor): 'title': 'Josef', 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, - 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', + 'uploader_id': 'pfbid02gpfwRM2XvdEJfsERupwQiNmBiDArc38RMRYZnap372q6Vs7MtFTVy72mmFWpJBTKl', 'timestamp': 1549275572, - 'duration': 3.413, + 'duration': 3.283, 'uploader': 'Josef Novak', 'description': '', 'upload_date': '20190204', @@ -396,6 +398,7 @@ class FacebookIE(InfoExtractor): 'playlist_count': 1, 'skip': 'Requires logging in', }, { + # FIXME: Cannot parse data error # data.event.cover_media_renderer.cover_video 'url': 'https://m.facebook.com/events/1509582499515440', 'info_dict': { @@ -498,7 +501,8 @@ class FacebookIE(InfoExtractor): or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) or {}) + or get_first(post, ('event', 'event_creator', {dict})) + or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( @@ -524,6 +528,11 @@ class FacebookIE(InfoExtractor): webpage, 'view count', default=None)), 'concurrent_view_count': get_first(post, ( ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { + 'like_count': ('likers', 'count', {int}), + 'comment_count': ('total_comment_count', {int}), + 'repost_count': ('share_count_reduced', {parse_count}), + }), get_all=False), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -932,18 +941,21 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'f13dd37f2633595982db5ed8765474d3', + 'md5': 'a53256d10fc2105441fe0c4212ed8cea', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', - 'description': 'md5:22f03309b216ac84720183961441d8db', - 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$', + 'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$', + 'uploader': 'Beast Camp Training', 'uploader_id': '100040874179269', 'duration': 9.579, 'timestamp': 1637502609, 'upload_date': '20211121', 'thumbnail': r're:^https?://.*', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }] From cc85596d5b59f0c14e9381b3675f619c1e12e597 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 13 Sep 2024 18:19:18 -0500 Subject: [PATCH 018/216] [utils] `mimetype2ext`: Recognize `aacp` as `aac` (#10860) Authored by: bashonly --- yt_dlp/utils/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 0d3e707c58..04dd0f8d2c 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2919,6 +2919,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): 'audio/webm': 'webm', 'audio/x-matroska': 'mka', 'audio/x-mpegurl': 'm3u', + 'aacp': 'aac', 'midi': 'mid', 'ogg': 'ogg', 'wav': 'wav', From 325001317d97f4545d66fac44c4ba772c6f45f22 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 13 Sep 2024 18:20:17 -0500 Subject: [PATCH 019/216] [ie] Handle decode errors when reading responses (#10868) Authored by: bashonly --- yt_dlp/extractor/common.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 432db9daf8..9501e5ec9a 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -35,6 +35,7 @@ from ..networking import HEADRequest, Request from ..networking.exceptions import ( HTTPError, IncompleteRead, + TransportError, network_exceptions, ) from ..networking.impersonate import ImpersonateTarget @@ -965,6 +966,9 @@ class InfoExtractor: return False content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data) + if content is False: + assert not fatal + return False return (content, urlh) @staticmethod @@ -1039,7 +1043,15 @@ class InfoExtractor: def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None, data=None): - webpage_bytes = urlh.read() + try: + webpage_bytes = urlh.read() + except TransportError as err: + errmsg = f'{video_id}: Error reading response: {err.msg}' + if fatal: + raise ExtractorError(errmsg, cause=err) + self.report_warning(errmsg) + return False + if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): From c8c078fe28b0ffc15ef9646346c00c592fe71a78 Mon Sep 17 00:00:00 2001 From: Sahil Singh Date: Sat, 14 Sep 2024 04:52:14 +0530 Subject: [PATCH 020/216] [ie/pinterest] Extend `_VALID_URL` (#10867) Closes #10850 Authored by: sahilsinghss73, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/pinterest.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 07f249498c..f0b38893b2 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -109,7 +109,7 @@ class PinterestBaseIE(InfoExtractor): class PinterestIE(PinterestBaseIE): - _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P\d+)' + _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?:[\w-]+--)?(?P\d+)' _TESTS = [{ # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', @@ -174,6 +174,25 @@ class PinterestIE(PinterestBaseIE): }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', 'only_matching': True, + }, + { + 'url': 'https://pinterest.com/pin/dive-into-serenity-blue-lagoon-pedi-nails-for-a-tranquil-and-refreshing-spa-experience-video-in-2024--2885187256207927', + 'info_dict': { + 'id': '2885187256207927', + 'ext': 'mp4', + 'title': 'Dive into Serenity: Blue Lagoon Pedi Nails for a Tranquil and Refreshing Spa Experience! 💙💅', + 'description': 'md5:5da41c767d2317e42e49b663b0b2150f', + 'uploader': 'Glamour Artistry |Everyday Outfits, Luxury Fashion & Nail Designs', + 'uploader_id': '1142999717836434688', + 'upload_date': '20240702', + 'timestamp': 1719939156, + 'duration': 7.967, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': ['#BlueLagoonPediNails', '#SpaExperience'], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, }] def _real_extract(self, url): From fa83d0b36bc43d30fe9241c1e923f4614864b758 Mon Sep 17 00:00:00 2001 From: naglis <827324+naglis@users.noreply.github.com> Date: Fri, 13 Sep 2024 23:23:19 +0000 Subject: [PATCH 021/216] [ie/LnkGo] Remove extractor (#10904) Authored by: naglis --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/{lnkgo.py => lnk.py} | 75 --------------------------- 2 files changed, 1 insertion(+), 79 deletions(-) rename yt_dlp/extractor/{lnkgo.py => lnk.py} (53%) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8903bf8fca..257d073081 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1041,10 +1041,7 @@ from .livestream import ( LivestreamShortenerIE, ) from .livestreamfails import LivestreamfailsIE -from .lnkgo import ( - LnkGoIE, - LnkIE, -) +from .lnk import LnkIE from .loom import ( LoomFolderIE, LoomIE, diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnk.py similarity index 53% rename from yt_dlp/extractor/lnkgo.py rename to yt_dlp/extractor/lnk.py index 31a7cefd82..593f73410d 100644 --- a/yt_dlp/extractor/lnkgo.py +++ b/yt_dlp/extractor/lnk.py @@ -1,86 +1,11 @@ from .common import InfoExtractor from ..utils import ( - clean_html, format_field, int_or_none, - parse_iso8601, unified_strdate, ) -class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P[A-Za-z0-9-]+)(?:/(?P\d+))?' - _TESTS = [{ - 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', - 'info_dict': { - 'id': '10809', - 'ext': 'mp4', - 'title': "Put'ka: Trys Klausimai", - 'upload_date': '20161216', - 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', - 'age_limit': 18, - 'duration': 117, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1481904000, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', - 'info_dict': { - 'id': '10467', - 'ext': 'mp4', - 'title': 'Nėrdas: Kompiuterio Valymas', - 'upload_date': '20150113', - 'description': 'md5:7352d113a242a808676ff17e69db6a69', - 'age_limit': 18, - 'duration': 346, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421164800, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', - 'only_matching': True, - }] - _AGE_LIMITS = { - 'N-7': 7, - 'N-14': 14, - 'S': 18, - } - _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' - - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - - video_info = self._download_json( - 'https://lnk.lt/api/main/video-page/{}/{}/false'.format(display_id, video_id or '0'), - display_id)['videoConfig']['videoInfo'] - - video_id = str(video_info['id']) - title = video_info['title'] - prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' - formats = self._extract_m3u8_formats( - self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), - video_id, 'mp4', 'm3u8_native') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'), - 'duration': int_or_none(video_info.get('duration')), - 'description': clean_html(video_info.get('htmlDescription')), - 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), - 'timestamp': parse_iso8601(video_info.get('airDate')), - 'view_count': int_or_none(video_info.get('viewsCount')), - } - - class LnkIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P\d+)' From 7adff8caf152dcf96d03aff69ed8545c0a63567c Mon Sep 17 00:00:00 2001 From: ischmidt20 Date: Fri, 13 Sep 2024 19:25:12 -0400 Subject: [PATCH 022/216] [ie/WatchESPN] Improve auth support (#10910) Authored by: ischmidt20 --- yt_dlp/extractor/espn.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index 4e9b63524e..552f9af12e 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -294,37 +294,37 @@ class ESPNCricInfoIE(InfoExtractor): class WatchESPNIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' _TESTS = [{ - 'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'url': 'https://www.espn.com/watch/player/_/id/11ce417a-6ac9-42b6-8a15-46aeb9ad5710', 'info_dict': { - 'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'id': '11ce417a-6ac9-42b6-8a15-46aeb9ad5710', 'ext': 'mp4', - 'title': 'Huddersfield vs. Burnley', - 'duration': 7500, - 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', + 'title': 'Abilene Chrstn vs. Texas Tech', + 'duration': 14166, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/11ce417a-6ac9-42b6-8a15-46aeb9ad5710/16x9.jpg?timestamp=202407252343&showBadge=true&cb=12&package=ESPN_PLUS', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'url': 'https://www.espn.com/watch/player/_/id/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3', 'info_dict': { - 'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'id': '90a2c85d-75e0-4b1e-a878-8e428a3cb2f3', 'ext': 'mp4', - 'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)', - 'duration': 8335, - 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', + 'title': 'UC Davis vs. California', + 'duration': 9547, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421', + 'url': 'https://www.espn.com/watch/player/_/id/c4313bbe-95b5-4bb8-b251-ac143ea0fc54', 'info_dict': { - 'id': '317f5fd1-c78a-4ebe-824a-129e0d348421', + 'id': 'c4313bbe-95b5-4bb8-b251-ac143ea0fc54', 'ext': 'mp4', - 'title': 'The Wheel - Episode 10', - 'duration': 3352, - 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS', + 'title': 'The College Football Show', + 'duration': 3639, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/c4313bbe-95b5-4bb8-b251-ac143ea0fc54/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', }, 'params': { 'skip_download': True, @@ -353,6 +353,13 @@ class WatchESPNIE(AdobePassIE): if not cookie: self.raise_login_required(method='cookies') + jwt = self._search_regex(r'=([^|]+)\|', cookie.value, 'cookie jwt') + id_token = self._download_json( + 'https://registerdisney.go.com/jgc/v6/client/ESPN-ONESITE.WEB-PROD/guest/refresh-auth', + None, 'Refreshing token', headers={'Content-Type': 'application/json'}, data=json.dumps({ + 'refreshToken': json.loads(base64.urlsafe_b64decode(f'{jwt}==='))['refresh_token'], + }).encode())['data']['token']['id_token'] + assertion = self._call_bamgrid_api( 'devices', video_id, headers={'Content-Type': 'application/json; charset=UTF-8'}, @@ -371,7 +378,7 @@ class WatchESPNIE(AdobePassIE): })['access_token'] assertion = self._call_bamgrid_api( - 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, + 'accounts/grant', video_id, payload={'id_token': id_token}, headers={ 'Authorization': token, 'Content-Type': 'application/json; charset=UTF-8', From 36f9e602ad55679764bc75a4f67f7562b1d6adcf Mon Sep 17 00:00:00 2001 From: naglis <827324+naglis@users.noreply.github.com> Date: Fri, 13 Sep 2024 23:27:10 +0000 Subject: [PATCH 023/216] [ie/screenrec] Add extractor (#10917) Closes #9780 Authored by: naglis --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/screenrec.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 yt_dlp/extractor/screenrec.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 257d073081..597876197f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1812,6 +1812,7 @@ from .screen9 import Screen9IE from .screencast import ScreencastIE from .screencastify import ScreencastifyIE from .screencastomatic import ScreencastOMaticIE +from .screenrec import ScreenRecIE from .scrippsnetworks import ( ScrippsNetworksIE, ScrippsNetworksWatchIE, diff --git a/yt_dlp/extractor/screenrec.py b/yt_dlp/extractor/screenrec.py new file mode 100644 index 0000000000..64f8d2494a --- /dev/null +++ b/yt_dlp/extractor/screenrec.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor + + +class ScreenRecIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screenrec\.com/share/(?P\w{10})' + _TESTS = [{ + 'url': 'https://screenrec.com/share/DasLtbknYo', + 'info_dict': { + 'id': 'DasLtbknYo', + 'ext': 'mp4', + 'title': '02.05.2024_03.01.25_REC', + 'description': 'Recorded with ScreenRec', + 'thumbnail': r're:^https?://.*\.gif$', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex( + r'customUrl\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'm3u8 URL', group='url') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'), + } From d8d473002b654ab0e7b97ead869f58b4361eeae1 Mon Sep 17 00:00:00 2001 From: aarubui Date: Sat, 14 Sep 2024 10:09:15 +1000 Subject: [PATCH 024/216] [ie/tenplay] Fix extractor (#10928) Closes #10926 Authored by: aarubui --- yt_dlp/extractor/tenplay.py | 104 +++++++++++++++++------------------- 1 file changed, 49 insertions(+), 55 deletions(-) diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index d8c556acef..07db583470 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,33 +1,31 @@ -import base64 -import datetime as dt import functools import itertools from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin +from ..utils import int_or_none, traverse_obj, url_or_none, urljoin class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?Ptpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ - 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'url': 'https://10play.com.au/neighbours/web-extras/season-41/heres-a-first-look-at-mischa-bartons-neighbours-debut/tpv230911hyxnz', 'info_dict': { - 'id': '6226844312001', + 'id': '6336940246112', 'ext': 'mp4', - 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', - 'duration': 186, - 'season': 'Season 39', - 'season_number': 39, + 'title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'alt_title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'description': 'Neighbours Premieres Monday, September 18 At 4:30pm On 10 And 10 Play And 6:30pm On 10 Peach', + 'duration': 74, + 'season': 'Season 41', + 'season_number': 41, 'series': 'Neighbours', 'thumbnail': r're:https://.*\.jpg', 'uploader': 'Channel 10', 'age_limit': 15, - 'timestamp': 1611810000, - 'upload_date': '20210128', + 'timestamp': 1694386800, + 'upload_date': '20230910', 'uploader_id': '2199827728001', }, 'params': { @@ -35,21 +33,30 @@ class TenPlayIE(InfoExtractor): }, 'skip': 'Only available in Australia', }, { - 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', + 'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp', 'info_dict': { - 'id': '6192880312001', + 'id': '9000000000091177', 'ext': 'mp4', - 'title': "Todd Sampson's Body Hack - S4 Ep. 2", - 'description': 'md5:fa278820ad90f08ea187f9458316ac74', + 'title': 'Neighbours - S42 Ep. 9107', + 'alt_title': 'Thu 05 Sep', + 'description': 'md5:37a1f4271be34b9ee2b533426a5fbaef', + 'duration': 1388, + 'episode': 'Episode 9107', + 'episode_number': 9107, + 'season': 'Season 42', + 'season_number': 42, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', 'age_limit': 15, - 'timestamp': 1600770600, - 'upload_date': '20200922', + 'timestamp': 1725517860, + 'upload_date': '20240905', 'uploader': 'Channel 10', 'uploader_id': '2199827728001', }, 'params': { 'skip_download': True, }, + 'skip': 'Only available in Australia', }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, @@ -66,55 +73,42 @@ class TenPlayIE(InfoExtractor): 'X': 18, } - def _get_bearer_token(self, video_id): - username, password = self._get_login_info() - if username is None or password is None: - self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') - _timestamp = dt.datetime.now().strftime('%Y%m%d000000') - _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') - data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ - 'X-Network-Ten-Auth': _auth_header, - }, data=urlencode_postdata({ - 'email': username, - 'password': password, - })) - return 'Bearer ' + data['jwt']['accessToken'] - def _real_extract(self, url): content_id = self._match_id(url) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) - headers = {} - if data.get('memberGated') is True: - _token = self._get_bearer_token(content_id) - headers = {'Authorization': _token} - - _video_url = self._download_json( - data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers=headers).get('source') - m3u8_url = self._request_webpage(HEADRequest( - _video_url), content_id).url + video_data = self._download_json( + f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}', + content_id, 'Downloading video JSON') + m3u8_url = self._request_webpage( + HEADRequest(video_data['items'][0]['HLSURL']), + content_id, 'Checking stream URL').url if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) + # Attempt to get a higher quality stream + m3u8_url = m3u8_url.replace(',150,75,55,0000', ',300,150,75,55,0000') formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') return { + 'id': content_id, 'formats': formats, - 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, - 'id': data.get('altId') or content_id, - 'duration': data.get('duration'), - 'title': data.get('subtitle'), - 'alt_title': data.get('title'), - 'description': data.get('description'), - 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('tvShow'), - 'season_number': int_or_none(data.get('season')), - 'episode_number': int_or_none(data.get('episode')), - 'timestamp': data.get('published'), - 'thumbnail': data.get('imageUrl'), + 'subtitles': {'en': [{'url': data['captionUrl']}]} if url_or_none(data.get('captionUrl')) else None, 'uploader': 'Channel 10', 'uploader_id': '2199827728001', + **traverse_obj(data, { + 'id': ('altId', {str}), + 'duration': ('duration', {int_or_none}), + 'title': ('subtitle', {str}), + 'alt_title': ('title', {str}), + 'description': ('description', {str}), + 'age_limit': ('classification', {self._AUS_AGES.get}), + 'series': ('tvShow', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('published', {int_or_none}), + 'thumbnail': ('imageUrl', {url_or_none}), + }), } From 300c91274f7ea5b1b0528fc5ee11cf1a61d4079e Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 14 Sep 2024 02:14:09 +0200 Subject: [PATCH 025/216] [ie/Servus] Fix extractor (#10944) Closes #10941 Authored by: seproDev --- yt_dlp/extractor/servus.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/servus.py b/yt_dlp/extractor/servus.py index 117f180814..841c7ebf33 100644 --- a/yt_dlp/extractor/servus.py +++ b/yt_dlp/extractor/servus.py @@ -27,7 +27,7 @@ class ServusIE(InfoExtractor): 'info_dict': { 'id': 'AA-28BYCQNH92111', 'ext': 'mp4', - 'title': 'Klettersteige in den Alpen', + 'title': 'Vie Ferrate - Klettersteige in den Alpen', 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2823, @@ -38,6 +38,7 @@ class ServusIE(InfoExtractor): 'season_number': 11, 'episode': 'Episode 8 - Vie Ferrate – Klettersteige in den Alpen', 'episode_number': 8, + 'categories': ['Bergwelten'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -71,8 +72,11 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + next_data = self._search_nextjs_data(webpage, video_id, fatal=False) + video = self._download_json( - 'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin', + 'https://api-player.redbull.com/stv/servus-tv-playnet', video_id, 'Downloading video JSON', query={'videoId': video_id}) if not video.get('videoUrl'): self._report_errors(video) @@ -89,7 +93,7 @@ class ServusIE(InfoExtractor): return { 'id': video_id, 'title': video.get('title'), - 'description': self._get_description(video_id) or video.get('description'), + 'description': self._get_description(next_data) or video.get('description'), 'thumbnail': video.get('poster'), 'duration': float_or_none(video.get('duration')), 'timestamp': unified_timestamp(video.get('currentSunrise')), @@ -100,16 +104,19 @@ class ServusIE(InfoExtractor): 'episode_number': episode_number, 'formats': formats, 'subtitles': subtitles, + **traverse_obj(next_data, ('props', 'pageProps', 'data', { + 'title': ('title', 'rendered', {str}), + 'timestamp': ('stv_date', 'raw', {int}), + 'duration': ('stv_duration', {float_or_none}), + 'categories': ('category_names', ..., {str}), + })), } - def _get_description(self, video_id): - info = self._download_json( - f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page', - video_id, fatal=False) - - return join_nonempty(*traverse_obj(info, ( - ('stv_short_description', 'stv_long_description'), - {lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n') + def _get_description(self, next_data): + return join_nonempty(*traverse_obj(next_data, ( + 'props', 'pageProps', 'data', + ('stv_short_description', 'stv_long_description'), {str}, + {lambda x: x.replace('\n\n', '\n')}, {unescapeHTML})), delim='\n\n') def _report_errors(self, video): playability_errors = traverse_obj(video, ('playabilityErrors', ...)) From 3aa0156e05662923d130ddbc1c82596e38c01a00 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 14 Sep 2024 02:15:07 +0200 Subject: [PATCH 026/216] [ie/Xinpianchang] Fix extractor (#10950) Authored by: seproDev --- yt_dlp/extractor/xinpianchang.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 10849916b8..23ed9270da 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -3,16 +3,13 @@ from ..utils import ( int_or_none, str_or_none, try_get, - update_url_query, url_or_none, ) class XinpianchangIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://www\.xinpianchang\.com/(?P[^/]+?)(?:\D|$)' - IE_NAME = 'xinpianchang' - IE_DESC = 'xinpianchang.com' + _VALID_URL = r'https?://(www\.)?xinpianchang\.com/(?Pa\d+)' + IE_DESC = '新片场' _TESTS = [{ 'url': 'https://www.xinpianchang.com/a11766551', 'info_dict': { @@ -49,11 +46,11 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) - vid = self.find_value_with_regex(var='vid', webpage=webpage) - app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) - api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) - data = self._download_json(api, video_id=video_id)['data'] + video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] + + data = self._download_json( + f'https://mod-api.xinpianchang.com/mod/api/v2/media/{video_data["vid"]}', video_id, + query={'appKey': video_data['appKey']})['data'] formats, subtitles = [], {} for k, v in data.get('resource').items(): if k in ('dash', 'hls'): @@ -72,6 +69,10 @@ class XinpianchangIE(InfoExtractor): 'width': int_or_none(prog.get('width')), 'height': int_or_none(prog.get('height')), 'ext': 'mp4', + 'http_headers': { + # NB: Server returns 403 without the Range header + 'Range': 'bytes=0-', + }, } for prog in v if prog.get('url') or []]) return { @@ -87,6 +88,3 @@ class XinpianchangIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } - - def find_value_with_regex(self, var, webpage): - return self._search_regex(rf'var\s{var}\s=\s\"(?P[^\"]+)\"', webpage, name=var) From 41a241ca6ffb95b3d9aaf4f42106ca8cba9af1a6 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 14 Sep 2024 02:16:34 +0200 Subject: [PATCH 027/216] [ie/Sen] Add extractor (#10952) Closes #10951 Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/sen.py | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 yt_dlp/extractor/sen.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 597876197f..d8abf0b5d3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1823,6 +1823,7 @@ from .scte import ( SCTECourseIE, ) from .sejmpl import SejmIE +from .sen import SenIE from .senalcolombia import SenalColombiaLiveIE from .senategov import ( SenateGovIE, diff --git a/yt_dlp/extractor/sen.py b/yt_dlp/extractor/sen.py new file mode 100644 index 0000000000..d8f14ecdc0 --- /dev/null +++ b/yt_dlp/extractor/sen.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class SenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sen\.com/video/(?P[0-9a-f-]+)' + _TEST = { + 'url': 'https://www.sen.com/video/eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'md5': 'ff615aca9691053c94f8f10d96cd7884', + 'info_dict': { + 'id': 'eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'ext': 'mp4', + 'description': 'Florida, 28 Sep 2022', + 'title': 'Hurricane Ian', + 'tags': ['North America', 'Storm', 'Weather'], + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_data = self._download_json(f'https://api.sen.com/content/public/video/{video_id}', video_id) + m3u8_url = (traverse_obj(api_data, ( + 'data', 'nodes', lambda _, v: v['id'] == 'player', 'video', 'url', {url_or_none}, any)) + or f'https://vod.sen.com/videos/{video_id}/manifest.m3u8') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + **traverse_obj(api_data, ('data', 'nodes', lambda _, v: v['id'] == 'details', any, 'content', { + 'title': ('title', 'text', {str}), + 'description': ('descriptions', 0, 'text', {str}), + 'tags': ('badges', ..., 'text', {str}), + })), + } From 4a27b8f092f7f7c10b7a334d3535c97c2af02f0a Mon Sep 17 00:00:00 2001 From: Oto Valek Date: Sat, 14 Sep 2024 02:19:03 +0200 Subject: [PATCH 028/216] [ie/IPrima] Fix zoom URL support (#10959) Closes #6100 Authored by: otovalek --- yt_dlp/extractor/iprima.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index ab26dc5efe..9b91a454b1 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -25,9 +25,29 @@ class IPrimaIE(InfoExtractor): 'id': 'p51388', 'ext': 'mp4', 'title': 'Partička (92)', - 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - 'upload_date': '20201103', - 'timestamp': 1604437480, + 'description': 'md5:57943f6a50d6188288c3a579d2fd5f01', + 'episode': 'Partička (92)', + 'season': 'Partička', + 'series': 'Prima Partička', + 'episode_number': 92, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-ef6cf9de-c980-4443-92e4-17fe8bccd45c-16x9.jpeg', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + 'url': 'https://zoom.iprima.cz/porady/krasy-kanarskych-ostrovu/tenerife-v-risi-ohne', + 'info_dict': { + 'id': 'p1412199', + 'ext': 'mp4', + 'episode_number': 3, + 'episode': 'Tenerife: V říši ohně', + 'description': 'md5:4b4a05c574b5eaef130e68d4811c3f2c', + 'duration': 3111.0, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-f66dd7fb-c1a0-47d1-b3bc-7db328d566c5-16x9-1711636518.jpg/t_16x9_medium_1366_768', + 'title': 'Tenerife: V říši ohně', + 'timestamp': 1711825800, + 'upload_date': '20240330', }, 'params': { 'skip_download': True, # m3u8 download @@ -131,6 +151,7 @@ class IPrimaIE(InfoExtractor): video_id = self._search_regex(( r'productId\s*=\s*([\'"])(?Pp\d+)\1', r'pproduct_id\s*=\s*([\'"])(?Pp\d+)\1', + r'let\s+videos\s*=\s*([\'"])(?Pp\d+)\1', ), webpage, 'real id', group='id', default=None) if not video_id: @@ -176,7 +197,7 @@ class IPrimaIE(InfoExtractor): final_result = self._search_json_ld(webpage, video_id, default={}) final_result.update({ 'id': video_id, - 'title': title, + 'title': final_result.get('title') or title, 'thumbnail': self._html_search_meta( ['thumbnail', 'og:image', 'twitter:image'], webpage, 'thumbnail', default=None), From 173d54c151b987409e3eb09552d8d89ed8fc50f7 Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Sat, 14 Sep 2024 12:21:07 +1200 Subject: [PATCH 029/216] [ie/kick:vod] Support new URL format (#10988) Closes #10975 Authored by: grqz, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/kick.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index 1c1b2a1772..abea5280ba 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -67,7 +67,7 @@ class KickIE(KickBaseIE): @classmethod def suitable(cls, url): - return False if KickClipIE.suitable(url) else super().suitable(url) + return False if (KickVODIE.suitable(url) or KickClipIE.suitable(url)) else super().suitable(url) def _real_extract(self, url): channel = self._match_id(url) @@ -98,25 +98,25 @@ class KickIE(KickBaseIE): class KickVODIE(KickBaseIE): IE_NAME = 'kick:vod' - _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/videos/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/e74614f4-5270-4319-90ad-32179f19a45c', + 'url': 'https://kick.com/xqc/videos/8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': 'e74614f4-5270-4319-90ad-32179f19a45c', + 'id': '8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'ext': 'mp4', - 'title': r're:❎ MEGA DRAMA ❎ LIVE ❎ CLICK ❎ ULTIMATE SKILLS .+', + 'title': '18+ #ad 🛑LIVE🛑CLICK🛑DRAMA🛑NEWS🛑STUFF🛑REACT🛑GET IN HHERE🛑BOP BOP🛑WEEEE WOOOO🛑', 'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.', 'channel': 'xqc', 'channel_id': '668', 'uploader': 'xQc', 'uploader_id': '676', - 'upload_date': '20240724', - 'timestamp': 1721796562, - 'duration': 18566.0, + 'upload_date': '20240909', + 'timestamp': 1725919141, + 'duration': 10155.0, 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, - 'categories': ['VALORANT'], + 'categories': ['Just Chatting'], 'age_limit': 0, }, 'params': {'skip_download': 'm3u8'}, From a555389c9bb32e589e00b4664974423fb7b04dcd Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 13 Sep 2024 19:23:22 -0500 Subject: [PATCH 030/216] [ie/HGTVDe] Fix extractor (#10992) Closes #10984 Authored by: bashonly, rdamas Co-authored-by: Robert Damas --- yt_dlp/extractor/dplay.py | 65 +++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index 8d7707271d..86950b2445 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -319,32 +319,6 @@ class DPlayIE(DPlayBaseIE): url, display_id, host, 'dplay' + country, country, domain) -class HGTVDeIE(DPlayBaseIE): - _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', - 'info_dict': { - 'id': '151205', - 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', - 'ext': 'mp4', - 'title': 'Wer braucht schon eine Toilette', - 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', - 'duration': 1177.024, - 'timestamp': 1595705400, - 'upload_date': '20200725', - 'creator': 'HGTV', - 'series': 'Tiny House - klein, aber oho', - 'season_number': 3, - 'episode_number': 3, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') - - class DiscoveryPlusBaseIE(DPlayBaseIE): """Subclasses must set _PRODUCT, _DISCO_API_PARAMS""" @@ -373,6 +347,45 @@ class DiscoveryPlusBaseIE(DPlayBaseIE): return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS) +class HGTVDeIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/mein-kleinstadt-traumhaus/vom-landleben-ins-loft', + 'info_dict': { + 'id': '7332936', + 'ext': 'mp4', + 'display_id': 'mein-kleinstadt-traumhaus/vom-landleben-ins-loft', + 'title': 'Vom Landleben ins Loft', + 'description': 'md5:e5f72c02c853970796dd3818f2e25745', + 'episode': 'Episode 7', + 'episode_number': 7, + 'season': 'Season 7', + 'season_number': 7, + 'series': 'Mein Kleinstadt-Traumhaus', + 'duration': 2645.0, + 'timestamp': 1725998100, + 'upload_date': '20240910', + 'creators': ['HGTV'], + 'tags': [], + 'thumbnail': 'https://eu1-prod-images.disco-api.com/2024/08/09/82a386b9-c688-32c7-b9ff-0b13865f0bae.jpeg', + }, + }] + + _PRODUCT = 'hgtv' + _DISCO_API_PARAMS = { + 'disco_host': 'eu1-prod.disco-api.com', + 'realm': 'hgtv', + 'country': 'de', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': 'Alps:HyogaPlayer:0.0.0', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + class GoDiscoveryIE(DiscoveryPlusBaseIE): _VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ From a06bb586795ebab87a2356923acfc674d6f0e152 Mon Sep 17 00:00:00 2001 From: Khaoklong51 <159044442+Khaoklong51@users.noreply.github.com> Date: Sat, 14 Sep 2024 23:19:17 +0700 Subject: [PATCH 031/216] [ie/BiliIntl] Fix referer header (#11003) Closes #10996 Authored by: Khaoklong51 --- yt_dlp/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 3163df8ab7..2fe1103cb9 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1852,7 +1852,7 @@ class BiliBiliPlayerIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor): _API_URL = 'https://api.bilibili.tv/intl/gateway' _NETRC_MACHINE = 'biliintl' - _HEADERS = {'Referer': 'https://www.bilibili.com/'} + _HEADERS = {'Referer': 'https://www.bilibili.tv/'} def _call_api(self, endpoint, *args, **kwargs): json = self._download_json(self._API_URL + endpoint, *args, **kwargs) From 4a9bc8c3630378bc29f0266126b503f6190c0430 Mon Sep 17 00:00:00 2001 From: 1-Byte <1-Byte@users.noreply.github.com> Date: Tue, 17 Sep 2024 21:17:05 +0200 Subject: [PATCH 032/216] [ie/NZZ] Fix extractor (#10461) Closes #5653 Authored by: 1-Byte --- yt_dlp/extractor/nzz.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/nzz.py b/yt_dlp/extractor/nzz.py index ac3b73156e..047c4e1ac9 100644 --- a/yt_dlp/extractor/nzz.py +++ b/yt_dlp/extractor/nzz.py @@ -1,9 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - extract_attributes, -) class NZZIE(InfoExtractor): @@ -22,19 +19,14 @@ class NZZIE(InfoExtractor): 'playlist_count': 1, }] + def _entries(self, webpage, page_id): + for script in re.findall(r'(?s)]* data-hid="jw-video-jw[^>]+>(.+?)', webpage): + settings = self._search_json(r'var\s+settings\s*=[^{]*', script, 'settings', page_id, fatal=False) + if entry := self._parse_jwplayer_data(settings, page_id): + yield entry + def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - entries = [] - for player_element in re.findall( - r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): - player_params = extract_attributes(player_element) - if player_params.get('data-type') not in ('kaltura_singleArticle',): - self.report_warning('Unsupported player type') - continue - entry_id = player_params['data-id'] - entries.append(self.url_result( - 'kaltura:1750922:' + entry_id, 'Kaltura', entry_id)) - - return self.playlist_result(entries, page_id) + return self.playlist_result(self._entries(webpage, page_id), page_id) From 3ad0b7f422d547204df687b6d0b2d9110fff3990 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 24 Sep 2024 17:10:42 -0500 Subject: [PATCH 033/216] [ie/tiktok] Fix web formats extraction (#11074) Closes #11034 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9d823a3154..f7e103fe9f 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -542,16 +542,12 @@ class TikTokBaseIE(InfoExtractor): **COMMON_FORMAT_INFO, 'format_id': 'download', 'url': self._proto_relative_url(download_url), + 'format_note': 'watermarked', + 'preference': -2, }) self._remove_duplicate_formats(formats) - for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']): - f.update({ - 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '), - 'preference': f.get('preference') or -2, - }) - # Is it a slideshow with only audio for download? if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})): audio_url = aweme_detail['music']['playUrl'] @@ -565,7 +561,8 @@ class TikTokBaseIE(InfoExtractor): 'vcodec': 'none', }) - return formats + # Filter out broken formats, see https://github.com/yt-dlp/yt-dlp/issues/11034 + return [f for f in formats if urllib.parse.urlparse(f['url']).hostname != 'www.tiktok.com'] def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False): author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), { From fa2be9a7c63babede07480151363e54eee5702bd Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 24 Sep 2024 17:12:02 -0500 Subject: [PATCH 034/216] [ie/youtube] Fix `format_note` (Bugfix for 3a3bd00037e9908e87da4fa9f2ad772aa34dc60e) (#11028) Authored by: bashonly --- yt_dlp/extractor/youtube.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 3d11c32f6e..fc50dbc05f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4110,7 +4110,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) - client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) + client_name = fmt[STREAMING_DATA_CLIENT_NAME] po_token = fmt.get(STREAMING_DATA_PO_TOKEN) if po_token: @@ -4135,7 +4135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), is_damaged and 'DAMAGED', is_broken and 'BROKEN', - (self.get_param('verbose') or all_formats) and client_name, + (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0), @@ -4234,7 +4234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) if self.get_param('verbose') or all_formats: - f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') + f['format_note'] = join_nonempty( + f.get('format_note'), short_client_name(client_name), delim=', ') if f.get('fps') and f['fps'] <= 1: del f['fps'] @@ -4245,7 +4246,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): subtitles = {} for sd in streaming_data: - client_name = sd.get(STREAMING_DATA_CLIENT_NAME) + client_name = sd[STREAMING_DATA_CLIENT_NAME] po_token = sd.get(STREAMING_DATA_PO_TOKEN) hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: From 5bb1aa04dafce13ba9de707ea53169fab58b5207 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 25 Sep 2024 15:59:20 -0500 Subject: [PATCH 035/216] [networking] Pin `curl-cffi` version to < 0.7.2 (#11092) Ref: https://github.com/lexiforest/curl_cffi/issues/394 Authored by: bashonly --- pyproject.toml | 2 +- yt_dlp/networking/_curlcffi.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fe6894a428..86a8f0f7e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ dependencies = [ default = [] curl-cffi = [ "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", - "curl-cffi>=0.5.10,!=0.6.*,<0.8; os_name!='nt' and implementation_name=='cpython'", + "curl-cffi>=0.5.10,!=0.6.*,<0.7.2; os_name!='nt' and implementation_name=='cpython'", ] secretstorage = [ "cffi", diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index e8a67b7347..0643348e7e 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -31,9 +31,9 @@ if curl_cffi is None: curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3])) -if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 8, 0)): +if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 7, 2)): curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' - raise ImportError('Only curl_cffi versions 0.5.10, 0.7.X are supported') + raise ImportError('Only curl_cffi versions 0.5.10, 0.7.0 and 0.7.1 are supported') import curl_cffi.requests from curl_cffi.const import CurlECode, CurlOpt From b397a64691421ace5df09457c2a764821a2dc6f2 Mon Sep 17 00:00:00 2001 From: sepro Date: Wed, 25 Sep 2024 23:13:54 +0200 Subject: [PATCH 036/216] [cookies] Improve error message for Windows `--cookies-from-browser chrome` issue (#11090) Authored by: seproDev --- yt_dlp/cookies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 070d2fcb98..cff8d74a74 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1053,8 +1053,9 @@ def _decrypt_windows_dpapi(ciphertext, logger): ctypes.byref(blob_out), # pDataOut ) if not ret: - logger.warning('failed to decrypt with DPAPI', only_once=True) - return None + message = 'Failed to decrypt with DPAPI. See https://github.com/yt-dlp/yt-dlp/issues/10927 for more info' + logger.error(message) + raise DownloadError(message) # force exit result = ctypes.string_at(blob_out.pbData, blob_out.cbData) ctypes.windll.kernel32.LocalFree(blob_out.pbData) From fb8b7f226d251e521a89b23c415e249e5b788e5c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 25 Sep 2024 18:07:17 -0500 Subject: [PATCH 037/216] [build] Bump PyInstaller version pin to `>=6.10.0` (#10709) Authored by: bashonly --- .github/workflows/build.yml | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4ff1cbc1dd..bd2e42d9af 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -409,7 +409,7 @@ jobs: run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | @@ -469,7 +469,7 @@ jobs: run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | diff --git a/pyproject.toml b/pyproject.toml index 86a8f0f7e8..18d9a0a3a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ test = [ "pytest~=8.1", ] pyinstaller = [ - "pyinstaller>=6.7.0", # for compat with setuptools>=70 + "pyinstaller>=6.10.0", # Windows temp cleanup fixed in 6.10.0 ] py2exe = [ "py2exe>=0.12", From e2b3634e299be9c16a247ece3b1858d83889c324 Mon Sep 17 00:00:00 2001 From: szantnerb <2652078+szantnerb@users.noreply.github.com> Date: Thu, 26 Sep 2024 18:23:26 +0200 Subject: [PATCH 038/216] [ie/mediaklikk] Fix extractor (#11083) Closes #11061 Authored by: szantnerb --- yt_dlp/extractor/mediaklikk.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index f51342060b..197e91d1d9 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -16,6 +16,15 @@ class MediaKlikkIE(InfoExtractor): (?P[^/#?_]+)''' _TESTS = [{ + 'url': 'https://mediaklikk.hu/filmajanlo/cikk/az-ajto/', + 'info_dict': { + 'id': '668177', + 'title': 'Az ajtó', + 'display_id': 'az-ajto', + 'ext': 'mp4', + 'thumbnail': 'https://cdn.cms.mtv.hu/wp-content/uploads/sites/4/2016/01/vlcsnap-2023-07-31-14h18m52s111.jpg', + }, + }, { # (old) mediaklikk. date in html. 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', 'info_dict': { @@ -37,6 +46,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230903', 'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg', }, + 'skip': 'Webpage redirects to 404 page', }, { # (old) m4sport 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', @@ -59,6 +69,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230908', 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg', }, + 'skip': 'Webpage redirects to 404 page', }, { # m4sport with *video/ url and no date 'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/', @@ -69,6 +80,7 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png', }, + 'skip': 'Webpage redirects to 404 page', }, { # (old) hirado 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', @@ -90,6 +102,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230911', 'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg', }, + 'skip': 'Webpage redirects to video list page', }, { # (old) petofilive 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', @@ -112,6 +125,7 @@ class MediaKlikkIE(InfoExtractor): 'upload_date': '20230909', 'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg', }, + 'skip': 'Webpage redirects to video list page', }] def _real_extract(self, url): @@ -143,14 +157,14 @@ class MediaKlikkIE(InfoExtractor): if not playlist_url: raise ExtractorError('Unable to extract playlist url') - formats = self._extract_wowza_formats( - playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(playlist_url, video_id) return { 'id': video_id, 'title': title, 'display_id': display_id, 'formats': formats, + 'subtitles': subtitles, 'upload_date': upload_date, 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage), } From 28b0ecba2af5b4919f198474b3d00a76ef322c31 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Fri, 27 Sep 2024 00:29:21 +0800 Subject: [PATCH 039/216] [ie/Mojevideo] Add extractor (#11019) Closes #8159 Authored by: 04-pasha-04, pzhlkj6612 Co-authored-by: pasha --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/mojevideo.py | 121 ++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 yt_dlp/extractor/mojevideo.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d8abf0b5d3..42607e8809 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1166,6 +1166,7 @@ from .mlb import ( ) from .mlssoccer import MLSSoccerIE from .mocha import MochaVideoIE +from .mojevideo import MojevideoIE from .mojvideo import MojvideoIE from .monstercat import MonstercatIE from .motherless import ( diff --git a/yt_dlp/extractor/mojevideo.py b/yt_dlp/extractor/mojevideo.py new file mode 100644 index 0000000000..145e306970 --- /dev/null +++ b/yt_dlp/extractor/mojevideo.py @@ -0,0 +1,121 @@ +from .common import InfoExtractor +from ..utils import js_to_json, remove_end, update_url_query + + +class MojevideoIE(InfoExtractor): + IE_DESC = 'mojevideo.sk' + _VALID_URL = r'https?://(?:www\.)?mojevideo\.sk/video/(?P\w+)/(?P[\w()]+?)\.html' + + _TESTS = [{ + 'url': 'https://www.mojevideo.sk/video/3d17c/chlapci_dobetonovali_sme_mame_hotovo.html', + 'md5': '384a4628bd2bbd261c5206cf77c38c17', + 'info_dict': { + 'id': '3d17c', + 'ext': 'mp4', + 'title': 'Chlapci dobetónovali sme, máme hotovo!', + 'display_id': 'chlapci_dobetonovali_sme_mame_hotovo', + 'description': 'md5:a0822126044050d304a9ef58c92ddb34', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/250236.jpg', + 'duration': 21.0, + 'upload_date': '20230919', + 'timestamp': 1695129706, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/14677/den_blbec.html', + 'md5': '517c3e111c53a67d10b429c1f344ba2f', + 'info_dict': { + 'id': '14677', + 'ext': 'mp4', + 'title': 'Deň blbec?', + 'display_id': 'den_blbec', + 'description': 'I maličkosť vám môže zmeniť celý deň. Nikdy nezahadzujte žuvačky na zem!', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/83575.jpg', + 'duration': 100.0, + 'upload_date': '20120515', + 'timestamp': 1337076481, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/2feb2/band_maid_onset_(instrumental)_live_zepp_tokyo_(full_hd).html', + 'md5': '64599a23d3ac31cf2fe069e4353d8162', + 'info_dict': { + 'id': '2feb2', + 'ext': 'mp4', + 'title': 'BAND-MAID - onset (Instrumental) Live - Zepp Tokyo (Full HD)', + 'display_id': 'band_maid_onset_(instrumental)_live_zepp_tokyo_(full_hd)', + 'description': 'Výborná inštrumentálna skladba od skupiny BAND-MAID.', + 'thumbnail': 'https://fs5.mojevideo.sk/imgfb/196274.jpg', + 'duration': 240.0, + 'upload_date': '20190708', + 'timestamp': 1562576592, + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/358c8/dva_nissany_skyline_strielaju_v_londyne.html', + 'only_matching': True, + }, { + # 720p + 'url': 'https://www.mojevideo.sk/video/2455d/gopro_hero4_session_nova_sportova_vodotesna_kamera.html', + 'only_matching': True, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/352ee/amd_rx_6800_xt_vs_nvidia_rtx_3080_(test_v_9_hrach).html', + 'only_matching': True, + }, { + # 1080p + 'url': 'https://www.mojevideo.sk/video/2cbeb/trailer_z_avengers_infinity_war.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + + video_id_dec = self._search_regex( + r'\bvId\s*=\s*(\d+)', webpage, 'video id', fatal=False) or str(int(video_id, 16)) + video_exp = self._search_regex(r'\bvEx\s*=\s*["\'](\d+)', webpage, 'video expiry') + video_hashes = self._search_json( + r'\bvHash\s*=', webpage, 'video hashes', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json) + + formats = [] + for video_hash, (suffix, quality, format_note) in zip(video_hashes, [ + ('', 1, 'normálna kvalita'), + ('_lq', 0, 'nízka kvalita'), + ('_hd', 2, 'HD-720p'), + ('_fhd', 3, 'FULL HD-1080p'), + ('_2k', 4, '2K-1440p'), + ]): + formats.append({ + 'format_id': f'mp4-{quality}', + 'quality': quality, + 'format_note': format_note, + 'url': update_url_query( + f'https://cache01.mojevideo.sk/securevideos69/{video_id_dec}{suffix}.mp4', { + 'md5': video_hash, + 'expires': video_exp, + }), + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': (self._og_search_title(webpage, default=None) + or remove_end(self._html_extract_title(webpage, 'title'), ' - Mojevideo')), + 'description': self._og_search_description(webpage), + **self._search_json_ld(webpage, video_id, default={}), + } From b37417e4f934fd8909788b493d017777155b0ae5 Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 26 Sep 2024 18:32:51 +0200 Subject: [PATCH 040/216] [ie/SnapchatSpotlight] Add extractor (#11030) Closes #1797 Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/snapchat.py | 76 +++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 yt_dlp/extractor/snapchat.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42607e8809..fddfba81bd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1880,6 +1880,7 @@ from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE from .smotrim import SmotrimIE +from .snapchat import SnapchatSpotlightIE from .snotr import SnotrIE from .sohu import ( SohuIE, diff --git a/yt_dlp/extractor/snapchat.py b/yt_dlp/extractor/snapchat.py new file mode 100644 index 0000000000..732677c190 --- /dev/null +++ b/yt_dlp/extractor/snapchat.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none, url_or_none +from ..utils.traversal import traverse_obj + + +class SnapchatSpotlightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?snapchat\.com/spotlight/(?P\w+)' + + _TESTS = [{ + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'md5': '46c580f63592d0cbb76e974d2f9f0fcc', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'ext': 'mp4', + 'title': 'Views 💕', + 'description': '', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/d/kKJHIR1QAznRKK9jgYYDq\.256\.IRZXSOY', + 'duration': 4.665, + 'timestamp': 1637777831.369, + 'upload_date': '20211124', + 'repost_count': int, + 'uploader': 'shreypatel57', + 'uploader_url': 'https://www.snapchat.com/add/shreypatel57', + }, + }, { + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'md5': '4cd9626458c1a0e3e6dbe72c544a9ec2', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'ext': 'mp4', + 'title': 'Spotlight Snap', + 'description': 'How he flirt her teacher🤭🤭🤩😍 #kdrama#cdrama #dramaclips #dramaspotlight', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/i/ztfr6xFs0FOcFhwVczWfj\.256\.IRZXSOY', + 'duration': 10.91, + 'timestamp': 1722720291.307, + 'upload_date': '20240803', + 'view_count': int, + 'repost_count': int, + 'uploader': 'ganda0535', + 'uploader_url': 'https://www.snapchat.com/add/ganda0535', + 'tags': ['#dramaspotlight', '#dramaclips', '#cdrama', '#kdrama'], + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + video_data = traverse_obj(page_props, ( + 'spotlightFeed', 'spotlightStories', + lambda _, v: v['story']['storyId']['value'] == video_id, 'metadata', any), None) + + return { + 'id': video_id, + 'ext': 'mp4', + **traverse_obj(video_data, ('videoMetadata', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'timestamp': ('uploadDateMs', {lambda x: float_or_none(x, 1000)}), + 'view_count': ('viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('shareCount', {int_or_none}), + 'url': ('contentUrl', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'duration': ('durationMs', {lambda x: float_or_none(x, 1000)}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'uploader': ('creator', 'personCreator', 'username', {str}), + 'uploader_url': ('creator', 'personCreator', 'url', {url_or_none}), + })), + **traverse_obj(video_data, { + 'description': ('description', {str}), + 'tags': ('hashtags', ..., {str}), + 'view_count': ('engagementStats', 'viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('engagementStats', 'shareCount', {int_or_none}), + }), + } From 416686ed0cf792ec44ab059f3b229dd776077e14 Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 26 Sep 2024 18:35:19 +0200 Subject: [PATCH 041/216] [ie/ertgr] Fix video extraction (#11091) Closes #8955 Authored by: seproDev --- yt_dlp/extractor/ertgr.py | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py index 864aa6dc5a..6f3f60ff43 100644 --- a/yt_dlp/extractor/ertgr.py +++ b/yt_dlp/extractor/ertgr.py @@ -17,6 +17,7 @@ from ..utils import ( url_or_none, variadic, ) +from ..utils.traversal import traverse_obj class ERTFlixBaseIE(InfoExtractor): @@ -74,29 +75,28 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): def _extract_formats_and_subs(self, video_id): media_info = self._call_api(video_id, codename=video_id) - formats, subs = [], {} - for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []: - for media in try_get(media_file, lambda x: x['Formats'], list) or []: - fmt_url = url_or_none(try_get(media, lambda x: x['Url'])) - if not fmt_url: - continue - ext = determine_ext(fmt_url) - if ext == 'm3u8': - formats_, subs_ = self._extract_m3u8_formats_and_subtitles( - fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False) - elif ext == 'mpd': - formats_, subs_ = self._extract_mpd_formats_and_subtitles( - fmt_url, video_id, mpd_id='dash', fatal=False) - else: - formats.append({ - 'url': fmt_url, - 'format_id': str_or_none(media.get('Id')), - }) - continue - formats.extend(formats_) - self._merge_subtitles(subs_, target=subs) + formats, subtitles = [], {} + for media in traverse_obj(media_info, ( + 'MediaFiles', lambda _, v: v['RoleCodename'] == 'main', + 'Formats', lambda _, v: url_or_none(v['Url']))): + fmt_url = media['Url'] + ext = determine_ext(fmt_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + fmt_url, video_id, m3u8_id='hls', ext='mp4', fatal=False) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + fmt_url, video_id, mpd_id='dash', fatal=False) + else: + formats.append({ + 'url': fmt_url, + 'format_id': str_or_none(media.get('Id')), + }) + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) - return formats, subs + return formats, subtitles def _real_extract(self, url): video_id = self._match_id(url) From 124f058b546d652a359c67025bb479789bfbef0b Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Fri, 27 Sep 2024 04:39:48 +1200 Subject: [PATCH 042/216] [ie/Germanupa] Add extractor (#10538) Closes #10527 Authored by: grqz --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/germanupa.py | 91 +++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 yt_dlp/extractor/germanupa.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index fddfba81bd..4b1f4c316d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -730,6 +730,7 @@ from .genius import ( GeniusIE, GeniusLyricsIE, ) +from .germanupa import GermanupaIE from .getcourseru import ( GetCourseRuIE, GetCourseRuPlayerIE, diff --git a/yt_dlp/extractor/germanupa.py b/yt_dlp/extractor/germanupa.py new file mode 100644 index 0000000000..e40f016b2f --- /dev/null +++ b/yt_dlp/extractor/germanupa.py @@ -0,0 +1,91 @@ +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..utils import ( + parse_qs, + traverse_obj, + url_or_none, +) + + +class GermanupaIE(InfoExtractor): + IE_DESC = 'germanupa.de' + _VALID_URL = r'https?://germanupa\.de/mediathek/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://germanupa.de/mediathek/4-figma-beratung-deine-sprechstunde-fuer-figma-fragen', + 'info_dict': { + 'id': '909179246', + 'title': 'Tutorial: #4 Figma Beratung - Deine Sprechstunde für Figma-Fragen', + 'ext': 'mp4', + 'uploader': 'German UPA', + 'uploader_id': 'germanupa', + 'thumbnail': 'https://i.vimeocdn.com/video/1792564420-7415283ccef8bf8702dab8c6b7515555ceeb7a1c11371ffcc133b8e887dbf70e-d_1280', + 'uploader_url': 'https://vimeo.com/germanupa', + 'duration': 3987, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'audio, uses GenericIE', + 'url': 'https://germanupa.de/mediathek/live-vom-ux-festival-neuigkeiten-von-figma-jobmarkt-agenturszene-interview-zu-sustainable', + 'info_dict': { + 'id': '1867346676', + 'title': 'Live vom UX Festival: Neuigkeiten von Figma, Jobmarkt, Agenturszene & Interview zu Sustainable UX', + 'ext': 'opus', + 'timestamp': 1720545088, + 'upload_date': '20240709', + 'duration': 3910.557, + 'like_count': int, + 'description': 'md5:db2aed5ff131e177a7b33901e9a8db05', + 'uploader': 'German UPA', + 'repost_count': int, + 'genres': ['Science'], + 'license': 'all-rights-reserved', + 'uploader_url': 'https://soundcloud.com/user-80097677', + 'uploader_id': '471579486', + 'view_count': int, + 'comment_count': int, + 'thumbnail': 'https://i1.sndcdn.com/artworks-oCti2e9GhaZFWBqY-48ybGw-original.jpg', + }, + }, { + 'note': 'Nur für Mitglieder/Just for members', + 'url': 'https://germanupa.de/mediathek/ux-festival-2024-usability-tests-und-ai', + 'info_dict': { + 'id': '986994430', + 'title': 'UX Festival 2024 "Usability Tests und AI" von Lennart Weber', + 'ext': 'mp4', + 'release_date': '20240719', + 'uploader_url': 'https://vimeo.com/germanupa', + 'timestamp': 1721373980, + 'license': 'by-sa', + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/1904187064-2a672630c30f9ad787bd390bff3f51d7506a3e8416763ba6dbf465732b165c5c-d_1280', + 'duration': 2146, + 'release_timestamp': 1721373980, + 'uploader': 'German UPA', + 'uploader_id': 'germanupa', + 'upload_date': '20240719', + 'comment_count': int, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + 'skip': 'login required', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + param_url = traverse_obj( + self._search_regex( + r']+data-src\s*?=\s*?([\'"])(?Phttps://germanupa\.de/media/oembed\?url=(?:(?!\1).)+)\1', + webpage, 'embedded video', default=None, group='url'), + ({parse_qs}, 'url', 0, {url_or_none})) + + if not param_url: + if self._search_regex( + r']+class\s*?=\s*?([\'"])(?:(?!\1).)*login-wrapper(?:(?!\1).)*\1', + webpage, 'login wrapper', default=None): + self.raise_login_required('This video is only available for members') + return self.url_result(url, 'Generic') # Fall back to generic to extract audio + + real_url = param_url.replace('https://vimeo.com/', 'https://player.vimeo.com/video/') + return self.url_result(VimeoIE._smuggle_referrer(real_url, url), VimeoIE, video_id) From ad0b857f459a6d390fbf124183916218c52f223a Mon Sep 17 00:00:00 2001 From: tony-hn <25278435+tony-hn@users.noreply.github.com> Date: Thu, 26 Sep 2024 17:53:52 +0100 Subject: [PATCH 043/216] [ie/RumbleChannel] Fix extractor (#11049) Closes #10833 Authored by: tony-hn --- yt_dlp/extractor/rumble.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index db780a2cf4..74c7e4f176 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -8,14 +8,17 @@ from ..utils import ( UnsupportedError, clean_html, determine_ext, + extract_attributes, format_field, get_element_by_class, + get_elements_html_by_class, int_or_none, join_nonempty, parse_count, parse_iso8601, traverse_obj, unescapeHTML, + urljoin, ) @@ -382,8 +385,10 @@ class RumbleChannelIE(InfoExtractor): if isinstance(e.cause, HTTPError) and e.cause.status == 404: break raise - for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage): - yield self.url_result('https://rumble.com' + video_url) + for video_url in traverse_obj( + get_elements_html_by_class('videostream__link', webpage), (..., {extract_attributes}, 'href'), + ): + yield self.url_result(urljoin('https://rumble.com', video_url)) def _real_extract(self, url): url, playlist_id = self._match_valid_url(url).groups() From 5a8a05aebb49693e78e1123015837ed5e961ff76 Mon Sep 17 00:00:00 2001 From: diman8 Date: Thu, 26 Sep 2024 18:57:00 +0200 Subject: [PATCH 044/216] [ie/SVTPage] Fix extractor (#11010) Authored by: diman8 --- yt_dlp/extractor/svt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 38782abac7..b5df2e1a18 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -472,7 +472,7 @@ class SVTPageIE(SVTBaseIE): title = self._og_search_title(webpage) urql_state = self._search_json( - r'window\.svt\.nyh\.urqlState\s*=', webpage, 'json data', display_id) + r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id) data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} From a2000bc85730c950351d78bb818493dc39dca3cb Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 26 Sep 2024 18:20:14 -0500 Subject: [PATCH 045/216] [ie/bilibili] Fix chapters and subtitles extraction (#11099) Closes #11089 Authored by: bashonly --- yt_dlp/extractor/bilibili.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 2fe1103cb9..62f68fbc6d 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -46,6 +46,7 @@ from ..utils import ( class BilibiliBaseIE(InfoExtractor): + _HEADERS = {'Referer': 'https://www.bilibili.com/'} _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?') _WBI_KEY_CACHE_TIMEOUT = 30 # exact expire timeout is unclear, use 30s for one session _wbi_key_cache = {} @@ -192,7 +193,7 @@ class BilibiliBaseIE(InfoExtractor): video_info = self._download_json( 'https://api.bilibili.com/x/player/v2', video_id, query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid}, - note=f'Extracting subtitle info {cid}') + note=f'Extracting subtitle info {cid}', headers=self._HEADERS) if traverse_obj(video_info, ('data', 'need_login_subtitle')): self.report_warning( f'Subtitles are only available when logged in. {self._login_hint()}', only_once=True) @@ -207,7 +208,7 @@ class BilibiliBaseIE(InfoExtractor): def _get_chapters(self, aid, cid): chapters = aid and cid and self._download_json( 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, - note='Extracting chapters', fatal=False) + note='Extracting chapters', fatal=False, headers=self._HEADERS) return traverse_obj(chapters, ('data', 'view_points', ..., { 'title': 'content', 'start_time': 'from', @@ -1021,8 +1022,6 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE): class BilibiliCheeseBaseIE(BilibiliBaseIE): - _HEADERS = {'Referer': 'https://www.bilibili.com/'} - def _extract_episode(self, season_info, ep_id): episode_info = traverse_obj(season_info, ( 'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False) From 9f5c9a90898c5a1e672922d9cd799716c73cee34 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 26 Sep 2024 18:21:03 -0500 Subject: [PATCH 046/216] [ie/wistia] Support password-protected videos (#11100) Closes #10914 Authored by: bashonly --- yt_dlp/extractor/wistia.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index fb2a8648fd..df7ecb3cdc 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -8,6 +8,7 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, + filter_dict, float_or_none, int_or_none, parse_qs, @@ -25,16 +26,25 @@ class WistiaBaseIE(InfoExtractor): def _download_embed_config(self, config_type, config_id, referer): base_url = self._EMBED_BASE_URL + f'{config_type}/{config_id}' + video_password = self.get_param('videopassword') embed_config = self._download_json( base_url + '.json', config_id, headers={ 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. - }) + }, query=filter_dict({'password': video_password})) error = traverse_obj(embed_config, 'error') if error: raise ExtractorError( f'Error while getting the playlist: {error}', expected=True) + if traverse_obj(embed_config, ( + 'media', ('embed_options', 'embedOptions'), 'plugin', + 'passwordProtectedVideo', 'on', any)) == 'true': + if video_password: + raise ExtractorError('Invalid video password', expected=True) + raise ExtractorError( + 'This content is password-protected. Use the --video-password option', expected=True) + return embed_config def _get_real_ext(self, url): From 1d84b780cf33a1d84756825ac23f990a905703df Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 26 Sep 2024 18:26:10 -0500 Subject: [PATCH 047/216] [ie/youtube:clip] Prioritize `https` formats (#11102) Closes #10856 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index fc50dbc05f..1382c01b60 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -7655,6 +7655,8 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'id': clip_id, 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, + '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility + 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang'), } From eabb4680fdb09ba1f48d174a700a2e3b43f82add Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 26 Sep 2024 18:27:16 -0500 Subject: [PATCH 048/216] [ie/niconico] Fix m3u8 formats extraction (#11103) Closes #10724 Authored by: bashonly --- yt_dlp/extractor/niconico.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 179e7a9b16..e06740d62e 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -420,7 +420,7 @@ class NiconicoIE(InfoExtractor): 'x-request-with': 'https://www.nicovideo.jp', })['data']['contentUrl'] # Getting all audio formats results in duplicate video formats which we filter out later - dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id) + dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id, 'mp4') # m3u8 extraction does not provide audio bitrates, so extract from the API data and fix for audio_fmt in traverse_obj(dms_fmts, lambda _, v: v['vcodec'] == 'none'): @@ -432,7 +432,6 @@ class NiconicoIE(InfoExtractor): 'asr': ('samplingRate', {int_or_none}), }), get_all=False), 'acodec': 'aac', - 'ext': 'm4a', } # Sort before removing dupes to keep the format dicts with the lowest tbr From 7f909046f4dc0fba472b4963145aef6e0d42491b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:37:16 -0500 Subject: [PATCH 049/216] [ie/abc.net.au:iview:showseries] Fix extraction (#11101) Closes #10475 Authored by: bashonly --- yt_dlp/extractor/abc.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 7518ba6f0d..7296be73b3 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -387,17 +387,27 @@ class ABCIViewShowSeriesIE(InfoExtractor): 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$', }, 'playlist_count': 15, + 'skip': 'This program is not currently available in ABC iview', + }, { + 'url': 'https://iview.abc.net.au/show/inbestigators', + 'info_dict': { + 'id': '175343-1', + 'title': 'Series 1', + 'description': 'md5:b9976935a6450e5b78ce2a940a755685', + 'series': 'The Inbestigators', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.+\.jpg', + }, + 'playlist_count': 17, }] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(url, show_id) - webpage_data = self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', - webpage, 'initial state') - video_data = self._parse_json( - unescapeHTML(webpage_data).encode().decode('unicode_escape'), show_id) - video_data = video_data['route']['pageData']['_embedded'] + video_data = self._search_json( + r'window\.__INITIAL_STATE__\s*=\s*[\'"]', webpage, 'initial state', show_id, + transform_source=lambda x: x.encode().decode('unicode_escape'), + end_pattern=r'[\'"]\s*;')['route']['pageData']['_embedded'] highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl']) if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'): From 48d629d461e05b1b19f5e53dc959bb9ebe95da42 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:38:08 -0500 Subject: [PATCH 050/216] [ie/YleAreena] Support podcasts (#11104) Closes #10840 Authored by: bashonly --- yt_dlp/extractor/yle_areena.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/yle_areena.py b/yt_dlp/extractor/yle_areena.py index ef9e96804c..c0a218e2fc 100644 --- a/yt_dlp/extractor/yle_areena.py +++ b/yt_dlp/extractor/yle_areena.py @@ -10,7 +10,7 @@ from ..utils import ( class YleAreenaIE(InfoExtractor): - _VALID_URL = r'https?://areena\.yle\.fi/(?P[\d-]+)' + _VALID_URL = r'https?://areena\.yle\.fi/(?Ppodcastit/)?(?P[\d-]+)' _GEO_COUNTRIES = ['FI'] _TESTS = [ { @@ -77,7 +77,7 @@ class YleAreenaIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) + video_id, is_podcast = self._match_valid_url(url).group('id', 'podcast') info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) video_data = self._download_json( f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b', @@ -103,8 +103,11 @@ class YleAreenaIE(InfoExtractor): 'name': sub.get('kind'), }) - kaltura_id = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id'), expected_type=str) - if kaltura_id: + if is_podcast: + info_dict = { + 'url': video_data['data']['ongoing_ondemand']['media_url'], + } + elif kaltura_id := traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id', {str})): info_dict = { '_type': 'url_transparent', 'url': smuggle_url(f'kaltura:1955031:{kaltura_id}', {'source_url': url}), @@ -114,13 +117,11 @@ class YleAreenaIE(InfoExtractor): formats, subs = self._extract_m3u8_formats_and_subtitles( video_data['data']['ongoing_ondemand']['manifest_url'], video_id, 'mp4', m3u8_id='hls') self._merge_subtitles(subs, target=subtitles) - info_dict = { - 'id': video_id, - 'formats': formats, - } + info_dict = {'formats': formats} return { **info_dict, + 'id': video_id, 'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str) or episode or info.get('title')), 'description': description, From 0aa4426e9a35f7f8e184f1f2082b3b313c1448f7 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:38:40 -0500 Subject: [PATCH 051/216] [ie/kick:clips] Support new URL format (#11107) Closes #11105 Authored by: bashonly --- yt_dlp/extractor/kick.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index abea5280ba..bd21e59501 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -148,7 +148,7 @@ class KickVODIE(KickBaseIE): class KickClipIE(KickBaseIE): IE_NAME = 'kick:clips' - _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/?\?(?:[^#]+&)?clip=(?Pclip_[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+(?:/clips/|/?\?(?:[^#]+&)?clip=)(?Pclip_[\w-]+)' _TESTS = [{ 'url': 'https://kick.com/mxddy?clip=clip_01GYXVB5Y8PWAPWCWMSBCFB05X', 'info_dict': { @@ -189,6 +189,26 @@ class KickClipIE(KickBaseIE): 'age_limit': 0, }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://kick.com/spreen/clips/clip_01J8RGZRKHXHXXKJEHGRM932A5', + 'info_dict': { + 'id': 'clip_01J8RGZRKHXHXXKJEHGRM932A5', + 'ext': 'mp4', + 'title': 'KLJASLDJKLJKASDLJKDAS', + 'channel': 'spreen', + 'channel_id': '5312671', + 'uploader': 'AnormalBarraBaja', + 'uploader_id': '26518262', + 'duration': 43.0, + 'upload_date': '20240927', + 'timestamp': 1727399987, + 'thumbnail': 'https://clips.kick.com/clips/f2/clip_01J8RGZRKHXHXXKJEHGRM932A5/thumbnail.webp', + 'view_count': int, + 'like_count': int, + 'categories': ['Minecraft'], + 'age_limit': 0, + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): From c08e0b20b5edd8957b8318716bc14e896d1b96f4 Mon Sep 17 00:00:00 2001 From: Kieran Date: Fri, 27 Sep 2024 13:52:41 -0700 Subject: [PATCH 052/216] Allow `none` arg to negate `--convert-subs` and `--convert-thumbnails` (#11066) Authored by: kieraneglin --- README.md | 10 +++++++--- yt_dlp/__init__.py | 5 +++++ yt_dlp/options.py | 8 +++++--- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 428eb9f478..1d6a4a86d5 100644 --- a/README.md +++ b/README.md @@ -999,12 +999,16 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git be used multiple times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format - (currently supported: ass, lrc, srt, vtt) - (Alias: --convert-subtitles) + (currently supported: ass, lrc, srt, vtt). + Use "--convert-subs none" to disable + conversion (default) (Alias: --convert- + subtitles) --convert-thumbnails FORMAT Convert the thumbnails to another format (currently supported: jpg, png, webp). You can specify multiple rules using similar - syntax as --remux-video + syntax as "--remux-video". Use "--convert- + thumbnails none" to disable conversion + (default) --split-chapters Split video into multiple files based on internal chapters. The "chapter:" prefix can be used with "--paths" and "--output" to set diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index c0b8e3b507..c2d19f94a0 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -235,6 +235,11 @@ def validate_options(opts): validate_regex('format sorting', f, FormatSorter.regex) # Postprocessor formats + if opts.convertsubtitles == 'none': + opts.convertsubtitles = None + if opts.convertthumbnails == 'none': + opts.convertthumbnails = None + validate_regex('merge output format', opts.merge_output_format, r'({0})(/({0}))*'.format('|'.join(map(re.escape, FFmpegMergerPP.SUPPORTED_EXTS)))) validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index ffe2463fe2..8077d5d88f 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1725,15 +1725,17 @@ def create_parser(): '--convert-subs', '--convert-sub', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, help=( - 'Convert the subtitles to another format (currently supported: {}) ' - '(Alias: --convert-subtitles)'.format(', '.join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))))) + 'Convert the subtitles to another format ' + f'(currently supported: {", ".join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))}). ' + 'Use "--convert-subs none" to disable conversion (default) (Alias: --convert-subtitles)')) postproc.add_option( '--convert-thumbnails', metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' f'(currently supported: {", ".join(sorted(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))}). ' - 'You can specify multiple rules using similar syntax as --remux-video')) + 'You can specify multiple rules using similar syntax as "--remux-video". ' + 'Use "--convert-thumbnails none" to disable conversion (default)')) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, From a1b4ac2b8ed8e6eaa56044d439f1e0d00c2ba218 Mon Sep 17 00:00:00 2001 From: fireattack Date: Sat, 28 Sep 2024 04:57:57 +0800 Subject: [PATCH 053/216] [ie/vimeo] Fix HLS audio format sorting (#11082) Closes #10854 Authored by: fireattack --- yt_dlp/extractor/vimeo.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 9a03948cd9..367d5e5835 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -21,6 +21,7 @@ from ..utils import ( parse_filesize, parse_iso8601, parse_qs, + qualities, smuggle_url, str_or_none, traverse_obj, @@ -146,6 +147,8 @@ class VimeoBaseInfoExtractor(InfoExtractor): }) # TODO: fix handling of 308 status code returned for live archive manifest requests + QUALITIES = ('low', 'medium', 'high') + quality = qualities(QUALITIES) sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): @@ -166,6 +169,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): m_url, video_id, 'mp4', live=is_live, m3u8_id=f_id, note=f'Downloading {cdn_name} m3u8 information', fatal=False) + # m3u8 doesn't give audio bitrates; need to prioritize based on GROUP-ID + # See: https://github.com/yt-dlp/yt-dlp/issues/10854 + for f in fmts: + if mobj := re.search(rf'audio-({"|".join(QUALITIES)})', f['format_id']): + f['quality'] = quality(mobj.group(1)) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif files_type == 'dash': From 8f4ea14680c7865d8ffac10a9174205d1d84ada7 Mon Sep 17 00:00:00 2001 From: rakslice Date: Fri, 27 Sep 2024 14:32:39 -0700 Subject: [PATCH 054/216] Fix format sorting bug with vp9.2 vcodec (#10884) Authored by: rakslice --- test/test_YoutubeDL.py | 29 +++++++++++++++++++++++++++++ test/test_utils.py | 5 +++++ yt_dlp/utils/_utils.py | 2 +- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1847c4ffd8..a99e624080 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -236,6 +236,35 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_by_vcodec_sort(self): + formats = [ + {'format_id': 'av1-format', 'ext': 'mp4', 'vcodec': 'av1', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-hdr-format', 'ext': 'mp4', 'vcodec': 'vp09.02.50.10.01.09.18.09.00', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-sdr-format', 'ext': 'mp4', 'vcodec': 'vp09.00.50.08', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'h265-format', 'ext': 'mp4', 'vcodec': 'h265', 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + def test_format_selection_string_ops(self): formats = [ {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, diff --git a/test/test_utils.py b/test/test_utils.py index a2b4593527..4f5fa1e100 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -921,6 +921,11 @@ class TestUtil(unittest.TestCase): 'acodec': 'none', 'dynamic_range': 'HDR10', }) + self.assertEqual(parse_codecs('vp09.02.50.10.01.09.18.09.00'), { + 'vcodec': 'vp09.02.50.10.01.09.18.09.00', + 'acodec': 'none', + 'dynamic_range': 'HDR10', + }) self.assertEqual(parse_codecs('av01.0.12M.10.0.110.09.16.09.0'), { 'vcodec': 'av01.0.12M.10.0.110.09.16.09.0', 'acodec': 'none', diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 04dd0f8d2c..184794f95a 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5281,7 +5281,7 @@ class FormatSorter: settings = { 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, + 'order': ['av0?1', 'vp0?9.0?2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', From 63da31b3b29af90062d8a72a905ffe4b5e499042 Mon Sep 17 00:00:00 2001 From: ndyanx <114776171+ndyanx@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:05:22 -0500 Subject: [PATCH 055/216] [ie/dropbox] Fix password-protected video support (#10735) Also adds thumbnail extraction Closes #9864 Authored by: ndyanx --- yt_dlp/extractor/dropbox.py | 52 +++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index 51b40df428..c122096230 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -6,8 +6,10 @@ import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + update_url, update_url_query, url_basename, + urlencode_postdata, ) @@ -36,43 +38,58 @@ class DropboxIE(InfoExtractor): }, ] + def _yield_decoded_parts(self, webpage): + for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): + yield base64.b64decode(encoded).decode('utf-8', 'ignore') + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) fn = urllib.parse.unquote(url_basename(url)) title = os.path.splitext(fn)[0] - password = self.get_param('videopassword') - if (self._og_search_title(webpage) == 'Dropbox - Password Required' - or 'Enter the password for this link' in webpage): + for part in self._yield_decoded_parts(webpage): + if '/sm/password' in part: + webpage = self._download_webpage( + update_url('https://www.dropbox.com/sm/password', query=part.partition('?')[2]), video_id) + break + + if (self._og_search_title(webpage, default=None) == 'Dropbox - Password Required' + or 'Enter the password for this link' in webpage): if password: - content_id = self._search_regex(r'content_id=(.*?)["\']', webpage, 'content_id') - payload = f'is_xhr=true&t={self._get_cookies("https://www.dropbox.com").get("t").value}&content_id={content_id}&password={password}&url={url}' response = self._download_json( - 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', data=payload.encode(), - headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}) + 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', + headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}, + data=urlencode_postdata({ + 'is_xhr': 'true', + 't': self._get_cookies('https://www.dropbox.com')['t'].value, + 'content_id': self._search_regex(r'content_id=([\w.+=/-]+)["\']', webpage, 'content id'), + 'password': password, + 'url': url, + })) if response.get('status') != 'authed': - raise ExtractorError('Authentication failed!', expected=True) - webpage = self._download_webpage(url, video_id) - elif self._get_cookies('https://dropbox.com').get('sm_auth'): - webpage = self._download_webpage(url, video_id) - else: + raise ExtractorError('Invalid password', expected=True) + elif not self._get_cookies('https://dropbox.com').get('sm_auth'): raise ExtractorError('Password protected video, use --video-password ', expected=True) + webpage = self._download_webpage(url, video_id) - formats, subtitles, has_anonymous_download = [], {}, False - for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): - decoded = base64.b64decode(encoded).decode('utf-8', 'ignore') + formats, subtitles = [], {} + has_anonymous_download = False + thumbnail = None + for part in self._yield_decoded_parts(webpage): if not has_anonymous_download: has_anonymous_download = self._search_regex( - r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) + r'(anonymous:\tanonymous)', part, 'anonymous', default=False) transcode_url = self._search_regex( - r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None) + r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', part, 'transcode url', default=None) if not transcode_url: continue formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4') + thumbnail = self._search_regex( + r'(https://www\.dropbox\.com/temp_thumb_from_token/[\w/?&=]+)', part, 'thumbnail', default=None) break # downloads enabled we can get the original file @@ -89,4 +106,5 @@ class DropboxIE(InfoExtractor): 'title': title, 'formats': formats, 'subtitles': subtitles, + 'thumbnail': thumbnail, } From 7509d692b37a7ec6230ea75bfe1e44a8de5eefce Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Sat, 28 Sep 2024 06:28:22 +0800 Subject: [PATCH 056/216] [ie/loom] Fix m3u8 formats extraction (#10760) Closes #10737 Authored by: kclauhk --- yt_dlp/extractor/loom.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/loom.py b/yt_dlp/extractor/loom.py index 1191aa17ea..b0878c33e2 100644 --- a/yt_dlp/extractor/loom.py +++ b/yt_dlp/extractor/loom.py @@ -92,9 +92,9 @@ class LoomIE(InfoExtractor): }, 'params': {'videopassword': 'seniorinfants2'}, }, { - # embed, transcoded-url endpoint sends empty JSON response + # embed, transcoded-url endpoint sends empty JSON response, split video and audio HLS formats 'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e', - 'md5': '8488817242a0db1cb2ad0ea522553cf6', + 'md5': 'b321d261656848c184a94e3b93eae28d', 'info_dict': { 'id': 'ddcf1c1ad21f451ea7468b1e33917e4e', 'ext': 'mp4', @@ -104,6 +104,7 @@ class LoomIE(InfoExtractor): 'timestamp': 1657216459, 'duration': 181, }, + 'params': {'format': 'bestvideo'}, # Test video-only fixup 'expected_warnings': ['Failed to parse JSON'], }] _WEBPAGE_TESTS = [{ @@ -293,7 +294,11 @@ class LoomIE(InfoExtractor): format_url = format_url.replace('-split.m3u8', '.m3u8') m3u8_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality) + # Sometimes only split video/audio formats are available, need to fixup video-only formats + is_not_premerged = 'none' in traverse_obj(m3u8_formats, (..., 'vcodec')) for fmt in m3u8_formats: + if is_not_premerged and fmt.get('vcodec') != 'none': + fmt['acodec'] = 'none' yield { **fmt, 'url': update_url(fmt['url'], query=query), From cca534cd9e6850c70244f225a4a1895ef4bcdbec Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:30:31 -0500 Subject: [PATCH 057/216] Raise minimum recommended Python version to 3.9 (#11098) Authored by: bashonly --- devscripts/changelog_override.json | 5 ++++ yt_dlp/update.py | 38 +++++++++++++++++++++++------- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 5189de2d77..7be750cfbe 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -185,5 +185,10 @@ "action": "add", "when": "6075a029dba70a89675ae1250e7cdfd91f0eba41", "short": "[priority] Security: [[ie/douyutv] Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3v33-3wmw-3785)\n - A dependency on potentially malicious third-party JavaScript code has been removed from the Douyu extractors" + }, + { + "action": "add", + "when": "fb8b7f226d251e521a89b23c415e249e5b788e5c", + "short": "[priority] **The minimum *recommended* Python version has been raised to 3.9**\nSince Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" } ] diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 72ae290844..4cf3bdc320 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -135,20 +135,42 @@ def _get_binary_name(): def _get_system_deprecation(): - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 8) + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 9) if sys.version_info > MIN_RECOMMENDED: return None major, minor = sys.version_info[:2] - if sys.version_info < MIN_SUPPORTED: - msg = f'Python version {major}.{minor} is no longer supported' - else: - msg = (f'Support for Python version {major}.{minor} has been deprecated. ' - '\nYou may stop receiving updates on this version at any time') + PYTHON_MSG = f'Please update to Python {".".join(map(str, MIN_RECOMMENDED))} or above' - major, minor = MIN_RECOMMENDED - return f'{msg}! Please update to Python {major}.{minor} or above' + if sys.version_info < MIN_SUPPORTED: + return f'Python version {major}.{minor} is no longer supported! {PYTHON_MSG}' + + EXE_MSG_TMPL = ('Support for {} has been deprecated. ' + 'See https://github.com/yt-dlp/yt-dlp/{} for details.\n{}') + STOP_MSG = 'You may stop receiving updates on this version at any time!' + variant = detect_variant() + + # Temporary until Windows builds use 3.9, which will drop support for Win7 and 2008ServerR2 + if variant in ('win_exe', 'win_x86_exe', 'py2exe'): + platform_name = platform.platform() + if any(platform_name.startswith(f'Windows-{name}') for name in ('7', '2008ServerR2')): + return EXE_MSG_TMPL.format('Windows 7/Server 2008 R2', 'issues/10086', STOP_MSG) + elif variant == 'py2exe': + return EXE_MSG_TMPL.format( + 'py2exe builds (yt-dlp_min.exe)', 'issues/10087', + 'In a future update you will be migrated to the PyInstaller-bundled executable. ' + 'This will be done automatically; no action is required on your part') + return None + + # Temporary until aarch64/armv7l build flow is bumped to Ubuntu 20.04 and Python 3.9 + elif variant in ('linux_aarch64_exe', 'linux_armv7l_exe'): + libc_ver = version_tuple(os.confstr('CS_GNU_LIBC_VERSION').partition(' ')[2]) + if libc_ver < (2, 31): + return EXE_MSG_TMPL.format('system glibc version < 2.31', 'pull/8638', STOP_MSG) + return None + + return f'Support for Python version {major}.{minor} has been deprecated. {PYTHON_MSG}' def _sha256_file(path): From c6387abc1af9842bb0541288a5610abba9b1ab51 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:46:22 -0500 Subject: [PATCH 058/216] [cleanup] Misc (#10807) Closes #10751, Closes #10769, Closes #10791 Authored by: bashonly, Codenade, pzhlkj6612, seproDev, coletdjnz, grqz, Grub4K Co-authored-by: Codenade Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> Co-authored-by: coletdjnz Co-authored-by: N/Ame <173015200+grqz@users.noreply.github.com> Co-authored-by: Simon Sawicki --- .github/workflows/quick-test.yml | 2 +- .github/workflows/release.yml | 4 ++-- README.md | 30 ++++++++++++++-------------- pyproject.toml | 2 +- yt_dlp/downloader/external.py | 4 ++-- yt_dlp/extractor/academicearth.py | 2 +- yt_dlp/extractor/ard.py | 4 ++-- yt_dlp/extractor/callin.py | 2 +- yt_dlp/extractor/common.py | 8 ++++---- yt_dlp/extractor/fc2.py | 2 +- yt_dlp/extractor/generic.py | 4 ++-- yt_dlp/extractor/getcourseru.py | 2 +- yt_dlp/extractor/golem.py | 2 +- yt_dlp/extractor/hrfensehen.py | 2 +- yt_dlp/extractor/japandiet.py | 7 +++++-- yt_dlp/extractor/kaltura.py | 2 +- yt_dlp/extractor/mailru.py | 2 +- yt_dlp/extractor/mgtv.py | 2 +- yt_dlp/extractor/mit.py | 2 +- yt_dlp/extractor/nzonscreen.py | 2 +- yt_dlp/extractor/pornhub.py | 3 +-- yt_dlp/extractor/radiofrance.py | 2 +- yt_dlp/extractor/reverbnation.py | 2 +- yt_dlp/extractor/tele13.py | 2 +- yt_dlp/extractor/twitcasting.py | 2 +- yt_dlp/extractor/viu.py | 2 +- yt_dlp/extractor/ximalaya.py | 2 +- yt_dlp/networking/_websockets.py | 4 ++-- yt_dlp/options.py | 10 +++++----- yt_dlp/postprocessor/sponsorblock.py | 2 +- yt_dlp/utils/_utils.py | 6 +++--- 31 files changed, 63 insertions(+), 61 deletions(-) diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index fe2a7e9239..1571d3cab4 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -15,7 +15,7 @@ jobs: with: python-version: '3.8' - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include test + run: python3 ./devscripts/install_deps.py -o --include test - name: Run tests timeout-minutes: 15 run: | diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fa5ad7e515..8d0bc4026a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -204,7 +204,7 @@ jobs: git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" git add -u git commit -m "Release ${{ env.version }}" \ - -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" + -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all" git push origin --force ${{ github.event.ref }}:release - name: Get target commitish @@ -325,7 +325,7 @@ jobs: "(https://github.com/yt-dlp/yt-dlp-master-builds/releases/latest \"Master builds\")"' || '' }} > ./RELEASE_NOTES printf '\n\n' >> ./RELEASE_NOTES cat >> ./RELEASE_NOTES << EOF - #### A description of the various files are in the [README](https://github.com/${{ github.repository }}#release-files) + #### A description of the various files is in the [README](https://github.com/${{ github.repository }}#release-files) --- $(python ./devscripts/make_changelog.py -vv --collapsible) EOF diff --git a/README.md b/README.md index 1d6a4a86d5..3e76a4efbb 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. -* [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) +* [**curl_cffi**](https://github.com/lexiforest/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lexiforest/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/lexiforest/curl_cffi/blob/main/LICENSE) * Can be installed with the `curl-cffi` group, e.g. `pip install "yt-dlp[default,curl-cffi]"` * Currently included in `yt-dlp.exe`, `yt-dlp_linux` and `yt-dlp_macos` builds @@ -459,17 +459,17 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git conditions. Use a "\" to escape "&" or quotes if needed. If used multiple times, the filter matches if at least one of the - conditions is met. E.g. --match-filter - !is_live --match-filter "like_count>?100 & + conditions is met. E.g. --match-filters + !is_live --match-filters "like_count>?100 & description~='(?i)\bcats \& dogs\b'" matches only videos that are not live OR those that have a like count more than 100 (or the like field is not available) and also has a description that contains the phrase "cats & - dogs" (caseless). Use "--match-filter -" to + dogs" (caseless). Use "--match-filters -" to interactively ask whether to download each video - --no-match-filters Do not use any --match-filter (default) + --no-match-filters Do not use any --match-filters (default) --break-match-filters FILTER Same as "--match-filters" but stops the download process when a video is rejected --no-break-match-filters Do not use any --break-match-filters (default) @@ -490,7 +490,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git encountering a file that is in the archive (default) --break-per-input Alters --max-downloads, --break-on-existing, - --break-match-filter, and autonumber to + --break-match-filters, and autonumber to reset per input URL --no-break-per-input --break-on-existing and similar options terminates the entire download queue @@ -1771,7 +1771,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web_creator` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web_creator` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -2184,9 +2184,9 @@ with yt_dlp.YoutubeDL(ydl_opts) as ydl: * **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata` -* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filter` etc +* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filters` etc -* **Improvements**: Regex and other operators in `--format`/`--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc +* **Improvements**: Regex and other operators in `--format`/`--match-filters`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc * **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details @@ -2227,7 +2227,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * ~~yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [aria2c](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is~~ -* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filters` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this * yt-dlp versions between 2021.11.10 and 2023.06.21 estimated `filesize_approx` values for fragmented/manifest formats. This was added for convenience in [f2fe69](https://github.com/yt-dlp/yt-dlp/commit/f2fe69c7b0d208bdb1f6292b4ae92bc1e1a7444a), but was reverted in [0dff8e](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) due to the potentially extreme inaccuracy of the estimated values. Use `--compat-options manifest-filesize-approx` to keep extracting the estimated values * yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. * The sub-modules `swfinterp`, `casefold` are removed. @@ -2273,11 +2273,11 @@ While these options are redundant, they are still expected to be used due to the --get-thumbnail --print thumbnail -e, --get-title --print title -g, --get-url --print urls - --match-title REGEX --match-filter "title ~= (?i)REGEX" - --reject-title REGEX --match-filter "title !~= (?i)REGEX" - --min-views COUNT --match-filter "view_count >=? COUNT" - --max-views COUNT --match-filter "view_count <=? COUNT" - --break-on-reject Use --break-match-filter + --match-title REGEX --match-filters "title ~= (?i)REGEX" + --reject-title REGEX --match-filters "title !~= (?i)REGEX" + --min-views COUNT --match-filters "view_count >=? COUNT" + --max-views COUNT --match-filters "view_count <=? COUNT" + --break-on-reject Use --break-match-filters --user-agent UA --add-header "User-Agent:UA" --referer URL --add-header "Referer:URL" --playlist-start NUMBER -I NUMBER: diff --git a/pyproject.toml b/pyproject.toml index 18d9a0a3a7..f54980d576 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.5.0", + "ruff~=0.6.0", ] test = [ "pytest~=8.1", diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ae2372915b..6c1ec403c8 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -508,7 +508,7 @@ class FFmpegFD(ExternalFD): env = None proxy = self.params.get('proxy') if proxy: - if not re.match(r'^[\da-zA-Z]+://', proxy): + if not re.match(r'[\da-zA-Z]+://', proxy): proxy = f'http://{proxy}' if proxy.startswith('socks'): @@ -559,7 +559,7 @@ class FFmpegFD(ExternalFD): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): - is_http = re.match(r'^https?://', fmt['url']) + is_http = re.match(r'https?://', fmt['url']) cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else [] if cookies: args.extend(['-cookies', ''.join( diff --git a/yt_dlp/extractor/academicearth.py b/yt_dlp/extractor/academicearth.py index d9691cb5c6..b997a02885 100644 --- a/yt_dlp/extractor/academicearth.py +++ b/yt_dlp/extractor/academicearth.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' + _VALID_URL = r'https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' IE_NAME = 'AcademicEarth:Course' _TEST = { 'url': 'http://academicearth.org/playlists/laws-of-nature/', diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 6fd6413479..efc79dd141 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -231,7 +231,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(InfoExtractor): IE_NAME = 'ARDMediathek' - _VALID_URL = r'''(?x)https:// + _VALID_URL = r'''(?x)https?:// (?:(?:beta|www)\.)?ardmediathek\.de/ (?:[^/]+/)? (?:player|live|video)/ @@ -470,7 +470,7 @@ class ARDBetaMediathekIE(InfoExtractor): class ARDMediathekCollectionIE(InfoExtractor): - _VALID_URL = r'''(?x)https:// + _VALID_URL = r'''(?x)https?:// (?:(?:beta|www)\.)?ardmediathek\.de/ (?:[^/?#]+/)? (?Psendung|serie|sammlung)/ diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index b7061a7d14..ee2e56f8e0 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -3,7 +3,7 @@ from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj class CallinIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P[-a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?P[-a-zA-Z]+)' _TESTS = [{ 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', 'info_dict': { diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 9501e5ec9a..486a4ea3cb 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2077,7 +2077,7 @@ class InfoExtractor: has_drm = HlsFD._has_drm(m3u8_doc) def format_url(url): - return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) + return url if re.match(r'https?://', url) else urllib.parse.urljoin(m3u8_url, url) if self.get_param('hls_split_discontinuity', False): def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None): @@ -2812,11 +2812,11 @@ class InfoExtractor: base_url_e = element.find(_add_ns('BaseURL')) if try_call(lambda: base_url_e.text) is not None: base_url = base_url_e.text + base_url - if re.match(r'^https?://', base_url): + if re.match(r'https?://', base_url): break if mpd_base_url and base_url.startswith('/'): base_url = urllib.parse.urljoin(mpd_base_url, base_url) - elif mpd_base_url and not re.match(r'^https?://', base_url): + elif mpd_base_url and not re.match(r'https?://', base_url): if not mpd_base_url.endswith('/'): mpd_base_url += '/' base_url = mpd_base_url + base_url @@ -2906,7 +2906,7 @@ class InfoExtractor: } def location_key(location): - return 'url' if re.match(r'^https?://', location) else 'path' + return 'url' if re.match(r'https?://', location) else 'path' if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index eac70f6a96..f7b883155c 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -14,7 +14,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' + _VALID_URL = r'(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 04cffaa861..592800287a 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2340,7 +2340,7 @@ class GenericIE(InfoExtractor): default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): - if re.match(r'^[^\s/]+\.[^\s/]+/', url): + if re.match(r'[^\s/]+\.[^\s/]+/', url): self.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': @@ -2400,7 +2400,7 @@ class GenericIE(InfoExtractor): # Check for direct link to a video content_type = full_response.headers.get('Content-Type', '').lower() - m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) + m = re.match(r'(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: self.report_detected('direct video link') headers = filter_dict({'Referer': smuggled_data.get('referer')}) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index 53b881011c..b7581d77e2 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -52,7 +52,7 @@ class GetCourseRuIE(InfoExtractor): _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' _VALID_URL = [ rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', - rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', + rf'{_BASE_URL_RE}/(?:pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', ] _TESTS = [{ 'url': 'http://academymel.online/3video_1', diff --git a/yt_dlp/extractor/golem.py b/yt_dlp/extractor/golem.py index 90d2fe6c26..964bf6519d 100644 --- a/yt_dlp/extractor/golem.py +++ b/yt_dlp/extractor/golem.py @@ -7,7 +7,7 @@ from ..utils import ( class GolemIE(InfoExtractor): - _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P.+?)/' + _VALID_URL = r'https?://video\.golem\.de/.+?/(?P.+?)/' _TEST = { 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py index 17673d5b8f..b5a7b14a58 100644 --- a/yt_dlp/extractor/hrfensehen.py +++ b/yt_dlp/extractor/hrfensehen.py @@ -13,7 +13,7 @@ from ..utils import ( class HRFernsehenIE(InfoExtractor): IE_NAME = 'hrfernsehen' - _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' + _VALID_URL = r'https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' _TESTS = [{ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', 'md5': '5c4e0ba94677c516a2f65a84110fc536', diff --git a/yt_dlp/extractor/japandiet.py b/yt_dlp/extractor/japandiet.py index 2ef091aff2..994da22ae0 100644 --- a/yt_dlp/extractor/japandiet.py +++ b/yt_dlp/extractor/japandiet.py @@ -194,11 +194,14 @@ class ShugiinItvVodIE(ShugiinItvBaseIE): class SangiinInstructionIE(InfoExtractor): - _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' + _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' IE_DESC = False # this shouldn't be listed as a supported site def _real_extract(self, url): - raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True) + raise ExtractorError( + 'Copy the link from the button below the video description/player ' + 'and use that link to download. If there is no button in the frame, ' + 'get the URL of the frame showing the video.', expected=True) class SangiinIE(InfoExtractor): diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index e5737b1e9e..6d51e32f6d 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -22,7 +22,7 @@ class KalturaIE(InfoExtractor): (?: kaltura:(?P\w+):(?P\w+)(?::(?P\w+))?| https?:// - (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ + (?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ (?: (?: # flash player diff --git a/yt_dlp/extractor/mailru.py b/yt_dlp/extractor/mailru.py index cca678f14a..0496a87f00 100644 --- a/yt_dlp/extractor/mailru.py +++ b/yt_dlp/extractor/mailru.py @@ -126,7 +126,7 @@ class MailRuIE(InfoExtractor): video_data = None # fix meta_url if missing the host address - if re.match(r'^\/\+\/', meta_url): + if re.match(r'\/\+\/', meta_url): meta_url = urljoin('https://my.mail.ru', meta_url) if meta_url: diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index d5dda06f99..c793626fde 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -16,7 +16,7 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/[bv]/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' IE_NAME = 'MangoTV' diff --git a/yt_dlp/extractor/mit.py b/yt_dlp/extractor/mit.py index e75c540a23..66c3b07936 100644 --- a/yt_dlp/extractor/mit.py +++ b/yt_dlp/extractor/mit.py @@ -65,7 +65,7 @@ class TechTVMITIE(InfoExtractor): class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P[a-z0-9\-]+)' + _VALID_URL = r'https?://ocw\.mit\.edu/courses/(?P[a-z0-9\-]+)' _BASE_URL = 'http://ocw.mit.edu/' _TESTS = [ diff --git a/yt_dlp/extractor/nzonscreen.py b/yt_dlp/extractor/nzonscreen.py index 5fc516daf4..755039804e 100644 --- a/yt_dlp/extractor/nzonscreen.py +++ b/yt_dlp/extractor/nzonscreen.py @@ -10,7 +10,7 @@ from ..utils import ( class NZOnScreenIE(InfoExtractor): - _VALID_URL = r'^https?://www\.nzonscreen\.com/title/(?P[^/?#]+)' + _VALID_URL = r'https?://www\.nzonscreen\.com/title/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.nzonscreen.com/title/shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982', 'info_dict': { diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 679dc63234..e1e9777e8e 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -628,8 +628,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - for e in page_entries: - yield e + yield from page_entries if not self._has_more(webpage): break diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index ff21963541..9d90439841 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -16,7 +16,7 @@ from ..utils import ( class RadioFranceIE(InfoExtractor): - _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' + _VALID_URL = r'https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' IE_NAME = 'radiofrance' _TEST = { diff --git a/yt_dlp/extractor/reverbnation.py b/yt_dlp/extractor/reverbnation.py index ddf8c3753f..f3bcc2c328 100644 --- a/yt_dlp/extractor/reverbnation.py +++ b/yt_dlp/extractor/reverbnation.py @@ -6,7 +6,7 @@ from ..utils import ( class ReverbNationIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P\d+).*?$' + _VALID_URL = r'https?://(?:www\.)?reverbnation\.com/.*?/song/(?P\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', diff --git a/yt_dlp/extractor/tele13.py b/yt_dlp/extractor/tele13.py index c5ca208fb4..0d721773ed 100644 --- a/yt_dlp/extractor/tele13.py +++ b/yt_dlp/extractor/tele13.py @@ -8,7 +8,7 @@ from ..utils import ( class Tele13IE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 53b4084694..bf9c6348cb 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -270,7 +270,7 @@ class TwitCastingLiveIE(InfoExtractor): class TwitCastingUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P[^/?#]+)/(:?show|archive)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P[^/?#]+)/(?:show|archive)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://twitcasting.tv/natsuiromatsuri/archive/', 'info_dict': { diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index 01e59352bf..f4ed96bf62 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -90,7 +90,7 @@ class ViuIE(ViuBaseIE): formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') for key, value in video_data.items(): - mobj = re.match(r'^subtitle_(?P[^_]+)_(?P(vtt|srt))', key) + mobj = re.match(r'subtitle_(?P[^_]+)_(?P(vtt|srt))', key) if not mobj: continue subtitles.setdefault(mobj.group('lang'), []).append({ diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index d63964a004..02bf6a7beb 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -21,7 +21,7 @@ class XimalayaBaseIE(InfoExtractor): class XimalayaIE(XimalayaBaseIE): IE_NAME = 'ximalaya' IE_DESC = '喜马拉雅FM' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(:?(?P\d+)/)?sound/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?:(?P\d+)/)?sound/(?P[0-9]+)' _TESTS = [ { 'url': 'http://www.ximalaya.com/sound/47740352/', diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index 21b765b91d..ec55567dae 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -33,8 +33,8 @@ if not websockets: import websockets.version websockets_version = tuple(map(int_or_none, websockets.version.version.split('.'))) -if websockets_version < (12, 0): - raise ImportError('Only websockets>=12.0 is supported') +if websockets_version < (13, 0): + raise ImportError('Only websockets>=13.0 is supported') import websockets.sync.client from websockets.uri import parse_uri diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 8077d5d88f..9980b7fc3f 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -647,16 +647,16 @@ def create_parser(): 'You can also simply specify a field to match if the field is present, ' 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, ' - 'the filter matches if at least one of the conditions is met. E.g. --match-filter ' - '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' + 'the filter matches if at least one of the conditions is met. E.g. --match-filters ' + '!is_live --match-filters "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' 'matches only videos that are not live OR those that have a like count more than 100 ' '(or the like field is not available) and also has a description ' 'that contains the phrase "cats & dogs" (caseless). ' - 'Use "--match-filter -" to interactively ask whether to download each video')) + 'Use "--match-filters -" to interactively ask whether to download each video')) selection.add_option( '--no-match-filters', dest='match_filter', action='store_const', const=None, - help='Do not use any --match-filter (default)') + help='Do not use any --match-filters (default)') selection.add_option( '--break-match-filters', metavar='FILTER', dest='breaking_match_filter', action='append', @@ -704,7 +704,7 @@ def create_parser(): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Alters --max-downloads, --break-on-existing, --break-match-filter, and autonumber to reset per input URL') + help='Alters --max-downloads, --break-on-existing, --break-match-filters, and autonumber to reset per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 6cf9ab62ea..b3fc8b54a8 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -33,7 +33,7 @@ class SponsorBlockPP(FFmpegPostProcessor): def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): FFmpegPostProcessor.__init__(self, downloader) self._categories = tuple(categories or self.CATEGORIES.keys()) - self._API_URL = api if re.match('^https?://', api) else 'https://' + api + self._API_URL = api if re.match('https?://', api) else 'https://' + api def run(self, info): extractor = info['extractor_key'] diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 184794f95a..e1b3c48d63 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1954,7 +1954,7 @@ def urljoin(base, path): path = path.decode() if not isinstance(path, str) or not path: return None - if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): + if re.match(r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode() @@ -2007,7 +2007,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): @@ -3113,7 +3113,7 @@ def is_html(first_bytes): while first_bytes.startswith(bom): encoding, first_bytes = enc, first_bytes[len(bom):] - return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) + return re.match(r'\s*<', first_bytes.decode(encoding, 'replace')) def determine_protocol(info_dict): From 5945fc1945a4001537072e39f03725f944437834 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 27 Sep 2024 23:01:13 +0000 Subject: [PATCH 059/216] Release 2024.09.27 Created by: bashonly :ci skip all --- CONTRIBUTORS | 16 +++++++++ Changelog.md | 90 +++++++++++++++++++++++++++++++++++++++++++++++ supportedsites.md | 14 ++++++-- yt_dlp/version.py | 6 ++-- 4 files changed, 120 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 489ab7da8b..c80f714055 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -657,3 +657,19 @@ luvyana szantnerb hugepower scribblemaniac +Codenade +Demon000 +Deukhoofd +grqz +hibes +Khaoklong51 +kieraneglin +lengzuo +naglis +ndyanx +otovalek +quad +rakslice +sahilsinghss73 +tony-hn +xingchensong diff --git a/Changelog.md b/Changelog.md index 0b96ab29cd..2ef28fa07a 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,96 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.09.27 + +#### Important changes +- **The minimum *recommended* Python version has been raised to 3.9** +Since Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086) + +#### Core changes +- [Allow `none` arg to negate `--convert-subs` and `--convert-thumbnails`](https://github.com/yt-dlp/yt-dlp/commit/c08e0b20b5edd8957b8318716bc14e896d1b96f4) ([#11066](https://github.com/yt-dlp/yt-dlp/issues/11066)) by [kieraneglin](https://github.com/kieraneglin) +- [Fix format sorting bug with vp9.2 vcodec](https://github.com/yt-dlp/yt-dlp/commit/8f4ea14680c7865d8ffac10a9174205d1d84ada7) ([#10884](https://github.com/yt-dlp/yt-dlp/issues/10884)) by [rakslice](https://github.com/rakslice) +- [Raise minimum recommended Python version to 3.9](https://github.com/yt-dlp/yt-dlp/commit/cca534cd9e6850c70244f225a4a1895ef4bcdbec) ([#11098](https://github.com/yt-dlp/yt-dlp/issues/11098)) by [bashonly](https://github.com/bashonly) +- **cookies**: [Improve error message for Windows `--cookies-from-browser chrome` issue](https://github.com/yt-dlp/yt-dlp/commit/b397a64691421ace5df09457c2a764821a2dc6f2) ([#11090](https://github.com/yt-dlp/yt-dlp/issues/11090)) by [seproDev](https://github.com/seproDev) +- **utils**: `mimetype2ext`: [Recognize `aacp` as `aac`](https://github.com/yt-dlp/yt-dlp/commit/cc85596d5b59f0c14e9381b3675f619c1e12e597) ([#10860](https://github.com/yt-dlp/yt-dlp/issues/10860)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- [Fix JW Player format parsing](https://github.com/yt-dlp/yt-dlp/commit/409f8e9e3b4bde81ef76fc563256f876d2ff8099) ([#10956](https://github.com/yt-dlp/yt-dlp/issues/10956)) by [seproDev](https://github.com/seproDev) +- [Handle decode errors when reading responses](https://github.com/yt-dlp/yt-dlp/commit/325001317d97f4545d66fac44c4ba772c6f45f22) ([#10868](https://github.com/yt-dlp/yt-dlp/issues/10868)) by [bashonly](https://github.com/bashonly) +- **abc.net.au**: iview, showseries: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7f909046f4dc0fba472b4963145aef6e0d42491b) ([#11101](https://github.com/yt-dlp/yt-dlp/issues/11101)) by [bashonly](https://github.com/bashonly) +- **adn**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/cc88a54bb1ef285154775f8a6a413335ce4c71ce) ([#10749](https://github.com/yt-dlp/yt-dlp/issues/10749)) by [infanf](https://github.com/infanf) +- **asobistage**: [Support redirected URLs](https://github.com/yt-dlp/yt-dlp/commit/a7d3235c84dac57a127cbe0ff38f7f7c2fdd8fa0) ([#10768](https://github.com/yt-dlp/yt-dlp/issues/10768)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **bandcamp**: user: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5d0176547f16a3642cd71627126e9dfc24981e20) ([#10328](https://github.com/yt-dlp/yt-dlp/issues/10328)) by [bashonly](https://github.com/bashonly), [quad](https://github.com/quad) +- **beacon**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b4760c778d0c92c6e3f2bc8346cd72c8f08595ae) ([#9901](https://github.com/yt-dlp/yt-dlp/issues/9901)) by [Deukhoofd](https://github.com/Deukhoofd) +- **bilibili** + - [Fix chapters and subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/a2000bc85730c950351d78bb818493dc39dca3cb) ([#11099](https://github.com/yt-dlp/yt-dlp/issues/11099)) by [bashonly](https://github.com/bashonly) + - [Fix festival URL support](https://github.com/yt-dlp/yt-dlp/commit/b43bd864851f2862e26caa85461c5d825d49d463) ([#10740](https://github.com/yt-dlp/yt-dlp/issues/10740)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **biliintl**: [Fix referer header](https://github.com/yt-dlp/yt-dlp/commit/a06bb586795ebab87a2356923acfc674d6f0e152) ([#11003](https://github.com/yt-dlp/yt-dlp/issues/11003)) by [Khaoklong51](https://github.com/Khaoklong51) +- **dropbox**: [Fix password-protected video support](https://github.com/yt-dlp/yt-dlp/commit/63da31b3b29af90062d8a72a905ffe4b5e499042) ([#10735](https://github.com/yt-dlp/yt-dlp/issues/10735)) by [ndyanx](https://github.com/ndyanx) +- **ertgr**: [Fix video extraction](https://github.com/yt-dlp/yt-dlp/commit/416686ed0cf792ec44ab059f3b229dd776077e14) ([#11091](https://github.com/yt-dlp/yt-dlp/issues/11091)) by [seproDev](https://github.com/seproDev) +- **eurosport**: [Support local URL variants](https://github.com/yt-dlp/yt-dlp/commit/f0bb28504c8c2b75ee3e5796aed50de2a7f90a1b) ([#10785](https://github.com/yt-dlp/yt-dlp/issues/10785)) by [seproDev](https://github.com/seproDev) +- **facebook** + - ads: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d62fef7e07d454c0d2ba2d69fb96d691dba1ded0) ([#10704](https://github.com/yt-dlp/yt-dlp/issues/10704)) by [kclauhk](https://github.com/kclauhk) + - reel: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/0e1b941c6b2caa688b0d3332e723d16dbafa4311) by [lengzuo](https://github.com/lengzuo) +- **germanupa**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/124f058b546d652a359c67025bb479789bfbef0b) ([#10538](https://github.com/yt-dlp/yt-dlp/issues/10538)) by [grqz](https://github.com/grqz) +- **hgtvde**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a555389c9bb32e589e00b4664974423fb7b04dcd) ([#10992](https://github.com/yt-dlp/yt-dlp/issues/10992)) by [bashonly](https://github.com/bashonly), [rdamas](https://github.com/rdamas) +- **huya**: video: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/25c1cdaa2650563494d3bf00a38f72d0d9486bff) ([#10686](https://github.com/yt-dlp/yt-dlp/issues/10686)) by [hugepower](https://github.com/hugepower) +- **iprima**: [Fix zoom URL support](https://github.com/yt-dlp/yt-dlp/commit/4a27b8f092f7f7c10b7a334d3535c97c2af02f0a) ([#10959](https://github.com/yt-dlp/yt-dlp/issues/10959)) by [otovalek](https://github.com/otovalek) +- **khanacademy**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0fba08485b6445b72b5b63ae23ca2a73fa5d967f) ([#10913](https://github.com/yt-dlp/yt-dlp/issues/10913)) by [seproDev](https://github.com/seproDev) +- **kick** + - clips: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/0aa4426e9a35f7f8e184f1f2082b3b313c1448f7) ([#11107](https://github.com/yt-dlp/yt-dlp/issues/11107)) by [bashonly](https://github.com/bashonly) + - vod: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/173d54c151b987409e3eb09552d8d89ed8fc50f7) ([#10988](https://github.com/yt-dlp/yt-dlp/issues/10988)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **kika**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e6f48ca80821939c1fd11ec2a0cdbf2fba9b258a) ([#5788](https://github.com/yt-dlp/yt-dlp/issues/5788)) by [1100101](https://github.com/1100101) +- **lnkgo**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/fa83d0b36bc43d30fe9241c1e923f4614864b758) ([#10904](https://github.com/yt-dlp/yt-dlp/issues/10904)) by [naglis](https://github.com/naglis) +- **loom**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/7509d692b37a7ec6230ea75bfe1e44a8de5eefce) ([#10760](https://github.com/yt-dlp/yt-dlp/issues/10760)) by [kclauhk](https://github.com/kclauhk) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/e2b3634e299be9c16a247ece3b1858d83889c324) ([#11083](https://github.com/yt-dlp/yt-dlp/issues/11083)) by [szantnerb](https://github.com/szantnerb) +- **mojevideo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/28b0ecba2af5b4919f198474b3d00a76ef322c31) ([#11019](https://github.com/yt-dlp/yt-dlp/issues/11019)) by [04-pasha-04](https://github.com/04-pasha-04), [pzhlkj6612](https://github.com/pzhlkj6612) +- **niconico**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/eabb4680fdb09ba1f48d174a700a2e3b43f82add) ([#11103](https://github.com/yt-dlp/yt-dlp/issues/11103)) by [bashonly](https://github.com/bashonly) +- **nzz**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4a9bc8c3630378bc29f0266126b503f6190c0430) ([#10461](https://github.com/yt-dlp/yt-dlp/issues/10461)) by [1-Byte](https://github.com/1-Byte) +- **patreoncampaign**: [Support API URLs](https://github.com/yt-dlp/yt-dlp/commit/232e6db30c474d1b387e405342f34173ceeaf832) ([#10734](https://github.com/yt-dlp/yt-dlp/issues/10734)) by [bashonly](https://github.com/bashonly), [hibes](https://github.com/hibes) +- **pinterest**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c8c078fe28b0ffc15ef9646346c00c592fe71a78) ([#10867](https://github.com/yt-dlp/yt-dlp/issues/10867)) by [bashonly](https://github.com/bashonly), [sahilsinghss73](https://github.com/sahilsinghss73) +- **radiko**: [Extract unique `id` values](https://github.com/yt-dlp/yt-dlp/commit/c8d096c5ce111411fbdbe2abb8fed54f317a6182) ([#10726](https://github.com/yt-dlp/yt-dlp/issues/10726)) by [garret1317](https://github.com/garret1317) +- **rtp**: [Support more subpages](https://github.com/yt-dlp/yt-dlp/commit/d02df303d8e49390599db9f34482697e4d1cf5b2) ([#10787](https://github.com/yt-dlp/yt-dlp/issues/10787)) by [Demon000](https://github.com/Demon000) +- **rumblechannel**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ad0b857f459a6d390fbf124183916218c52f223a) ([#11049](https://github.com/yt-dlp/yt-dlp/issues/11049)) by [tony-hn](https://github.com/tony-hn) +- **rutube**: [Support livestreams](https://github.com/yt-dlp/yt-dlp/commit/41be32e78c3845000dbac188ffb90ea3ea7c4dfa) ([#10844](https://github.com/yt-dlp/yt-dlp/issues/10844)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **samplefocus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/46f4c80bc363ee8116c33d37f65202e6c3470954) ([#10947](https://github.com/yt-dlp/yt-dlp/issues/10947)) by [seproDev](https://github.com/seproDev) +- **screenrec**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/36f9e602ad55679764bc75a4f67f7562b1d6adcf) ([#10917](https://github.com/yt-dlp/yt-dlp/issues/10917)) by [naglis](https://github.com/naglis) +- **sen**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/41a241ca6ffb95b3d9aaf4f42106ca8cba9af1a6) ([#10952](https://github.com/yt-dlp/yt-dlp/issues/10952)) by [seproDev](https://github.com/seproDev) +- **servus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/300c91274f7ea5b1b0528fc5ee11cf1a61d4079e) ([#10944](https://github.com/yt-dlp/yt-dlp/issues/10944)) by [seproDev](https://github.com/seproDev) +- **snapchatspotlight**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b37417e4f934fd8909788b493d017777155b0ae5) ([#11030](https://github.com/yt-dlp/yt-dlp/issues/11030)) by [seproDev](https://github.com/seproDev) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5a8a05aebb49693e78e1123015837ed5e961ff76) ([#11010](https://github.com/yt-dlp/yt-dlp/issues/11010)) by [diman8](https://github.com/diman8) +- **tenplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d8d473002b654ab0e7b97ead869f58b4361eeae1) ([#10928](https://github.com/yt-dlp/yt-dlp/issues/10928)) by [aarubui](https://github.com/aarubui) +- **tiktok**: [Fix web formats extraction](https://github.com/yt-dlp/yt-dlp/commit/3ad0b7f422d547204df687b6d0b2d9110fff3990) ([#11074](https://github.com/yt-dlp/yt-dlp/issues/11074)) by [bashonly](https://github.com/bashonly) +- **twitter**: spaces: [Support video spaces](https://github.com/yt-dlp/yt-dlp/commit/bef1d4d6fc9493fda7f75e2289c07c507d10092f) ([#10789](https://github.com/yt-dlp/yt-dlp/issues/10789)) by [bashonly](https://github.com/bashonly) +- **vidflex**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e978c312d6550a6ae4c9df18001afb1b420cb72f) ([#10002](https://github.com/yt-dlp/yt-dlp/issues/10002)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **vimeo** + - [Always try to extract original format](https://github.com/yt-dlp/yt-dlp/commit/4115c24d157c5b5f63089d75c4e0f51d1f8b4489) ([#10721](https://github.com/yt-dlp/yt-dlp/issues/10721)) by [bashonly](https://github.com/bashonly) (With fixes in [e8e6a98](https://github.com/yt-dlp/yt-dlp/commit/e8e6a982a1b659eed434d225d7922f632bac6568) by [seproDev](https://github.com/seproDev)) + - [Fix HLS audio format sorting](https://github.com/yt-dlp/yt-dlp/commit/a1b4ac2b8ed8e6eaa56044d439f1e0d00c2ba218) ([#11082](https://github.com/yt-dlp/yt-dlp/issues/11082)) by [fireattack](https://github.com/fireattack) +- **watchespn**: [Improve auth support](https://github.com/yt-dlp/yt-dlp/commit/7adff8caf152dcf96d03aff69ed8545c0a63567c) ([#10910](https://github.com/yt-dlp/yt-dlp/issues/10910)) by [ischmidt20](https://github.com/ischmidt20) +- **wistia**: [Support password-protected videos](https://github.com/yt-dlp/yt-dlp/commit/9f5c9a90898c5a1e672922d9cd799716c73cee34) ([#11100](https://github.com/yt-dlp/yt-dlp/issues/11100)) by [bashonly](https://github.com/bashonly) +- **ximalaya**: [Add VIP support](https://github.com/yt-dlp/yt-dlp/commit/3dfd720d098b4d49d69cfc77e6376f22bcd90934) ([#10832](https://github.com/yt-dlp/yt-dlp/issues/10832)) by [seproDev](https://github.com/seproDev), [xingchensong](https://github.com/xingchensong) +- **xinpianchang**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3aa0156e05662923d130ddbc1c82596e38c01a00) ([#10950](https://github.com/yt-dlp/yt-dlp/issues/10950)) by [seproDev](https://github.com/seproDev) +- **yleareena**: [Support podcasts](https://github.com/yt-dlp/yt-dlp/commit/48d629d461e05b1b19f5e53dc959bb9ebe95da42) ([#11104](https://github.com/yt-dlp/yt-dlp/issues/11104)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `po_token`, `visitor_data`, `data_sync_id` extractor args](https://github.com/yt-dlp/yt-dlp/commit/3a3bd00037e9908e87da4fa9f2ad772aa34dc60e) ([#10648](https://github.com/yt-dlp/yt-dlp/issues/10648)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [seproDev](https://github.com/seproDev) (With fixes in [fa2be9a](https://github.com/yt-dlp/yt-dlp/commit/fa2be9a7c63babede07480151363e54eee5702bd) by [bashonly](https://github.com/bashonly)) + - [Support excluding `player_client`s in extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/49f3741a820ed142f6866317c2e7d247b130960e) ([#10710](https://github.com/yt-dlp/yt-dlp/issues/10710)) by [bashonly](https://github.com/bashonly) + - clip: [Prioritize `https` formats](https://github.com/yt-dlp/yt-dlp/commit/1d84b780cf33a1d84756825ac23f990a905703df) ([#11102](https://github.com/yt-dlp/yt-dlp/issues/11102)) by [bashonly](https://github.com/bashonly) + - tab: [Fix shorts tab extraction](https://github.com/yt-dlp/yt-dlp/commit/9431777b4c37129a6093080c77ca59960afbb9d7) ([#10938](https://github.com/yt-dlp/yt-dlp/issues/10938)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- [Fix handler not being added to RequestError](https://github.com/yt-dlp/yt-dlp/commit/d1c4d88b2d912e8da5e76db455562ca63b1af690) ([#10955](https://github.com/yt-dlp/yt-dlp/issues/10955)) by [coletdjnz](https://github.com/coletdjnz) +- [Pin `curl-cffi` version to < 0.7.2](https://github.com/yt-dlp/yt-dlp/commit/5bb1aa04dafce13ba9de707ea53169fab58b5207) ([#11092](https://github.com/yt-dlp/yt-dlp/issues/11092)) by [bashonly](https://github.com/bashonly) +- **Request Handler**: websockets: [Upgrade websockets to 13.0](https://github.com/yt-dlp/yt-dlp/commit/6f9e6537434562d513d0c9b68ced8a61ade94a64) ([#10815](https://github.com/yt-dlp/yt-dlp/issues/10815)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Bump PyInstaller version pin to `>=6.10.0`](https://github.com/yt-dlp/yt-dlp/commit/fb8b7f226d251e521a89b23c415e249e5b788e5c) ([#10709](https://github.com/yt-dlp/yt-dlp/issues/10709)) by [bashonly](https://github.com/bashonly) + - [Pin `delocate` version for `macos`](https://github.com/yt-dlp/yt-dlp/commit/7e41628ff523b3fe373b0981a5db441358980dab) ([#10901](https://github.com/yt-dlp/yt-dlp/issues/10901)) by [bashonly](https://github.com/bashonly) +- **ci** + - [Add comment sanitization workflow](https://github.com/yt-dlp/yt-dlp/commit/b6200bdcf3a9415ae36859188f9a57e3e461c696) ([#10915](https://github.com/yt-dlp/yt-dlp/issues/10915)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Add issue tracker anti-spam protection](https://github.com/yt-dlp/yt-dlp/commit/ad9a8115aa29a1a95c961b16fcf129a228d98f50) ([#10861](https://github.com/yt-dlp/yt-dlp/issues/10861)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [c6387ab](https://github.com/yt-dlp/yt-dlp/commit/c6387abc1af9842bb0541288a5610abba9b1ab51) by [bashonly](https://github.com/bashonly), [Codenade](https://github.com/Codenade), [coletdjnz](https://github.com/coletdjnz), [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [pzhlkj6612](https://github.com/pzhlkj6612), [seproDev](https://github.com/seproDev) + ### 2024.08.06 #### Core changes diff --git a/supportedsites.md b/supportedsites.md index e3bbe03ec7..e23d395fde 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -143,6 +143,7 @@ - **BBVTV**: [*bbvtv*](## "netrc machine") - **BBVTVLive**: [*bbvtv*](## "netrc machine") - **BBVTVRecordings**: [*bbvtv*](## "netrc machine") + - **BeaconTv** - **BeatBumpPlaylist** - **BeatBumpVideo** - **Beatport** @@ -505,6 +506,7 @@ - **gem.cbc.ca:playlist** - **Genius** - **GeniusLyrics** + - **Germanupa**: germanupa.de - **GetCourseRu**: [*getcourseru*](## "netrc machine") - **GetCourseRuPlayer** - **Gettr** @@ -580,6 +582,7 @@ - **HungamaAlbumPlaylist** - **HungamaSong** - **huya:live**: huya.com + - **huya:video**: 虎牙视频 - **Hypem** - **Hytale** - **Icareus** @@ -660,6 +663,7 @@ - **kick:vod** - **Kicker** - **KickStarter** + - **Kika**: KiKA.de - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -722,7 +726,6 @@ - **livestream:original** - **Livestreamfails** - **Lnk** - - **LnkGo** - **loc**: Library of Congress - **loom** - **loom:folder** @@ -756,7 +759,7 @@ - **Masters** - **MatchTV** - **MBN**: mbn.co.kr (매일방송) - - **MDR**: MDR.DE and KiKA + - **MDR**: MDR.DE - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** @@ -811,6 +814,7 @@ - **MNetTVLive**: [*mnettv*](## "netrc machine") - **MNetTVRecordings**: [*mnettv*](## "netrc machine") - **MochaVideo** + - **Mojevideo**: mojevideo.sk - **Mojvideo** - **Monstercat** - **MonsterSirenHypergryphMusic** @@ -1285,12 +1289,14 @@ - **Screencast** - **Screencastify** - **ScreencastOMatic** + - **ScreenRec** - **ScrippsNetworks** - **scrippsnetworks:watch** - **Scrolller** - **SCTE**: [*scte*](## "netrc machine") (**Currently broken**) - **SCTECourse**: [*scte*](## "netrc machine") (**Currently broken**) - **sejm** + - **Sen** - **SenalColombiaLive**: (**Currently broken**) - **SenateGov** - **SenateISVP** @@ -1327,6 +1333,7 @@ - **SlidesLive** - **Slutload** - **Smotrim** + - **SnapchatSpotlight** - **Snotr** - **Sohu** - **SohuV** @@ -1608,6 +1615,7 @@ - **videomore:season** - **videomore:video** - **VideoPress** + - **Vidflex** - **Vidio**: [*vidio*](## "netrc machine") - **VidioLive**: [*vidio*](## "netrc machine") - **VidioPremier**: [*vidio*](## "netrc machine") @@ -1736,7 +1744,7 @@ - **XiaoHongShu**: 小红书 - **ximalaya**: 喜马拉雅FM - **ximalaya:album**: 喜马拉雅FM 专辑 - - **xinpianchang**: xinpianchang.com (**Currently broken**) + - **Xinpianchang**: 新片场 - **XMinus**: (**Currently broken**) - **XNXX** - **Xstream** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 6633a11b91..76b8bf0ee2 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.08.06' +__version__ = '2024.09.27' -RELEASE_GIT_HEAD = '4d9231208332d4c32364b8cd814bff8b20232cae' +RELEASE_GIT_HEAD = 'c6387abc1af9842bb0541288a5610abba9b1ab51' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.08.06' +_pkg_version = '2024.09.27' From 6328e2e67a4e126e08af382e6a387073082d5c5f Mon Sep 17 00:00:00 2001 From: Corey Wright Date: Sun, 29 Sep 2024 16:03:39 -0500 Subject: [PATCH 060/216] [ie/ApplePodcasts] Fix extractor (#10903) Closes #10809 Authored by: coreywright --- yt_dlp/extractor/applepodcasts.py | 78 ++++++++++++++----------------- yt_dlp/extractor/common.py | 2 +- 2 files changed, 36 insertions(+), 44 deletions(-) diff --git a/yt_dlp/extractor/applepodcasts.py b/yt_dlp/extractor/applepodcasts.py index bd301e904a..b99d24e0eb 100644 --- a/yt_dlp/extractor/applepodcasts.py +++ b/yt_dlp/extractor/applepodcasts.py @@ -1,27 +1,42 @@ from .common import InfoExtractor from ..utils import ( - clean_html, clean_podcast_url, - get_element_by_class, int_or_none, parse_iso8601, - try_get, ) +from ..utils.traversal import traverse_obj class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/ferreck-dawn-to-the-break-of-dawn-117/id1625658232?i=1000665010654', + 'md5': '82cc219b8cc1dcf8bfc5a5e99b23b172', + 'info_dict': { + 'id': '1000665010654', + 'ext': 'mp3', + 'title': 'Ferreck Dawn - To The Break of Dawn 117', + 'episode': 'Ferreck Dawn - To The Break of Dawn 117', + 'description': 'md5:1fc571102f79dbd0a77bfd71ffda23bc', + 'upload_date': '20240812', + 'timestamp': 1723449600, + 'duration': 3596, + 'series': 'Ferreck Dawn - To The Break of Dawn', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', + }, + }, { 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': '41dc31cd650143e530d9423b6b5a344f', + 'md5': 'baf8a6b8b8aa6062dbb4639ed73d0052', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', + 'episode': '207 - Whitney Webb Returns', + 'episode_number': 207, 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', 'timestamp': 1593932400, - 'duration': 6454, + 'duration': 5369, 'series': 'The Tim Dillon Show', 'thumbnail': 're:.+[.](png|jpe?g|webp)', }, @@ -39,47 +54,24 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - episode_data = {} - ember_data = {} - # new page type 2021-11 - amp_data = self._parse_json(self._search_regex( - r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', - webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} - amp_data = try_get(amp_data, - lambda a: self._parse_json( - next(a[x] for x in iter(a) if episode_id in x), - episode_id), - dict) or {} - amp_data = amp_data.get('d') or [] - episode_data = try_get( - amp_data, - lambda a: next(x for x in a - if x['type'] == 'podcast-episodes' and x['id'] == episode_id), - dict) - if not episode_data: - # try pre 2021-11 page type: TODO: consider deleting if no longer used - ember_data = self._parse_json(self._search_regex( - r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) or {} - ember_data = ember_data.get(episode_id) or ember_data - episode_data = try_get(ember_data, lambda x: x['data'], dict) - episode = episode_data['attributes'] - description = episode.get('description') or {} - - series = None - for inc in (amp_data or ember_data.get('included') or []): - if inc.get('type') == 'media/podcast': - series = try_get(inc, lambda x: x['attributes']['name']) - series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) + server_data = self._search_json( + r'', webpage), + (..., {js_to_json}, {json.loads}, ..., {self._find_json}, ...)) + meta = traverse_obj(nextjs_data, ( + ..., lambda _, v: v['meta']['path'] == urllib.parse.urlparse(url).path, 'meta', any)) + + video_id = meta['uuid'] + info_dict = traverse_obj(meta, { + 'title': ('title', {str}), + 'description': ('description', {str.strip}), + }) + + if traverse_obj(meta, ('program', 'subtype')) != 'movie': + for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)): + episode_data = traverse_obj( + season_data, ('videos', lambda _, v: v['videoId'] == video_id, any)) + if not episode_data: + continue + + episode_title = traverse_obj( + episode_data, 'contextualTitle', 'episodeTitle', expected_type=str) + info_dict.update({ + 'title': episode_title or info_dict.get('title'), + 'series': remove_end(info_dict.get('title'), f' - {episode_title}'), + 'season_number': traverse_obj(season_data, ('season', {int_or_none})), + 'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})), + }) + break api = self._download_json( f'https://api.goplay.be/web/v1/videos/long-form/{video_id}', From a9f85670d03ab993dc589f21a9ffffcad61392d5 Mon Sep 17 00:00:00 2001 From: manav_chaudhary <100396248+manavchaudhary1@users.noreply.github.com> Date: Tue, 12 Nov 2024 04:11:56 +0530 Subject: [PATCH 156/216] [ie/Chaturbate] Support alternate domains (#10595) Closes #10594 Authored by: manavchaudhary1 --- yt_dlp/extractor/chaturbate.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index b49f741efa..864d61f9c2 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -9,7 +9,7 @@ from ..utils import ( class ChaturbateIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.(?Pcom|eu|global)/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://www.chaturbate.com/siswet19/', 'info_dict': { @@ -29,15 +29,24 @@ class ChaturbateIE(InfoExtractor): }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, + }, { + 'url': 'https://chaturbate.eu/siswet19/', + 'only_matching': True, + }, { + 'url': 'https://chaturbate.eu/fullvideo/?b=caylin', + 'only_matching': True, + }, { + 'url': 'https://chaturbate.global/siswet19/', + 'only_matching': True, }] _ROOM_OFFLINE = 'Room is currently offline' def _real_extract(self, url): - video_id = self._match_id(url) + video_id, tld = self._match_valid_url(url).group('id', 'tld') webpage = self._download_webpage( - f'https://chaturbate.com/{video_id}/', video_id, + f'https://chaturbate.{tld}/{video_id}/', video_id, headers=self.geo_verification_headers()) found_m3u8_urls = [] From bacc31b05a04181b63100c481565256b14813a5e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 12 Nov 2024 23:23:10 +0000 Subject: [PATCH 157/216] [ie/facebook] Fix formats extraction (#11513) Closes #11497 Authored by: bashonly --- yt_dlp/extractor/facebook.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 2bcb5a8411..91e2f3489c 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) - def extract_dash_manifest(video, formats): + def extract_dash_manifest(vid_data, formats, mpd_url=None): dash_manifest = traverse_obj( - video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str) + vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str) if dash_manifest: formats.extend(self._parse_mpd_formats( compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), - mpd_url=url_or_none(video.get('dash_manifest_url')))) + mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url)) def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around @@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor): video = video['creation_story'] video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video.update(reel_info) - fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video + formats = [] q = qualities(['sd', 'hd']) + + # Legacy formats extraction + fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('browser_native_sd_url', 'sd')): @@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor): if not playable_url: continue if determine_ext(playable_url) == 'mpd': - formats.extend(self._extract_mpd_formats(playable_url, video_id)) + formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(fmt_data, formats) + + # New videoDeliveryResponse formats extraction + fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult')) + mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none})) + dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml'])) + for idx, dash_manifest in enumerate(dash_manifests): + extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx)) + if not dash_manifests: + # Only extract from MPD URLs if the manifests are not already provided + for mpd_url in mpd_urls: + formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False)) + for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])): + format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower})) + formats.append({ + 'format_id': format_id, + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, + 'url': prog_fmt['progressive_url'], + }) + for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls')) + if not formats: # Do not append false positive entry w/o any formats return From f2a4983df7a64c4e93b56f79dbd16a781bd90206 Mon Sep 17 00:00:00 2001 From: Jackson Humphrey Date: Tue, 12 Nov 2024 17:26:18 -0600 Subject: [PATCH 158/216] [ie/archive.org] Fix comments extraction (#11527) Closes #11526 Authored by: jshumphrey --- yt_dlp/extractor/archiveorg.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index f5a55efc4f..2849d9fd5b 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor): }, }, ], + }, { + # The reviewbody is None for one of the reviews; just need to extract data without crashing + 'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn', + 'info_dict': { + 'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn', + 'ext': 'mp3', + 'title': 'Stuck Inside of Mobile with the Memphis Blues Again', + 'creators': ['Grateful Dead'], + 'duration': 338.31, + 'track': 'Stuck Inside of Mobile with the Memphis Blues Again', + 'description': 'md5:764348a470b986f1217ffd38d6ac7b72', + 'display_id': 'gd95-04-02d1t04.shn', + 'location': 'Pyramid Arena', + 'uploader': 'jon@archive.org', + 'album': '1995-04-02 - Pyramid Arena', + 'upload_date': '20040519', + 'track_number': 4, + 'release_date': '19950402', + 'timestamp': 1084927901, + }, }] @staticmethod @@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor): info['comments'].append({ 'id': review.get('review_id'), 'author': review.get('reviewer'), - 'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), + 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'), 'timestamp': unified_timestamp(review.get('createdate')), 'parent': 'root'}) From 39d79c9b9cf23411d935910685c40aa1a2fdb409 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Fri, 15 Nov 2024 22:06:15 +0100 Subject: [PATCH 159/216] [utils] Fix `join_nonempty`, add `**kwargs` to `unpack` (#11559) Authored by: Grub4K --- test/test_traversal.py | 2 +- test/test_utils.py | 5 ----- yt_dlp/utils/_utils.py | 3 +-- yt_dlp/utils/traversal.py | 4 ++-- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/test/test_traversal.py b/test/test_traversal.py index d48606e99c..52ea19fab3 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -525,7 +525,7 @@ class TestTraversalHelpers: def test_unpack(self): assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123' assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3' - assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3' + assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3' with pytest.raises(TypeError): unpack(join_nonempty)() with pytest.raises(TypeError): diff --git a/test/test_utils.py b/test/test_utils.py index b5f35736b6..835774a912 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -72,7 +72,6 @@ from yt_dlp.utils import ( intlist_to_bytes, iri_to_uri, is_html, - join_nonempty, js_to_json, limit_length, locked_file, @@ -2158,10 +2157,6 @@ Line 1 assert int_or_none(v=10) == 10, 'keyword passed positional should call function' assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function' - assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially' - assert callable(join_nonempty()), 'varargs positional should apply partially' - assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function' - if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b28bb555e1..89c53c39e7 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -216,7 +216,7 @@ def partial_application(func): sig = inspect.signature(func) required_args = [ param.name for param in sig.parameters.values() - if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL) + if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD) if param.default is inspect.Parameter.empty ] @@ -4837,7 +4837,6 @@ def number_of_digits(number): return len('%d' % number) -@partial_application def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 361f239ba6..6bb52050f2 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -452,9 +452,9 @@ def trim_str(*, start=None, end=None): return trim -def unpack(func): +def unpack(func, **kwargs): @functools.wraps(func) - def inner(items, **kwargs): + def inner(items): return func(*items, **kwargs) return inner From c014fbcddcb4c8f79d914ac5bb526758b540ea33 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Fri, 15 Nov 2024 23:25:52 +0100 Subject: [PATCH 160/216] [utils] `subs_list_to_dict`: Add `lang` default parameter (#11508) Authored by: Grub4K --- test/test_traversal.py | 50 ++++++++++++++++++++++++++++++++++++++- yt_dlp/utils/traversal.py | 22 ++++++++++------- 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/test/test_traversal.py b/test/test_traversal.py index 52ea19fab3..bc433029d8 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -481,7 +481,7 @@ class TestTraversalHelpers: 'id': 'name', 'data': 'content', 'url': 'url', - }, all, {subs_list_to_dict}]) == { + }, all, {subs_list_to_dict(lang=None)}]) == { 'de': [{'url': 'https://example.com/subs/de.ass'}], 'en': [{'data': 'content'}], }, 'subs with mandatory items missing should be filtered' @@ -507,6 +507,54 @@ class TestTraversalHelpers: {'url': 'https://example.com/subs/en1', 'ext': 'ext'}, {'url': 'https://example.com/subs/en2', 'ext': 'ext'}, ]}, '`quality` key should sort subtitle list accordingly' + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.ass'}, + {'name': 'de'}, + {'name': 'en', 'content': 'content'}, + {'url': 'https://example.com/subs/en'}, + ], [..., { + 'id': 'name', + 'url': 'url', + 'data': 'content', + }, all, {subs_list_to_dict(lang='en')}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass'}], + 'en': [ + {'data': 'content'}, + {'url': 'https://example.com/subs/en'}, + ], + }, 'optionally provided lang should be used if no id available' + assert traverse_obj([ + {'name': 1, 'url': 'https://example.com/subs/de1'}, + {'name': {}, 'url': 'https://example.com/subs/de2'}, + {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'}, + {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'}, + ], [..., { + 'id': 'name', + 'url': 'url', + 'ext': 'ext', + }, all, {subs_list_to_dict(lang=None)}]) == { + 'de': [ + {'url': 'https://example.com/subs/de3'}, + {'url': 'https://example.com/subs/de4'}, + ], + }, 'non str types should be ignored for id and ext' + assert traverse_obj([ + {'name': 1, 'url': 'https://example.com/subs/de1'}, + {'name': {}, 'url': 'https://example.com/subs/de2'}, + {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'}, + {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'}, + ], [..., { + 'id': 'name', + 'url': 'url', + 'ext': 'ext', + }, all, {subs_list_to_dict(lang='de')}]) == { + 'de': [ + {'url': 'https://example.com/subs/de1'}, + {'url': 'https://example.com/subs/de2'}, + {'url': 'https://example.com/subs/de3'}, + {'url': 'https://example.com/subs/de4'}, + ], + }, 'non str types should be replaced by default id' def test_trim_str(self): with pytest.raises(TypeError): diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 6bb52050f2..76b51f53d1 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -332,14 +332,14 @@ class _RequiredError(ExtractorError): @typing.overload -def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ... +def subs_list_to_dict(*, lang: str | None = 'und', ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ... @typing.overload -def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ... +def subs_list_to_dict(subs: list[dict] | None, /, *, lang: str | None = 'und', ext: str | None = None) -> dict[str, list[dict]]: ... -def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): +def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None): """ Convert subtitles from a traversal into a subtitle dict. The path should have an `all` immediately before this function. @@ -352,7 +352,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): `quality` The sort order for each subtitle """ if subs is None: - return functools.partial(subs_list_to_dict, ext=ext) + return functools.partial(subs_list_to_dict, lang=lang, ext=ext) result = collections.defaultdict(list) @@ -360,10 +360,16 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): if not url_or_none(sub.get('url')) and not sub.get('data'): continue sub_id = sub.pop('id', None) - if sub_id is None: - continue - if ext is not None and not sub.get('ext'): - sub['ext'] = ext + if not isinstance(sub_id, str): + if not lang: + continue + sub_id = lang + sub_ext = sub.get('ext') + if not isinstance(sub_ext, str): + if not ext: + sub.pop('ext', None) + else: + sub['ext'] = ext result[sub_id].append(sub) result = dict(result) From eb64ae7d5def6df2aba74fb703e7f168fb299865 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 14 Nov 2024 16:08:50 -0600 Subject: [PATCH 161/216] [ie] Allow `ext` override for thumbnails (#11545) Authored by: bashonly --- yt_dlp/YoutubeDL.py | 4 +++- yt_dlp/extractor/common.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 3186a999de..3130deda31 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -4381,7 +4381,9 @@ class YoutubeDL: return None for idx, t in list(enumerate(thumbnails))[::-1]: - thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') + thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg') + if multiple: + thumb_ext = f'{t["id"]}.{thumb_ext}' thumb_display_id = f'{label} thumbnail {t["id"]}' thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 01915acf23..23f6fc6c46 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -279,6 +279,7 @@ class InfoExtractor: thumbnails: A list of dictionaries, with the following entries: * "id" (optional, string) - Thumbnail format ID * "url" + * "ext" (optional, string) - actual image extension if not given in URL * "preference" (optional, int) - quality of the image * "width" (optional, int) * "height" (optional, int) From c699bafc5038b59c9afe8c2e69175fb66424c832 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 14 Nov 2024 16:09:11 -0600 Subject: [PATCH 162/216] [ie/soop] Fix thumbnail extraction (#11545) Closes #11537 Authored by: bashonly --- yt_dlp/extractor/afreecatv.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 6682a89817..572d1a3893 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -66,6 +66,14 @@ class AfreecaTVBaseIE(InfoExtractor): extensions={'legacy_ssl': True}), display_id, 'Downloading API JSON', 'Unable to download API JSON') + @staticmethod + def _fixup_thumb(thumb_url): + if not url_or_none(thumb_url): + return None + # Core would determine_ext as 'php' from the url, so we need to provide the real ext + # See: https://github.com/yt-dlp/yt-dlp/issues/11537 + return [{'url': thumb_url, 'ext': 'jpg'}] + class AfreecaTVIE(AfreecaTVBaseIE): IE_NAME = 'soop' @@ -155,7 +163,7 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'uploader': ('writer_nick', {str}), 'uploader_id': ('bj_id', {str}), 'duration': ('total_file_duration', {int_or_none(scale=1000)}), - 'thumbnail': ('thumb', {url_or_none}), + 'thumbnails': ('thumb', {self._fixup_thumb}), }) entries = [] @@ -226,8 +234,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE): return self.playlist_result(self._entries(data), video_id) - @staticmethod - def _entries(data): + def _entries(self, data): # 'files' is always a list with 1 element yield from traverse_obj(data, ( 'data', lambda _, v: v['story_type'] == 'catch', @@ -238,7 +245,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE): 'title': ('title', {str}), 'uploader': ('writer_nick', {str}), 'uploader_id': ('writer_id', {str}), - 'thumbnail': ('thumb', {url_or_none}), + 'thumbnails': ('thumb', {self._fixup_thumb}), 'timestamp': ('write_timestamp', {int_or_none}), })) From 70c55cb08f780eab687e881ef42bb5c6007d290b Mon Sep 17 00:00:00 2001 From: Alessandro Campolo Date: Sat, 16 Nov 2024 13:56:15 +0100 Subject: [PATCH 163/216] [ie/RadioRadicale] Add extractor (#5607) Authored by: a13ssandr0, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/radioradicale.py | 105 ++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 yt_dlp/extractor/radioradicale.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 51caefd4d7..d6ab610025 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1649,6 +1649,7 @@ from .radiokapital import ( RadioKapitalIE, RadioKapitalShowIE, ) +from .radioradicale import RadioRadicaleIE from .radiozet import RadioZetPodcastIE from .radlive import ( RadLiveChannelIE, diff --git a/yt_dlp/extractor/radioradicale.py b/yt_dlp/extractor/radioradicale.py new file mode 100644 index 0000000000..472e25c45f --- /dev/null +++ b/yt_dlp/extractor/radioradicale.py @@ -0,0 +1,105 @@ +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class RadioRadicaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radioradicale\.it/scheda/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.radioradicale.it/scheda/471591', + 'md5': 'eb0fbe43a601f1a361cbd00f3c45af4a', + 'info_dict': { + 'id': '471591', + 'ext': 'mp4', + 'title': 'md5:e8fbb8de57011a3255db0beca69af73d', + 'description': 'md5:5e15a789a2fe4d67da8d1366996e89ef', + 'location': 'Napoli', + 'duration': 2852.0, + 'timestamp': 1459987200, + 'upload_date': '20160407', + 'thumbnail': 'https://www.radioradicale.it/photo400/0/0/9/0/1/00901768.jpg', + }, + }, { + 'url': 'https://www.radioradicale.it/scheda/742783/parlamento-riunito-in-seduta-comune-11a-della-xix-legislatura', + 'info_dict': { + 'id': '742783', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + 'description': '-) Votazione per l\'elezione di un giudice della Corte Costituzionale (nono scrutinio)', + 'location': 'CAMERA', + 'duration': 5868.0, + 'timestamp': 1730246400, + 'upload_date': '20241030', + }, + 'playlist': [{ + 'md5': 'aa48de55dcc45478e4cd200f299aab7d', + 'info_dict': { + 'id': '742783-0', + 'ext': 'mp4', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + }, + }, { + 'md5': 'be915c189c70ad2920e5810f32260ff5', + 'info_dict': { + 'id': '742783-1', + 'ext': 'mp4', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + }, + }, { + 'md5': 'f0ee4047342baf8ed3128a8417ac5e0a', + 'info_dict': { + 'id': '742783-2', + 'ext': 'mp4', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + }, + }], + }] + + def _entries(self, videos_info, page_id): + for idx, video in enumerate(traverse_obj( + videos_info, ('playlist', lambda _, v: v['sources']))): + video_id = f'{page_id}-{idx}' + formats = [] + subtitles = {} + + for m3u8_url in traverse_obj(video, ('sources', ..., 'src', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + for sub in traverse_obj(video, ('subtitles', ..., lambda _, v: url_or_none(v['src']))): + self._merge_subtitles({sub.get('srclang') or 'und': [{ + 'url': sub['src'], + 'name': sub.get('label'), + }]}, target=subtitles) + + yield { + 'id': video_id, + 'title': video.get('title'), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + videos_info = self._search_json( + r'jQuery\.extend\(Drupal\.settings\s*,', + webpage, 'videos_info', page_id)['RRscheda'] + + entries = list(self._entries(videos_info, page_id)) + + common_info = { + 'id': page_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'location': videos_info.get('luogo'), + **self._search_json_ld(webpage, page_id), + } + + if len(entries) == 1: + return { + **entries[0], + **common_info, + } + + return self.playlist_result(entries, multi_video=True, **common_info) From 6365e92589e4bc17b8fffb0125a716d144ad2137 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 16 Nov 2024 17:56:43 +0100 Subject: [PATCH 164/216] [ie/bandlab] Add extractors (#11535) Closes #7750 Authored by: seproDev --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/bandlab.py | 438 ++++++++++++++++++++++++++++++++ 2 files changed, 442 insertions(+) create mode 100644 yt_dlp/extractor/bandlab.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d6ab610025..25a233a2d6 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -208,6 +208,10 @@ from .bandcamp import ( BandcampUserIE, BandcampWeeklyIE, ) +from .bandlab import ( + BandlabIE, + BandlabPlaylistIE, +) from .bannedvideo import BannedVideoIE from .bbc import ( BBCIE, diff --git a/yt_dlp/extractor/bandlab.py b/yt_dlp/extractor/bandlab.py new file mode 100644 index 0000000000..e48d5d3f76 --- /dev/null +++ b/yt_dlp/extractor/bandlab.py @@ -0,0 +1,438 @@ + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + format_field, + int_or_none, + parse_iso8601, + parse_qs, + truncate_string, + url_or_none, +) +from ..utils.traversal import traverse_obj, value + + +class BandlabBaseIE(InfoExtractor): + def _call_api(self, endpoint, asset_id, **kwargs): + headers = kwargs.pop('headers', None) or {} + return self._download_json( + f'https://www.bandlab.com/api/v1.3/{endpoint}/{asset_id}', + asset_id, headers={ + 'accept': 'application/json', + 'referer': 'https://www.bandlab.com/', + 'x-client-id': 'BandLab-Web', + 'x-client-version': '10.1.124', + **headers, + }, **kwargs) + + def _parse_revision(self, revision_data, url=None): + return { + 'vcodec': 'none', + 'media_type': 'revision', + 'extractor_key': BandlabIE.ie_key(), + 'extractor': BandlabIE.IE_NAME, + **traverse_obj(revision_data, { + 'webpage_url': ( + 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/revision/%s')}), filter, any), + 'id': (('revisionId', 'id'), {str}, any), + 'title': ('song', 'name', {str}), + 'track': ('song', 'name', {str}), + 'url': ('mixdown', 'file', {url_or_none}), + 'thumbnail': ('song', 'picture', 'url', {url_or_none}), + 'description': ('description', {str}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + 'timestamp': ('createdOn', {parse_iso8601}), + 'duration': ('mixdown', 'duration', {float_or_none}), + 'view_count': ('counters', 'plays', {int_or_none}), + 'like_count': ('counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'genres': ('genres', ..., 'name', {str}), + }), + } + + def _parse_track(self, track_data, url=None): + return { + 'vcodec': 'none', + 'media_type': 'track', + 'extractor_key': BandlabIE.ie_key(), + 'extractor': BandlabIE.IE_NAME, + **traverse_obj(track_data, { + 'webpage_url': ( + 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any), + 'id': (('revisionId', 'id'), {str}, any), + 'url': ('track', 'sample', 'audioUrl', {url_or_none}), + 'title': ('track', 'name', {str}), + 'track': ('track', 'name', {str}), + 'description': ('caption', {str}), + 'thumbnail': ('track', 'picture', ('original', 'url'), {url_or_none}, any), + 'view_count': ('counters', 'plays', {int_or_none}), + 'like_count': ('counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'duration': ('track', 'sample', 'duration', {float_or_none}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + 'timestamp': ('createdOn', {parse_iso8601}), + }), + } + + def _parse_video(self, video_data, url=None): + return { + 'media_type': 'video', + 'extractor_key': BandlabIE.ie_key(), + 'extractor': BandlabIE.IE_NAME, + **traverse_obj(video_data, { + 'id': ('id', {str}), + 'webpage_url': ( + 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any), + 'url': ('video', 'url', {url_or_none}), + 'title': ('caption', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}), + 'description': ('caption', {str}), + 'thumbnail': ('video', 'picture', 'url', {url_or_none}), + 'view_count': ('video', 'counters', 'plays', {int_or_none}), + 'like_count': ('video', 'counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'duration': ('video', 'duration', {float_or_none}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + }), + } + + +class BandlabIE(BandlabBaseIE): + _VALID_URL = [ + r'https?://(?:www\.)?bandlab.com/(?Ptrack|post|revision)/(?P[\da-f_-]+)', + r'https?://(?:www\.)?bandlab.com/(?Pembed)/\?(?:[^#]*&)?id=(?P[\da-f-]+)', + ] + _EMBED_REGEX = [rf']+src=[\'"](?P{_VALID_URL[1]})[\'"]'] + _TESTS = [{ + 'url': 'https://www.bandlab.com/track/04b37e88dba24967b9dac8eb8567ff39_07d7f906fc96ee11b75e000d3a428fff', + 'md5': '46f7b43367dd268bbcf0bbe466753b2c', + 'info_dict': { + 'id': '02d7f906-fc96-ee11-b75e-000d3a428fff', + 'ext': 'm4a', + 'uploader_id': 'ender_milze', + 'track': 'sweet black', + 'description': 'composed by juanjn3737', + 'timestamp': 1702171963, + 'view_count': int, + 'like_count': int, + 'duration': 54.629999999999995, + 'title': 'sweet black', + 'upload_date': '20231210', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'genres': ['Lofi'], + 'uploader': 'ender milze', + 'comment_count': int, + 'media_type': 'revision', + }, + }, { + # Same track as above but post URL + 'url': 'https://www.bandlab.com/post/07d7f906-fc96-ee11-b75e-000d3a428fff', + 'md5': '46f7b43367dd268bbcf0bbe466753b2c', + 'info_dict': { + 'id': '02d7f906-fc96-ee11-b75e-000d3a428fff', + 'ext': 'm4a', + 'uploader_id': 'ender_milze', + 'track': 'sweet black', + 'description': 'composed by juanjn3737', + 'timestamp': 1702171973, + 'view_count': int, + 'like_count': int, + 'duration': 54.629999999999995, + 'title': 'sweet black', + 'upload_date': '20231210', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'genres': ['Lofi'], + 'uploader': 'ender milze', + 'comment_count': int, + 'media_type': 'revision', + }, + }, { + # SharedKey Example + 'url': 'https://www.bandlab.com/track/048916c2-c6da-ee11-85f9-6045bd2e11f9?sharedKey=0NNWX8qYAEmI38lWAzCNDA', + 'md5': '15174b57c44440e2a2008be9cae00250', + 'info_dict': { + 'id': '038916c2-c6da-ee11-85f9-6045bd2e11f9', + 'ext': 'm4a', + 'comment_count': int, + 'genres': ['Other'], + 'uploader_id': 'user8353034818103753', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/51b18363-da23-4b9b-a29c-2933a3e561ca/', + 'timestamp': 1709625771, + 'track': 'PodcastMaerchen4b', + 'duration': 468.14, + 'view_count': int, + 'description': 'Podcast: Neues aus der Märchenwelt', + 'like_count': int, + 'upload_date': '20240305', + 'uploader': 'Erna Wageneder', + 'title': 'PodcastMaerchen4b', + 'media_type': 'revision', + }, + }, { + # Different Revision selected + 'url': 'https://www.bandlab.com/track/130343fc-148b-ea11-96d2-0003ffd1fc09?revId=110343fc-148b-ea11-96d2-0003ffd1fc09', + 'md5': '74e055ef9325d63f37088772fbfe4454', + 'info_dict': { + 'id': '110343fc-148b-ea11-96d2-0003ffd1fc09', + 'ext': 'm4a', + 'timestamp': 1588273294, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/b612e533-e4f7-4542-9f50-3fcfd8dd822c/', + 'description': 'Final Revision.', + 'title': 'Replay ( Instrumental)', + 'uploader': 'David R Sparks', + 'uploader_id': 'davesnothome69', + 'view_count': int, + 'comment_count': int, + 'track': 'Replay ( Instrumental)', + 'genres': ['Rock'], + 'upload_date': '20200430', + 'like_count': int, + 'duration': 279.43, + 'media_type': 'revision', + }, + }, { + # Video + 'url': 'https://www.bandlab.com/post/5cdf9036-3857-ef11-991a-6045bd36e0d9', + 'md5': '8caa2ef28e86c1dacf167293cfdbeba9', + 'info_dict': { + 'id': '5cdf9036-3857-ef11-991a-6045bd36e0d9', + 'ext': 'mp4', + 'duration': 44.705, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/videos/67c6cef1-cef6-40d3-831e-a55bc1dcb972/', + 'comment_count': int, + 'title': 'backing vocals', + 'uploader_id': 'marliashya', + 'uploader': 'auraa', + 'like_count': int, + 'description': 'backing vocals', + 'media_type': 'video', + }, + }, { + # Embed Example + 'url': 'https://www.bandlab.com/embed/?blur=false&id=014de0a4-7d82-ea11-a94c-0003ffd19c0f', + 'md5': 'a4ad05cb68c54faaed9b0a8453a8cf4a', + 'info_dict': { + 'id': '014de0a4-7d82-ea11-a94c-0003ffd19c0f', + 'ext': 'm4a', + 'comment_count': int, + 'genres': ['Electronic'], + 'uploader': 'Charlie Henson', + 'timestamp': 1587328674, + 'upload_date': '20200419', + 'view_count': int, + 'track': 'Positronic Meltdown', + 'duration': 318.55, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/87165bc3-5439-496e-b1f7-a9f13b541ff2/', + 'description': 'Checkout my tracks at AOMX http://aomxsounds.com/', + 'uploader_id': 'microfreaks', + 'title': 'Positronic Meltdown', + 'like_count': int, + 'media_type': 'revision', + }, + }, { + # Track without revisions available + 'url': 'https://www.bandlab.com/track/55767ac51789ea11a94c0003ffd1fc09_2f007b0a37b94ec7a69bc25ae15108a5', + 'md5': 'f05d68a3769952c2d9257c473e14c15f', + 'info_dict': { + 'id': '55767ac51789ea11a94c0003ffd1fc09_2f007b0a37b94ec7a69bc25ae15108a5', + 'ext': 'm4a', + 'track': 'insame', + 'like_count': int, + 'duration': 84.03, + 'title': 'insame', + 'view_count': int, + 'comment_count': int, + 'uploader': 'Sorakime', + 'uploader_id': 'sorakime', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/572a351a-0f3a-4c6a-ac39-1a5defdeeb1c/', + 'timestamp': 1691162128, + 'upload_date': '20230804', + 'media_type': 'track', + }, + }, { + 'url': 'https://www.bandlab.com/revision/014de0a4-7d82-ea11-a94c-0003ffd19c0f', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://phantomluigi.github.io/', + 'info_dict': { + 'id': 'e14223c3-7871-ef11-bdfd-000d3a980db3', + 'ext': 'm4a', + 'view_count': int, + 'upload_date': '20240913', + 'uploader_id': 'phantommusicofficial', + 'timestamp': 1726194897, + 'uploader': 'Phantom', + 'comment_count': int, + 'genres': ['Progresive Rock'], + 'description': 'md5:a38cd668f7a2843295ef284114f18429', + 'duration': 225.23, + 'like_count': int, + 'title': 'Vermilion Pt. 2 (Cover)', + 'track': 'Vermilion Pt. 2 (Cover)', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/62b10750-7aef-4f42-ad08-1af52f577e97/', + 'media_type': 'revision', + }, + }] + + def _real_extract(self, url): + display_id, url_type = self._match_valid_url(url).group('id', 'url_type') + + qs = parse_qs(url) + revision_id = traverse_obj(qs, (('revId', 'id'), 0, any)) + if url_type == 'revision': + revision_id = display_id + + revision_data = None + if not revision_id: + post_data = self._call_api( + 'posts', display_id, note='Downloading post data', + query=traverse_obj(qs, {'sharedKey': ('sharedKey', 0)})) + + revision_id = traverse_obj(post_data, (('revisionId', ('revision', 'id')), {str}, any)) + revision_data = traverse_obj(post_data, ('revision', {dict})) + + if not revision_data and not revision_id: + post_type = post_data.get('type') + if post_type == 'Video': + return self._parse_video(post_data, url=url) + if post_type == 'Track': + return self._parse_track(post_data, url=url) + raise ExtractorError(f'Could not extract data for post type {post_type!r}') + + if not revision_data: + revision_data = self._call_api( + 'revisions', revision_id, note='Downloading revision data', query={'edit': 'false'}) + + return self._parse_revision(revision_data, url=url) + + +class BandlabPlaylistIE(BandlabBaseIE): + _VALID_URL = [ + r'https?://(?:www\.)?bandlab.com/(?:[\w]+/)?(?Palbums|collections)/(?P[\da-f-]+)', + r'https?://(?:www\.)?bandlab.com/(?Pembed)/collection/\?(?:[^#]*&)?id=(?P[\da-f-]+)', + ] + _EMBED_REGEX = [rf']+src=[\'"](?P{_VALID_URL[1]})[\'"]'] + _TESTS = [{ + 'url': 'https://www.bandlab.com/davesnothome69/albums/89b79ea6-de42-ed11-b495-00224845aac7', + 'info_dict': { + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/69507ff3-579a-45be-afca-9e87eddec944/', + 'release_date': '20221003', + 'title': 'Remnants', + 'album': 'Remnants', + 'like_count': int, + 'album_type': 'LP', + 'description': 'A collection of some feel good, rock hits.', + 'comment_count': int, + 'view_count': int, + 'id': '89b79ea6-de42-ed11-b495-00224845aac7', + 'uploader': 'David R Sparks', + 'uploader_id': 'davesnothome69', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.bandlab.com/slytheband/collections/955102d4-1040-ef11-86c3-000d3a42581b', + 'info_dict': { + 'id': '955102d4-1040-ef11-86c3-000d3a42581b', + 'timestamp': 1720762659, + 'view_count': int, + 'title': 'My Shit 🖤', + 'uploader_id': 'slytheband', + 'uploader': '𝓢𝓛𝓨', + 'upload_date': '20240712', + 'like_count': int, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/collections/2c64ca12-b180-4b76-8587-7a8da76bddc8/', + }, + 'playlist_count': 15, + }, { + # Embeds can contain both albums and collections with the same URL pattern. This is an album + 'url': 'https://www.bandlab.com/embed/collection/?id=12cc6f7f-951b-ee11-907c-00224844f303', + 'info_dict': { + 'id': '12cc6f7f-951b-ee11-907c-00224844f303', + 'release_date': '20230706', + 'description': 'This is a collection of songs I created when I had an Amiga computer.', + 'view_count': int, + 'title': 'Mark Salud The Amiga Collection', + 'uploader_id': 'mssirmooth1962', + 'comment_count': int, + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/d618bd7b-0537-40d5-bdd8-61b066e77d59/', + 'like_count': int, + 'uploader': 'Mark Salud', + 'album': 'Mark Salud The Amiga Collection', + 'album_type': 'LP', + }, + 'playlist_count': 24, + }, { + # Tracks without revision id + 'url': 'https://www.bandlab.com/embed/collection/?id=e98aafb5-d932-ee11-b8f0-00224844c719', + 'info_dict': { + 'like_count': int, + 'uploader_id': 'sorakime', + 'comment_count': int, + 'uploader': 'Sorakime', + 'view_count': int, + 'description': 'md5:4ec31c568a5f5a5a2b17572ea64c3825', + 'release_date': '20230812', + 'title': 'Art', + 'album': 'Art', + 'album_type': 'Album', + 'id': 'e98aafb5-d932-ee11-b8f0-00224844c719', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/20c890de-e94a-4422-828a-2da6377a13c8/', + }, + 'playlist_count': 13, + }, { + 'url': 'https://www.bandlab.com/albums/89b79ea6-de42-ed11-b495-00224845aac7', + 'only_matching': True, + }] + + def _entries(self, album_data): + for post in traverse_obj(album_data, ('posts', lambda _, v: v['type'])): + post_type = post['type'] + if post_type == 'Revision': + yield self._parse_revision(post.get('revision')) + elif post_type == 'Track': + yield self._parse_track(post) + elif post_type == 'Video': + yield self._parse_video(post) + else: + self.report_warning(f'Skipping unknown post type: "{post_type}"') + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + + endpoints = { + 'albums': ['albums'], + 'collections': ['collections'], + 'embed': ['collections', 'albums'], + }.get(playlist_type) + for endpoint in endpoints: + playlist_data = self._call_api( + endpoint, playlist_id, note=f'Downloading {endpoint[:-1]} data', + fatal=False, expected_status=404) + if not playlist_data.get('errorCode'): + playlist_type = endpoint + break + if error_code := playlist_data.get('errorCode'): + raise ExtractorError(f'Could not find playlist data. Error code: "{error_code}"') + + return self.playlist_result( + self._entries(playlist_data), playlist_id, + **traverse_obj(playlist_data, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + 'timestamp': ('createdOn', {parse_iso8601}), + 'release_date': ('releaseDate', {lambda x: x.replace('-', '')}, filter), + 'thumbnail': ('picture', ('original', 'url'), {url_or_none}, any), + 'like_count': ('counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'view_count': ('counters', 'plays', {int_or_none}), + }), + **(traverse_obj(playlist_data, { + 'album': ('name', {str}), + 'album_type': ('type', {str}), + }) if playlist_type == 'albums' else {})) From 8388ec256f7753b02488788e3cfa771f6e1db247 Mon Sep 17 00:00:00 2001 From: Jackson Humphrey Date: Sat, 16 Nov 2024 13:48:47 -0600 Subject: [PATCH 165/216] [ie/spankbang] Support browser impersonation (#11542) Closes #6545 Authored by: jshumphrey --- yt_dlp/extractor/spankbang.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/spankbang.py b/yt_dlp/extractor/spankbang.py index 6805a72deb..05f0bb1468 100644 --- a/yt_dlp/extractor/spankbang.py +++ b/yt_dlp/extractor/spankbang.py @@ -71,9 +71,11 @@ class SpankBangIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') or mobj.group('id_2') + country = self.get_param('geo_bypass_country') or 'US' + self._set_cookie('.spankbang.com', 'country', country.upper()) webpage = self._download_webpage( url.replace(f'/{video_id}/embed', f'/{video_id}/video'), - video_id, headers={'Cookie': 'country=US'}) + video_id, impersonate=True) if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): raise ExtractorError( From d215fba7edb69d4fa665f43663756fd260b1489f Mon Sep 17 00:00:00 2001 From: Jackson Humphrey Date: Sat, 16 Nov 2024 13:50:17 -0600 Subject: [PATCH 166/216] [ie/RedGifsUser] Fix extraction (#11531) Closes #7382, Closes #9131 Authored by: jshumphrey --- yt_dlp/extractor/redgifs.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py index 50138ab12c..b11ea273de 100644 --- a/yt_dlp/extractor/redgifs.py +++ b/yt_dlp/extractor/redgifs.py @@ -213,7 +213,7 @@ class RedGifsSearchIE(RedGifsBaseInfoExtractor): class RedGifsUserIE(RedGifsBaseInfoExtractor): IE_DESC = 'Redgifs user' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P[^/?#]+)(?:\?(?P[^#]+))?' - _PAGE_SIZE = 30 + _PAGE_SIZE = 80 _TESTS = [ { 'url': 'https://www.redgifs.com/users/lamsinka89', @@ -222,7 +222,7 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor): 'title': 'lamsinka89', 'description': 'RedGifs user lamsinka89, ordered by recent', }, - 'playlist_mincount': 100, + 'playlist_mincount': 391, }, { 'url': 'https://www.redgifs.com/users/lamsinka89?page=3', @@ -231,7 +231,7 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor): 'title': 'lamsinka89', 'description': 'RedGifs user lamsinka89, ordered by recent', }, - 'playlist_count': 30, + 'playlist_count': 80, }, { 'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g', @@ -240,7 +240,17 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor): 'title': 'lamsinka89', 'description': 'RedGifs user lamsinka89, ordered by best', }, - 'playlist_mincount': 100, + 'playlist_mincount': 391, + }, + { + 'url': 'https://www.redgifs.com/users/ignored52', + 'note': 'https://github.com/yt-dlp/yt-dlp/issues/7382', + 'info_dict': { + 'id': 'ignored52', + 'title': 'ignored52', + 'description': 'RedGifs user ignored52, ordered by recent', + }, + 'playlist_mincount': 121, }, ] From 720b3dc453c342bc2e8df7dbc0acaab4479de46c Mon Sep 17 00:00:00 2001 From: powergold1 <18133986+powergold1@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:55:40 +0100 Subject: [PATCH 167/216] [ie/chaturbate] Extract from API and support impersonation (#11555) Closes #6546, Closes #10359 Authored by: powergold1 --- yt_dlp/extractor/chaturbate.py | 51 ++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index 864d61f9c2..aa70f26a1b 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -5,6 +5,7 @@ from ..utils import ( ExtractorError, lowercase_escape, url_or_none, + urlencode_postdata, ) @@ -40,14 +41,48 @@ class ChaturbateIE(InfoExtractor): 'only_matching': True, }] - _ROOM_OFFLINE = 'Room is currently offline' + _ERROR_MAP = { + 'offline': 'Room is currently offline', + 'private': 'Room is currently in a private show', + 'away': 'Performer is currently away', + 'password protected': 'Room is password protected', + 'hidden': 'Hidden session in progress', + } - def _real_extract(self, url): - video_id, tld = self._match_valid_url(url).group('id', 'tld') + def _extract_from_api(self, video_id, tld): + response = self._download_json( + f'https://chaturbate.{tld}/get_edge_hls_url_ajax/', video_id, + data=urlencode_postdata({'room_slug': video_id}), + headers={ + **self.geo_verification_headers(), + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': 'application/json', + }, fatal=False, impersonate=True) or {} + status = response.get('room_status') + if status != 'public': + if error := self._ERROR_MAP.get(status): + raise ExtractorError(error, expected=True) + self.report_warning('Falling back to webpage extraction') + return None + + m3u8_url = response.get('url') + if not m3u8_url: + self.raise_geo_restricted() + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': f'https://roomimg.stream.highwebmedia.com/ri/{video_id}.jpg', + 'is_live': True, + 'age_limit': 18, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), + } + + def _extract_from_webpage(self, video_id, tld): webpage = self._download_webpage( f'https://chaturbate.{tld}/{video_id}/', video_id, - headers=self.geo_verification_headers()) + headers=self.geo_verification_headers(), impersonate=True) found_m3u8_urls = [] @@ -85,8 +120,8 @@ class ChaturbateIE(InfoExtractor): webpage, 'error', group='error', default=None) if not error: if any(p in webpage for p in ( - self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): - error = self._ROOM_OFFLINE + self._ERROR_MAP['offline'], 'offline_tipping', 'tip_offline')): + error = self._ERROR_MAP['offline'] if error: raise ExtractorError(error, expected=True) raise ExtractorError('Unable to find stream URL') @@ -113,3 +148,7 @@ class ChaturbateIE(InfoExtractor): 'is_live': True, 'formats': formats, } + + def _real_extract(self, url): + video_id, tld = self._match_valid_url(url).group('id', 'tld') + return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) From 1d253b0a27110d174c40faf8fb1c999d099e0cde Mon Sep 17 00:00:00 2001 From: Jackson Humphrey Date: Sat, 16 Nov 2024 14:02:14 -0600 Subject: [PATCH 168/216] [ie/patreon] Fix comments extraction (#11530) Closes #11483 Authored by: jshumphrey, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/patreon.py | 51 +++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 4d668cd37d..6bdeaf1571 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -16,10 +16,10 @@ from ..utils import ( parse_iso8601, smuggle_url, str_or_none, - traverse_obj, url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj, value class PatreonBaseIE(InfoExtractor): @@ -252,6 +252,27 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': r're:^https?://.+', }, 'skip': 'Patron-only content', + }, { + # Contains a comment reply in the 'included' section + 'url': 'https://www.patreon.com/posts/114721679', + 'info_dict': { + 'id': '114721679', + 'ext': 'mp4', + 'upload_date': '20241025', + 'uploader': 'Japanalysis', + 'like_count': int, + 'thumbnail': r're:^https?://.+', + 'comment_count': int, + 'title': 'Karasawa Part 2', + 'description': 'Part 2 of this video https://www.youtube.com/watch?v=Azms2-VTASk', + 'uploader_url': 'https://www.patreon.com/japanalysis', + 'uploader_id': '80504268', + 'channel_url': 'https://www.patreon.com/japanalysis', + 'channel_follower_count': int, + 'timestamp': 1729897015, + 'channel_id': '9346307', + }, + 'params': {'getcomments': True}, }] _RETURN_TYPE = 'video' @@ -404,26 +425,24 @@ class PatreonIE(PatreonBaseIE): f'posts/{post_id}/comments', post_id, query=params, note=f'Downloading comments page {page}') cursor = None - for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)): + for comment in traverse_obj(response, (('data', 'included'), lambda _, v: v['type'] == 'comment' and v['id'])): count += 1 - comment_id = comment.get('id') - attributes = comment.get('attributes') or {} - if comment_id is None: - continue author_id = traverse_obj(comment, ('relationships', 'commenter', 'data', 'id')) - author_info = traverse_obj( - response, ('included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes'), - get_all=False, expected_type=dict, default={}) yield { - 'id': comment_id, - 'text': attributes.get('body'), - 'timestamp': parse_iso8601(attributes.get('created')), - 'parent': traverse_obj(comment, ('relationships', 'parent', 'data', 'id'), default='root'), - 'author_is_uploader': attributes.get('is_by_creator'), + **traverse_obj(comment, { + 'id': ('id', {str_or_none}), + 'text': ('attributes', 'body', {str}), + 'timestamp': ('attributes', 'created', {parse_iso8601}), + 'parent': ('relationships', 'parent', 'data', ('id', {value('root')}), {str}, any), + 'author_is_uploader': ('attributes', 'is_by_creator', {bool}), + }), + **traverse_obj(response, ( + 'included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes', { + 'author': ('full_name', {str}), + 'author_thumbnail': ('image_url', {url_or_none}), + }), get_all=False), 'author_id': author_id, - 'author': author_info.get('full_name'), - 'author_thumbnail': author_info.get('image_url'), } if count < traverse_obj(response, ('meta', 'count')): From f95a92b3d0169a784ee15a138fbe09d82b2754a1 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 17 Nov 2024 00:24:11 +0100 Subject: [PATCH 169/216] [cleanup] Deprecate more compat functions (#11439) Authored by: seproDev --- devscripts/generate_aes_testdata.py | 3 +- pyproject.toml | 10 ++++ test/helper.py | 3 +- test/test_YoutubeDL.py | 9 ++- test/test_aes.py | 47 ++++++++------- test/test_compat.py | 40 +------------ test/test_downloader_http.py | 7 +-- test/test_utils.py | 16 ++--- yt_dlp/YoutubeDL.py | 35 ++++++----- yt_dlp/__init__.py | 8 +-- yt_dlp/aes.py | 21 ++++--- yt_dlp/compat/__init__.py | 22 +------ yt_dlp/compat/_deprecated.py | 18 +++--- yt_dlp/compat/_legacy.py | 11 +++- yt_dlp/compat/functools.py | 7 --- yt_dlp/compat/urllib/request.py | 6 +- yt_dlp/cookies.py | 3 +- yt_dlp/downloader/common.py | 18 +++--- yt_dlp/downloader/external.py | 9 ++- yt_dlp/downloader/fragment.py | 9 ++- yt_dlp/downloader/http.py | 8 +-- yt_dlp/downloader/rtmp.py | 7 +-- yt_dlp/downloader/rtsp.py | 4 +- yt_dlp/extractor/abematv.py | 9 +-- yt_dlp/extractor/adn.py | 8 +-- yt_dlp/extractor/anvato.py | 6 +- yt_dlp/extractor/common.py | 3 +- yt_dlp/extractor/shemaroome.py | 12 ++-- yt_dlp/postprocessor/common.py | 3 +- yt_dlp/postprocessor/embedthumbnail.py | 13 ++-- yt_dlp/postprocessor/ffmpeg.py | 23 ++++---- .../postprocessor/movefilesafterdownload.py | 16 +++-- yt_dlp/postprocessor/sponskrub.py | 7 +-- yt_dlp/postprocessor/xattrpp.py | 3 +- yt_dlp/update.py | 59 ++----------------- yt_dlp/utils/_deprecated.py | 36 +++++------ yt_dlp/utils/_legacy.py | 27 +++++++++ yt_dlp/utils/_utils.py | 35 +++-------- 38 files changed, 218 insertions(+), 363 deletions(-) delete mode 100644 yt_dlp/compat/functools.py diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py index 7f3c88bcfb..73cf803b8f 100644 --- a/devscripts/generate_aes_testdata.py +++ b/devscripts/generate_aes_testdata.py @@ -11,13 +11,12 @@ import codecs import subprocess from yt_dlp.aes import aes_encrypt, key_expansion -from yt_dlp.utils import intlist_to_bytes secret_msg = b'Secret message goes here' def hex_str(int_list): - return codecs.encode(intlist_to_bytes(int_list), 'hex') + return codecs.encode(bytes(int_list), 'hex') def openssl_encode(algo, key, iv): diff --git a/pyproject.toml b/pyproject.toml index ef921fed58..92d399e319 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -313,6 +313,16 @@ banned-from = [ "yt_dlp.compat.compat_urllib_parse_urlparse".msg = "Use `urllib.parse.urlparse` instead." "yt_dlp.compat.compat_shlex_quote".msg = "Use `yt_dlp.utils.shell_quote` instead." "yt_dlp.utils.error_to_compat_str".msg = "Use `str` instead." +"yt_dlp.utils.bytes_to_intlist".msg = "Use `list` instead." +"yt_dlp.utils.intlist_to_bytes".msg = "Use `bytes` instead." +"yt_dlp.utils.decodeArgument".msg = "Do not use" +"yt_dlp.utils.decodeFilename".msg = "Do not use" +"yt_dlp.utils.encodeFilename".msg = "Do not use" +"yt_dlp.compat.compat_os_name".msg = "Use `os.name` instead." +"yt_dlp.compat.compat_realpath".msg = "Use `os.path.realpath` instead." +"yt_dlp.compat.functools".msg = "Use `functools` instead." +"yt_dlp.utils.decodeOption".msg = "Do not use" +"yt_dlp.utils.compiled_regex_type".msg = "Use `re.Pattern` instead." [tool.autopep8] max_line_length = 120 diff --git a/test/helper.py b/test/helper.py index 3b550d1927..c776e70b73 100644 --- a/test/helper.py +++ b/test/helper.py @@ -9,7 +9,6 @@ import types import yt_dlp.extractor from yt_dlp import YoutubeDL -from yt_dlp.compat import compat_os_name from yt_dlp.utils import preferredencoding, try_call, write_string, find_available_port if 'pytest' in sys.modules: @@ -49,7 +48,7 @@ def report_warning(message, *args, **kwargs): Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored """ - if sys.stderr.isatty() and compat_os_name != 'nt': + if sys.stderr.isatty() and os.name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a99e624080..966d27a498 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -15,7 +15,6 @@ import json from test.helper import FakeYDL, assertRegexpMatches, try_rm from yt_dlp import YoutubeDL -from yt_dlp.compat import compat_os_name from yt_dlp.extractor import YoutubeIE from yt_dlp.extractor.common import InfoExtractor from yt_dlp.postprocessor.common import PostProcessor @@ -839,8 +838,8 @@ class TestYoutubeDL(unittest.TestCase): test('%(filesize)#D', '1Ki') test('%(height)5.2D', ' 1.08k') test('%(title4)#S', 'foo_bar_test') - test('%(title4).10S', ('foo "bar" ', 'foo "bar"' + ('#' if compat_os_name == 'nt' else ' '))) - if compat_os_name == 'nt': + test('%(title4).10S', ('foo "bar" ', 'foo "bar"' + ('#' if os.name == 'nt' else ' '))) + if os.name == 'nt': test('%(title4)q', ('"foo ""bar"" test"', None)) test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', None)) test('%(formats.0.id)#q', ('"id 1"', None)) @@ -903,9 +902,9 @@ class TestYoutubeDL(unittest.TestCase): # Environment variable expansion for prepare_filename os.environ['__yt_dlp_var'] = 'expanded' - envvar = '%__yt_dlp_var%' if compat_os_name == 'nt' else '$__yt_dlp_var' + envvar = '%__yt_dlp_var%' if os.name == 'nt' else '$__yt_dlp_var' test(envvar, (envvar, 'expanded')) - if compat_os_name == 'nt': + if os.name == 'nt': test('%s%', ('%s%', '%s%')) os.environ['s'] = 'expanded' test('%s%', ('%s%', 'expanded')) # %s% should be expanded before escaping %s diff --git a/test/test_aes.py b/test/test_aes.py index 6fe6059a17..9cd9189bcc 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -27,7 +27,6 @@ from yt_dlp.aes import ( pad_block, ) from yt_dlp.dependencies import Cryptodome -from yt_dlp.utils import bytes_to_intlist, intlist_to_bytes # the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' @@ -40,33 +39,33 @@ class TestAES(unittest.TestCase): def test_encrypt(self): msg = b'message' key = list(range(16)) - encrypted = aes_encrypt(bytes_to_intlist(msg), key) - decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) + encrypted = aes_encrypt(list(msg), key) + decrypted = bytes(aes_decrypt(encrypted, key)) self.assertEqual(decrypted, msg) def test_cbc_decrypt(self): data = b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\x27\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd' - decrypted = intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist(data), self.key, self.iv)) + decrypted = bytes(aes_cbc_decrypt(list(data), self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) if Cryptodome.AES: - decrypted = aes_cbc_decrypt_bytes(data, intlist_to_bytes(self.key), intlist_to_bytes(self.iv)) + decrypted = aes_cbc_decrypt_bytes(data, bytes(self.key), bytes(self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_cbc_encrypt(self): - data = bytes_to_intlist(self.secret_msg) - encrypted = intlist_to_bytes(aes_cbc_encrypt(data, self.key, self.iv)) + data = list(self.secret_msg) + encrypted = bytes(aes_cbc_encrypt(data, self.key, self.iv)) self.assertEqual( encrypted, b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd') def test_ctr_decrypt(self): - data = bytes_to_intlist(b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') - decrypted = intlist_to_bytes(aes_ctr_decrypt(data, self.key, self.iv)) + data = list(b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') + decrypted = bytes(aes_ctr_decrypt(data, self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_ctr_encrypt(self): - data = bytes_to_intlist(self.secret_msg) - encrypted = intlist_to_bytes(aes_ctr_encrypt(data, self.key, self.iv)) + data = list(self.secret_msg) + encrypted = bytes(aes_ctr_encrypt(data, self.key, self.iv)) self.assertEqual( encrypted, b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') @@ -75,19 +74,19 @@ class TestAES(unittest.TestCase): data = b'\x159Y\xcf5eud\x90\x9c\x85&]\x14\x1d\x0f.\x08\xb4T\xe4/\x17\xbd' authentication_tag = b'\xe8&I\x80rI\x07\x9d}YWuU@:e' - decrypted = intlist_to_bytes(aes_gcm_decrypt_and_verify( - bytes_to_intlist(data), self.key, bytes_to_intlist(authentication_tag), self.iv[:12])) + decrypted = bytes(aes_gcm_decrypt_and_verify( + list(data), self.key, list(authentication_tag), self.iv[:12])) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) if Cryptodome.AES: decrypted = aes_gcm_decrypt_and_verify_bytes( - data, intlist_to_bytes(self.key), authentication_tag, intlist_to_bytes(self.iv[:12])) + data, bytes(self.key), authentication_tag, bytes(self.iv[:12])) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_gcm_aligned_decrypt(self): data = b'\x159Y\xcf5eud\x90\x9c\x85&]\x14\x1d\x0f' authentication_tag = b'\x08\xb1\x9d!&\x98\xd0\xeaRq\x90\xe6;\xb5]\xd8' - decrypted = intlist_to_bytes(aes_gcm_decrypt_and_verify( + decrypted = bytes(aes_gcm_decrypt_and_verify( list(data), self.key, list(authentication_tag), self.iv[:12])) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg[:16]) if Cryptodome.AES: @@ -96,38 +95,38 @@ class TestAES(unittest.TestCase): self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg[:16]) def test_decrypt_text(self): - password = intlist_to_bytes(self.key).decode() + password = bytes(self.key).decode() encrypted = base64.b64encode( - intlist_to_bytes(self.iv[:8]) + bytes(self.iv[:8]) + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae', ).decode() decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) - password = intlist_to_bytes(self.key).decode() + password = bytes(self.key).decode() encrypted = base64.b64encode( - intlist_to_bytes(self.iv[:8]) + bytes(self.iv[:8]) + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83', ).decode() decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) def test_ecb_encrypt(self): - data = bytes_to_intlist(self.secret_msg) - encrypted = intlist_to_bytes(aes_ecb_encrypt(data, self.key)) + data = list(self.secret_msg) + encrypted = bytes(aes_ecb_encrypt(data, self.key)) self.assertEqual( encrypted, b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') def test_ecb_decrypt(self): - data = bytes_to_intlist(b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') - decrypted = intlist_to_bytes(aes_ecb_decrypt(data, self.key, self.iv)) + data = list(b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + decrypted = bytes(aes_ecb_decrypt(data, self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_key_expansion(self): key = '4f6bdaa39e2f8cb07f5e722d9edef314' - self.assertEqual(key_expansion(bytes_to_intlist(bytearray.fromhex(key))), [ + self.assertEqual(key_expansion(list(bytearray.fromhex(key))), [ 0x4F, 0x6B, 0xDA, 0xA3, 0x9E, 0x2F, 0x8C, 0xB0, 0x7F, 0x5E, 0x72, 0x2D, 0x9E, 0xDE, 0xF3, 0x14, 0x53, 0x66, 0x20, 0xA8, 0xCD, 0x49, 0xAC, 0x18, 0xB2, 0x17, 0xDE, 0x35, 0x2C, 0xC9, 0x2D, 0x21, 0x8C, 0xBE, 0xDD, 0xD9, 0x41, 0xF7, 0x71, 0xC1, 0xF3, 0xE0, 0xAF, 0xF4, 0xDF, 0x29, 0x82, 0xD5, diff --git a/test/test_compat.py b/test/test_compat.py index e7d97e3e93..b1cc2a8187 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -12,12 +12,7 @@ import struct from yt_dlp import compat from yt_dlp.compat import urllib # isort: split -from yt_dlp.compat import ( - compat_etree_fromstring, - compat_expanduser, - compat_urllib_parse_unquote, # noqa: TID251 - compat_urllib_parse_urlencode, # noqa: TID251 -) +from yt_dlp.compat import compat_etree_fromstring, compat_expanduser from yt_dlp.compat.urllib.request import getproxies @@ -43,39 +38,6 @@ class TestCompat(unittest.TestCase): finally: os.environ['HOME'] = old_home or '' - def test_compat_urllib_parse_unquote(self): - self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') - self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') - self.assertEqual(compat_urllib_parse_unquote(''), '') - self.assertEqual(compat_urllib_parse_unquote('%'), '%') - self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') - self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') - self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') - self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') - self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') - self.assertEqual( - compat_urllib_parse_unquote(''' -%%a'''), - ''' -%%a''') - self.assertEqual( - compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), - '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') - - def test_compat_urllib_parse_unquote_plus(self): - self.assertEqual(urllib.parse.unquote_plus('abc%20def'), 'abc def') - self.assertEqual(urllib.parse.unquote_plus('%7e/abc+def'), '~/abc def') - - def test_compat_urllib_parse_urlencode(self): - self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode({b'abc': 'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode({b'abc': b'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([('abc', 'def')]), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([('abc', b'def')]), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([(b'abc', 'def')]), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([(b'abc', b'def')]), 'abc=def') - def test_compat_etree_fromstring(self): xml = ''' diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index faba0bc9c8..cf2e3fac16 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -15,7 +15,6 @@ import threading from test.helper import http_server_port, try_rm from yt_dlp import YoutubeDL from yt_dlp.downloader.http import HttpFD -from yt_dlp.utils import encodeFilename from yt_dlp.utils._utils import _YDLLogger as FakeLogger TEST_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -82,12 +81,12 @@ class TestHttpFD(unittest.TestCase): ydl = YoutubeDL(params) downloader = HttpFD(ydl, params) filename = 'testfile.mp4' - try_rm(encodeFilename(filename)) + try_rm(filename) self.assertTrue(downloader.real_download(filename, { 'url': f'http://127.0.0.1:{self.port}/{ep}', }), ep) - self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE, ep) - try_rm(encodeFilename(filename)) + self.assertEqual(os.path.getsize(filename), TEST_SIZE, ep) + try_rm(filename) def download_all(self, params): for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): diff --git a/test/test_utils.py b/test/test_utils.py index 835774a912..b3de14198e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,7 +21,6 @@ import xml.etree.ElementTree from yt_dlp.compat import ( compat_etree_fromstring, compat_HTMLParseError, - compat_os_name, ) from yt_dlp.utils import ( Config, @@ -49,7 +48,6 @@ from yt_dlp.utils import ( dfxp2srt, encode_base_n, encode_compat_str, - encodeFilename, expand_path, extract_attributes, extract_basic_auth, @@ -69,7 +67,6 @@ from yt_dlp.utils import ( get_elements_html_by_class, get_elements_text_and_html_by_attribute, int_or_none, - intlist_to_bytes, iri_to_uri, is_html, js_to_json, @@ -566,10 +563,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(res_data, {'a': 'b', 'c': 'd'}) def test_shell_quote(self): - args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')] + args = ['ffmpeg', '-i', 'ñ€ß\'.mp4'] self.assertEqual( shell_quote(args), - """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''') + """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if os.name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''') def test_float_or_none(self): self.assertEqual(float_or_none('42.42'), 42.42) @@ -1309,15 +1306,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') self.assertEqual(clean_html('a
    \xa0b'), 'a\nb') - def test_intlist_to_bytes(self): - self.assertEqual( - intlist_to_bytes([0, 1, 127, 128, 255]), - b'\x00\x01\x7f\x80\xff') - def test_args_to_str(self): self.assertEqual( args_to_str(['foo', 'ba/r', '-baz', '2 be', '']), - 'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""', + 'foo ba/r -baz \'2 be\' \'\'' if os.name != 'nt' else 'foo ba/r -baz "2 be" ""', ) def test_parse_filesize(self): @@ -2117,7 +2109,7 @@ Line 1 assert extract_basic_auth('http://user:@foo.bar') == ('http://foo.bar', 'Basic dXNlcjo=') assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') - @unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows') + @unittest.skipUnless(os.name == 'nt', 'Only relevant on Windows') def test_windows_escaping(self): tests = [ 'test"&', diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 3130deda31..749de5d4e3 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -26,7 +26,7 @@ import unicodedata from .cache import Cache from .compat import urllib # isort: split -from .compat import compat_os_name, urllib_req_to_req +from .compat import urllib_req_to_req from .cookies import CookieLoadError, LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version @@ -109,7 +109,6 @@ from .utils import ( determine_ext, determine_protocol, encode_compat_str, - encodeFilename, escapeHTML, expand_path, extract_basic_auth, @@ -167,7 +166,7 @@ from .utils.networking import ( ) from .version import CHANNEL, ORIGIN, RELEASE_GIT_HEAD, VARIANT, __version__ -if compat_os_name == 'nt': +if os.name == 'nt': import ctypes @@ -643,7 +642,7 @@ class YoutubeDL: out=stdout, error=sys.stderr, screen=sys.stderr if self.params.get('quiet') else stdout, - console=None if compat_os_name == 'nt' else next( + console=None if os.name == 'nt' else next( filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None), ) @@ -952,7 +951,7 @@ class YoutubeDL: self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) def _send_console_code(self, code): - if compat_os_name == 'nt' or not self._out_files.console: + if os.name == 'nt' or not self._out_files.console: return self._write_string(code, self._out_files.console) @@ -960,7 +959,7 @@ class YoutubeDL: if not self.params.get('consoletitle', False): return message = remove_terminal_sequences(message) - if compat_os_name == 'nt': + if os.name == 'nt': if ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is # already of type unicode() @@ -3255,9 +3254,9 @@ class YoutubeDL: if full_filename is None: return - if not self._ensure_dir_exists(encodeFilename(full_filename)): + if not self._ensure_dir_exists(full_filename): return - if not self._ensure_dir_exists(encodeFilename(temp_filename)): + if not self._ensure_dir_exists(temp_filename): return if self._write_description('video', info_dict, @@ -3289,16 +3288,16 @@ class YoutubeDL: if self.params.get('writeannotations', False): annofn = self.prepare_filename(info_dict, 'annotation') if annofn: - if not self._ensure_dir_exists(encodeFilename(annofn)): + if not self._ensure_dir_exists(annofn): return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)): + if not self.params.get('overwrites', True) and os.path.exists(annofn): self.to_screen('[info] Video annotations are already present') elif not info_dict.get('annotations'): self.report_warning('There are no annotations to write.') else: try: self.to_screen('[info] Writing video annotations to: ' + annofn) - with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + with open(annofn, 'w', encoding='utf-8') as annofile: annofile.write(info_dict['annotations']) except (KeyError, TypeError): self.report_warning('There are no annotations to write.') @@ -3314,14 +3313,14 @@ class YoutubeDL: f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown') return True linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext')) - if not self._ensure_dir_exists(encodeFilename(linkfn)): + if not self._ensure_dir_exists(linkfn): return False - if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): + if self.params.get('overwrites', True) and os.path.exists(linkfn): self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present') return True try: self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') - with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + with open(to_high_limit_path(linkfn), 'w', encoding='utf-8', newline='\r\n' if link_type == 'url' else '\n') as linkfile: template_vars = {'url': url} if link_type == 'desktop': @@ -3352,7 +3351,7 @@ class YoutubeDL: if self.params.get('skip_download'): info_dict['filepath'] = temp_filename - info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(full_filename)) info_dict['__files_to_move'] = files_to_move replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)) info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') @@ -3482,7 +3481,7 @@ class YoutubeDL: self.report_file_already_downloaded(dl_filename) dl_filename = dl_filename or temp_filename - info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(full_filename)) except network_exceptions as err: self.report_error(f'unable to download video data: {err}') @@ -4297,7 +4296,7 @@ class YoutubeDL: else: try: self.to_screen(f'[info] Writing {label} description to: {descfn}') - with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + with open(descfn, 'w', encoding='utf-8') as descfile: descfile.write(ie_result['description']) except OSError: self.report_error(f'Cannot write {label} description file {descfn}') @@ -4399,7 +4398,7 @@ class YoutubeDL: try: uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {}))) self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') - with open(encodeFilename(thumb_filename), 'wb') as thumbf: + with open(thumb_filename, 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index a7665159bd..a1880bf7dc 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -14,7 +14,6 @@ import os import re import traceback -from .compat import compat_os_name from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS, CookieLoadError from .downloader.external import get_external_downloader from .extractor import list_extractor_classes @@ -44,7 +43,6 @@ from .utils import ( GeoUtils, PlaylistEntries, SameFileError, - decodeOption, download_range_func, expand_path, float_or_none, @@ -883,8 +881,8 @@ def parse_options(argv=None): 'listsubtitles': opts.listsubtitles, 'subtitlesformat': opts.subtitlesformat, 'subtitleslangs': opts.subtitleslangs, - 'matchtitle': decodeOption(opts.matchtitle), - 'rejecttitle': decodeOption(opts.rejecttitle), + 'matchtitle': opts.matchtitle, + 'rejecttitle': opts.rejecttitle, 'max_downloads': opts.max_downloads, 'prefer_free_formats': opts.prefer_free_formats, 'trim_file_name': opts.trim_file_name, @@ -1053,7 +1051,7 @@ def _real_main(argv=None): ydl.warn_if_short_id(args) # Show a useful error message and wait for keypress if not launched from shell on Windows - if not args and compat_os_name == 'nt' and getattr(sys, 'frozen', False): + if not args and os.name == 'nt' and getattr(sys, 'frozen', False): import ctypes.wintypes import msvcrt diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index be67b40fe2..0930d36df9 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -3,7 +3,6 @@ from math import ceil from .compat import compat_ord from .dependencies import Cryptodome -from .utils import bytes_to_intlist, intlist_to_bytes if Cryptodome.AES: def aes_cbc_decrypt_bytes(data, key, iv): @@ -17,15 +16,15 @@ if Cryptodome.AES: else: def aes_cbc_decrypt_bytes(data, key, iv): """ Decrypt bytes with AES-CBC using native implementation since pycryptodome is unavailable """ - return intlist_to_bytes(aes_cbc_decrypt(*map(bytes_to_intlist, (data, key, iv)))) + return bytes(aes_cbc_decrypt(*map(list, (data, key, iv)))) def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): """ Decrypt bytes with AES-GCM using native implementation since pycryptodome is unavailable """ - return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) + return bytes(aes_gcm_decrypt_and_verify(*map(list, (data, key, tag, nonce)))) def aes_cbc_encrypt_bytes(data, key, iv, **kwargs): - return intlist_to_bytes(aes_cbc_encrypt(*map(bytes_to_intlist, (data, key, iv)), **kwargs)) + return bytes(aes_cbc_encrypt(*map(list, (data, key, iv)), **kwargs)) BLOCK_SIZE_BYTES = 16 @@ -221,7 +220,7 @@ def aes_gcm_decrypt_and_verify(data, key, tag, nonce): j0 = [*nonce, 0, 0, 0, 1] else: fill = (BLOCK_SIZE_BYTES - (len(nonce) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES + 8 - ghash_in = nonce + [0] * fill + bytes_to_intlist((8 * len(nonce)).to_bytes(8, 'big')) + ghash_in = nonce + [0] * fill + list((8 * len(nonce)).to_bytes(8, 'big')) j0 = ghash(hash_subkey, ghash_in) # TODO: add nonce support to aes_ctr_decrypt @@ -234,9 +233,9 @@ def aes_gcm_decrypt_and_verify(data, key, tag, nonce): s_tag = ghash( hash_subkey, data - + [0] * pad_len # pad - + bytes_to_intlist((0 * 8).to_bytes(8, 'big') # length of associated data - + ((len(data) * 8).to_bytes(8, 'big'))), # length of data + + [0] * pad_len # pad + + list((0 * 8).to_bytes(8, 'big') # length of associated data + + ((len(data) * 8).to_bytes(8, 'big'))), # length of data ) if tag != aes_ctr_encrypt(s_tag, key, j0): @@ -300,8 +299,8 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data)) - password = bytes_to_intlist(password.encode()) + data = list(base64.b64decode(data)) + password = list(password.encode()) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) @@ -310,7 +309,7 @@ def aes_decrypt_text(data, password, key_size_bytes): cipher = data[NONCE_LENGTH_BYTES:] decrypted_data = aes_ctr_decrypt(cipher, key, nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)) - return intlist_to_bytes(decrypted_data) + return bytes(decrypted_data) RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index d820adaf1e..d779620688 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -1,5 +1,4 @@ import os -import sys import xml.etree.ElementTree as etree from .compat_utils import passthrough_module @@ -24,33 +23,14 @@ def compat_etree_fromstring(text): return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) -compat_os_name = os._name if os.name == 'java' else os.name - - -def compat_shlex_quote(s): - from ..utils import shell_quote - return shell_quote(s) - - def compat_ord(c): return c if isinstance(c, int) else ord(c) -if compat_os_name == 'nt' and sys.version_info < (3, 8): - # os.path.realpath on Windows does not follow symbolic links - # prior to Python 3.8 (see https://bugs.python.org/issue9949) - def compat_realpath(path): - while os.path.islink(path): - path = os.path.abspath(os.readlink(path)) - return os.path.realpath(path) -else: - compat_realpath = os.path.realpath - - # Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl # See https://github.com/yt-dlp/yt-dlp/issues/792 # https://docs.python.org/3/library/os.path.html#os.path.expanduser -if compat_os_name in ('nt', 'ce'): +if os.name in ('nt', 'ce'): def compat_expanduser(path): HOME = os.environ.get('HOME') if not HOME: diff --git a/yt_dlp/compat/_deprecated.py b/yt_dlp/compat/_deprecated.py index 607bae9999..445acc1a06 100644 --- a/yt_dlp/compat/_deprecated.py +++ b/yt_dlp/compat/_deprecated.py @@ -8,16 +8,14 @@ passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn( DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6)) del passthrough_module -import base64 -import urllib.error -import urllib.parse +import functools # noqa: F401 +import os -compat_str = str -compat_b64decode = base64.b64decode +compat_os_name = os.name +compat_realpath = os.path.realpath -compat_urlparse = urllib.parse -compat_parse_qs = urllib.parse.parse_qs -compat_urllib_parse_unquote = urllib.parse.unquote -compat_urllib_parse_urlencode = urllib.parse.urlencode -compat_urllib_parse_urlparse = urllib.parse.urlparse + +def compat_shlex_quote(s): + from ..utils import shell_quote + return shell_quote(s) diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index dfc792eae4..dae2c14592 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -30,7 +30,7 @@ from asyncio import run as compat_asyncio_run # noqa: F401 from re import Pattern as compat_Pattern # noqa: F401 from re import match as compat_Match # noqa: F401 -from . import compat_expanduser, compat_HTMLParseError, compat_realpath +from . import compat_expanduser, compat_HTMLParseError from .compat_utils import passthrough_module from ..dependencies import brotli as compat_brotli # noqa: F401 from ..dependencies import websockets as compat_websockets # noqa: F401 @@ -78,7 +78,7 @@ compat_kwargs = lambda kwargs: kwargs compat_map = map compat_numeric_types = (int, float, complex) compat_os_path_expanduser = compat_expanduser -compat_os_path_realpath = compat_realpath +compat_os_path_realpath = os.path.realpath compat_print = print compat_shlex_split = shlex.split compat_socket_create_connection = socket.create_connection @@ -104,5 +104,12 @@ compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseEr compat_xpath = lambda xpath: xpath compat_zip = zip workaround_optparse_bug9161 = lambda: None +compat_str = str +compat_b64decode = base64.b64decode +compat_urlparse = urllib.parse +compat_parse_qs = urllib.parse.parse_qs +compat_urllib_parse_unquote = urllib.parse.unquote +compat_urllib_parse_urlencode = urllib.parse.urlencode +compat_urllib_parse_urlparse = urllib.parse.urlparse legacy = [] diff --git a/yt_dlp/compat/functools.py b/yt_dlp/compat/functools.py deleted file mode 100644 index c2e9e90279..0000000000 --- a/yt_dlp/compat/functools.py +++ /dev/null @@ -1,7 +0,0 @@ -# flake8: noqa: F405 -from functools import * # noqa: F403 - -from .compat_utils import passthrough_module - -passthrough_module(__name__, 'functools') -del passthrough_module diff --git a/yt_dlp/compat/urllib/request.py b/yt_dlp/compat/urllib/request.py index ad9fa83c87..dfc7f4a2dc 100644 --- a/yt_dlp/compat/urllib/request.py +++ b/yt_dlp/compat/urllib/request.py @@ -7,9 +7,9 @@ passthrough_module(__name__, 'urllib.request') del passthrough_module -from .. import compat_os_name +import os -if compat_os_name == 'nt': +if os.name == 'nt': # On older Python versions, proxies are extracted from Windows registry erroneously. [1] # If the https proxy in the registry does not have a scheme, urllib will incorrectly add https:// to it. [2] # It is unlikely that the user has actually set it to be https, so we should be fine to safely downgrade @@ -37,4 +37,4 @@ if compat_os_name == 'nt': def getproxies(): return getproxies_environment() or getproxies_registry_patched() -del compat_os_name +del os diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index e673498244..d5b0d3991b 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -25,7 +25,6 @@ from .aes import ( aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) -from .compat import compat_os_name from .dependencies import ( _SECRETSTORAGE_UNAVAILABLE_REASON, secretstorage, @@ -343,7 +342,7 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): logger.debug(f'cookie version breakdown: {counts}') return jar except PermissionError as error: - if compat_os_name == 'nt' and error.errno == 13: + if os.name == 'nt' and error.errno == 13: message = 'Could not copy Chrome cookie database. See https://github.com/yt-dlp/yt-dlp/issues/7271 for more info' logger.error(message) raise DownloadError(message) # force exit diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 2e3ea2fc4e..e8dcb37cc3 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -20,9 +20,7 @@ from ..utils import ( Namespace, RetryManager, classproperty, - decodeArgument, deprecation_warning, - encodeFilename, format_bytes, join_nonempty, parse_bytes, @@ -219,7 +217,7 @@ class FileDownloader: def temp_name(self, filename): """Returns a temporary filename for the given filename.""" if self.params.get('nopart', False) or filename == '-' or \ - (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): + (os.path.exists(filename) and not os.path.isfile(filename)): return filename return filename + '.part' @@ -273,7 +271,7 @@ class FileDownloader: """Try to set the last-modified time of the given file.""" if last_modified_hdr is None: return - if not os.path.isfile(encodeFilename(filename)): + if not os.path.isfile(filename): return timestr = last_modified_hdr if timestr is None: @@ -432,13 +430,13 @@ class FileDownloader: """ nooverwrites_and_exists = ( not self.params.get('overwrites', True) - and os.path.exists(encodeFilename(filename)) + and os.path.exists(filename) ) if not hasattr(filename, 'write'): continuedl_and_exists = ( self.params.get('continuedl', True) - and os.path.isfile(encodeFilename(filename)) + and os.path.isfile(filename) and not self.params.get('nopart', False) ) @@ -448,7 +446,7 @@ class FileDownloader: self._hook_progress({ 'filename': filename, 'status': 'finished', - 'total_bytes': os.path.getsize(encodeFilename(filename)), + 'total_bytes': os.path.getsize(filename), }, info_dict) self._finish_multiline_status() return True, False @@ -489,9 +487,7 @@ class FileDownloader: if not self.params.get('verbose', False): return - str_args = [decodeArgument(a) for a in args] - if exe is None: - exe = os.path.basename(str_args[0]) + exe = os.path.basename(args[0]) - self.write_debug(f'{exe} command line: {shell_quote(str_args)}') + self.write_debug(f'{exe} command line: {shell_quote(args)}') diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 6c1ec403c8..7f6b5b45cc 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -23,7 +23,6 @@ from ..utils import ( cli_valueless_option, determine_ext, encodeArgument, - encodeFilename, find_available_port, remove_end, traverse_obj, @@ -67,7 +66,7 @@ class ExternalFD(FragmentFD): 'elapsed': time.time() - started, } if filename != '-': - fsize = os.path.getsize(encodeFilename(tmpfilename)) + fsize = os.path.getsize(tmpfilename) self.try_rename(tmpfilename, filename) status.update({ 'downloaded_bytes': fsize, @@ -184,9 +183,9 @@ class ExternalFD(FragmentFD): dest.write(decrypt_fragment(fragment, src.read())) src.close() if not self.params.get('keep_fragments', False): - self.try_remove(encodeFilename(fragment_filename)) + self.try_remove(fragment_filename) dest.close() - self.try_remove(encodeFilename(f'{tmpfilename}.frag.urls')) + self.try_remove(f'{tmpfilename}.frag.urls') return 0 def _call_process(self, cmd, info_dict): @@ -620,7 +619,7 @@ class FFmpegFD(ExternalFD): args += self._configuration_args(('_o1', '_o', '')) args = [encodeArgument(opt) for opt in args] - args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + args.append(ffpp._ffmpeg_filename_argument(tmpfilename)) self._debug_cmd(args) piped = any(fmt['url'] in ('-', 'pipe:') for fmt in selected_formats) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 0d00196e2e..98784e7039 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -9,10 +9,9 @@ import time from .common import FileDownloader from .http import HttpFD from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import compat_os_name from ..networking import Request from ..networking.exceptions import HTTPError, IncompleteRead -from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj +from ..utils import DownloadError, RetryManager, traverse_obj from ..utils.networking import HTTPHeaderDict from ..utils.progress import ProgressCalculator @@ -152,7 +151,7 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): self._write_ytdl_file(ctx) if not self.params.get('keep_fragments', False): - self.try_remove(encodeFilename(ctx['fragment_filename_sanitized'])) + self.try_remove(ctx['fragment_filename_sanitized']) del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): @@ -188,7 +187,7 @@ class FragmentFD(FileDownloader): }) if self.__do_ytdl_file(ctx): - ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + ytdl_file_exists = os.path.isfile(self.ytdl_filename(ctx['filename'])) continuedl = self.params.get('continuedl', True) if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) @@ -390,7 +389,7 @@ class FragmentFD(FileDownloader): def __exit__(self, exc_type, exc_val, exc_tb): pass - if compat_os_name == 'nt': + if os.name == 'nt': def future_result(future): while True: try: diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index c0165790d1..9c6dd8b799 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -15,7 +15,6 @@ from ..utils import ( ThrottledDownload, XAttrMetadataError, XAttrUnavailableError, - encodeFilename, int_or_none, parse_http_range, try_call, @@ -58,9 +57,8 @@ class HttpFD(FileDownloader): if self.params.get('continuedl', True): # Establish possible resume length - if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize( - encodeFilename(ctx.tmpfilename)) + if os.path.isfile(ctx.tmpfilename): + ctx.resume_len = os.path.getsize(ctx.tmpfilename) ctx.is_resume = ctx.resume_len > 0 @@ -241,7 +239,7 @@ class HttpFD(FileDownloader): ctx.resume_len = byte_counter else: try: - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = os.path.getsize(ctx.tmpfilename) except FileNotFoundError: ctx.resume_len = 0 raise RetryDownload(e) diff --git a/yt_dlp/downloader/rtmp.py b/yt_dlp/downloader/rtmp.py index d7ffb3b34d..1b831e5f30 100644 --- a/yt_dlp/downloader/rtmp.py +++ b/yt_dlp/downloader/rtmp.py @@ -8,7 +8,6 @@ from ..utils import ( Popen, check_executable, encodeArgument, - encodeFilename, get_exe_version, ) @@ -179,7 +178,7 @@ class RtmpFD(FileDownloader): return False while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: - prevsize = os.path.getsize(encodeFilename(tmpfilename)) + prevsize = os.path.getsize(tmpfilename) self.to_screen(f'[rtmpdump] Downloaded {prevsize} bytes') time.sleep(5.0) # This seems to be needed args = [*basic_args, '--resume'] @@ -187,7 +186,7 @@ class RtmpFD(FileDownloader): args += ['--skip', '1'] args = [encodeArgument(a) for a in args] retval = run_rtmpdump(args) - cursize = os.path.getsize(encodeFilename(tmpfilename)) + cursize = os.path.getsize(tmpfilename) if prevsize == cursize and retval == RD_FAILED: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those @@ -196,7 +195,7 @@ class RtmpFD(FileDownloader): retval = RD_SUCCESS break if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): - fsize = os.path.getsize(encodeFilename(tmpfilename)) + fsize = os.path.getsize(tmpfilename) self.to_screen(f'[rtmpdump] Downloaded {fsize} bytes') self.try_rename(tmpfilename, filename) self._hook_progress({ diff --git a/yt_dlp/downloader/rtsp.py b/yt_dlp/downloader/rtsp.py index e89269fed9..b4b0be7e6e 100644 --- a/yt_dlp/downloader/rtsp.py +++ b/yt_dlp/downloader/rtsp.py @@ -2,7 +2,7 @@ import os import subprocess from .common import FileDownloader -from ..utils import check_executable, encodeFilename +from ..utils import check_executable class RtspFD(FileDownloader): @@ -26,7 +26,7 @@ class RtspFD(FileDownloader): retval = subprocess.call(args) if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) + fsize = os.path.getsize(tmpfilename) self.to_screen(f'\r[{args[0]}] {fsize} bytes') self.try_rename(tmpfilename, filename) self._hook_progress({ diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 66ab083fe0..b1343eed39 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -6,7 +6,6 @@ import hmac import io import json import re -import struct import time import urllib.parse import uuid @@ -18,10 +17,8 @@ from ..networking.exceptions import TransportError from ..utils import ( ExtractorError, OnDemandPagedList, - bytes_to_intlist, decode_base_n, int_or_none, - intlist_to_bytes, time_seconds, traverse_obj, update_url_query, @@ -72,15 +69,15 @@ class AbemaLicenseRH(RequestHandler): }) res = decode_base_n(license_response['k'], table=self._STRTABLE) - encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) + encvideokey = list(res.to_bytes(16, 'big')) h = hmac.new( binascii.unhexlify(self._HKEY), (license_response['cid'] + self.ie._DEVICE_ID).encode(), digestmod=hashlib.sha256) - enckey = bytes_to_intlist(h.digest()) + enckey = list(h.digest()) - return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) + return bytes(aes_ecb_decrypt(encvideokey, enckey)) class AbemaTVBaseIE(InfoExtractor): diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index c8a2613754..919e1d6af5 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -11,11 +11,9 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, ass_subtitles_timecode, - bytes_to_intlist, bytes_to_long, float_or_none, int_or_none, - intlist_to_bytes, join_nonempty, long_to_bytes, parse_iso8601, @@ -198,16 +196,16 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') self._K = ''.join(random.choices('0123456789abcdef', k=16)) - message = bytes_to_intlist(json.dumps({ + message = list(json.dumps({ 'k': self._K, 't': token, - })) + }).encode()) # Sometimes authentication fails for no good reason, retry with # a different random padding links_data = None for _ in range(3): - padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + padded_message = bytes(pkcs1pad(message, 128)) n, e = self._RSA_KEY encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) authorization = base64.b64encode(encrypted_message).decode() diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index ba1d7df372..bd3b19b133 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -8,10 +8,8 @@ import time from .common import InfoExtractor from ..aes import aes_encrypt from ..utils import ( - bytes_to_intlist, determine_ext, int_or_none, - intlist_to_bytes, join_nonempty, smuggle_url, strip_jsonp, @@ -234,8 +232,8 @@ class AnvatoIE(InfoExtractor): server_time = self._server_time(access_key, video_id) input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}' - auth_secret = intlist_to_bytes(aes_encrypt( - bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) + auth_secret = bytes(aes_encrypt( + list(input_data[:64].encode()), list(self._AUTH_KEY))) query = { 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'), 'rtyp': 'fp', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 23f6fc6c46..2aa40a77a7 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -25,7 +25,6 @@ import xml.etree.ElementTree from ..compat import ( compat_etree_fromstring, compat_expanduser, - compat_os_name, urllib_req_to_req, ) from ..cookies import LenientSimpleCookie @@ -1029,7 +1028,7 @@ class InfoExtractor: filename = sanitize_filename(f'{basen}.dump', restricted=True) # Working around MAX_PATH limitation on Windows (see # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': + if os.name == 'nt': absfilepath = os.path.abspath(filename) if len(absfilepath) > 259: filename = fR'\\?\{absfilepath}' diff --git a/yt_dlp/extractor/shemaroome.py b/yt_dlp/extractor/shemaroome.py index 284b2f89c1..3ab322f67d 100644 --- a/yt_dlp/extractor/shemaroome.py +++ b/yt_dlp/extractor/shemaroome.py @@ -1,11 +1,9 @@ import base64 from .common import InfoExtractor -from ..aes import aes_cbc_decrypt, unpad_pkcs7 +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..utils import ( ExtractorError, - bytes_to_intlist, - intlist_to_bytes, unified_strdate, ) @@ -68,10 +66,10 @@ class ShemarooMeIE(InfoExtractor): data_json = self._download_json('https://www.shemaroome.com/users/user_all_lists', video_id, data=data.encode()) if not data_json.get('status'): raise ExtractorError('Premium videos cannot be downloaded yet.', expected=True) - url_data = bytes_to_intlist(base64.b64decode(data_json['new_play_url'])) - key = bytes_to_intlist(base64.b64decode(data_json['key'])) - iv = [0] * 16 - m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') + url_data = base64.b64decode(data_json['new_play_url']) + key = base64.b64decode(data_json['key']) + iv = bytes(16) + m3u8_url = unpad_pkcs7(aes_cbc_decrypt_bytes(url_data, key, iv)).decode('ascii') headers = {'stream_key': data_json['stream_key']} formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers) for fmt in formats: diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index eeeece82c2..be2bb33f64 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -9,7 +9,6 @@ from ..utils import ( RetryManager, _configuration_args, deprecation_warning, - encodeFilename, ) @@ -151,7 +150,7 @@ class PostProcessor(metaclass=PostProcessorMetaClass): def try_utime(self, path, atime, mtime, errnote='Cannot update utime of file'): try: - os.utime(encodeFilename(path), (atime, mtime)) + os.utime(path, (atime, mtime)) except Exception: self.report_warning(errnote) diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index 16c8bcdda7..d8ba220cab 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -12,7 +12,6 @@ from ..utils import ( PostProcessingError, check_executable, encodeArgument, - encodeFilename, prepend_extension, shell_quote, ) @@ -68,7 +67,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): self.to_screen('There are no thumbnails on disk') return [], info thumbnail_filename = info['thumbnails'][idx]['filepath'] - if not os.path.exists(encodeFilename(thumbnail_filename)): + if not os.path.exists(thumbnail_filename): self.report_warning('Skipping embedding the thumbnail because the file is missing.') return [], info @@ -85,7 +84,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): thumbnail_filename = convertor.convert_thumbnail(thumbnail_filename, 'png') thumbnail_ext = 'png' - mtime = os.stat(encodeFilename(filename)).st_mtime + mtime = os.stat(filename).st_mtime success = True if info['ext'] == 'mp3': @@ -154,12 +153,12 @@ class EmbedThumbnailPP(FFmpegPostProcessor): else: if not prefer_atomicparsley: self.to_screen('mutagen was not found. Falling back to AtomicParsley') - cmd = [encodeFilename(atomicparsley, True), - encodeFilename(filename, True), + cmd = [atomicparsley, + filename, encodeArgument('--artwork'), - encodeFilename(thumbnail_filename, True), + thumbnail_filename, encodeArgument('-o'), - encodeFilename(temp_filename, True)] + temp_filename] cmd += [encodeArgument(o) for o in self._configuration_args('AtomicParsley')] self._report_run('atomicparsley', filename) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 164c46d143..d994754fd3 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -21,7 +21,6 @@ from ..utils import ( determine_ext, dfxp2srt, encodeArgument, - encodeFilename, filter_dict, float_or_none, is_outdated_version, @@ -243,13 +242,13 @@ class FFmpegPostProcessor(PostProcessor): try: if self.probe_available: cmd = [ - encodeFilename(self.probe_executable, True), + self.probe_executable, encodeArgument('-show_streams')] else: cmd = [ - encodeFilename(self.executable, True), + self.executable, encodeArgument('-i')] - cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) + cmd.append(self._ffmpeg_filename_argument(path)) self.write_debug(f'{self.basename} command line: {shell_quote(cmd)}') stdout, stderr, returncode = Popen.run( cmd, text=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -282,7 +281,7 @@ class FFmpegPostProcessor(PostProcessor): self.check_version() cmd = [ - encodeFilename(self.probe_executable, True), + self.probe_executable, encodeArgument('-hide_banner'), encodeArgument('-show_format'), encodeArgument('-show_streams'), @@ -335,9 +334,9 @@ class FFmpegPostProcessor(PostProcessor): self.check_version() oldest_mtime = min( - os.stat(encodeFilename(path)).st_mtime for path, _ in input_path_opts if path) + os.stat(path).st_mtime for path, _ in input_path_opts if path) - cmd = [encodeFilename(self.executable, True), encodeArgument('-y')] + cmd = [self.executable, encodeArgument('-y')] # avconv does not have repeat option if self.basename == 'ffmpeg': cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')] @@ -353,7 +352,7 @@ class FFmpegPostProcessor(PostProcessor): args.append('-i') return ( [encodeArgument(arg) for arg in args] - + [encodeFilename(self._ffmpeg_filename_argument(file), True)]) + + [self._ffmpeg_filename_argument(file)]) for arg_type, path_opts in (('i', input_path_opts), ('o', output_path_opts)): cmd += itertools.chain.from_iterable( @@ -522,8 +521,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): return [], information orig_path = prepend_extension(path, 'orig') temp_path = prepend_extension(path, 'temp') - if (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)) - and os.path.exists(encodeFilename(orig_path))): + if (self._nopostoverwrites and os.path.exists(new_path) + and os.path.exists(orig_path)): self.to_screen(f'Post-process file {new_path} exists, skipping') return [], information @@ -838,7 +837,7 @@ class FFmpegMergerPP(FFmpegPostProcessor): args.extend(['-map', f'{i}:v:0']) self.to_screen(f'Merging formats into "{filename}"') self.run_ffmpeg_multiple_files(info['__files_to_merge'], temp_filename, args) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + os.rename(temp_filename, filename) return info['__files_to_merge'], info def can_merge(self): @@ -1039,7 +1038,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): def _ffmpeg_args_for_chapter(self, number, chapter, info): destination = self._prepare_filename(number, chapter, info) - if not self._downloader._ensure_dir_exists(encodeFilename(destination)): + if not self._downloader._ensure_dir_exists(destination): return chapter['filepath'] = destination diff --git a/yt_dlp/postprocessor/movefilesafterdownload.py b/yt_dlp/postprocessor/movefilesafterdownload.py index 35e87051b4..964ca1f921 100644 --- a/yt_dlp/postprocessor/movefilesafterdownload.py +++ b/yt_dlp/postprocessor/movefilesafterdownload.py @@ -4,8 +4,6 @@ from .common import PostProcessor from ..compat import shutil from ..utils import ( PostProcessingError, - decodeFilename, - encodeFilename, make_dir, ) @@ -21,25 +19,25 @@ class MoveFilesAfterDownloadPP(PostProcessor): return 'MoveFiles' def run(self, info): - dl_path, dl_name = os.path.split(encodeFilename(info['filepath'])) + dl_path, dl_name = os.path.split(info['filepath']) finaldir = info.get('__finaldir', dl_path) finalpath = os.path.join(finaldir, dl_name) if self._downloaded: - info['__files_to_move'][info['filepath']] = decodeFilename(finalpath) + info['__files_to_move'][info['filepath']] = finalpath - make_newfilename = lambda old: decodeFilename(os.path.join(finaldir, os.path.basename(encodeFilename(old)))) + make_newfilename = lambda old: os.path.join(finaldir, os.path.basename(old)) for oldfile, newfile in info['__files_to_move'].items(): if not newfile: newfile = make_newfilename(oldfile) - if os.path.abspath(encodeFilename(oldfile)) == os.path.abspath(encodeFilename(newfile)): + if os.path.abspath(oldfile) == os.path.abspath(newfile): continue - if not os.path.exists(encodeFilename(oldfile)): + if not os.path.exists(oldfile): self.report_warning(f'File "{oldfile}" cannot be found') continue - if os.path.exists(encodeFilename(newfile)): + if os.path.exists(newfile): if self.get_param('overwrites', True): self.report_warning(f'Replacing existing file "{newfile}"') - os.remove(encodeFilename(newfile)) + os.remove(newfile) else: self.report_warning( f'Cannot move file "{oldfile}" out of temporary directory since "{newfile}" already exists. ') diff --git a/yt_dlp/postprocessor/sponskrub.py b/yt_dlp/postprocessor/sponskrub.py index 525b6392a4..ac6db1bc7b 100644 --- a/yt_dlp/postprocessor/sponskrub.py +++ b/yt_dlp/postprocessor/sponskrub.py @@ -9,7 +9,6 @@ from ..utils import ( check_executable, cli_option, encodeArgument, - encodeFilename, prepend_extension, shell_quote, str_or_none, @@ -52,7 +51,7 @@ class SponSkrubPP(PostProcessor): return [], information filename = information['filepath'] - if not os.path.exists(encodeFilename(filename)): # no download + if not os.path.exists(filename): # no download return [], information if information['extractor_key'].lower() != 'youtube': @@ -71,8 +70,8 @@ class SponSkrubPP(PostProcessor): self.report_warning('If sponskrub is run multiple times, unintended parts of the video could be cut out.') temp_filename = prepend_extension(filename, self._temp_ext) - if os.path.exists(encodeFilename(temp_filename)): - os.remove(encodeFilename(temp_filename)) + if os.path.exists(temp_filename): + os.remove(temp_filename) cmd = [self.path] if not self.cutout: diff --git a/yt_dlp/postprocessor/xattrpp.py b/yt_dlp/postprocessor/xattrpp.py index 166aabaf92..e486b797b7 100644 --- a/yt_dlp/postprocessor/xattrpp.py +++ b/yt_dlp/postprocessor/xattrpp.py @@ -1,7 +1,6 @@ import os from .common import PostProcessor -from ..compat import compat_os_name from ..utils import ( PostProcessingError, XAttrMetadataError, @@ -57,7 +56,7 @@ class XAttrMetadataPP(PostProcessor): elif e.reason == 'VALUE_TOO_LONG': self.report_warning(f'Unable to write extended attribute "{xattrname}" due to too long values.') else: - tip = ('You need to use NTFS' if compat_os_name == 'nt' + tip = ('You need to use NTFS' if os.name == 'nt' else 'You may have to enable them in your "/etc/fstab"') raise PostProcessingError(f'This filesystem doesn\'t support extended attributes. {tip}') diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 90df2509f0..ca2ec5f376 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -13,7 +13,6 @@ import sys from dataclasses import dataclass from zipimport import zipimporter -from .compat import compat_realpath from .networking import Request from .networking.exceptions import HTTPError, network_exceptions from .utils import ( @@ -201,8 +200,6 @@ class UpdateInfo: binary_name: str | None = _get_binary_name() # noqa: RUF009: Always returns the same value checksum: str | None = None - _has_update = True - class Updater: # XXX: use class variables to simplify testing @@ -523,7 +520,7 @@ class Updater: @functools.cached_property def filename(self): """Filename of the executable""" - return compat_realpath(_get_variant_and_executable_path()[1]) + return os.path.realpath(_get_variant_and_executable_path()[1]) @functools.cached_property def cmd(self): @@ -562,62 +559,14 @@ class Updater: f'Unable to {action}{delim} visit ' f'https://github.com/{self.requested_repo}/releases/{path}', True) - # XXX: Everything below this line in this class is deprecated / for compat only - @property - def _target_tag(self): - """Deprecated; requested tag with 'tags/' prepended when necessary for API calls""" - return f'tags/{self.requested_tag}' if self.requested_tag != 'latest' else self.requested_tag - - def _check_update(self): - """Deprecated; report whether there is an update available""" - return bool(self.query_update(_output=True)) - - def __getattr__(self, attribute: str): - """Compat getter function for deprecated attributes""" - deprecated_props_map = { - 'check_update': '_check_update', - 'target_tag': '_target_tag', - 'target_channel': 'requested_channel', - } - update_info_props_map = { - 'has_update': '_has_update', - 'new_version': 'version', - 'latest_version': 'requested_version', - 'release_name': 'binary_name', - 'release_hash': 'checksum', - } - - if attribute not in deprecated_props_map and attribute not in update_info_props_map: - raise AttributeError(f'{type(self).__name__!r} object has no attribute {attribute!r}') - - msg = f'{type(self).__name__}.{attribute} is deprecated and will be removed in a future version' - if attribute in deprecated_props_map: - source_name = deprecated_props_map[attribute] - if not source_name.startswith('_'): - msg += f'. Please use {source_name!r} instead' - source = self - mapping = deprecated_props_map - - else: # attribute in update_info_props_map - msg += '. Please call query_update() instead' - source = self.query_update() - if source is None: - source = UpdateInfo('', None, None, None) - source._has_update = False - mapping = update_info_props_map - - deprecation_warning(msg) - for target_name, source_name in mapping.items(): - value = getattr(source, source_name) - setattr(self, target_name, value) - - return getattr(self, attribute) - def run_update(ydl): """Update the program file with the latest version from the repository @returns Whether there was a successful update (No update = False) """ + deprecation_warning( + '"yt_dlp.update.run_update(ydl)" is deprecated and may be removed in a future version. ' + 'Use "yt_dlp.update.Updater(ydl).update()" instead') return Updater(ydl).update() diff --git a/yt_dlp/utils/_deprecated.py b/yt_dlp/utils/_deprecated.py index a8ae8ecb5d..e4762699b7 100644 --- a/yt_dlp/utils/_deprecated.py +++ b/yt_dlp/utils/_deprecated.py @@ -9,31 +9,23 @@ passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn( del passthrough_module -from ._utils import preferredencoding +import re +import struct -def encodeFilename(s, for_subprocess=False): - assert isinstance(s, str) - return s +def bytes_to_intlist(bs): + if not bs: + return [] + if isinstance(bs[0], int): # Python 3 + return list(bs) + else: + return [ord(c) for c in bs] -def decodeFilename(b, for_subprocess=False): - return b +def intlist_to_bytes(xs): + if not xs: + return b'' + return struct.pack('%dB' % len(xs), *xs) -def decodeArgument(b): - return b - - -def decodeOption(optval): - if optval is None: - return optval - if isinstance(optval, bytes): - optval = optval.decode(preferredencoding()) - - assert isinstance(optval, str) - return optval - - -def error_to_compat_str(err): - return str(err) +compiled_regex_type = type(re.compile('')) diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index 356e580226..d65b135d9d 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -313,3 +313,30 @@ def make_HTTPS_handler(params, **kwargs): def process_communicate_or_kill(p, *args, **kwargs): return Popen.communicate_or_kill(p, *args, **kwargs) + + +def encodeFilename(s, for_subprocess=False): + assert isinstance(s, str) + return s + + +def decodeFilename(b, for_subprocess=False): + return b + + +def decodeArgument(b): + return b + + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, str) + return optval + + +def error_to_compat_str(err): + return str(err) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 89c53c39e7..8517b762ef 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -49,15 +49,11 @@ from ..compat import ( compat_etree_fromstring, compat_expanduser, compat_HTMLParseError, - compat_os_name, ) from ..dependencies import xattr __name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module -# This is not clearly defined otherwise -compiled_regex_type = type(re.compile('')) - class NO_DEFAULT: pass @@ -874,7 +870,7 @@ class Popen(subprocess.Popen): kwargs.setdefault('encoding', 'utf-8') kwargs.setdefault('errors', 'replace') - if shell and compat_os_name == 'nt' and kwargs.get('executable') is None: + if shell and os.name == 'nt' and kwargs.get('executable') is None: if not isinstance(args, str): args = shell_quote(args, shell=True) shell = False @@ -1457,7 +1453,7 @@ def system_identifier(): @functools.cache def get_windows_version(): """ Get Windows version. returns () if it's not running on Windows """ - if compat_os_name == 'nt': + if os.name == 'nt': return version_tuple(platform.win32_ver()[1]) else: return () @@ -1470,7 +1466,7 @@ def write_string(s, out=None, encoding=None): if not out: return - if compat_os_name == 'nt' and supports_terminal_sequences(out): + if os.name == 'nt' and supports_terminal_sequences(out): s = re.sub(r'([\r\n]+)', r' \1', s) enc, buffer = None, out @@ -1503,21 +1499,6 @@ def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): deprecation_warning._cache = set() -def bytes_to_intlist(bs): - if not bs: - return [] - if isinstance(bs[0], int): # Python 3 - return list(bs) - else: - return [ord(c) for c in bs] - - -def intlist_to_bytes(xs): - if not xs: - return b'' - return struct.pack('%dB' % len(xs), *xs) - - class LockingUnsupportedError(OSError): msg = 'File locking is not supported' @@ -1701,7 +1682,7 @@ _CMD_QUOTE_TRANS = str.maketrans({ def shell_quote(args, *, shell=False): args = list(variadic(args)) - if compat_os_name != 'nt': + if os.name != 'nt': return shlex.join(args) trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS @@ -4516,7 +4497,7 @@ def urshift(val, n): def write_xattr(path, key, value): # Windows: Write xattrs to NTFS Alternate Data Streams: # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 - if compat_os_name == 'nt': + if os.name == 'nt': assert ':' not in key assert os.path.exists(path) @@ -4778,12 +4759,12 @@ def jwt_decode_hs256(jwt): return json.loads(base64.urlsafe_b64decode(f'{payload_b64}===')) -WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None +WINDOWS_VT_MODE = False if os.name == 'nt' else None @functools.cache def supports_terminal_sequences(stream): - if compat_os_name == 'nt': + if os.name == 'nt': if not WINDOWS_VT_MODE: return False elif not os.getenv('TERM'): @@ -4877,7 +4858,7 @@ def parse_http_range(range): def read_stdin(what): if what: - eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D' + eof = 'Ctrl+Z' if os.name == 'nt' else 'Ctrl+D' write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n') return sys.stdin From 637d62a3a9fc723d68632c1af25c30acdadeeb85 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 17 Nov 2024 00:31:04 +0100 Subject: [PATCH 170/216] [ie/youtube] Player client maintenance (#11528) Authored by: bashonly, seproDev Co-authored-by: bashonly --- README.md | 2 +- yt_dlp/extractor/youtube.py | 73 ++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 2df72b7498..09096218e8 100644 --- a/README.md +++ b/README.md @@ -1768,7 +1768,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `mediaconnect`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, and `web_creator,mediaconnect` is added as needed for age-gated videos when account age verification is required. Similarly, the `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `mediaconnect`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, and `web_creator` is added as needed for age-gated videos when account age verification is required. Similarly, the `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index caa99182ae..2c57ee6005 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -124,14 +124,15 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, + 'REQUIRE_AUTH': True, }, 'android': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '19.29.37', + 'clientVersion': '19.44.38', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/19.29.37 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.youtube/19.44.38 (Linux; U; Android 11) gzip', 'osName': 'Android', 'osVersion': '11', }, @@ -140,13 +141,14 @@ INNERTUBE_CLIENTS = { 'REQUIRE_JS_PLAYER': False, 'REQUIRE_PO_TOKEN': True, }, + # This client now requires sign-in for every video 'android_music': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '7.11.50', + 'clientVersion': '7.27.52', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.music/7.11.50 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.apps.youtube.music/7.27.52 (Linux; U; Android 11) gzip', 'osName': 'Android', 'osVersion': '11', }, @@ -154,15 +156,16 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, 'REQUIRE_PO_TOKEN': True, + 'REQUIRE_AUTH': True, }, # This client now requires sign-in for every video 'android_creator': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '24.30.100', + 'clientVersion': '24.45.100', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.creator/24.30.100 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.apps.youtube.creator/24.45.100 (Linux; U; Android 11) gzip', 'osName': 'Android', 'osVersion': '11', }, @@ -170,17 +173,18 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, 'REQUIRE_PO_TOKEN': True, + 'REQUIRE_AUTH': True, }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_VR', - 'clientVersion': '1.57.29', + 'clientVersion': '1.60.19', 'deviceMake': 'Oculus', 'deviceModel': 'Quest 3', 'androidSdkVersion': 32, - 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.57.29 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', + 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.60.19 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', 'osName': 'Android', 'osVersion': '12L', }, @@ -188,68 +192,56 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, 'REQUIRE_JS_PLAYER': False, }, - 'android_testsuite': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'ANDROID_TESTSUITE', - 'clientVersion': '1.9', - 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 11) gzip', - 'osName': 'Android', - 'osVersion': '11', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 30, - 'REQUIRE_JS_PLAYER': False, - 'PLAYER_PARAMS': '2AMB', - }, # iOS clients have HLS live streams. Setting device model to get 60fps formats. # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 'ios': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '19.29.1', + 'clientVersion': '19.45.4', 'deviceMake': 'Apple', 'deviceModel': 'iPhone16,2', - 'userAgent': 'com.google.ios.youtube/19.29.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'userAgent': 'com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', 'osName': 'iPhone', - 'osVersion': '17.5.1.21F90', + 'osVersion': '18.1.0.22B83', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, 'REQUIRE_JS_PLAYER': False, }, + # This client now requires sign-in for every video 'ios_music': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '7.08.2', + 'clientVersion': '7.27.0', 'deviceMake': 'Apple', 'deviceModel': 'iPhone16,2', - 'userAgent': 'com.google.ios.youtubemusic/7.08.2 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'userAgent': 'com.google.ios.youtubemusic/7.27.0 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', 'osName': 'iPhone', - 'osVersion': '17.5.1.21F90', + 'osVersion': '18.1.0.22B83', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_AUTH': True, }, # This client now requires sign-in for every video 'ios_creator': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '24.30.100', + 'clientVersion': '24.45.100', 'deviceMake': 'Apple', 'deviceModel': 'iPhone16,2', - 'userAgent': 'com.google.ios.ytcreator/24.30.100 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'userAgent': 'com.google.ios.ytcreator/24.45.100 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', 'osName': 'iPhone', - 'osVersion': '17.5.1.21F90', + 'osVersion': '18.1.0.22B83', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_AUTH': True, }, # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 @@ -282,8 +274,10 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, + 'REQUIRE_AUTH': True, }, - # This client has pre-merged video+audio 720p/1080p streams + # This client now requires sign-in for every video + # It may be able to receive pre-merged video+audio 720p/1080p streams 'mediaconnect': { 'INNERTUBE_CONTEXT': { 'client': { @@ -293,6 +287,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 95, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_AUTH': True, }, } @@ -321,6 +316,7 @@ def build_innertube_clients(): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg.setdefault('REQUIRE_PO_TOKEN', False) + ytcfg.setdefault('REQUIRE_AUTH', False) ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') @@ -4059,9 +4055,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if smuggled_data.get('is_music_url') or self.is_music_url(url): for requested_client in requested_clients: _, base_client, variant = _split_innertube_client(requested_client) - music_client = f'{base_client}_music' + music_client = f'{base_client}_music' if base_client != 'mweb' else 'web_music' if variant != 'music' and music_client in INNERTUBE_CLIENTS: - requested_clients.append(music_client) + if not INNERTUBE_CLIENTS[music_client]['REQUIRE_AUTH'] or self.is_authenticated: + requested_clients.append(music_client) return orderedSet(requested_clients) @@ -4174,10 +4171,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen( f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) - # web_creator and mediaconnect can work around the age-verification requirement - # _testsuite & _vr variants can also work around age-verification + # web_creator can work around the age-verification requirement + # android_vr and mediaconnect may also be able to work around age-verification # tv_embedded may(?) still work around age-verification if the video is embeddable - append_client('web_creator', 'mediaconnect') + append_client('web_creator') prs.extend(deprioritized_prs) From 52c0ffe40ad6e8404d93296f575007b05b04c686 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 16 Nov 2024 23:40:21 +0000 Subject: [PATCH 171/216] [ie/youtube] Remove broken OAuth support (#11558) Closes #11462 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 240 +++--------------------------------- 1 file changed, 18 insertions(+), 222 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2c57ee6005..a3b237bc8d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -22,7 +22,7 @@ import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper from ..jsinterp import JSInterpreter -from ..networking.exceptions import HTTPError, TransportError, network_exceptions +from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, ExtractorError, @@ -50,12 +50,12 @@ from ..utils import ( parse_iso8601, parse_qs, qualities, + remove_end, remove_start, smuggle_url, str_or_none, str_to_int, strftime_or_none, - time_seconds, traverse_obj, try_call, try_get, @@ -573,208 +573,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._check_login_required() def _perform_login(self, username, password): - auth_type, _, user = (username or '').partition('+') - - if auth_type != 'oauth': - raise ExtractorError(self._youtube_login_hint, expected=True) - - self._initialize_oauth(user, password) - - ''' - OAuth 2.0 Device Authorization Grant flow, used by the YouTube TV client (youtube.com/tv). - - For more information regarding OAuth 2.0 and the Device Authorization Grant flow in general, see: - - https://developers.google.com/identity/protocols/oauth2/limited-input-device - - https://accounts.google.com/.well-known/openid-configuration - - https://www.rfc-editor.org/rfc/rfc8628 - - https://www.rfc-editor.org/rfc/rfc6749 - - Note: The official client appears to use a proxied version of the oauth2 endpoints on youtube.com/o/oauth2, - which applies some modifications to the response (such as returning errors as 200 OK). - Since the client works with the standard API, we will use that as it is well-documented. - ''' - - _OAUTH_PROFILE = None - _OAUTH_ACCESS_TOKEN_CACHE = {} - _OAUTH_DISPLAY_ID = 'oauth' - - # YouTube TV (TVHTML5) client. You can find these at youtube.com/tv - _OAUTH_CLIENT_ID = '861556708454-d6dlm3lh05idd8npek18k6be8ba3oc68.apps.googleusercontent.com' - _OAUTH_CLIENT_SECRET = 'SboVhoG9s0rNafixCSGGKXAT' - _OAUTH_SCOPE = 'http://gdata.youtube.com https://www.googleapis.com/auth/youtube-paid-content' - - # From https://accounts.google.com/.well-known/openid-configuration - # Technically, these should be fetched dynamically and not hard-coded. - # However, as these endpoints rarely change, we can risk saving an extra request for every invocation. - _OAUTH_DEVICE_AUTHORIZATION_ENDPOINT = 'https://oauth2.googleapis.com/device/code' - _OAUTH_TOKEN_ENDPOINT = 'https://oauth2.googleapis.com/token' - - @property - def _oauth_cache_key(self): - return f'oauth_refresh_token_{self._OAUTH_PROFILE}' - - def _read_oauth_error_response(self, response): - return traverse_obj( - self._webpage_read_content(response, self._OAUTH_TOKEN_ENDPOINT, self._OAUTH_DISPLAY_ID, fatal=False), - ({json.loads}, 'error', {str})) - - def _set_oauth_info(self, token_response): - YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE.setdefault(self._OAUTH_PROFILE, {}).update({ - 'access_token': token_response['access_token'], - 'token_type': token_response['token_type'], - 'expiry': time_seconds( - seconds=traverse_obj(token_response, ('expires_in', {float_or_none}), default=300) - 10), - }) - refresh_token = traverse_obj(token_response, ('refresh_token', {str})) - if refresh_token: - self.cache.store(self._NETRC_MACHINE, self._oauth_cache_key, refresh_token) - YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE]['refresh_token'] = refresh_token - - def _initialize_oauth(self, user, refresh_token): - self._OAUTH_PROFILE = user or 'default' - - if self._OAUTH_PROFILE in YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE: - self.write_debug(f'{self._OAUTH_DISPLAY_ID}: Using cached access token for profile "{self._OAUTH_PROFILE}"') - return - - YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE] = {} - - if refresh_token: - msg = f'{self._OAUTH_DISPLAY_ID}: Using password input as refresh token' - if self.get_param('cachedir') is not False: - msg += ' and caching token to disk; you should supply an empty password next time' - self.to_screen(msg) - self.cache.store(self._NETRC_MACHINE, self._oauth_cache_key, refresh_token) - else: - refresh_token = self.cache.load(self._NETRC_MACHINE, self._oauth_cache_key) - - if refresh_token: - YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE]['refresh_token'] = refresh_token - try: - token_response = self._refresh_token(refresh_token) - except ExtractorError as e: - error_msg = str(e.orig_msg).replace('Failed to refresh access token: ', '') - self.report_warning(f'{self._OAUTH_DISPLAY_ID}: Failed to refresh access token: {error_msg}') - token_response = self._oauth_authorize - else: - token_response = self._oauth_authorize - - self._set_oauth_info(token_response) - self.write_debug(f'{self._OAUTH_DISPLAY_ID}: Logged in using profile "{self._OAUTH_PROFILE}"') - - def _refresh_token(self, refresh_token): - try: - token_response = self._download_json( - self._OAUTH_TOKEN_ENDPOINT, - video_id=self._OAUTH_DISPLAY_ID, - note='Refreshing access token', - data=json.dumps({ - 'client_id': self._OAUTH_CLIENT_ID, - 'client_secret': self._OAUTH_CLIENT_SECRET, - 'refresh_token': refresh_token, - 'grant_type': 'refresh_token', - }).encode(), - headers={'Content-Type': 'application/json'}) - except ExtractorError as e: - if isinstance(e.cause, HTTPError): - error = self._read_oauth_error_response(e.cause.response) - if error == 'invalid_grant': - # RFC6749 § 5.2 - raise ExtractorError( - 'Failed to refresh access token: Refresh token is invalid, revoked, or expired (invalid_grant)', - expected=True, video_id=self._OAUTH_DISPLAY_ID) - raise ExtractorError( - f'Failed to refresh access token: Authorization server returned error {error}', - video_id=self._OAUTH_DISPLAY_ID) - raise - return token_response - - @property - def _oauth_authorize(self): - code_response = self._download_json( - self._OAUTH_DEVICE_AUTHORIZATION_ENDPOINT, - video_id=self._OAUTH_DISPLAY_ID, - note='Initializing authorization flow', - data=json.dumps({ - 'client_id': self._OAUTH_CLIENT_ID, - 'scope': self._OAUTH_SCOPE, - }).encode(), - headers={'Content-Type': 'application/json'}) - - verification_url = traverse_obj(code_response, ('verification_url', {str})) - user_code = traverse_obj(code_response, ('user_code', {str})) - if not verification_url or not user_code: + if username.startswith('oauth'): raise ExtractorError( - 'Authorization server did not provide verification_url or user_code', video_id=self._OAUTH_DISPLAY_ID) + f'Login with OAuth is no longer supported. {self._youtube_login_hint}', expected=True) - # note: The whitespace is intentional - self.to_screen( - f'{self._OAUTH_DISPLAY_ID}: To give yt-dlp access to your account, ' - f'go to {verification_url} and enter code {user_code}') - - # RFC8628 § 3.5: default poll interval is 5 seconds if not provided - poll_interval = traverse_obj(code_response, ('interval', {int}), default=5) - - for retry in self.RetryManager(): - while True: - try: - token_response = self._download_json( - self._OAUTH_TOKEN_ENDPOINT, - video_id=self._OAUTH_DISPLAY_ID, - note=False, - errnote='Failed to request access token', - data=json.dumps({ - 'client_id': self._OAUTH_CLIENT_ID, - 'client_secret': self._OAUTH_CLIENT_SECRET, - 'device_code': code_response['device_code'], - 'grant_type': 'urn:ietf:params:oauth:grant-type:device_code', - }).encode(), - headers={'Content-Type': 'application/json'}) - except ExtractorError as e: - if isinstance(e.cause, TransportError): - retry.error = e - break - elif isinstance(e.cause, HTTPError): - error = self._read_oauth_error_response(e.cause.response) - if not error: - retry.error = e - break - - if error == 'authorization_pending': - time.sleep(poll_interval) - continue - elif error == 'expired_token': - raise ExtractorError( - 'Authorization timed out', expected=True, video_id=self._OAUTH_DISPLAY_ID) - elif error == 'access_denied': - raise ExtractorError( - 'You denied access to an account', expected=True, video_id=self._OAUTH_DISPLAY_ID) - elif error == 'slow_down': - # RFC8628 § 3.5: add 5 seconds to the poll interval - poll_interval += 5 - time.sleep(poll_interval) - continue - else: - raise ExtractorError( - f'Authorization server returned an error when fetching access token: {error}', - video_id=self._OAUTH_DISPLAY_ID) - raise - - return token_response - - def _update_oauth(self): - token = YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE.get(self._OAUTH_PROFILE) - if token is None or token['expiry'] > time.time(): - return - - self._set_oauth_info(self._refresh_token(token['refresh_token'])) + self.report_warning( + f'Login with password is not supported for YouTube. {self._youtube_login_hint}') @property def _youtube_login_hint(self): - return ('Use --username=oauth[+PROFILE] --password="" to log in using oauth, ' - f'or else u{self._login_hint(method="cookies")[1:]}. ' - 'See https://github.com/yt-dlp/yt-dlp/wiki/Extractors#logging-in-with-oauth for more on how to use oauth. ' - 'See https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies for help with cookies') + return (f'{self._login_hint(method="cookies")}. Also see ' + 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies ' + 'for tips on effectively exporting YouTube cookies') def _check_login_required(self): if self._LOGIN_REQUIRED and not self.is_authenticated: @@ -924,7 +734,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): @functools.cached_property def is_authenticated(self): - return self._OAUTH_PROFILE or bool(self._generate_sapisidhash_header()) + return bool(self._generate_sapisidhash_header()) def extract_ytcfg(self, video_id, webpage): if not webpage: @@ -934,16 +744,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', default='{}'), video_id, fatal=False) or {} - def _generate_oauth_headers(self): - self._update_oauth() - oauth_token = YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE.get(self._OAUTH_PROFILE) - if not oauth_token: - return {} - - return { - 'Authorization': f'{oauth_token["token_type"]} {oauth_token["access_token"]}', - } - def _generate_cookie_auth_headers(self, *, ytcfg=None, account_syncid=None, session_index=None, origin=None, **kwargs): headers = {} account_syncid = account_syncid or self._extract_account_syncid(ytcfg) @@ -973,14 +773,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'Origin': origin, 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client), - **self._generate_oauth_headers(), **self._generate_cookie_auth_headers(ytcfg=ytcfg, account_syncid=account_syncid, session_index=session_index, origin=origin), } return filter_dict(headers) - def _generate_webpage_headers(self): - return self._generate_oauth_headers() - def _download_ytcfg(self, client, video_id): url = { 'web': 'https://www.youtube.com', @@ -990,8 +786,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not url: return {} webpage = self._download_webpage( - url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config', - headers=self._generate_webpage_headers()) + url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config') return self.extract_ytcfg(video_id, webpage) or {} @staticmethod @@ -3256,8 +3051,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): code = self._download_webpage( player_url, video_id, fatal=fatal, note='Downloading player ' + player_id, - errnote=f'Download of {player_url} failed', - headers=self._generate_webpage_headers()) + errnote=f'Download of {player_url} failed') if code: self._code_cache[player_id] = code return self._code_cache.get(player_id) @@ -3540,8 +3334,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._download_webpage( url, video_id, f'Marking {label}watched', - 'Unable to mark watched', fatal=False, - headers=self._generate_webpage_headers()) + 'Unable to mark watched', fatal=False) @classmethod def _extract_from_webpage(cls, url, webpage): @@ -4523,7 +4316,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if pp: query['pp'] = pp webpage = self._download_webpage( - webpage_url, video_id, fatal=False, query=query, headers=self._generate_webpage_headers()) + webpage_url, video_id, fatal=False, query=query) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() @@ -4666,6 +4459,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.raise_geo_restricted(subreason, countries, metadata_available=True) reason += f'. {subreason}' if reason: + if 'sign in' in reason.lower(): + reason = remove_end(reason, 'This helps protect our community. Learn more') + reason = f'{remove_end(reason.strip(), ".")}. {self._youtube_login_hint}' self.raise_no_formats(reason, expected=True) keywords = get_first(video_details, 'keywords', expected_type=list) or [] @@ -5811,7 +5607,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): webpage, data = None, None for retry in self.RetryManager(fatal=fatal): try: - webpage = self._download_webpage(url, item_id, note='Downloading webpage', headers=self._generate_webpage_headers()) + webpage = self._download_webpage(url, item_id, note='Downloading webpage') data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): From 7cecd299e4a5ef1f0f044b2fedc26f17e41f15e3 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 17 Nov 2024 13:32:12 +0100 Subject: [PATCH 172/216] [ie/chaturbate] Don't break embed detection (#11565) Bugfix for 720b3dc453c342bc2e8df7dbc0acaab4479de46c Authored by: seproDev --- yt_dlp/extractor/chaturbate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index aa70f26a1b..a40b7d39c7 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor): 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), } - def _extract_from_webpage(self, video_id, tld): + def _extract_from_html(self, video_id, tld): webpage = self._download_webpage( f'https://chaturbate.{tld}/{video_id}/', video_id, headers=self.geo_verification_headers(), impersonate=True) @@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor): def _real_extract(self, url): video_id, tld = self._match_valid_url(url).group('id', 'tld') - return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) + return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld) From eb15fd5a32d8b35ef515f7a3d1158c03025648ff Mon Sep 17 00:00:00 2001 From: krichbanana <77071421+krichbanana@users.noreply.github.com> Date: Sun, 17 Nov 2024 09:12:26 -0500 Subject: [PATCH 173/216] [ie/kenh14] Add extractor (#3996) Closes #3937 Authored by: krichbanana, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/kenh14.py | 160 ++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 yt_dlp/extractor/kenh14.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 25a233a2d6..af903f307b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -946,6 +946,10 @@ from .kaltura import KalturaIE from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE from .kelbyone import KelbyOneIE +from .kenh14 import ( + Kenh14PlaylistIE, + Kenh14VideoIE, +) from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, diff --git a/yt_dlp/extractor/kenh14.py b/yt_dlp/extractor/kenh14.py new file mode 100644 index 0000000000..3c46020e8b --- /dev/null +++ b/yt_dlp/extractor/kenh14.py @@ -0,0 +1,160 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_attribute, + get_elements_html_by_class, + int_or_none, + parse_duration, + parse_iso8601, + remove_start, + strip_or_none, + unescapeHTML, + update_url, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class Kenh14VideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P[0-9]+)\.chn' + _TESTS = [{ + 'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn', + 'md5': '1ed67f9c3a1e74acf15db69590cf6210', + 'info_dict': { + 'id': '316173', + 'ext': 'mp4', + 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', + 'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], + 'uploader': 'Unbox Therapy', + 'upload_date': '20220517', + 'view_count': int, + 'duration': 722.86, + 'timestamp': 1652764468, + }, + }, { + 'url': 'https://video.kenh14.vn/video-316174.chn', + 'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd', + 'info_dict': { + 'id': '316174', + 'ext': 'mp4', + 'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu', + 'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], + 'upload_date': '20220517', + 'view_count': int, + 'duration': 70.04, + 'timestamp': 1652766021, + }, + }, { + 'url': 'https://video.kenh14.vn/0-344740.chn', + 'md5': 'b843495d5e728142c8870c09b46df2a9', + 'info_dict': { + 'id': '344740', + 'ext': 'mov', + 'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi', + 'description': 'md5:2a2dbb4a7397169fb21ee68f09160497', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$', + 'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'], + 'uploader': 'Quang Vũ', + 'upload_date': '20241024', + 'view_count': int, + 'duration': 198.88, + 'timestamp': 1729741590, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '') + direct_url = attrs['data-vid'] + + metadata = self._download_json( + 'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format( + remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False) + + formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}] + subtitles = {} + video_data = self._download_json( + f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False) + if hls_url := traverse_obj(video_data, ('hls', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_url, video_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})): + fmts, subs = self._extract_mpd_formats_and_subtitles( + dash_url, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + **traverse_obj(metadata, { + 'duration': ('duration', {parse_duration}), + 'uploader': ('author', {strip_or_none}), + 'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}), + 'view_count': ('views', {int_or_none}), + }), + 'id': video_id, + 'title': ( + traverse_obj(metadata, ('title', {strip_or_none})) + or clean_html(self._og_search_title(webpage)) + or clean_html(get_element_by_class('vdbw-title', webpage))), + 'formats': formats, + 'subtitles': subtitles, + 'description': ( + clean_html(self._og_search_description(webpage)) + or clean_html(get_element_by_class('vdbw-sapo', webpage))), + 'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')), + 'tags': traverse_obj(self._html_search_meta('keywords', webpage), ( + {lambda x: x.split(';')}, ..., filter)), + } + + +class Kenh14PlaylistIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P[0-9]+)\.chn' + _TESTS = [{ + 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn', + 'info_dict': { + 'id': '71', + 'title': 'Trần Tình (Naked love) mùa 2', + 'description': 'md5:e9522339304956dea931722dd72eddb2', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 9, + }, { + 'url': 'https://video.kenh14.vn/playlist/0-72.chn', + 'info_dict': { + 'id': '72', + 'title': 'Lau Lại Đầu Từ', + 'description': 'Cùng xem xưa và nay có gì khác biệt nhé!', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + category_detail = get_element_by_class('category-detail', webpage) or '' + embed_info = traverse_obj( + self._yield_json_ld(webpage, playlist_id), + (lambda _, v: v['name'] and v['alternateName'], any)) or {} + + return self.playlist_from_matches( + get_elements_html_by_class('video-item', webpage), playlist_id, + (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))), + getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']), + ie=Kenh14VideoIE, playlist_description=( + clean_html(get_element_by_class('description', category_detail)) + or unescapeHTML(embed_info.get('alternateName'))), + thumbnail=traverse_obj( + self._og_search_thumbnail(webpage), + ({url_or_none}, {update_url(query=None)}))) From 10fc719bc7f1eef469389c5219102266ef411f29 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Mon, 18 Nov 2024 01:22:40 +0900 Subject: [PATCH 174/216] [cleanup] Remove dead extractors (#11566) - Removes MildomClipIE, MildomIE, MildomUserVodIE, MildomVodIE - Removes PokemonIE, PokemonWatchIE - Removes VeohIE, VeohUserIE Closes #3373, Closes #7059 Authored by: doe1080 --- yt_dlp/extractor/_extractors.py | 14 -- yt_dlp/extractor/mildom.py | 291 -------------------------------- yt_dlp/extractor/pokemon.py | 136 --------------- yt_dlp/extractor/veoh.py | 189 --------------------- 4 files changed, 630 deletions(-) delete mode 100644 yt_dlp/extractor/mildom.py delete mode 100644 yt_dlp/extractor/pokemon.py delete mode 100644 yt_dlp/extractor/veoh.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index af903f307b..0d849c1697 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1139,12 +1139,6 @@ from .microsoftembed import ( MicrosoftMediusIE, ) from .microsoftstream import MicrosoftStreamIE -from .mildom import ( - MildomClipIE, - MildomIE, - MildomUserVodIE, - MildomVodIE, -) from .minds import ( MindsChannelIE, MindsGroupIE, @@ -1563,10 +1557,6 @@ from .podbayfm import ( ) from .podchaser import PodchaserIE from .podomatic import PodomaticIE -from .pokemon import ( - PokemonIE, - PokemonWatchIE, -) from .pokergo import ( PokerGoCollectionIE, PokerGoIE, @@ -2288,10 +2278,6 @@ from .utreon import UtreonIE from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veo import VeoIE -from .veoh import ( - VeohIE, - VeohUserIE, -) from .vesti import VestiIE from .vevo import ( VevoIE, diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py deleted file mode 100644 index 88a2b9e891..0000000000 --- a/yt_dlp/extractor/mildom.py +++ /dev/null @@ -1,291 +0,0 @@ -import functools -import json -import uuid - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - OnDemandPagedList, - determine_ext, - dict_get, - float_or_none, - traverse_obj, -) - - -class MildomBaseIE(InfoExtractor): - _GUEST_ID = None - - def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): - if not self._GUEST_ID: - self._GUEST_ID = f'pc-gp-{uuid.uuid4()}' - - content = self._download_json( - url, video_id, note=note, data=json.dumps(body).encode() if body else None, - headers={'Content-Type': 'application/json'} if body else {}, - query={ - '__guest_id': self._GUEST_ID, - '__platform': 'web', - **(query or {}), - }) - - if content['code'] != 0: - raise ExtractorError( - f'Mildom says: {content["message"]} (code {content["code"]})', - expected=True) - return content['body'] - - -class MildomIE(MildomBaseIE): - IE_NAME = 'mildom' - IE_DESC = 'Record ongoing live by specific user in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/(?P\d+)' - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id) - - enterstudio = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, - note='Downloading live metadata', query={'user_id': video_id}) - result_video_id = enterstudio.get('log_id', video_id) - - servers = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, - note='Downloading live server list', query={ - 'user_id': video_id, - 'live_server_type': 'hls', - }) - - playback_token = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id, - note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'}) - playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False) - if not playback_token: - raise ExtractorError('Failed to obtain live playback token') - - formats = self._extract_m3u8_formats( - f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}', - result_video_id, 'mp4', headers={ - 'Referer': 'https://www.mildom.com/', - 'Origin': 'https://www.mildom.com', - }) - - for fmt in formats: - fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' - - return { - 'id': result_video_id, - 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'), - 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str), - 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000), - 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'), - 'uploader_id': video_id, - 'formats': formats, - 'is_live': True, - } - - -class MildomVodIE(MildomBaseIE): - IE_NAME = 'mildom:vod' - IE_DESC = 'VOD in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P\d+)/(?P(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)' - _TESTS = [{ - 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269', - 'info_dict': { - 'id': '10882672-1597662269', - 'ext': 'mp4', - 'title': '始めてのミルダム配信じゃぃ!', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'upload_date': '20200817', - 'duration': 4138.37, - 'description': 'ゲームをしたくて!', - 'timestamp': 1597662269.0, - 'uploader_id': '10882672', - 'uploader': 'kson組長(けいそん)', - }, - }, { - 'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477', - 'info_dict': { - 'id': '10882672-1597758589870-477', - 'ext': 'mp4', - 'title': '【kson】感染メイズ!麻酔銃で無双する', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'timestamp': 1597759093.0, - 'uploader': 'kson組長(けいそん)', - 'duration': 4302.58, - 'uploader_id': '10882672', - 'description': 'このステージ絶対乗り越えたい', - 'upload_date': '20200818', - }, - }, { - 'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0', - 'info_dict': { - 'id': '10882672-buha9td2lrn97fk2jme0', - 'ext': 'mp4', - 'title': '【kson組長】CART RACER!!!', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'uploader_id': '10882672', - 'uploader': 'kson組長(けいそん)', - 'upload_date': '20201104', - 'timestamp': 1604494797.0, - 'duration': 4657.25, - 'description': 'WTF', - }, - }] - - def _real_extract(self, url): - user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id) - - autoplay = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, - note='Downloading playback metadata', query={ - 'v_id': video_id, - })['playback'] - - formats = [{ - 'url': autoplay['audio_url'], - 'format_id': 'audio', - 'protocol': 'm3u8_native', - 'vcodec': 'none', - 'acodec': 'aac', - 'ext': 'm4a', - }] - for fmt in autoplay['video_link']: - formats.append({ - 'format_id': 'video-{}'.format(fmt['name']), - 'url': fmt['url'], - 'protocol': 'm3u8_native', - 'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'], - 'height': fmt['level'], - 'vcodec': 'h264', - 'acodec': 'aac', - 'ext': 'mp4', - }) - - return { - 'id': video_id, - 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'), - 'description': traverse_obj(autoplay, 'video_intro'), - 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000), - 'duration': float_or_none(autoplay.get('video_length'), scale=1000), - 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')), - 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')), - 'uploader_id': user_id, - 'formats': formats, - } - - -class MildomClipIE(MildomBaseIE): - IE_NAME = 'mildom:clip' - IE_DESC = 'Clip in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P(?P\d+)-[a-zA-Z0-9]+)' - _TESTS = [{ - 'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9', - 'info_dict': { - 'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9', - 'title': '全然違ったよ', - 'timestamp': 1619181890, - 'duration': 59, - 'thumbnail': r're:https?://.+', - 'uploader': 'ざきんぽ', - 'uploader_id': '10042245', - }, - }, { - 'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864', - 'info_dict': { - 'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864', - 'title': 'かっこいい', - 'timestamp': 1621094003, - 'duration': 59, - 'thumbnail': r're:https?://.+', - 'uploader': '(ルーキー', - 'uploader_id': '10111524', - }, - }, { - 'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', - 'info_dict': { - 'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', - 'title': 'あ', - 'timestamp': 1614769431, - 'duration': 31, - 'thumbnail': r're:https?://.+', - 'uploader': 'ドルゴルスレンギーン=ダグワドルジ', - 'uploader_id': '10660174', - }, - }] - - def _real_extract(self, url): - user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id) - - clip_detail = self._call_api( - 'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id, - note='Downloading playback metadata', query={ - 'clip_id': video_id, - }) - - return { - 'id': video_id, - 'title': self._html_search_meta( - ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'), - 'timestamp': float_or_none(clip_detail.get('create_time')), - 'duration': float_or_none(clip_detail.get('length')), - 'thumbnail': clip_detail.get('cover'), - 'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')), - 'uploader_id': user_id, - - 'url': clip_detail['url'], - 'ext': determine_ext(clip_detail.get('url'), 'mp4'), - } - - -class MildomUserVodIE(MildomBaseIE): - IE_NAME = 'mildom:user:vod' - IE_DESC = 'Download all VODs from specific user in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/profile/(?P\d+)' - _TESTS = [{ - 'url': 'https://www.mildom.com/profile/10093333', - 'info_dict': { - 'id': '10093333', - 'title': 'Uploads from ねこばたけ', - }, - 'playlist_mincount': 732, - }, { - 'url': 'https://www.mildom.com/profile/10882672', - 'info_dict': { - 'id': '10882672', - 'title': 'Uploads from kson組長(けいそん)', - }, - 'playlist_mincount': 201, - }] - - def _fetch_page(self, user_id, page): - page += 1 - reply = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', - user_id, note=f'Downloading page {page}', query={ - 'user_id': user_id, - 'page': page, - 'limit': '30', - }) - if not reply: - return - for x in reply: - v_id = x.get('v_id') - if not v_id: - continue - yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}') - - def _real_extract(self, url): - user_id = self._match_id(url) - self.to_screen(f'This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/{user_id}" instead') - - profile = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id, - query={'user_id': user_id}, note='Downloading user profile')['user_info'] - - return self.playlist_result( - OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30), - user_id, f'Uploads from {profile["loginname"]}') diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py deleted file mode 100644 index 1769684f72..0000000000 --- a/yt_dlp/extractor/pokemon.py +++ /dev/null @@ -1,136 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - extract_attributes, - int_or_none, - js_to_json, - merge_dicts, -) - - -class PokemonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/(?:[^/]+/)+(?P[^/?#&]+))' - _TESTS = [{ - 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', - 'md5': '2fe8eaec69768b25ef898cda9c43062e', - 'info_dict': { - 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', - 'ext': 'mp4', - 'title': 'The Ol’ Raise and Switch!', - 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', - }, - 'add_id': ['LimelightMedia'], - }, { - # no data-video-title - 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', - 'info_dict': { - 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', - 'ext': 'mp4', - 'title': "Pokémon : L'ascension de Darkrai", - 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', - }, - 'add_id': ['LimelightMedia'], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id, display_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, video_id or display_id) - video_data = extract_attributes(self._search_regex( - r'(<[^>]+data-video-id="{}"[^>]*>)'.format(video_id if video_id else '[a-z0-9]{32}'), - webpage, 'video data element')) - video_id = video_data['data-video-id'] - title = video_data.get('data-video-title') or self._html_search_meta( - 'pkm-title', webpage, ' title', default=None) or self._search_regex( - r']+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': f'limelight:media:{video_id}', - 'title': title, - 'description': video_data.get('data-video-summary'), - 'thumbnail': video_data.get('data-video-poster'), - 'series': 'Pokémon', - 'season_number': int_or_none(video_data.get('data-video-season')), - 'episode': title, - 'episode_number': int_or_none(video_data.get('data-video-episode')), - 'ie_key': 'LimelightMedia', - } - - -class PokemonWatchIE(InfoExtractor): - _VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P[a-z0-9]{32})' - _API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}' - _TESTS = [{ - 'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667', - 'md5': '62833938a31e61ab49ada92f524c42ff', - 'info_dict': { - 'id': '8309a40969894a8e8d5bc1311e9c5667', - 'ext': 'mp4', - 'title': 'Lillier and the Staff!', - 'description': 'md5:338841b8c21b283d24bdc9b568849f04', - }, - }, { - 'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2', - 'only_matching': True, - }, { - 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07', - 'only_matching': True, - }] - - def _extract_media(self, channel_array, video_id): - for channel in channel_array: - for media in channel.get('media'): - if media.get('id') == video_id: - return media - return None - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = { - '_type': 'url', - 'id': video_id, - 'url': f'limelight:media:{video_id}', - 'ie_key': 'LimelightMedia', - } - - # API call can be avoided entirely if we are listing formats - if self.get_param('listformats', False): - return info - - webpage = self._download_webpage(url, video_id) - build_vars = self._parse_json(self._search_regex( - r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'), - video_id, transform_source=js_to_json) - region = build_vars.get('region') - channel_array = self._download_json(self._API_URL.format(region), video_id) - video_data = self._extract_media(channel_array, video_id) - - if video_data is None: - raise ExtractorError( - f'Video {video_id} does not exist', expected=True) - - info['_type'] = 'url_transparent' - images = video_data.get('images') - - return merge_dicts(info, { - 'title': video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': images.get('medium') or images.get('small'), - 'series': 'Pokémon', - 'season_number': int_or_none(video_data.get('season')), - 'episode': video_data.get('title'), - 'episode_number': int_or_none(video_data.get('episode')), - }) diff --git a/yt_dlp/extractor/veoh.py b/yt_dlp/extractor/veoh.py deleted file mode 100644 index aac768f3c6..0000000000 --- a/yt_dlp/extractor/veoh.py +++ /dev/null @@ -1,189 +0,0 @@ -import functools -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - OnDemandPagedList, - int_or_none, - parse_duration, - qualities, - remove_start, - strip_or_none, -) - - -class VeohIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|videos|embed|iphone/#_Watch)/(?P(?:v|e|yapi-)[\da-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', - 'md5': '620e68e6a3cff80086df3348426c9ca3', - 'info_dict': { - 'id': 'v56314296nk7Zdmz3', - 'ext': 'mp4', - 'title': 'Straight Backs Are Stronger', - 'description': 'md5:203f976279939a6dc664d4001e13f5f4', - 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th56314296\\.jpg(\\?.*)?', - 'uploader': 'LUMOback', - 'duration': 46, - 'view_count': int, - 'average_rating': int, - 'comment_count': int, - 'age_limit': 0, - 'categories': ['technology_and_gaming'], - 'tags': ['posture', 'posture', 'sensor', 'back', 'pain', 'wearable', 'tech', 'lumo'], - }, - }, { - 'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3', - 'only_matching': True, - }, { - 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', - 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', - 'info_dict': { - 'id': '27701988', - 'ext': 'mp4', - 'title': 'Chile workers cover up to avoid skin damage', - 'description': 'md5:2bd151625a60a32822873efc246ba20d', - 'uploader': 'afp-news', - 'duration': 123, - }, - 'skip': 'This video has been deleted.', - }, { - 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', - 'md5': '4fde7b9e33577bab2f2f8f260e30e979', - 'note': 'Embedded ooyala video', - 'info_dict': { - 'id': '69525809', - 'ext': 'mp4', - 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', - 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', - 'uploader': 'newsy-videos', - }, - 'skip': 'This video has been deleted.', - }, { - 'url': 'http://www.veoh.com/watch/e152215AJxZktGS', - 'only_matching': True, - }, { - 'url': 'https://www.veoh.com/videos/v16374379WA437rMH', - 'md5': 'cceb73f3909063d64f4b93d4defca1b3', - 'info_dict': { - 'id': 'v16374379WA437rMH', - 'ext': 'mp4', - 'title': 'Phantasmagoria 2, pt. 1-3', - 'description': 'Phantasmagoria: a Puzzle of Flesh', - 'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th16374379\\.jpg(\\?.*)?', - 'uploader': 'davidspackage', - 'duration': 968, - 'view_count': int, - 'average_rating': int, - 'comment_count': int, - 'age_limit': 18, - 'categories': ['technology_and_gaming', 'gaming'], - 'tags': ['puzzle', 'of', 'flesh'], - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata = self._download_json( - 'https://www.veoh.com/watch/getVideo/' + video_id, - video_id) - video = metadata['video'] - title = video['title'] - - thumbnail_url = None - q = qualities(['Regular', 'HQ']) - formats = [] - for f_id, f_url in video.get('src', {}).items(): - if not f_url: - continue - if f_id == 'poster': - thumbnail_url = f_url - else: - formats.append({ - 'format_id': f_id, - 'quality': q(f_id), - 'url': f_url, - }) - - categories = metadata.get('categoryPath') - if not categories: - category = remove_start(strip_or_none(video.get('category')), 'category_') - categories = [category] if category else None - tags = video.get('tags') - - return { - 'id': video_id, - 'title': title, - 'description': video.get('description'), - 'thumbnail': thumbnail_url, - 'uploader': video.get('author', {}).get('nickname'), - 'duration': int_or_none(video.get('lengthBySec')) or parse_duration(video.get('length')), - 'view_count': int_or_none(video.get('views')), - 'formats': formats, - 'average_rating': int_or_none(video.get('rating')), - 'comment_count': int_or_none(video.get('numOfComments')), - 'age_limit': 18 if video.get('contentRatingId') == 2 else 0, - 'categories': categories, - 'tags': tags.split(', ') if tags else None, - } - - -class VeohUserIE(VeohIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?veoh\.com/users/(?P[\w-]+)' - IE_NAME = 'veoh:user' - - _TESTS = [ - { - 'url': 'https://www.veoh.com/users/valentinazoe', - 'info_dict': { - 'id': 'valentinazoe', - 'title': 'valentinazoe (Uploads)', - }, - 'playlist_mincount': 75, - }, - { - 'url': 'https://www.veoh.com/users/PiensaLibre', - 'info_dict': { - 'id': 'PiensaLibre', - 'title': 'PiensaLibre (Uploads)', - }, - 'playlist_mincount': 2, - }] - - _PAGE_SIZE = 16 - - def _fetch_page(self, uploader, page): - response = self._download_json( - 'https://www.veoh.com/users/published/videos', uploader, - note=f'Downloading videos page {page + 1}', - headers={ - 'x-csrf-token': self._TOKEN, - 'content-type': 'application/json;charset=UTF-8', - }, - data=json.dumps({ - 'username': uploader, - 'maxResults': self._PAGE_SIZE, - 'page': page + 1, - 'requestName': 'userPage', - }).encode()) - if not response.get('success'): - raise ExtractorError(response['message']) - - for video in response['videos']: - yield self.url_result(f'https://www.veoh.com/watch/{video["permalinkId"]}', VeohIE, - video['permalinkId'], video.get('title')) - - def _real_initialize(self): - webpage = self._download_webpage( - 'https://www.veoh.com', None, note='Downloading authorization token') - self._TOKEN = self._search_regex( - r'csrfToken:\s*(["\'])(?P[0-9a-zA-Z]{40})\1', webpage, - 'request token', group='token') - - def _real_extract(self, url): - uploader = self._match_id(url) - return self.playlist_result(OnDemandPagedList( - functools.partial(self._fetch_page, uploader), - self._PAGE_SIZE), uploader, f'{uploader} (Uploads)') From d867f99622ef7fba690b08da56c39d739b822bb7 Mon Sep 17 00:00:00 2001 From: ChocoLZS <61224208+ChocoLZS@users.noreply.github.com> Date: Mon, 18 Nov 2024 02:41:57 +0800 Subject: [PATCH 175/216] [ie/PiaLive] Add extractor (#10811) Authored by: ChocoLZS --- yt_dlp/extractor/_extractors.py | 6 +- yt_dlp/extractor/pialive.py | 122 +++++++++++++++++++++++++++++ yt_dlp/extractor/piaulizaportal.py | 70 ----------------- yt_dlp/extractor/uliza.py | 113 ++++++++++++++++++++++++++ 4 files changed, 240 insertions(+), 71 deletions(-) create mode 100644 yt_dlp/extractor/pialive.py delete mode 100644 yt_dlp/extractor/piaulizaportal.py create mode 100644 yt_dlp/extractor/uliza.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0d849c1697..967010826e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1520,8 +1520,8 @@ from .pgatour import PGATourIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pialive import PiaLiveIE from .piapro import PiaproIE -from .piaulizaportal import PIAULIZAPortalIE from .picarto import ( PicartoIE, PicartoVodIE, @@ -2250,6 +2250,10 @@ from .ufctv import ( ) from .ukcolumn import UkColumnIE from .uktvplay import UKTVPlayIE +from .uliza import ( + UlizaPlayerIE, + UlizaPortalIE, +) from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE diff --git a/yt_dlp/extractor/pialive.py b/yt_dlp/extractor/pialive.py new file mode 100644 index 0000000000..7469135c1b --- /dev/null +++ b/yt_dlp/extractor/pialive.py @@ -0,0 +1,122 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + multipart_encode, + str_or_none, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class PiaLiveIE(InfoExtractor): + _VALID_URL = r'https?://player\.pia-live\.jp/stream/(?P[\w-]+)' + _PLAYER_ROOT_URL = 'https://player.pia-live.jp/' + _PIA_LIVE_API_URL = 'https://api.pia-live.jp' + _API_KEY = 'kfds)FKFps-dms9e' + _TESTS = [{ + 'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krUDqGOwN4d61dCWQYOd6CTxl4hjya9dsfEZGsM4uGOUdax60lEI4twsXGXf7crmz8Gk__GhupTrWxA7RFRVt76', + 'info_dict': { + 'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84', + 'display_id': '2431867_001', + 'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)', + 'live_status': 'was_live', + 'comment_count': int, + }, + 'params': { + 'getcomments': True, + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'skip': 'The video is no longer available', + }, { + 'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krJdu0GVBVbVy01IwpJ6J3qBEm3d9TCTt1d0eWpsZGj7DrOjVOmS7GAWGwyscMgiThopJvzgWC4H5b-7XQjAfRZ', + 'info_dict': { + 'id': '9ce8b8ba-f6d1-4d1f-83a0-18c3148ded93', + 'display_id': '2431867_002', + 'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)', + 'live_status': 'was_live', + 'comment_count': int, + }, + 'params': { + 'getcomments': True, + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'skip': 'The video is no longer available', + }] + + def _extract_var(self, variable, html): + return self._search_regex( + rf'(?:var|const|let)\s+{variable}\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + html, f'variable {variable}', group='value') + + def _real_extract(self, url): + video_key = self._match_id(url) + webpage = self._download_webpage(url, video_key) + + program_code = self._extract_var('programCode', webpage) + article_code = self._extract_var('articleCode', webpage) + title = self._html_extract_title(webpage) + + if get_element_html_by_class('play-end', webpage): + raise ExtractorError('The video is no longer available', expected=True, video_id=program_code) + + if start_info := clean_html(get_element_by_class('play-waiting__date', webpage)): + date, time = self._search_regex( + r'(?P\d{4}/\d{1,2}/\d{1,2})\([月火水木金土日]\)(?P