From 4b5eec0aaa7c02627f27a386591b735b90e681a8 Mon Sep 17 00:00:00 2001 From: Jakob Kruse Date: Sun, 24 Nov 2024 23:20:30 +0100 Subject: [PATCH 01/99] [ie/chaturbate] Fix support for non-public streams (#11624) Fix bug in 720b3dc453c342bc2e8df7dbc0acaab4479de46c Closes #11623 Authored by: jkruse --- yt_dlp/extractor/chaturbate.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index a40b7d39c..d031d3985 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -59,16 +59,15 @@ class ChaturbateIE(InfoExtractor): 'Accept': 'application/json', }, fatal=False, impersonate=True) or {} - status = response.get('room_status') - if status != 'public': - if error := self._ERROR_MAP.get(status): - raise ExtractorError(error, expected=True) - self.report_warning('Falling back to webpage extraction') - return None - m3u8_url = response.get('url') if not m3u8_url: - self.raise_geo_restricted() + status = response.get('room_status') + if error := self._ERROR_MAP.get(status): + raise ExtractorError(error, expected=True) + if status == 'public': + self.raise_geo_restricted() + self.report_warning(f'Got status "{status}" from API; falling back to webpage extraction') + return None return { 'id': video_id, From e0500cbf796323551bbabe5b8ed8c75a511ba47a Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Wed, 27 Nov 2024 00:05:07 +0100 Subject: [PATCH 02/99] [ie] Handle fragmented formats in `_remove_duplicate_formats` (#11637) Authored by: Grub4K --- yt_dlp/extractor/common.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 28a3adf93..ce79e0b62 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1854,12 +1854,26 @@ class InfoExtractor: @staticmethod def _remove_duplicate_formats(formats): - format_urls = set() + seen_urls = set() + seen_fragment_urls = set() unique_formats = [] for f in formats: - if f['url'] not in format_urls: - format_urls.add(f['url']) + fragments = f.get('fragments') + if callable(fragments): unique_formats.append(f) + + elif fragments: + fragment_urls = frozenset( + fragment.get('url') or urljoin(f['fragment_base_url'], fragment['path']) + for fragment in fragments) + if fragment_urls not in seen_fragment_urls: + seen_fragment_urls.add(fragment_urls) + unique_formats.append(f) + + elif f['url'] not in seen_urls: + seen_urls.add(f['url']) + unique_formats.append(f) + formats[:] = unique_formats def _is_valid_url(self, url, video_id, item='video', headers={}): From 0a0d80800b9350d1a4c4b18d82cfb77ffbc3c507 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 26 Nov 2024 23:18:48 +0000 Subject: [PATCH 03/99] [ie/dacast] Fix HLS AES formats extraction (#11644) Closes #11643 Authored by: bashonly --- yt_dlp/extractor/dacast.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/dacast.py b/yt_dlp/extractor/dacast.py index 4e81aa4a7..537352e5f 100644 --- a/yt_dlp/extractor/dacast.py +++ b/yt_dlp/extractor/dacast.py @@ -1,3 +1,4 @@ +import functools import hashlib import re import time @@ -51,6 +52,15 @@ class DacastVODIE(DacastBaseIE): 'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2', }, 'params': {'skip_download': 'm3u8'}, + }, { # /uspaes/ in hls_url + 'url': 'https://iframe.dacast.com/vod/f9823fc6-faba-b98f-0d00-4a7b50a58c5b/348c5c84-b6af-4859-bb9d-1d01009c795b', + 'info_dict': { + 'id': '348c5c84-b6af-4859-bb9d-1d01009c795b', + 'ext': 'mp4', + 'title': 'pl1-edyta-rubas-211124.mp4', + 'uploader_id': 'f9823fc6-faba-b98f-0d00-4a7b50a58c5b', + 'thumbnail': 'https://universe-files.dacast.com/4d0bd042-a536-752d-fc34-ad2fa44bbcbb.png', + }, }] _WEBPAGE_TESTS = [{ 'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/', @@ -74,6 +84,15 @@ class DacastVODIE(DacastBaseIE): 'params': {'skip_download': 'm3u8'}, }] + @functools.cached_property + def _usp_signing_secret(self): + player_js = self._download_webpage( + 'https://player.dacast.com/js/player.js', None, 'Downloading player JS') + # Rotates every so often, but hardcode a fallback in case of JS change/breakage before rotation + return self._search_regex( + r'\bUSP_SIGNING_SECRET\s*=\s*(["\'])(?P(?:(?!\1).)+)', player_js, + 'usp signing secret', group='secret', fatal=False) or 'odnInCGqhvtyRTtIiddxtuRtawYYICZP' + def _real_extract(self, url): user_id, video_id = self._match_valid_url(url).group('user_id', 'id') query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} @@ -94,10 +113,10 @@ class DacastVODIE(DacastBaseIE): if 'DRM_EXT' in hls_url: self.report_drm(video_id) elif '/uspaes/' in hls_url: - # From https://player.dacast.com/js/player.js + # Ref: https://player.dacast.com/js/player.js ts = int(time.time()) signature = hashlib.sha1( - f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex() + f'{10413792000 - ts}{ts}{self._usp_signing_secret}'.encode()).digest().hex() hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}' for retry in self.RetryManager(): From 910ecc422930bca14e2abe4986f5f92359e3cea8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 27 Nov 2024 00:45:01 +0000 Subject: [PATCH 04/99] [ie/tiktok] Deprioritize animated thumbnails (#11645) Closes #11641 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index ba15f08b6..9e53b3407 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -413,15 +413,6 @@ class TikTokBaseIE(InfoExtractor): for f in formats: self._set_cookie(urllib.parse.urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value) - thumbnails = [] - for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak', - 'origin_cover', 'dynamic_cover'): - for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)): - thumbnails.append({ - 'id': cover_id, - 'url': cover_url, - }) - stats_info = aweme_detail.get('statistics') or {} music_info = aweme_detail.get('music') or {} labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str) @@ -467,7 +458,17 @@ class TikTokBaseIE(InfoExtractor): 'formats': formats, 'subtitles': self.extract_subtitles( aweme_detail, aweme_id, traverse_obj(author_info, 'uploader', 'uploader_id', 'channel_id')), - 'thumbnails': thumbnails, + 'thumbnails': [ + { + 'id': cover_id, + 'url': cover_url, + 'preference': -1 if cover_id in ('cover', 'origin_cover') else -2, + } + for cover_id in ( + 'cover', 'ai_dynamic_cover', 'animated_cover', + 'ai_dynamic_cover_bak', 'origin_cover', 'dynamic_cover') + for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)) + ], 'duration': (traverse_obj(video_info, ( (None, 'download_addr'), 'duration', {int_or_none(scale=1000)}, any)) or traverse_obj(music_info, ('duration', {int_or_none}))), @@ -600,11 +601,15 @@ class TikTokBaseIE(InfoExtractor): 'repost_count': 'shareCount', 'comment_count': 'commentCount', }), expected_type=int_or_none), - 'thumbnails': traverse_obj(aweme_detail, ( - (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), { - 'url': ({url_or_none}, {self._proto_relative_url}), - }, - )), + 'thumbnails': [ + { + 'id': cover_id, + 'url': self._proto_relative_url(cover_url), + 'preference': -2 if cover_id == 'dynamicCover' else -1, + } + for cover_id in ('thumbnail', 'cover', 'dynamicCover', 'originCover') + for cover_url in traverse_obj(aweme_detail, ((None, 'video'), cover_id, {url_or_none})) + ], } From 00dcde728635633eee969ad4d498b9f233c4a94e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 27 Nov 2024 01:47:28 +0000 Subject: [PATCH 05/99] [ie/dropbox] Fix password-protected video extraction (#11636) Closes #11634 Authored by: bashonly --- yt_dlp/extractor/dropbox.py | 38 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index c12209623..2bfeebc7c 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -48,32 +48,30 @@ class DropboxIE(InfoExtractor): webpage = self._download_webpage(url, video_id) fn = urllib.parse.unquote(url_basename(url)) title = os.path.splitext(fn)[0] - password = self.get_param('videopassword') + content_id = None for part in self._yield_decoded_parts(webpage): if '/sm/password' in part: - webpage = self._download_webpage( - update_url('https://www.dropbox.com/sm/password', query=part.partition('?')[2]), video_id) + content_id = self._search_regex(r'content_id=([\w.+=/-]+)', part, 'content ID') break - if (self._og_search_title(webpage, default=None) == 'Dropbox - Password Required' - or 'Enter the password for this link' in webpage): - if password: - response = self._download_json( - 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', - headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'}, - data=urlencode_postdata({ - 'is_xhr': 'true', - 't': self._get_cookies('https://www.dropbox.com')['t'].value, - 'content_id': self._search_regex(r'content_id=([\w.+=/-]+)["\']', webpage, 'content id'), - 'password': password, - 'url': url, - })) - - if response.get('status') != 'authed': - raise ExtractorError('Invalid password', expected=True) - elif not self._get_cookies('https://dropbox.com').get('sm_auth'): + if content_id: + password = self.get_param('videopassword') + if not password: raise ExtractorError('Password protected video, use --video-password ', expected=True) + + response = self._download_json( + 'https://www.dropbox.com/sm/auth', video_id, 'POSTing video password', + data=urlencode_postdata({ + 'is_xhr': 'true', + 't': self._get_cookies('https://www.dropbox.com')['t'].value, + 'content_id': content_id, + 'password': password, + 'url': update_url(url, scheme='', netloc=''), + })) + if response.get('status') != 'authed': + raise ExtractorError('Invalid password', expected=True) + webpage = self._download_webpage(url, video_id) formats, subtitles = [], {} From 360aed810ad85db950df586282d256516c98cd2d Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Mon, 2 Dec 2024 03:16:50 +1300 Subject: [PATCH 06/99] [ie/instagram] Support `share` URLs (#11677) Closes #11630 Authored by: grqz --- yt_dlp/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index dee8cb85d..55086d0b2 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -254,7 +254,7 @@ class InstagramIOSIE(InfoExtractor): class InstagramIE(InstagramBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reels?(?!/audio/))/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com(?:/(?!share/)[^/?#]+)?/(?:p|tv|reels?(?!/audio/))/(?P[^/?#&]+))' _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1'] _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', From cd0f934604587ed793e9177f6a127e5dcf99a7dd Mon Sep 17 00:00:00 2001 From: DarkZeros Date: Sun, 1 Dec 2024 14:21:57 +0000 Subject: [PATCH 07/99] [ie/mitele] Fix extractor (#11683) Closes #11690 Authored by: DarkZeros --- yt_dlp/extractor/mitele.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/mitele.py b/yt_dlp/extractor/mitele.py index 3573a2a3f..76fef337a 100644 --- a/yt_dlp/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py @@ -80,9 +80,9 @@ class MiTeleIE(TelecincoBaseIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - pre_player = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})', - webpage, 'Pre Player'), display_id)['prePlayer'] + pre_player = self._search_json( + r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=', + webpage, 'Pre Player', display_id)['prePlayer'] title = pre_player['title'] video_info = self._parse_content(pre_player['video'], url) content = pre_player.get('content') or {} From 0d146c1e36f467af30e87b7af651bdee67b73500 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Dec 2024 15:25:09 +0000 Subject: [PATCH 08/99] [ie/youtube] Adjust player clients for site changes (#11663) Closes #11640 Authored by: bashonly --- README.md | 4 ++-- yt_dlp/extractor/youtube.py | 46 +++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 0a62d8e74..772395d24 100644 --- a/README.md +++ b/README.md @@ -1761,7 +1761,7 @@ $ yt-dlp --replace-in-metadata "title,uploader" "[ _]" "-" # EXTRACTOR ARGUMENTS -Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=mediaconnect,web;formats=incomplete" --extractor-args "funimation:version=uncut"` +Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=tv,mweb;formats=incomplete" --extractor-args "funimation:version=uncut"` Note: In CLI, `ARG` can use `-` instead of `_`; e.g. `youtube:player-client"` becomes `youtube:player_client"` @@ -1770,7 +1770,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `mediaconnect`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, and `web_creator` is added as needed for age-gated videos when account age verification is required. Similarly, the `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, or `web_creator,mweb` is used when authenticating with cookies. The `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. Not all clients support authentication via cookies. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 7a9133466..a67f09e62 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -83,6 +83,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, 'REQUIRE_PO_TOKEN': True, + 'SUPPORTS_COOKIES': True, }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats 'web_safari': { @@ -95,6 +96,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, 'REQUIRE_PO_TOKEN': True, + 'SUPPORTS_COOKIES': True, }, 'web_embedded': { 'INNERTUBE_CONTEXT': { @@ -104,6 +106,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, + 'SUPPORTS_COOKIES': True, }, 'web_music': { 'INNERTUBE_HOST': 'music.youtube.com', @@ -114,6 +117,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, + 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video 'web_creator': { @@ -125,6 +129,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, }, 'android': { 'INNERTUBE_CONTEXT': { @@ -157,6 +162,7 @@ INNERTUBE_CLIENTS = { 'REQUIRE_JS_PLAYER': False, 'REQUIRE_PO_TOKEN': True, 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video 'android_creator': { @@ -191,6 +197,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, 'REQUIRE_JS_PLAYER': False, + 'SUPPORTS_COOKIES': True, }, # iOS clients have HLS live streams. Setting device model to get 60fps formats. # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 @@ -225,6 +232,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, 'REQUIRE_JS_PLAYER': False, 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video 'ios_creator': { @@ -253,6 +261,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, + 'SUPPORTS_COOKIES': True, }, 'tv': { 'INNERTUBE_CONTEXT': { @@ -262,6 +271,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, + 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video # It was previously an age-gate workaround for videos that were `playable_in_embed` @@ -275,19 +285,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, 'REQUIRE_AUTH': True, - }, - # This client now requires sign-in for every video - # It may be able to receive pre-merged video+audio 720p/1080p streams - 'mediaconnect': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'MEDIA_CONNECT_FRONTEND', - 'clientVersion': '0.1', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 95, - 'REQUIRE_JS_PLAYER': False, - 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, }, } @@ -317,6 +315,7 @@ def build_innertube_clients(): ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg.setdefault('REQUIRE_PO_TOKEN', False) ytcfg.setdefault('REQUIRE_AUTH', False) + ytcfg.setdefault('SUPPORTS_COOKIES', False) ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') @@ -1357,6 +1356,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _DEFAULT_CLIENTS = ('ios', 'mweb') + _DEFAULT_AUTHED_CLIENTS = ('web_creator', 'mweb') _GEO_BYPASS = False @@ -3823,12 +3823,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] excluded_clients = [] + default_clients = self._DEFAULT_AUTHED_CLIENTS if self.is_authenticated else self._DEFAULT_CLIENTS allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client == 'default': - requested_clients.extend(self._DEFAULT_CLIENTS) + requested_clients.extend(default_clients) elif client == 'all': requested_clients.extend(allowed_clients) elif client.startswith('-'): @@ -3838,7 +3839,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: requested_clients.append(client) if not requested_clients: - requested_clients.extend(self._DEFAULT_CLIENTS) + requested_clients.extend(default_clients) for excluded_client in excluded_clients: if excluded_client in requested_clients: requested_clients.remove(excluded_client) @@ -3850,9 +3851,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _, base_client, variant = _split_innertube_client(requested_client) music_client = f'{base_client}_music' if base_client != 'mweb' else 'web_music' if variant != 'music' and music_client in INNERTUBE_CLIENTS: - if not INNERTUBE_CLIENTS[music_client]['REQUIRE_AUTH'] or self.is_authenticated: + client_info = INNERTUBE_CLIENTS[music_client] + if not client_info['REQUIRE_AUTH'] or (self.is_authenticated and client_info['SUPPORTS_COOKIES']): requested_clients.append(music_client) + if self.is_authenticated: + unsupported_clients = [ + client for client in requested_clients if not INNERTUBE_CLIENTS[client]['SUPPORTS_COOKIES'] + ] + for client in unsupported_clients: + self.report_warning(f'Skipping client "{client}" since it does not support cookies', only_once=True) + requested_clients.remove(client) + return orderedSet(requested_clients) def _invalid_player_response(self, pr, video_id): @@ -3958,6 +3968,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: prs.append(pr) + ''' This code is pointless while web_creator is in _DEFAULT_AUTHED_CLIENTS # EU countries require age-verification for accounts to access age-restricted videos # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients if self.is_authenticated and self._is_agegated(pr): @@ -3965,9 +3976,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) # web_creator can work around the age-verification requirement - # android_vr and mediaconnect may also be able to work around age-verification + # android_vr may also be able to work around age-verification # tv_embedded may(?) still work around age-verification if the video is embeddable append_client('web_creator') + ''' prs.extend(deprioritized_prs) From 239f5f36fe04603bec59c8b975f6a792f10246db Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Mon, 2 Dec 2024 10:55:18 +1300 Subject: [PATCH 09/99] [ie/bilibili] Fix extractor (#11667) Closes #11665 Authored by: grqz --- yt_dlp/extractor/bilibili.py | 43 ++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 02ea67707..f01befcc0 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -18,7 +18,6 @@ from ..utils import ( InAdvancePagedList, OnDemandPagedList, bool_or_none, - clean_html, determine_ext, filter_dict, float_or_none, @@ -639,31 +638,27 @@ class BiliBiliIE(BilibiliBaseIE): headers['Referer'] = url initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + + if traverse_obj(initial_state, ('error', 'trueCode')) == -403: + self.raise_login_required() + if traverse_obj(initial_state, ('error', 'trueCode')) == -404: + raise ExtractorError( + 'This video may be deleted or geo-restricted. ' + 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) + is_festival = 'videoData' not in initial_state if is_festival: video_data = initial_state['videoInfo'] else: - play_info_obj = self._search_json( - r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False) - if not play_info_obj: - if traverse_obj(initial_state, ('error', 'trueCode')) == -403: - self.raise_login_required() - if traverse_obj(initial_state, ('error', 'trueCode')) == -404: - raise ExtractorError( - 'This video may be deleted or geo-restricted. ' - 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) - play_info = traverse_obj(play_info_obj, ('data', {dict})) - if not play_info: - if traverse_obj(play_info_obj, 'code') == 87007: - toast = get_element_by_class('tips-toast', webpage) or '' - msg = clean_html( - f'{get_element_by_class("belongs-to", toast) or ""},' - + (get_element_by_class('level', toast) or '')) - raise ExtractorError( - f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True) - raise ExtractorError('Failed to extract play info') video_data = initial_state['videoData'] + if video_data.get('is_upower_exclusive'): + high_level = traverse_obj(initial_state, ('elecFullInfo', 'show_info', 'high_level', {dict})) or {} + raise ExtractorError( + 'This is a supporter-only video: ' + f'{join_nonempty("title", "sub_title", from_dict=high_level, delim=",")}. ' + f'{self._login_hint()}', expected=True) + video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. @@ -689,10 +684,14 @@ class BiliBiliIE(BilibiliBaseIE): old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') + play_info = ( + traverse_obj( + self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id, default=None), + ('data', {dict})) + or self._download_playinfo(video_id, cid, headers=headers)) + festival_info = {} if is_festival: - play_info = self._download_playinfo(video_id, cid, headers=headers) - festival_info = traverse_obj(initial_state, { 'uploader': ('videoInfo', 'upName'), 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), From 62cba8a1bedbfc0ddde7267ae57b72bf5f7ea7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elan=20Ruusam=C3=A4e?= Date: Mon, 2 Dec 2024 00:33:11 +0200 Subject: [PATCH 10/99] [ie/duoplay] Fix extractor (#11588) Authored by: glensc, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/duoplay.py | 60 +++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/duoplay.py b/yt_dlp/extractor/duoplay.py index 18642fea3..75650c3a6 100644 --- a/yt_dlp/extractor/duoplay.py +++ b/yt_dlp/extractor/duoplay.py @@ -5,15 +5,16 @@ from ..utils import ( get_element_text_and_html_by_tag, int_or_none, join_nonempty, + parse_qs, str_or_none, try_call, unified_timestamp, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import traverse_obj, value class DuoplayIE(InfoExtractor): - _VALID_URL = r'https?://duoplay\.ee/(?P\d+)/[\w-]+/?(?:\?(?:[^#]+&)?ep=(?P\d+))?' + _VALID_URL = r'https?://duoplay\.ee/(?P\d+)(?:[/?#]|$)' _TESTS = [{ 'note': 'Siberi võmm S02E12', 'url': 'https://duoplay.ee/4312/siberi-vomm?ep=24', @@ -34,15 +35,16 @@ class DuoplayIE(InfoExtractor): 'episode_number': 12, 'episode_id': '24', }, + 'skip': 'No video found', }, { 'note': 'Empty title', 'url': 'https://duoplay.ee/17/uhikarotid?ep=14', - 'md5': '6aca68be71112314738dd17cced7f8bf', + 'md5': 'cba9f5dabf2582b224d80ac44fb80e47', 'info_dict': { 'id': '17_14', 'ext': 'mp4', - 'title': 'Ühikarotid', - 'thumbnail': r're:https://.+\.jpg(?:\?c=\d+)?$', + 'title': 'Episode 14', + 'thumbnail': r're:https?://.+\.jpg', 'description': 'md5:4719b418e058c209def41d48b601276e', 'upload_date': '20100916', 'timestamp': 1284661800, @@ -52,6 +54,8 @@ class DuoplayIE(InfoExtractor): 'season_number': 2, 'episode_id': '14', 'release_year': 2010, + 'episode': 'Episode 14', + 'episode_number': 14, }, }, { 'note': 'Movie without expiry', @@ -68,10 +72,32 @@ class DuoplayIE(InfoExtractor): 'timestamp': 1671054000, 'release_year': 2018, }, + 'skip': 'No video found', + }, { + 'note': 'Episode url without show name', + 'url': 'https://duoplay.ee/9644?ep=185', + 'md5': '63f324b4fe2dbd8194dca16a6d52184a', + 'info_dict': { + 'id': '9644_185', + 'ext': 'mp4', + 'title': 'Episode 185', + 'thumbnail': r're:https?://.+\.jpg', + 'description': 'md5:ed25ba4e9e5d54bc291a4a0cdd241467', + 'upload_date': '20241120', + 'timestamp': 1732077000, + 'episode': 'Episode 63', + 'episode_id': '185', + 'episode_number': 63, + 'season': 'Season 2', + 'season_number': 2, + 'series': 'Telehommik', + 'series_id': '9644', + }, }] def _real_extract(self, url): - telecast_id, episode = self._match_valid_url(url).group('id', 'ep') + telecast_id = self._match_id(url) + episode = traverse_obj(parse_qs(url), ('ep', 0, {int_or_none}, {str_or_none})) video_id = join_nonempty(telecast_id, episode, delim='_') webpage = self._download_webpage(url, video_id) video_player = try_call(lambda: extract_attributes( @@ -79,25 +105,33 @@ class DuoplayIE(InfoExtractor): if not video_player or not video_player.get('manifest-url'): raise ExtractorError('No video found', expected=True) + manifest_url = video_player['manifest-url'] + session_token = self._download_json( + 'https://sts.postimees.ee/session/register', video_id, 'Registering session', + 'Unable to register session', headers={ + 'Accept': 'application/json', + 'X-Original-URI': manifest_url, + })['session'] + episode_attr = self._parse_json(video_player.get(':episode') or '', video_id, fatal=False) or {} return { 'id': video_id, - 'formats': self._extract_m3u8_formats(video_player['manifest-url'], video_id, 'mp4'), + 'formats': self._extract_m3u8_formats(manifest_url, video_id, 'mp4', query={'s': session_token}), **traverse_obj(episode_attr, { - 'title': 'title', - 'description': 'synopsis', + 'title': ('title', {str}), + 'description': ('synopsis', {str}), 'thumbnail': ('images', 'original'), 'timestamp': ('airtime', {lambda x: unified_timestamp(x + ' +0200')}), - 'cast': ('cast', {lambda x: x.split(', ')}), + 'cast': ('cast', filter, {lambda x: x.split(', ')}), 'release_year': ('year', {int_or_none}), }), **(traverse_obj(episode_attr, { - 'title': (None, ('subtitle', ('episode_nr', {lambda x: f'Episode {x}' if x else None}))), - 'series': 'title', + 'title': (None, (('subtitle', {str}, filter), {value(f'Episode {episode}' if episode else None)})), + 'series': ('title', {str}), 'series_id': ('telecast_id', {str_or_none}), 'season_number': ('season_id', {int_or_none}), - 'episode': 'subtitle', + 'episode': ('subtitle', {str}, filter), 'episode_number': ('episode_nr', {int_or_none}), 'episode_id': ('episode_id', {str_or_none}), }, get_all=False) if episode_attr.get('category') != 'movies' else {}), From 2bea7936323ca4b6f3b9b1fdd892566223e30efa Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 2 Dec 2024 16:22:16 +0100 Subject: [PATCH 11/99] [ie/MicrosoftEmbed] Make format extraction non fatal (#11654) Authored by: seproDev --- yt_dlp/extractor/microsoftembed.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index d0135f5a9..2575d6c5e 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -26,6 +26,7 @@ class MicrosoftEmbedIE(InfoExtractor): 'timestamp': 1631658316, 'upload_date': '20210914', }, + 'expected_warnings': ['Failed to parse XML: syntax error: line 1, column 0'], }] _API_URL = 'https://prod-video-cms-rt-microsoft-com.akamaized.net/vhs/api/videos/' @@ -36,11 +37,11 @@ class MicrosoftEmbedIE(InfoExtractor): formats = [] for source_type, source in metadata['streams'].items(): if source_type == 'smooth_Streaming': - formats.extend(self._extract_ism_formats(source['url'], video_id, 'mss')) + formats.extend(self._extract_ism_formats(source['url'], video_id, 'mss', fatal=False)) elif source_type == 'apple_HTTP_Live_Streaming': - formats.extend(self._extract_m3u8_formats(source['url'], video_id, 'mp4')) + formats.extend(self._extract_m3u8_formats(source['url'], video_id, 'mp4', fatal=False)) elif source_type == 'mPEG_DASH': - formats.extend(self._extract_mpd_formats(source['url'], video_id)) + formats.extend(self._extract_mpd_formats(source['url'], video_id, fatal=False)) else: formats.append({ 'format_id': source_type, From d8fb3490863653182864d2a53522f350d67a9ff8 Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 2 Dec 2024 16:29:30 +0100 Subject: [PATCH 12/99] [cleanup] Bump ruff to 0.8.x (#11608) Authored by: seproDev --- pyproject.toml | 6 ++---- yt_dlp/YoutubeDL.py | 10 +++++----- yt_dlp/__init__.py | 6 +++--- yt_dlp/aes.py | 14 ++++++-------- yt_dlp/cookies.py | 4 ++-- yt_dlp/downloader/hls.py | 8 ++++---- yt_dlp/downloader/youtube_live_chat.py | 4 ++-- yt_dlp/extractor/bilibili.py | 4 ++-- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/funimation.py | 6 +++--- yt_dlp/extractor/youtube.py | 14 +++++++------- yt_dlp/plugins.py | 2 +- yt_dlp/postprocessor/__init__.py | 2 +- yt_dlp/postprocessor/ffmpeg.py | 2 +- yt_dlp/utils/_utils.py | 4 ++-- 15 files changed, 42 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 97ea4375f..96e2d669a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.7.0", + "ruff~=0.8.0", ] test = [ "pytest~=8.1", @@ -186,6 +186,7 @@ ignore = [ "E501", # line-too-long "E731", # lambda-assignment "E741", # ambiguous-variable-name + "UP031", # printf-string-formatting "UP036", # outdated-version-block "B006", # mutable-argument-default "B008", # function-call-in-default-argument @@ -258,9 +259,6 @@ select = [ "A002", # builtin-argument-shadowing "C408", # unnecessary-collection-call ] -"yt_dlp/jsinterp.py" = [ - "UP031", # printf-string-formatting -] [tool.ruff.lint.isort] known-first-party = [ diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a9a8e4133..65b72e026 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1116,7 +1116,7 @@ class YoutubeDL: def raise_no_formats(self, info, forced=False, *, msg=None): has_drm = info.get('_has_drm') ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg) - msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!' + msg = msg or (has_drm and 'This video is DRM protected') or 'No video formats found!' if forced or not ignored: raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'], expected=has_drm or ignored or expected) @@ -2196,7 +2196,7 @@ class YoutubeDL: def _default_format_spec(self, info_dict): prefer_best = ( self.params['outtmpl']['default'] == '-' - or info_dict.get('is_live') and not self.params.get('live_from_start')) + or (info_dict.get('is_live') and not self.params.get('live_from_start'))) def can_merge(): merger = FFmpegMergerPP(self) @@ -2365,7 +2365,7 @@ class YoutubeDL: vexts=[f['ext'] for f in video_fmts], aexts=[f['ext'] for f in audio_fmts], preferences=(try_call(lambda: self.params['merge_output_format'].split('/')) - or self.params.get('prefer_free_formats') and ('webm', 'mkv'))) + or (self.params.get('prefer_free_formats') and ('webm', 'mkv')))) filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info)) @@ -3541,8 +3541,8 @@ class YoutubeDL: and info_dict.get('container') == 'm4a_dash', 'writing DASH m4a. Only some players support this container', FFmpegFixupM4aPP) - ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts') - or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, + ffmpeg_fixup((downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')) + or (info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None), 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', FFmpegFixupM3u8PP) ffmpeg_fixup(downloader == 'dashsegments' diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index a1880bf7d..20111175b 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -1062,7 +1062,7 @@ def _real_main(argv=None): # If we only have a single process attached, then the executable was double clicked # When using `pyinstaller` with `--onefile`, two processes get attached is_onefile = hasattr(sys, '_MEIPASS') and os.path.basename(sys._MEIPASS).startswith('_MEI') - if attached_processes == 1 or is_onefile and attached_processes == 2: + if attached_processes == 1 or (is_onefile and attached_processes == 2): print(parser._generate_error_message( 'Do not double-click the executable, instead call it from a command line.\n' 'Please read the README for further information on how to use yt-dlp: ' @@ -1109,9 +1109,9 @@ def main(argv=None): from .extractor import gen_extractors, list_extractors __all__ = [ - 'main', 'YoutubeDL', - 'parse_options', 'gen_extractors', 'list_extractors', + 'main', + 'parse_options', ] diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index 0930d36df..9908434a5 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -534,19 +534,17 @@ def ghash(subkey, data): __all__ = [ 'aes_cbc_decrypt', 'aes_cbc_decrypt_bytes', - 'aes_ctr_decrypt', - 'aes_decrypt_text', - 'aes_decrypt', - 'aes_ecb_decrypt', - 'aes_gcm_decrypt_and_verify', - 'aes_gcm_decrypt_and_verify_bytes', - 'aes_cbc_encrypt', 'aes_cbc_encrypt_bytes', + 'aes_ctr_decrypt', 'aes_ctr_encrypt', + 'aes_decrypt', + 'aes_decrypt_text', + 'aes_ecb_decrypt', 'aes_ecb_encrypt', 'aes_encrypt', - + 'aes_gcm_decrypt_and_verify', + 'aes_gcm_decrypt_and_verify_bytes', 'key_expansion', 'pad_block', 'pkcs7_padding', diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index d5b0d3991..772433b0f 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1276,8 +1276,8 @@ class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): def _really_save(self, f, ignore_discard, ignore_expires): now = time.time() for cookie in self: - if (not ignore_discard and cookie.discard - or not ignore_expires and cookie.is_expired(now)): + if ((not ignore_discard and cookie.discard) + or (not ignore_expires and cookie.is_expired(now))): continue name, value = cookie.name, cookie.value if value is None: diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index 0a00d5dab..da2574da7 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -119,12 +119,12 @@ class HlsFD(FragmentFD): self.to_screen(f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}') def is_ad_fragment_start(s): - return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s - or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + return ((s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s) + or (s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad'))) def is_ad_fragment_end(s): - return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s - or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) + return ((s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s) + or (s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment'))) fragments = [] diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py index 961938d44..ddd912ca2 100644 --- a/yt_dlp/downloader/youtube_live_chat.py +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -123,8 +123,8 @@ class YoutubeLiveChatFD(FragmentFD): data, lambda x: x['continuationContents']['liveChatContinuation'], dict) or {} - func = (info_dict['protocol'] == 'youtube_live_chat' and parse_actions_live - or frag_index == 1 and try_refresh_replay_beginning + func = ((info_dict['protocol'] == 'youtube_live_chat' and parse_actions_live) + or (frag_index == 1 and try_refresh_replay_beginning) or parse_actions_replay) return (True, *func(live_chat_continuation)) except HTTPError as err: diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index f01befcc0..72d5f20cf 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -662,12 +662,12 @@ class BiliBiliIE(BilibiliBaseIE): video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - page_list_json = not is_festival and traverse_obj( + page_list_json = (not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, note='Extracting videos in anthology', headers=headers), - 'data', expected_type=list) or [] + 'data', expected_type=list)) or [] is_anthology = len(page_list_json) > 1 part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ce79e0b62..92ddad2b7 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3803,7 +3803,7 @@ class InfoExtractor: def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed: + if (self.supports_login() and self._get_login_info()[0] is not None) or self._cookies_passed: self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index d3e61c84f..01b53bcde 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -193,9 +193,9 @@ class FunimationIE(FunimationBaseIE): for lang, version, fmt in self._get_experiences(episode): experience_id = str(fmt['experienceId']) - if (only_initial_experience and experience_id != initial_experience_id - or requested_languages and lang.lower() not in requested_languages - or requested_versions and version.lower() not in requested_versions): + if ((only_initial_experience and experience_id != initial_experience_id) + or (requested_languages and lang.lower() not in requested_languages) + or (requested_versions and version.lower() not in requested_versions)): continue thumbnails.append({'url': fmt.get('poster')}) duration = max(duration, fmt.get('duration', 0)) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a67f09e62..41cd90db9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2925,7 +2925,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = immediate or last_error and isinstance(last_error, HTTPError) and last_error.status == 403 + expire_fast = immediate or (last_error and isinstance(last_error, HTTPError) and last_error.status == 403) mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -3995,8 +3995,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return prs, player_url def _needs_live_processing(self, live_status, duration): - if (live_status == 'is_live' and self.get_param('live_from_start') - or live_status == 'post_live' and (duration or 0) > 2 * 3600): + if ((live_status == 'is_live' and self.get_param('live_from_start')) + or (live_status == 'post_live' and (duration or 0) > 2 * 3600)): return live_status def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): @@ -4192,7 +4192,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): skip_manifests = set(self._configuration_arg('skip')) if (not self.get_param('youtube_include_hls_manifest', True) or needs_live_processing == 'is_live' # These will be filtered out by YoutubeDL anyway - or needs_live_processing and skip_bad_formats): + or (needs_live_processing and skip_bad_formats)): skip_manifests.add('hls') if not self.get_param('youtube_include_dash_manifest', True): @@ -4390,14 +4390,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): expected_type=dict) translated_title = self._get_text(microformats, (..., 'title')) - video_title = (self._preferred_lang and translated_title + video_title = ((self._preferred_lang and translated_title) or get_first(video_details, 'title') # primary or translated_title or search_meta(['og:title', 'twitter:title', 'title'])) translated_description = self._get_text(microformats, (..., 'description')) original_description = get_first(video_details, 'shortDescription') video_description = ( - self._preferred_lang and translated_description + (self._preferred_lang and translated_description) # If original description is blank, it will be an empty string. # Do not prefer translated description in this case. or original_description if original_description is not None else translated_description) @@ -6837,7 +6837,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): tab_url = urljoin(base_url, traverse_obj( tab, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url'))) - tab_id = (tab_url and self._get_url_mobj(tab_url)['tab'][1:] + tab_id = ((tab_url and self._get_url_mobj(tab_url)['tab'][1:]) or traverse_obj(tab, 'tabIdentifier', expected_type=str)) if tab_id: return { diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index 2bf55df71..94335a9a3 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -183,4 +183,4 @@ def load_plugins(name, suffix): sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.extractor', f'{PACKAGE_NAME}.postprocessor')) -__all__ = ['directories', 'load_plugins', 'PACKAGE_NAME', 'COMPAT_PACKAGE_NAME'] +__all__ = ['COMPAT_PACKAGE_NAME', 'PACKAGE_NAME', 'directories', 'load_plugins'] diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index 164540b5d..7b1620544 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -44,4 +44,4 @@ def get_postprocessor(key): globals().update(_PLUGIN_CLASSES) __all__ = [name for name in globals() if name.endswith('PP')] -__all__.extend(('PostProcessor', 'FFmpegPostProcessor')) +__all__.extend(('FFmpegPostProcessor', 'PostProcessor')) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index d994754fd..8965806ae 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -626,7 +626,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): sub_ext = sub_info['ext'] if sub_ext == 'json': self.report_warning('JSON subtitles cannot be embedded') - elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + elif ext != 'webm' or (ext == 'webm' and sub_ext == 'vtt'): sub_langs.append(lang) sub_names.append(sub_info.get('name')) sub_filenames.append(sub_info['filepath']) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 8517b762e..699bf1e7f 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2683,8 +2683,8 @@ def merge_dicts(*dicts): merged = {} for a_dict in dicts: for k, v in a_dict.items(): - if (v is not None and k not in merged - or isinstance(v, str) and merged[k] == ''): + if ((v is not None and k not in merged) + or (isinstance(v, str) and merged[k] == '')): merged[k] = v return merged From f05a1cd1492fc98dc8d80d2081d632a1879913d2 Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:19:22 +1300 Subject: [PATCH 13/99] [ie/bilibili] Fix supporter-only video extraction (#11711) Fix bug in 239f5f36fe04603bec59c8b975f6a792f10246db Closes #11702 Authored by: grqz, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/bilibili.py | 125 ++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 61 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 72d5f20cf..e538e5308 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -652,13 +652,6 @@ class BiliBiliIE(BilibiliBaseIE): else: video_data = initial_state['videoData'] - if video_data.get('is_upower_exclusive'): - high_level = traverse_obj(initial_state, ('elecFullInfo', 'show_info', 'high_level', {dict})) or {} - raise ExtractorError( - 'This is a supporter-only video: ' - f'{join_nonempty("title", "sub_title", from_dict=high_level, delim=",")}. ' - f'{self._login_hint()}', expected=True) - video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. @@ -726,62 +719,72 @@ class BiliBiliIE(BilibiliBaseIE): self._get_interactive_entries(video_id, cid, metainfo, headers=headers), **metainfo, duration=traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})), __post_extractor=self.extract_comments(aid)) - else: - formats = self.extract_formats(play_info) - if not traverse_obj(play_info, ('dash')): - # we only have legacy formats and need additional work - has_qn = lambda x: x in traverse_obj(formats, (..., 'quality')) - for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})): - formats.extend(traverse_obj( - self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, qn=qn)), - lambda _, v: not has_qn(v['quality']))) - self._check_missing_formats(play_info, formats) - flv_formats = traverse_obj(formats, lambda _, v: v['fragments']) - if flv_formats and len(flv_formats) < len(formats): - # Flv and mp4 are incompatible due to `multi_video` workaround, so drop one - if not self._configuration_arg('prefer_multi_flv'): - dropped_fmts = ', '.join( - f'{f.get("format_note")} ({f.get("format_id")})' for f in flv_formats) - formats = traverse_obj(formats, lambda _, v: not v.get('fragments')) - if dropped_fmts: - self.to_screen( - f'Dropping incompatible flv format(s) {dropped_fmts} since mp4 is available. ' - 'To extract flv, pass --extractor-args "bilibili:prefer_multi_flv"') - else: - formats = traverse_obj( - # XXX: Filtering by extractor-arg is for testing purposes - formats, lambda _, v: v['quality'] == int(self._configuration_arg('prefer_multi_flv')[0]), - ) or [max(flv_formats, key=lambda x: x['quality'])] + formats = self.extract_formats(play_info) - if traverse_obj(formats, (0, 'fragments')): - # We have flv formats, which are individual short videos with their own timestamps and metainfo - # Binary concatenation corrupts their timestamps, so we need a `multi_video` workaround - return { - **metainfo, - '_type': 'multi_video', - 'entries': [{ - 'id': f'{metainfo["id"]}_{idx}', - 'title': metainfo['title'], - 'http_headers': metainfo['http_headers'], - 'formats': [{ - **fragment, - 'format_id': formats[0].get('format_id'), - }], - 'subtitles': self.extract_subtitles(video_id, cid) if idx == 0 else None, - '__post_extractor': self.extract_comments(aid) if idx == 0 else None, - } for idx, fragment in enumerate(formats[0]['fragments'])], - 'duration': float_or_none(play_info.get('timelength'), scale=1000), - } - else: - return { - **metainfo, - 'formats': formats, - 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'chapters': self._get_chapters(aid, cid), - 'subtitles': self.extract_subtitles(video_id, cid), - '__post_extractor': self.extract_comments(aid), - } + if video_data.get('is_upower_exclusive'): + high_level = traverse_obj(initial_state, ('elecFullInfo', 'show_info', 'high_level', {dict})) or {} + msg = f'{join_nonempty("title", "sub_title", from_dict=high_level, delim=",")}. {self._login_hint()}' + if not formats: + raise ExtractorError(f'This is a supporter-only video: {msg}', expected=True) + if '试看' in traverse_obj(play_info, ('accept_description', ..., {str})): + self.report_warning( + f'This is a supporter-only video, only the preview will be extracted: {msg}', + video_id=video_id) + + if not traverse_obj(play_info, 'dash'): + # we only have legacy formats and need additional work + has_qn = lambda x: x in traverse_obj(formats, (..., 'quality')) + for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})): + formats.extend(traverse_obj( + self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, qn=qn)), + lambda _, v: not has_qn(v['quality']))) + self._check_missing_formats(play_info, formats) + flv_formats = traverse_obj(formats, lambda _, v: v['fragments']) + if flv_formats and len(flv_formats) < len(formats): + # Flv and mp4 are incompatible due to `multi_video` workaround, so drop one + if not self._configuration_arg('prefer_multi_flv'): + dropped_fmts = ', '.join( + f'{f.get("format_note")} ({f.get("format_id")})' for f in flv_formats) + formats = traverse_obj(formats, lambda _, v: not v.get('fragments')) + if dropped_fmts: + self.to_screen( + f'Dropping incompatible flv format(s) {dropped_fmts} since mp4 is available. ' + 'To extract flv, pass --extractor-args "bilibili:prefer_multi_flv"') + else: + formats = traverse_obj( + # XXX: Filtering by extractor-arg is for testing purposes + formats, lambda _, v: v['quality'] == int(self._configuration_arg('prefer_multi_flv')[0]), + ) or [max(flv_formats, key=lambda x: x['quality'])] + + if traverse_obj(formats, (0, 'fragments')): + # We have flv formats, which are individual short videos with their own timestamps and metainfo + # Binary concatenation corrupts their timestamps, so we need a `multi_video` workaround + return { + **metainfo, + '_type': 'multi_video', + 'entries': [{ + 'id': f'{metainfo["id"]}_{idx}', + 'title': metainfo['title'], + 'http_headers': metainfo['http_headers'], + 'formats': [{ + **fragment, + 'format_id': formats[0].get('format_id'), + }], + 'subtitles': self.extract_subtitles(video_id, cid) if idx == 0 else None, + '__post_extractor': self.extract_comments(aid) if idx == 0 else None, + } for idx, fragment in enumerate(formats[0]['fragments'])], + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + } + + return { + **metainfo, + 'formats': formats, + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'chapters': self._get_chapters(aid, cid), + 'subtitles': self.extract_subtitles(video_id, cid), + '__post_extractor': self.extract_comments(aid), + } class BiliBiliBangumiIE(BilibiliBaseIE): From dc1687648077c5bf64863b307ecc5ab7e029bd8d Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Tue, 3 Dec 2024 16:44:03 +1300 Subject: [PATCH 14/99] [ie/bilibili] Always try to extract HD formats (#10559) Closes #10554 Authored by: grqz --- yt_dlp/extractor/bilibili.py | 38 +++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index e538e5308..23c8255ee 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -62,7 +62,7 @@ class BilibiliBaseIE(InfoExtractor): 'support_formats', lambda _, v: v['quality'] not in parsed_qualities))], delim=', ') if missing_formats: self.to_screen( - f'Format(s) {missing_formats} are missing; you have to login or ' + f'Format(s) {missing_formats} are missing; you have to ' f'become a premium member to download them. {self._login_hint()}') def extract_formats(self, play_info): @@ -164,14 +164,18 @@ class BilibiliBaseIE(InfoExtractor): params['w_rid'] = hashlib.md5(f'{query}{self._get_wbi_key(video_id)}'.encode()).hexdigest() return params - def _download_playinfo(self, bvid, cid, headers=None, qn=None): - params = {'bvid': bvid, 'cid': cid, 'fnval': 4048} - if qn: - params['qn'] = qn + def _download_playinfo(self, bvid, cid, headers=None, query=None): + params = {'bvid': bvid, 'cid': cid, 'fnval': 4048, **(query or {})} + if self.is_logged_in: + params.pop('try_look', None) + if qn := params.get('qn'): + note = f'Downloading video format {qn} for cid {cid}' + else: + note = f'Downloading video formats for cid {cid}' + return self._download_json( 'https://api.bilibili.com/x/player/wbi/playurl', bvid, - query=self._sign_wbi(params, bvid), headers=headers, - note=f'Downloading video formats for cid {cid} {qn or ""}')['data'] + query=self._sign_wbi(params, bvid), headers=headers, note=note)['data'] def json2srt(self, json_data): srt_data = '' @@ -285,7 +289,7 @@ class BilibiliBaseIE(InfoExtractor): ('data', 'interaction', 'graph_version', {int_or_none})) cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1) for cid, edges in cid_edges.items(): - play_info = self._download_playinfo(video_id, cid, headers=headers) + play_info = self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1}) yield { **metainfo, 'id': f'{video_id}_{cid}', @@ -681,7 +685,7 @@ class BiliBiliIE(BilibiliBaseIE): traverse_obj( self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id, default=None), ('data', {dict})) - or self._download_playinfo(video_id, cid, headers=headers)) + or self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1})) festival_info = {} if is_festival: @@ -737,7 +741,7 @@ class BiliBiliIE(BilibiliBaseIE): has_qn = lambda x: x in traverse_obj(formats, (..., 'quality')) for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})): formats.extend(traverse_obj( - self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, qn=qn)), + self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, query={'qn': qn})), lambda _, v: not has_qn(v['quality']))) self._check_missing_formats(play_info, formats) flv_formats = traverse_obj(formats, lambda _, v: v['fragments']) @@ -862,10 +866,16 @@ class BiliBiliBangumiIE(BilibiliBaseIE): self.raise_login_required('This video is for premium members only') headers['Referer'] = url - play_info = self._download_json( - 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, - 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, - headers=headers) + + play_info = ( + self._search_json( + r'playurlSSRData\s*=', webpage, 'embedded page info', episode_id, + end_pattern='\n', default=None) + or self._download_json( + 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, + 'Extracting episode', query={'fnval': 12240, 'ep_id': episode_id}, + headers=headers)) + premium_only = play_info.get('code') == -10403 play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} From a13a336aa6f906812701abec8101b73b73db8ff7 Mon Sep 17 00:00:00 2001 From: Link Date: Tue, 3 Dec 2024 12:08:46 +0800 Subject: [PATCH 15/99] [ie/bilibili] Fix subtitles and chapters extraction (#11708) Authored by: xiaomac --- yt_dlp/extractor/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 23c8255ee..91619d9d5 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -194,7 +194,7 @@ class BilibiliBaseIE(InfoExtractor): } video_info = self._download_json( - 'https://api.bilibili.com/x/player/v2', video_id, + 'https://api.bilibili.com/x/player/wbi/v2', video_id, query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid}, note=f'Extracting subtitle info {cid}', headers=self._HEADERS) if traverse_obj(video_info, ('data', 'need_login_subtitle')): @@ -210,7 +210,7 @@ class BilibiliBaseIE(InfoExtractor): def _get_chapters(self, aid, cid): chapters = aid and cid and self._download_json( - 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, + 'https://api.bilibili.com/x/player/wbi/v2', aid, query={'aid': aid, 'cid': cid}, note='Extracting chapters', fatal=False, headers=self._HEADERS) return traverse_obj(chapters, ('data', 'view_points', ..., { 'title': 'content', From c038a7b187ba24360f14134842a7a2cf897c33b1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:28:43 +0000 Subject: [PATCH 16/99] [ie/vk] Fix extractors (#11715) Closes #5832, Closes #11471, Closes #11646, Closes #11670 Authored by: bashonly --- yt_dlp/extractor/vk.py | 96 ++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 32 deletions(-) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 6ccc701a2..4b36e41ff 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -17,10 +17,10 @@ from ..utils import ( get_element_html_by_id, int_or_none, join_nonempty, + parse_qs, parse_resolution, str_or_none, str_to_int, - traverse_obj, try_call, unescapeHTML, unified_timestamp, @@ -29,6 +29,7 @@ from ..utils import ( urlencode_postdata, urljoin, ) +from ..utils.traversal import require, traverse_obj class VKBaseIE(InfoExtractor): @@ -91,17 +92,17 @@ class VKBaseIE(InfoExtractor): class VKIE(VKBaseIE): IE_NAME = 'vk' IE_DESC = 'VK' - _EMBED_REGEX = [r']+?src=(["\'])(?Phttps?://vk\.com/video_ext\.php.+?)\1'] + _EMBED_REGEX = [r']+?src=(["\'])(?Phttps?://vk(?:(?:video)?\.ru|\.com)/video_ext\.php.+?)\1'] _VALID_URL = r'''(?x) https?:// (?: (?: - (?:(?:m|new)\.)?vk\.com/video_| + (?:(?:m|new)\.)?vk(?:(?:video)?\.ru|\.com)/video_| (?:www\.)?daxab\.com/ ) ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: - (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)| + (?:(?:m|new)\.)?vk(?:(?:video)?\.ru|\.com)/(?:.+?\?.*?z=)?(?:video|clip)| (?:www\.)?daxab\.com/embed/ ) (?P-?\d+_\d+)(?:.*\blist=(?P([\da-f]+)|(ln-[\da-zA-Z]+)))? @@ -110,7 +111,7 @@ class VKIE(VKBaseIE): _TESTS = [ { - 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'url': 'https://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', 'info_dict': { 'id': '-77521_162222515', 'ext': 'mp4', @@ -127,7 +128,7 @@ class VKIE(VKBaseIE): 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://vk.com/video205387401_165548505', + 'url': 'https://vk.com/video205387401_165548505', 'info_dict': { 'id': '205387401_165548505', 'ext': 'mp4', @@ -182,10 +183,10 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', - 'duration': 178, + 'duration': 179, 'upload_date': '20130117', 'uploader': "Children's Joy Foundation Inc.", - 'uploader_id': 'thecjf', + 'uploader_id': '@CJFIofficial', 'view_count': int, 'channel_id': 'UCgzCNQ11TmR9V97ECnhi3gw', 'availability': 'public', @@ -193,7 +194,7 @@ class VKIE(VKBaseIE): 'live_status': 'not_live', 'playable_in_embed': True, 'channel': 'Children\'s Joy Foundation Inc.', - 'uploader_url': 'http://www.youtube.com/user/thecjf', + 'uploader_url': 'https://www.youtube.com/@CJFIofficial', 'thumbnail': r're:https?://.+\.jpg$', 'tags': 'count:27', 'start_time': 0.0, @@ -201,6 +202,7 @@ class VKIE(VKBaseIE): 'channel_url': 'https://www.youtube.com/channel/UCgzCNQ11TmR9V97ECnhi3gw', 'channel_follower_count': int, 'age_limit': 0, + 'timestamp': 1358394935, }, }, { @@ -222,6 +224,7 @@ class VKIE(VKBaseIE): 'thumbnail': r're:https?://.+x1080$', 'tags': list, }, + 'skip': 'This video has been deleted and is no longer available.', }, { 'url': 'https://vk.com/clips-74006511?z=clip-74006511_456247211', @@ -235,13 +238,13 @@ class VKIE(VKBaseIE): 'timestamp': 1664995597, 'title': 'Clip by @madempress', 'upload_date': '20221005', - 'uploader': 'Шальная императрица', + 'uploader': 'Шальная Императрица', 'uploader_id': '-74006511', }, }, { # video key is extra_data not url\d+ - 'url': 'http://vk.com/video-110305615_171782105', + 'url': 'https://vk.com/video-110305615_171782105', 'md5': 'e13fcda136f99764872e739d13fac1d1', 'info_dict': { 'id': '-110305615_171782105', @@ -273,6 +276,7 @@ class VKIE(VKBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No formats found', }, { # live stream, hls and rtmp links, most likely already finished live @@ -312,7 +316,16 @@ class VKIE(VKBaseIE): { 'url': 'https://vk.com/clip30014565_456240946', 'only_matching': True, - }] + }, + { + 'url': 'https://vkvideo.ru/video-127553155_456242961', + 'only_matching': True, + }, + { + 'url': 'https://vk.ru/video-220754053_456242564', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -338,7 +351,7 @@ class VKIE(VKBaseIE): video_id = '{}_{}'.format(mobj.group('oid'), mobj.group('id')) info_page = self._download_webpage( - 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) + 'https://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id) error_message = self._html_search_regex( [r'(?s)]+class="video_layer_message"[^>]*>(.+?)', @@ -432,7 +445,7 @@ class VKIE(VKBaseIE): if m_opts_url: opts_url = m_opts_url.group(1) if opts_url.startswith('//'): - opts_url = 'http:' + opts_url + opts_url = 'https:' + opts_url return self.url_result(opts_url) data = player['params'][0] @@ -512,8 +525,11 @@ class VKIE(VKBaseIE): class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/(?:playlist/)?(?P[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P
\w+))?|$)' - _TEMPLATE_URL = 'https://vk.com/videos' + _BASE_URL_RE = r'https?://(?:(?:m|new)\.)?vk(?:video\.ru|\.com/video)' + _VALID_URL = [ + rf'{_BASE_URL_RE}/playlist/(?P-?\d+_\d+)', + rf'{_BASE_URL_RE}/(?P@[^/?#]+)(?:/all)?/?(?!\?.*\bz=video)(?:[?#]|$)', + ] _TESTS = [{ 'url': 'https://vk.com/video/@mobidevices', 'info_dict': { @@ -527,12 +543,20 @@ class VKUserVideosIE(VKBaseIE): }, 'playlist_mincount': 182, }, { - 'url': 'https://vk.com/video/playlist/-174476437_2', + 'url': 'https://vkvideo.ru/playlist/-204353299_426', 'info_dict': { - 'id': '-174476437_playlist_2', - 'title': 'Анонсы', + 'id': '-204353299_playlist_426', }, - 'playlist_mincount': 108, + 'playlist_mincount': 33, + }, { + 'url': 'https://vk.com/video/@gorkyfilmstudio/all', + 'only_matching': True, + }, { + 'url': 'https://vkvideo.ru/@mobidevices', + 'only_matching': True, + }, { + 'url': 'https://vk.com/video/playlist/-174476437_2', + 'only_matching': True, }] _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) @@ -552,7 +576,7 @@ class VKUserVideosIE(VKBaseIE): v = self._VIDEO._make(video[:2]) video_id = '%d_%d' % (v.owner_id, v.id) yield self.url_result( - 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id) + 'https://vk.com/video' + video_id, VKIE.ie_key(), video_id) if count >= total: break video_list_json = self._download_payload('al_video', page_id, { @@ -561,23 +585,25 @@ class VKUserVideosIE(VKBaseIE): 'oid': page_id, 'section': section, })[0][section] - count += video_list_json['count'] + new_count = video_list_json['count'] + if not new_count: + self.to_screen(f'{page_id}: Skipping {total - count} unavailable videos') + break + count += new_count video_list = video_list_json['list'] def _real_extract(self, url): - u_id, section = self._match_valid_url(url).groups() + u_id = self._match_id(url) webpage = self._download_webpage(url, u_id) if u_id.startswith('@'): - page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') - elif '_' in u_id: - page_id, section = u_id.split('_', 1) - section = f'playlist_{section}' + page_id = traverse_obj( + self._search_json(r'\bvar newCur\s*=', webpage, 'cursor data', u_id), + ('oid', {int}, {str_or_none}, {require('page id')})) + section = traverse_obj(parse_qs(url), ('section', 0)) or 'all' else: - raise ExtractorError('Invalid URL', expected=True) - - if not section: - section = 'all' + page_id, _, section = u_id.partition('_') + section = f'playlist_{section}' playlist_title = clean_html(get_element_by_class('VideoInfoPanel__title', webpage)) return self.playlist_result(self._entries(page_id, section), f'{page_id}_{section}', playlist_title) @@ -717,7 +743,7 @@ class VKWallPostIE(VKBaseIE): class VKPlayBaseIE(InfoExtractor): - _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vkplay\.ru)/' + _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vk(?:play|video)\.ru)/' _RESOLUTIONS = { 'tiny': '256x144', 'lowest': '426x240', @@ -797,6 +823,9 @@ class VKPlayIE(VKPlayBaseIE): }, { 'url': 'https://live.vkplay.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records', 'only_matching': True, + }, { + 'url': 'https://live.vkvideo.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records', + 'only_matching': True, }] def _real_extract(self, url): @@ -839,6 +868,9 @@ class VKPlayLiveIE(VKPlayBaseIE): }, { 'url': 'https://live.vkplay.ru/lebwa', 'only_matching': True, + }, { + 'url': 'https://live.vkvideo.ru/panterka', + 'only_matching': True, }] def _real_extract(self, url): From 2b67ac300ac8b44368fb121637d1743cea8c5b6b Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 3 Dec 2024 20:22:21 +0000 Subject: [PATCH 17/99] [cleanup] Misc (#11716) Authored by: bashonly, seproDev Co-authored-by: sepro --- yt_dlp/extractor/adn.py | 2 +- yt_dlp/extractor/cultureunplugged.py | 6 ------ yt_dlp/extractor/pixivsketch.py | 5 +++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index 919e1d6af..7dff40556 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -232,7 +232,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' error = self._parse_json(e.cause.response.read(), video_id) message = error.get('message') - if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': + if e.cause.status == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) raise ExtractorError(message) else: diff --git a/yt_dlp/extractor/cultureunplugged.py b/yt_dlp/extractor/cultureunplugged.py index 8e6579c35..c7ccd2747 100644 --- a/yt_dlp/extractor/cultureunplugged.py +++ b/yt_dlp/extractor/cultureunplugged.py @@ -1,7 +1,4 @@ -import time - from .common import InfoExtractor -from ..networking import HEADRequest from ..utils import int_or_none @@ -31,9 +28,6 @@ class CultureUnpluggedIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id - # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request - self._request_webpage(HEADRequest( - 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) movie_data = self._download_json( f'http://www.cultureunplugged.com/movie-data/cu-{video_id}.json', display_id) diff --git a/yt_dlp/extractor/pixivsketch.py b/yt_dlp/extractor/pixivsketch.py index 344cdb3d0..50b7af535 100644 --- a/yt_dlp/extractor/pixivsketch.py +++ b/yt_dlp/extractor/pixivsketch.py @@ -1,4 +1,5 @@ from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, traverse_obj, @@ -110,8 +111,8 @@ class PixivSketchUserIE(PixivSketchBaseIE): if not traverse_obj(data, 'is_broadcasting'): try: self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure') - except ExtractorError as ex: - if ex.cause and ex.cause.code == 401: + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies') raise ExtractorError('This user is offline', expected=True) From cfa76f35d25eaf993437df8b833befbbf9255331 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 3 Dec 2024 20:30:33 +0000 Subject: [PATCH 18/99] Release 2024.12.03 Created by: bashonly :ci skip all --- CONTRIBUTORS | 3 +++ Changelog.md | 36 ++++++++++++++++++++++++++++++++++++ yt_dlp/version.py | 6 +++--- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 9b8207b28..8ee7fbffa 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -707,3 +707,6 @@ Sakura286 SamDecrock stratus-ss subrat-lima +gitninja1234 +jkruse +xiaomac diff --git a/Changelog.md b/Changelog.md index 4dc032368..c04e936b5 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,42 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.12.03 + +#### Core changes +- [Add `playlist_webpage_url` field](https://github.com/yt-dlp/yt-dlp/commit/7d6c259a03bc4707a319e5e8c6eff0278707874b) ([#11613](https://github.com/yt-dlp/yt-dlp/issues/11613)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- [Handle fragmented formats in `_remove_duplicate_formats`](https://github.com/yt-dlp/yt-dlp/commit/e0500cbf796323551bbabe5b8ed8c75a511ba47a) ([#11637](https://github.com/yt-dlp/yt-dlp/issues/11637)) by [Grub4K](https://github.com/Grub4K) +- **bilibili** + - [Always try to extract HD formats](https://github.com/yt-dlp/yt-dlp/commit/dc1687648077c5bf64863b307ecc5ab7e029bd8d) ([#10559](https://github.com/yt-dlp/yt-dlp/issues/10559)) by [grqz](https://github.com/grqz) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/239f5f36fe04603bec59c8b975f6a792f10246db) ([#11667](https://github.com/yt-dlp/yt-dlp/issues/11667)) by [grqz](https://github.com/grqz) (With fixes in [f05a1cd](https://github.com/yt-dlp/yt-dlp/commit/f05a1cd1492fc98dc8d80d2081d632a1879913d2) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz)) + - [Fix subtitles and chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/a13a336aa6f906812701abec8101b73b73db8ff7) ([#11708](https://github.com/yt-dlp/yt-dlp/issues/11708)) by [xiaomac](https://github.com/xiaomac) +- **chaturbate**: [Fix support for non-public streams](https://github.com/yt-dlp/yt-dlp/commit/4b5eec0aaa7c02627f27a386591b735b90e681a8) ([#11624](https://github.com/yt-dlp/yt-dlp/issues/11624)) by [jkruse](https://github.com/jkruse) +- **dacast**: [Fix HLS AES formats extraction](https://github.com/yt-dlp/yt-dlp/commit/0a0d80800b9350d1a4c4b18d82cfb77ffbc3c507) ([#11644](https://github.com/yt-dlp/yt-dlp/issues/11644)) by [bashonly](https://github.com/bashonly) +- **dropbox**: [Fix password-protected video extraction](https://github.com/yt-dlp/yt-dlp/commit/00dcde728635633eee969ad4d498b9f233c4a94e) ([#11636](https://github.com/yt-dlp/yt-dlp/issues/11636)) by [bashonly](https://github.com/bashonly) +- **duoplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/62cba8a1bedbfc0ddde7267ae57b72bf5f7ea7b1) ([#11588](https://github.com/yt-dlp/yt-dlp/issues/11588)) by [bashonly](https://github.com/bashonly), [glensc](https://github.com/glensc) +- **facebook**: [Support more groups URLs](https://github.com/yt-dlp/yt-dlp/commit/e0f1ae813b36e783e2348ba2a1566e12f5cd8f6e) ([#11576](https://github.com/yt-dlp/yt-dlp/issues/11576)) by [grqz](https://github.com/grqz) +- **instagram**: [Support `share` URLs](https://github.com/yt-dlp/yt-dlp/commit/360aed810ad85db950df586282d256516c98cd2d) ([#11677](https://github.com/yt-dlp/yt-dlp/issues/11677)) by [grqz](https://github.com/grqz) +- **microsoftembed**: [Make format extraction non fatal](https://github.com/yt-dlp/yt-dlp/commit/2bea7936323ca4b6f3b9b1fdd892566223e30efa) ([#11654](https://github.com/yt-dlp/yt-dlp/issues/11654)) by [seproDev](https://github.com/seproDev) +- **mitele**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/cd0f934604587ed793e9177f6a127e5dcf99a7dd) ([#11683](https://github.com/yt-dlp/yt-dlp/issues/11683)) by [DarkZeros](https://github.com/DarkZeros) +- **stripchat**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/16336c51d0848a6868a4fa04e749fa03548b4913) ([#11596](https://github.com/yt-dlp/yt-dlp/issues/11596)) by [gitninja1234](https://github.com/gitninja1234) +- **tiktok**: [Deprioritize animated thumbnails](https://github.com/yt-dlp/yt-dlp/commit/910ecc422930bca14e2abe4986f5f92359e3cea8) ([#11645](https://github.com/yt-dlp/yt-dlp/issues/11645)) by [bashonly](https://github.com/bashonly) +- **vk**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/c038a7b187ba24360f14134842a7a2cf897c33b1) ([#11715](https://github.com/yt-dlp/yt-dlp/issues/11715)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Adjust player clients for site changes](https://github.com/yt-dlp/yt-dlp/commit/0d146c1e36f467af30e87b7af651bdee67b73500) ([#11663](https://github.com/yt-dlp/yt-dlp/issues/11663)) by [bashonly](https://github.com/bashonly) + - tab: [Fix playlists tab extraction](https://github.com/yt-dlp/yt-dlp/commit/fe70f20aedf528fdee332131bc9b6710e54e6f10) ([#11615](https://github.com/yt-dlp/yt-dlp/issues/11615)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- **Request Handler**: websockets: [Support websockets 14.0+](https://github.com/yt-dlp/yt-dlp/commit/c7316373c0a886f65a07a51e50ee147bb3294c85) ([#11616](https://github.com/yt-dlp/yt-dlp/issues/11616)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup** + - [Bump ruff to 0.8.x](https://github.com/yt-dlp/yt-dlp/commit/d8fb3490863653182864d2a53522f350d67a9ff8) ([#11608](https://github.com/yt-dlp/yt-dlp/issues/11608)) by [seproDev](https://github.com/seproDev) + - Miscellaneous + - [ccf0a6b](https://github.com/yt-dlp/yt-dlp/commit/ccf0a6b86b7f68a75463804fe485ec240b8635f0) by [bashonly](https://github.com/bashonly), [pzhlkj6612](https://github.com/pzhlkj6612) + - [2b67ac3](https://github.com/yt-dlp/yt-dlp/commit/2b67ac300ac8b44368fb121637d1743cea8c5b6b) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + ### 2024.11.18 #### Important changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index f4b9400bc..b7d5c57bf 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.11.18' +__version__ = '2024.12.03' -RELEASE_GIT_HEAD = '7ea2787920cccc6b8ea30791993d114fbd564434' +RELEASE_GIT_HEAD = '2b67ac300ac8b44368fb121637d1743cea8c5b6b' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.11.18' +_pkg_version = '2024.12.03' From 354cb4026cf2191e1a130ec2a627b95cabfbc60a Mon Sep 17 00:00:00 2001 From: wesson09 <49007620+wesson09@users.noreply.github.com> Date: Wed, 4 Dec 2024 18:41:58 +0100 Subject: [PATCH 19/99] [cookies] Add `--cookies-from-browser` support for MS Store Firefox (#11731) Authored by: wesson09 --- yt_dlp/cookies.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 772433b0f..fad323c90 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -195,7 +195,10 @@ def _extract_firefox_cookies(profile, container, logger): def _firefox_browser_dirs(): if sys.platform in ('cygwin', 'win32'): - yield os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') + yield from map(os.path.expandvars, ( + R'%APPDATA%\Mozilla\Firefox\Profiles', + R'%LOCALAPPDATA%\Packages\Mozilla.Firefox_n80bbvh6b1yt2\LocalCache\Roaming\Mozilla\Firefox\Profiles', + )) elif sys.platform == 'darwin': yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles') From 2e49c789d3eebc39af8910705d65a98bca0e4c4f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 4 Dec 2024 22:33:14 +0000 Subject: [PATCH 20/99] [ie/youtube] Player client maintenance (#11724) Closes #11686 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 41cd90db9..83dde7d9c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -78,7 +78,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20240726.00.00', + 'clientVersion': '2.20241126.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, @@ -90,7 +90,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20240726.00.00', + 'clientVersion': '2.20241126.01.00', 'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', }, }, @@ -102,7 +102,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20240723.01.00', + 'clientVersion': '1.20241201.00.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, @@ -113,7 +113,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20240724.00.00', + 'clientVersion': '1.20241127.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, @@ -124,7 +124,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20240723.03.00', + 'clientVersion': '1.20241203.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, @@ -257,7 +257,8 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20240726.01.00', + 'clientVersion': '2.20241202.07.00', + 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, @@ -267,7 +268,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'TVHTML5', - 'clientVersion': '7.20240724.13.00', + 'clientVersion': '7.20241201.18.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, From fca3eb5f8be08d5fab2e18b45b7281a12e566725 Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:11:55 +1300 Subject: [PATCH 21/99] [ie/bilibili] Fix HD formats extraction (#11734) Fixes dc1687648077c5bf64863b307ecc5ab7e029bd8d Closes #10554 Authored by: grqz --- yt_dlp/extractor/bilibili.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 91619d9d5..2db951a60 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -681,12 +681,6 @@ class BiliBiliIE(BilibiliBaseIE): old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') - play_info = ( - traverse_obj( - self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id, default=None), - ('data', {dict})) - or self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1})) - festival_info = {} if is_festival: festival_info = traverse_obj(initial_state, { @@ -724,6 +718,13 @@ class BiliBiliIE(BilibiliBaseIE): duration=traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})), __post_extractor=self.extract_comments(aid)) + play_info = None + if self.is_logged_in: + play_info = traverse_obj( + self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id, default=None), + ('data', {dict})) + if not play_info: + play_info = self._download_playinfo(video_id, cid, headers=headers, query={'try_look': 1}) formats = self.extract_formats(play_info) if video_data.get('is_upower_exclusive'): From 2feb28028ee48f2185d2d95076e62accb09b9e2e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:02:30 +0000 Subject: [PATCH 22/99] [ie/soundcloud] Fix formats extraction (#11742) Authored by: bashonly --- README.md | 2 +- yt_dlp/extractor/soundcloud.py | 98 +++++++++++++++------------------- 2 files changed, 43 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 772395d24..1db4ed2a5 100644 --- a/README.md +++ b/README.md @@ -1860,7 +1860,7 @@ The following extractors use this feature: * `cdn`: One or more CDN IDs to use with the API call for stream URLs, e.g. `gcp_cdn`, `gs_cdn_pc_app`, `gs_cdn_mobile_web`, `gs_cdn_pc_web` #### soundcloud -* `formats`: Formats to request from the API. Requested values should be in the format of `{protocol}_{extension}` (omitting the bitrate), e.g. `hls_opus,http_aac`. The `*` character functions as a wildcard, e.g. `*_mp3`, and can be passed by itself to request all formats. Known protocols include `http`, `hls` and `hls-aes`; known extensions include `aac`, `opus` and `mp3`. Original `download` formats are always extracted. Default is `http_aac,hls_aac,http_opus,hls_opus,http_mp3,hls_mp3` +* `formats`: Formats to request from the API. Requested values should be in the format of `{protocol}_{codec}`, e.g. `hls_opus,http_aac`. The `*` character functions as a wildcard, e.g. `*_mp3`, and can be passed by itself to request all formats. Known protocols include `http`, `hls` and `hls-aes`; known codecs include `aac`, `opus` and `mp3`. Original `download` formats are always extracted. Default is `http_aac,hls_aac,http_opus,hls_opus,http_mp3,hls_mp3` #### orfon (orf:on) * `prefer_segments_playlist`: Prefer a playlist of program segments instead of a single complete video when available. If individual segments are desired, use `--concat-playlist never --extractor-args "orfon:prefer_segments_playlist"` diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 03089e98e..e0dda8ff8 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -7,7 +7,6 @@ from .common import InfoExtractor, SearchInfoExtractor from ..networking import HEADRequest from ..networking.exceptions import HTTPError from ..utils import ( - KNOWN_EXTENSIONS, ExtractorError, float_or_none, int_or_none, @@ -251,50 +250,15 @@ class SoundcloudBaseIE(InfoExtractor): def invalid_url(url): return not url or url in format_urls - def add_format(f, protocol, is_preview=False): - mobj = re.search(r'\.(?P\d+)\.(?P[0-9a-z]{3,4})(?=[/?])', stream_url) - if mobj: - for k, v in mobj.groupdict().items(): - if not f.get(k): - f[k] = v - format_id_list = [] - if protocol: - format_id_list.append(protocol) - ext = f.get('ext') - if ext == 'aac': - f.update({ - 'abr': 256, - 'quality': 5, - 'format_note': 'Premium', - }) - for k in ('ext', 'abr'): - v = str_or_none(f.get(k)) - if v: - format_id_list.append(v) - preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) - if preview: - format_id_list.append('preview') - abr = f.get('abr') - if abr: - f['abr'] = int(abr) - if protocol in ('hls', 'hls-aes'): - protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' - else: - protocol = 'http' - f.update({ - 'format_id': '_'.join(format_id_list), - 'protocol': protocol, - 'preference': -10 if preview else None, - }) - formats.append(f) - # New API - for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']))): + for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']) and v['preset'])): if extract_flat: break format_url = t['url'] + preset = t['preset'] + preset_base = preset.partition('_')[0] - protocol = traverse_obj(t, ('format', 'protocol', {str})) + protocol = traverse_obj(t, ('format', 'protocol', {str})) or 'http' if protocol == 'progressive': protocol = 'http' if protocol != 'hls' and '/hls' in format_url: @@ -302,32 +266,54 @@ class SoundcloudBaseIE(InfoExtractor): if protocol == 'encrypted-hls' or '/encrypted-hls' in format_url: protocol = 'hls-aes' - ext = None - if preset := traverse_obj(t, ('preset', {str_or_none})): - ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - ext = mimetype2ext(traverse_obj(t, ('format', 'mime_type', {str}))) - - identifier = join_nonempty(protocol, ext, delim='_') - if not self._is_requested(identifier): - self.write_debug(f'"{identifier}" is not a requested format, skipping') + short_identifier = f'{protocol}_{preset_base}' + if preset_base == 'abr': + self.write_debug(f'Skipping broken "{short_identifier}" format') + continue + if not self._is_requested(short_identifier): + self.write_debug(f'"{short_identifier}" is not a requested format, skipping') continue # XXX: if not extract_flat, 429 error must be caught where _extract_info_dict is called stream_url = traverse_obj(self._call_api( - format_url, track_id, f'Downloading {identifier} format info JSON', + format_url, track_id, f'Downloading {short_identifier} format info JSON', query=query, headers=self._HEADERS), ('url', {url_or_none})) - if invalid_url(stream_url): continue format_urls.add(stream_url) - add_format({ + + mime_type = traverse_obj(t, ('format', 'mime_type', {str})) + codec = self._search_regex(r'codecs="([^"]+)"', mime_type, 'codec', default=None) + ext = { + 'mp4a': 'm4a', + 'opus': 'opus', + }.get(codec[:4] if codec else None) or mimetype2ext(mime_type, default=None) + if not ext or ext == 'm3u8': + ext = preset_base + + is_premium = t.get('quality') == 'hq' + abr = int_or_none( + self._search_regex(r'(\d+)k$', preset, 'abr', default=None) + or self._search_regex(r'\.(\d+)\.(?:opus|mp3)[/?]', stream_url, 'abr', default=None) + or (256 if (is_premium and 'aac' in preset) else None)) + + is_preview = (t.get('snipped') + or '/preview/' in format_url + or re.search(r'/(?:preview|playlist)/0/30/', stream_url)) + + formats.append({ + 'format_id': join_nonempty(protocol, preset, is_preview and 'preview', delim='_'), 'url': stream_url, 'ext': ext, - }, protocol, t.get('snipped') or '/preview/' in format_url) - - for f in formats: - f['vcodec'] = 'none' + 'acodec': codec, + 'vcodec': 'none', + 'abr': abr, + 'protocol': 'm3u8_native' if protocol in ('hls', 'hls-aes') else 'http', + 'container': 'm4a_dash' if ext == 'm4a' else None, + 'quality': 5 if is_premium else 0 if (abr and abr >= 160) else -1, + 'format_note': 'Premium' if is_premium else None, + 'preference': -10 if is_preview else None, + }) if not formats and info.get('policy') == 'BLOCK': self.raise_geo_restricted(metadata_available=True) From 4c85ccd1366c88cf93982f8350f58eed17355981 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:34:13 +0000 Subject: [PATCH 23/99] [ie/youtube] Fix signature function extraction (#11751) Closes #11748 Authored by: bashonly --- test/test_youtube_signature.py | 5 +++++ yt_dlp/extractor/youtube.py | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 0f7ae34f4..56db096ca 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -68,6 +68,11 @@ _SIG_TESTS = [ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', ), + ( + 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'MyOSJXtKI3m-uME_jv7-pT12gOFC02RFkGoqWpzE0Cs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), ] _NSIG_TESTS = [ diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 83dde7d9c..c9b831618 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3119,19 +3119,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen('Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): + # Examples where `sig` is funcname: + # sig=function(a){a=a.split(""); ... ;return a.join("")}; + # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; + # {var l=f,m=h.sp,n=sig(decodeURIComponent(h.s));l.set(m,encodeURIComponent(n))} + # sig=function(J){J=J.split(""); ... ;return J.join("")}; + # ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J}; + # {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))} funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + (r'\b(?P[a-zA-Z0-9$]+)&&\((?P=var)=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\((?P=var)\)\)', + r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*(?P[a-zA-Z0-9$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', + # Old patterns + r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', - r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', - r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'("|\')signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') From a95ee6d8803fca9157adecf63732ab58bf87fd88 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:35:18 +0000 Subject: [PATCH 24/99] [ie/youtube] Fix `n` sig extraction for player `3bb1f723` (#11750) Closes #11744 Authored by: bashonly --- test/test_youtube_signature.py | 11 +++++++++-- yt_dlp/extractor/youtube.py | 15 +++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 56db096ca..2a99436a6 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -188,6 +188,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js', 'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw', ), + ( + 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', + 'gK15nzVyaXE9RsMP3z', 'ZFFWFLPWx9DEgQ', + ), ] @@ -259,8 +263,11 @@ def signature(jscode, sig_input): def n_sig(jscode, sig_input): - funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) - return JSInterpreter(jscode).call_function(funcname, sig_input) + ie = YoutubeIE(FakeYDL()) + funcname = ie._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname))) + return func([sig_input]) make_sig_test = t_factory( diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c9b831618..2b026ef05 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3212,6 +3212,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # * a.D&&(b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") # * a.D&&(PL(a),b=a.j.n||null)&&(b=narray[0](b),a.set("n",b),narray.length||nfunc("") # * a.D&&(b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") + # * J.J="";J.url="";J.Z&&(R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}}; funcname, idx = self._search_regex( r'''(?x) (?: @@ -3228,7 +3229,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): )\)&&\(c=| \b(?P[a-zA-Z0-9_$]+)= )(?P[a-zA-Z0-9_$]+)(?:\[(?P\d+)\])?\([a-zA-Z]\) - (?(var),[a-zA-Z0-9_$]+\.set\("n"\,(?P=var)\),(?P=nfunc)\.length)''', + (?(var),[a-zA-Z0-9_$]+\.set\((?:"n+"|[a-zA-Z0-9_$]+)\,(?P=var)\))''', jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) if not funcname: self.report_warning(join_nonempty( @@ -3237,7 +3238,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self._search_regex( r'''(?xs) ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) - \s*\{(?:(?!};).)+?["']enhanced_except_''', + \s*\{(?:(?!};).)+?return\s*(?P["'])[\w-]+_w8_(?P=q)\s*\+\s*[a-zA-Z0-9_$]+''', jscode, 'Initial JS player n function name', group='name') elif not idx: return funcname @@ -3246,6 +3247,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] + def _fixup_n_function_code(self, argnames, code): + return argnames, re.sub( + rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(["\'])undefined\1\s*\)\s*return\s+{argnames[0]};', + ';', code) + def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09') @@ -3257,7 +3263,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): func_name = self._extract_n_function_name(jscode, player_url=player_url) - func_code = jsi.extract_function_code(func_name) + # XXX: Workaround for the `typeof` gotcha + func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name)) self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code @@ -3273,7 +3280,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except Exception as e: raise JSInterpreter.Exception(traceback.format_exc(), cause=e) - if ret.startswith('enhanced_except_'): + if ret.startswith('enhanced_except_') or ret.endswith(f'_w8_{s}'): raise JSInterpreter.Exception('Signature function returned an exception') return ret From 4bd2655398aed450456197a6767639114a24eac2 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:58:44 +0000 Subject: [PATCH 25/99] [ie/youtube] Raise if `n` function returns input value (#11752) Improve a95ee6d8803fca9157adecf63732ab58bf87fd88 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2b026ef05..e69373ba2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3280,7 +3280,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except Exception as e: raise JSInterpreter.Exception(traceback.format_exc(), cause=e) - if ret.startswith('enhanced_except_') or ret.endswith(f'_w8_{s}'): + if ret.startswith('enhanced_except_') or ret.endswith(s): raise JSInterpreter.Exception('Signature function returned an exception') return ret From 6fef824025b3c2f0ca8af7ac9fa04b10d09a3591 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:07:07 +0000 Subject: [PATCH 26/99] Release 2024.12.06 Created by: bashonly :ci skip all --- CONTRIBUTORS | 1 + Changelog.md | 13 +++++++++++++ yt_dlp/version.py | 6 +++--- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 8ee7fbffa..240197e8a 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -710,3 +710,4 @@ subrat-lima gitninja1234 jkruse xiaomac +wesson09 diff --git a/Changelog.md b/Changelog.md index c04e936b5..9dc905309 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,19 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.12.06 + +#### Core changes +- **cookies**: [Add `--cookies-from-browser` support for MS Store Firefox](https://github.com/yt-dlp/yt-dlp/commit/354cb4026cf2191e1a130ec2a627b95cabfbc60a) ([#11731](https://github.com/yt-dlp/yt-dlp/issues/11731)) by [wesson09](https://github.com/wesson09) + +#### Extractor changes +- **bilibili**: [Fix HD formats extraction](https://github.com/yt-dlp/yt-dlp/commit/fca3eb5f8be08d5fab2e18b45b7281a12e566725) ([#11734](https://github.com/yt-dlp/yt-dlp/issues/11734)) by [grqz](https://github.com/grqz) +- **soundcloud**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/2feb28028ee48f2185d2d95076e62accb09b9e2e) ([#11742](https://github.com/yt-dlp/yt-dlp/issues/11742)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Fix `n` sig extraction for player `3bb1f723`](https://github.com/yt-dlp/yt-dlp/commit/a95ee6d8803fca9157adecf63732ab58bf87fd88) ([#11750](https://github.com/yt-dlp/yt-dlp/issues/11750)) by [bashonly](https://github.com/bashonly) (With fixes in [4bd2655](https://github.com/yt-dlp/yt-dlp/commit/4bd2655398aed450456197a6767639114a24eac2)) + - [Fix signature function extraction](https://github.com/yt-dlp/yt-dlp/commit/4c85ccd1366c88cf93982f8350f58eed17355981) ([#11751](https://github.com/yt-dlp/yt-dlp/issues/11751)) by [bashonly](https://github.com/bashonly) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/2e49c789d3eebc39af8910705d65a98bca0e4c4f) ([#11724](https://github.com/yt-dlp/yt-dlp/issues/11724)) by [bashonly](https://github.com/bashonly) + ### 2024.12.03 #### Core changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index b7d5c57bf..3dec228d3 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.12.03' +__version__ = '2024.12.06' -RELEASE_GIT_HEAD = '2b67ac300ac8b44368fb121637d1743cea8c5b6b' +RELEASE_GIT_HEAD = '4bd2655398aed450456197a6767639114a24eac2' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.12.03' +_pkg_version = '2024.12.06' From f4d3e9e6dc25077b79849a31a2f67f93fdc01e62 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:39:38 +0000 Subject: [PATCH 27/99] [ie/soundcloud] Fix extraction (#11777) Authored by: bashonly --- yt_dlp/extractor/soundcloud.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index e0dda8ff8..66bc5f9c5 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -259,6 +259,8 @@ class SoundcloudBaseIE(InfoExtractor): preset_base = preset.partition('_')[0] protocol = traverse_obj(t, ('format', 'protocol', {str})) or 'http' + if protocol.startswith(('ctr-', 'cbc-')): + continue if protocol == 'progressive': protocol = 'http' if protocol != 'hls' and '/hls' in format_url: From bc262bcad4d3683ceadf61a7eb87e233e72adef3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:44:19 +0000 Subject: [PATCH 28/99] [ie/patreon:campaign] Support /c/ URLs (#11756) Closes #11755 Authored by: bashonly --- yt_dlp/extractor/patreon.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 6bdeaf157..a0e831a5c 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -457,7 +457,7 @@ class PatreonCampaignIE(PatreonBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?patreon\.com/(?: (?:m|api/campaigns)/(?P\d+)| - (?P(?!creation[?/]|posts/|rss[?/])[\w-]+) + (?:c/)?(?P(?!creation[?/]|posts/|rss[?/])[\w-]+) )(?:/posts)?/?(?:$|[?#])''' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', @@ -509,6 +509,26 @@ class PatreonCampaignIE(PatreonBaseIE): 'thumbnail': r're:^https?://.*$', }, 'playlist_mincount': 201, + }, { + 'url': 'https://www.patreon.com/c/OgSog', + 'info_dict': { + 'id': '8504388', + 'title': 'OGSoG', + 'description': r're:(?s)Hello and welcome to our Patreon page. We are Mari, Lasercorn, .+', + 'channel': 'OGSoG', + 'channel_id': '8504388', + 'channel_url': 'https://www.patreon.com/OgSog', + 'uploader_url': 'https://www.patreon.com/OgSog', + 'uploader_id': '72323575', + 'uploader': 'David Moss', + 'thumbnail': r're:https?://.+/.+', + 'channel_follower_count': int, + 'age_limit': 0, + }, + 'playlist_mincount': 331, + }, { + 'url': 'https://www.patreon.com/c/OgSog/posts', + 'only_matching': True, }, { 'url': 'https://www.patreon.com/dissonancepod/posts', 'only_matching': True, From d5e2a379f2adcb28bc48c7d9e90716d7278f89d2 Mon Sep 17 00:00:00 2001 From: Pew <65320806+MutantPiggieGolem1@users.noreply.github.com> Date: Thu, 12 Dec 2024 05:46:52 -0800 Subject: [PATCH 29/99] [ie/youtube] Fix `release_date` extraction (#11759) Authored by: MutantPiggieGolem1 --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e69373ba2..5ba8a62aa 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4689,7 +4689,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?=(?P[^\n]+))(?P=artist)\n+ (?=(?P[^\n]+))(?P=album)\n (?:.+?℗\s*(?P\d{4})(?!\d))? - (?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? + (?:.+?Released\ on\s*:\s*(?P\d{4}-\d{2}-\d{2}))? (.+?\nArtist\s*:\s* (?=(?P[^\n]+))(?P=clean_artist)\n )?.+\nAuto-generated\ by\ YouTube\.\s*$ From f6c73aad5f1a67544bea137ebd9d1e22e0e56567 Mon Sep 17 00:00:00 2001 From: Crypto90 Date: Thu, 12 Dec 2024 14:54:11 +0100 Subject: [PATCH 30/99] [ie/youtube:search_url] Fix playlist searches (#11782) Closes #11666 Authored by: Crypto90 --- yt_dlp/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 5ba8a62aa..7eb522a47 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -5282,6 +5282,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)], 'richGridRenderer': lambda x: self._extract_entries(x, continuation_list), + 'lockupViewModel': lambda x: [self._extract_lockup_view_model(x)], } for key, renderer in isr_content.items(): if key not in known_renderers: From 5460cd91891bf613a2065e2fc278d9903c37a127 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 13 Dec 2024 09:43:08 +0000 Subject: [PATCH 31/99] [ie/youtube] Fix signature function extraction for `2f1832d2` (#11801) Closes #11798 Authored by: bashonly --- test/test_youtube_signature.py | 9 +++++++++ yt_dlp/extractor/youtube.py | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 2a99436a6..13436f088 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -73,6 +73,11 @@ _SIG_TESTS = [ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', 'MyOSJXtKI3m-uME_jv7-pT12gOFC02RFkGoqWpzE0Cs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', ), + ( + 'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q', + ), ] _NSIG_TESTS = [ @@ -192,6 +197,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', 'gK15nzVyaXE9RsMP3z', 'ZFFWFLPWx9DEgQ', ), + ( + 'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js', + 'YWt1qdbe8SAfkoPHW5d', 'RrRjWQOJmBiP', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 7eb522a47..e4904965d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3127,9 +3127,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J}; # {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))} funcname = self._search_regex( - (r'\b(?P[a-zA-Z0-9$]+)&&\((?P=var)=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\((?P=var)\)\)', - r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*(?P[a-zA-Z0-9$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', + (r'\b(?P[a-zA-Z0-9_$]+)&&\((?P=var)=(?P[a-zA-Z0-9_$]{2,})\(decodeURIComponent\((?P=var)\)\)', + r'(?P[a-zA-Z0-9_$]+)\s*=\s*function\(\s*(?P[a-zA-Z0-9_$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9_$])(?P[a-zA-Z0-9_$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9_$]{2}\.[a-zA-Z0-9_$]{2}\(a,\d+\))?', # Old patterns r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', From dc3c4fddcc653989dae71fc563d82a308fc898cc Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:21:48 +0000 Subject: [PATCH 32/99] [ie/youtube] Prioritize original language over auto-dubbed audio (#11803) Closes #11753 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e4904965d..fd9c7107c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4067,10 +4067,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if height: res_qualities[height] = quality + display_name = audio_track.get('displayName') or '' + is_original = 'original' in display_name.lower() + is_descriptive = 'descriptive' in display_name.lower() is_default = audio_track.get('audioIsDefault') - is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower() language_code = audio_track.get('id', '').split('.')[0] - if language_code and is_default: + if language_code and (is_original or (is_default and not original_language)): original_language = language_code # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment @@ -4151,7 +4153,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( - join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''), + join_nonempty(display_name, is_default and ' (default)', delim=''), name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), @@ -4170,7 +4172,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, - 'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1, + 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, # Strictly de-prioritize broken, damaged and 3gp formats 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, } From 54216696261bc07cacd9a837c501d9e0b7fed09e Mon Sep 17 00:00:00 2001 From: sepro Date: Fri, 13 Dec 2024 11:25:29 +0100 Subject: [PATCH 33/99] [cleanup] Make more playlist entries lazy (#11763) Authored by: seproDev --- yt_dlp/extractor/brightcove.py | 5 +++-- yt_dlp/extractor/dvtv.py | 2 +- yt_dlp/extractor/nytimes.py | 2 +- yt_dlp/extractor/vidyard.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 2526f25da..3ada1fd5d 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -31,6 +31,7 @@ from ..utils import ( update_url_query, url_or_none, ) +from ..utils.traversal import traverse_obj class BrightcoveLegacyIE(InfoExtractor): @@ -935,8 +936,8 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): if content_type == 'playlist': return self.playlist_result( - [self._parse_brightcove_metadata(vid, vid.get('id'), headers) - for vid in json_data.get('videos', []) if vid.get('id')], + (self._parse_brightcove_metadata(vid, vid['id'], headers) + for vid in traverse_obj(json_data, ('videos', lambda _, v: v['id']))), json_data.get('id'), json_data.get('name'), json_data.get('description')) diff --git a/yt_dlp/extractor/dvtv.py b/yt_dlp/extractor/dvtv.py index 3e442b339..52d67d2bd 100644 --- a/yt_dlp/extractor/dvtv.py +++ b/yt_dlp/extractor/dvtv.py @@ -162,7 +162,7 @@ class DVTVIE(InfoExtractor): items = re.findall(r'(?s)playlist\.push\(({.+?})\);', webpage) if items: return self.playlist_result( - [self._parse_video_metadata(i, video_id, timestamp) for i in items], + (self._parse_video_metadata(i, video_id, timestamp) for i in items), video_id, self._html_search_meta('twitter:title', webpage)) item = self._search_regex( diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index 9ef57410a..a97add71a 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -343,7 +343,7 @@ class NYTimesCookingIE(NYTimesBaseIE): if media_ids: media_ids.append(lead_video_id) return self.playlist_result( - [self._extract_video(media_id) for media_id in media_ids], page_id, title, description) + map(self._extract_video, media_ids), page_id, title, description) return { **self._extract_video(lead_video_id), diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py index 2f6d1f4c5..89a89b13f 100644 --- a/yt_dlp/extractor/vidyard.py +++ b/yt_dlp/extractor/vidyard.py @@ -421,5 +421,5 @@ class VidyardIE(VidyardBaseIE): return self._process_video_json(video_json['chapters'][0], video_id) return self.playlist_result( - [self._process_video_json(chapter, video_id) for chapter in video_json['chapters']], + (self._process_video_json(chapter, video_id) for chapter in video_json['chapters']), str(video_json['playerUuid']), video_json.get('name')) From 2037a6414f81db8080ca724dca506fde91974c5d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:35:40 +0000 Subject: [PATCH 34/99] Release 2024.12.13 Created by: bashonly :ci skip all --- CONTRIBUTORS | 2 ++ Changelog.md | 14 ++++++++++++++ yt_dlp/version.py | 6 +++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 240197e8a..4b6964260 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -711,3 +711,5 @@ gitninja1234 jkruse xiaomac wesson09 +Crypto90 +MutantPiggieGolem1 diff --git a/Changelog.md b/Changelog.md index 9dc905309..75e824033 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,20 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.12.13 + +#### Extractor changes +- **patreon**: campaign: [Support /c/ URLs](https://github.com/yt-dlp/yt-dlp/commit/bc262bcad4d3683ceadf61a7eb87e233e72adef3) ([#11756](https://github.com/yt-dlp/yt-dlp/issues/11756)) by [bashonly](https://github.com/bashonly) +- **soundcloud**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/f4d3e9e6dc25077b79849a31a2f67f93fdc01e62) ([#11777](https://github.com/yt-dlp/yt-dlp/issues/11777)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Fix `release_date` extraction](https://github.com/yt-dlp/yt-dlp/commit/d5e2a379f2adcb28bc48c7d9e90716d7278f89d2) ([#11759](https://github.com/yt-dlp/yt-dlp/issues/11759)) by [MutantPiggieGolem1](https://github.com/MutantPiggieGolem1) + - [Fix signature function extraction for `2f1832d2`](https://github.com/yt-dlp/yt-dlp/commit/5460cd91891bf613a2065e2fc278d9903c37a127) ([#11801](https://github.com/yt-dlp/yt-dlp/issues/11801)) by [bashonly](https://github.com/bashonly) + - [Prioritize original language over auto-dubbed audio](https://github.com/yt-dlp/yt-dlp/commit/dc3c4fddcc653989dae71fc563d82a308fc898cc) ([#11803](https://github.com/yt-dlp/yt-dlp/issues/11803)) by [bashonly](https://github.com/bashonly) + - search_url: [Fix playlist searches](https://github.com/yt-dlp/yt-dlp/commit/f6c73aad5f1a67544bea137ebd9d1e22e0e56567) ([#11782](https://github.com/yt-dlp/yt-dlp/issues/11782)) by [Crypto90](https://github.com/Crypto90) + +#### Misc. changes +- **cleanup**: [Make more playlist entries lazy](https://github.com/yt-dlp/yt-dlp/commit/54216696261bc07cacd9a837c501d9e0b7fed09e) ([#11763](https://github.com/yt-dlp/yt-dlp/issues/11763)) by [seproDev](https://github.com/seproDev) + ### 2024.12.06 #### Core changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 3dec228d3..f696e1e9d 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.12.06' +__version__ = '2024.12.13' -RELEASE_GIT_HEAD = '4bd2655398aed450456197a6767639114a24eac2' +RELEASE_GIT_HEAD = '54216696261bc07cacd9a837c501d9e0b7fed09e' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.12.06' +_pkg_version = '2024.12.13' From 3d3ee458c1fe49dd5ebd7651a092119d23eb7000 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 15 Dec 2024 19:47:50 +0000 Subject: [PATCH 35/99] [update] Fix endless update loop for `linux_exe` builds (#11827) Closes #11808 Authored by: bashonly --- yt_dlp/update.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index ca2ec5f37..dfab132af 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -525,11 +525,16 @@ class Updater: @functools.cached_property def cmd(self): """The command-line to run the executable, if known""" + argv = None # There is no sys.orig_argv in py < 3.10. Also, it can be [] when frozen if getattr(sys, 'orig_argv', None): - return sys.orig_argv + argv = sys.orig_argv elif getattr(sys, 'frozen', False): - return sys.argv + argv = sys.argv + # linux_static exe's argv[0] will be /tmp/staticx-NNNN/yt-dlp_linux if we don't fixup here + if argv and os.getenv('STATICX_PROG_PATH'): + argv = [self.filename, *argv[1:]] + return argv def restart(self): """Restart the executable""" From b91c3925c2059970daa801cb131c0c2f4f302e72 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 15 Dec 2024 19:55:30 +0000 Subject: [PATCH 36/99] [update] Check 64-bitness when upgrading ARM builds (#11819) Closes #11813 Authored by: bashonly --- yt_dlp/update.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/yt_dlp/update.py b/yt_dlp/update.py index dfab132af..360f5ad58 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -65,9 +65,14 @@ def _get_variant_and_executable_path(): machine = '_legacy' if version_tuple(platform.mac_ver()[0]) < (10, 15) else '' else: machine = f'_{platform.machine().lower()}' + is_64bits = sys.maxsize > 2**32 # Ref: https://en.wikipedia.org/wiki/Uname#Examples if machine[1:] in ('x86', 'x86_64', 'amd64', 'i386', 'i686'): - machine = '_x86' if platform.architecture()[0][:2] == '32' else '' + machine = '_x86' if not is_64bits else '' + # platform.machine() on 32-bit raspbian OS may return 'aarch64', so check "64-bitness" + # See: https://github.com/yt-dlp/yt-dlp/issues/11813 + elif machine[1:] == 'aarch64' and not is_64bits: + machine = '_armv7l' # sys.executable returns a /tmp/ path for staticx builds (linux_static) # Ref: https://staticx.readthedocs.io/en/latest/usage.html#run-time-information if static_exe_path := os.getenv('STATICX_PROG_PATH'): From 1a8851b689763e5173b96f70f8a71df0e4a44b66 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 15 Dec 2024 20:07:18 +0000 Subject: [PATCH 37/99] [ie/youtube] Fix `uploader_id` extraction (#11818) Closes #11816 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index fd9c7107c..e12f728ea 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -518,11 +518,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) def handle_or_none(self, handle): - return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None) + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', urllib.parse.unquote(handle or ''), + '@-handle', default=None) def handle_from_url(self, url): return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', - url, 'channel handle', default=None) + urllib.parse.unquote(url or ''), 'channel handle', default=None) def ucid_from_url(self, url): return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', @@ -2801,6 +2802,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, }, }, + { + # uploader_id has non-ASCII characters that are percent-encoded in YT's JSON + 'url': 'https://www.youtube.com/shorts/18NGQq7p3LY', + 'info_dict': { + 'id': '18NGQq7p3LY', + 'ext': 'mp4', + 'title': '아이브 이서 장원영 리즈 삐끼삐끼 챌린지', + 'description': '', + 'uploader': 'ㅇㅇ', + 'uploader_id': '@으아-v1k', + 'uploader_url': 'https://www.youtube.com/@으아-v1k', + 'channel': 'ㅇㅇ', + 'channel_id': 'UCC25oTm2J7ZVoi5TngOHg9g', + 'channel_url': 'https://www.youtube.com/channel/UCC25oTm2J7ZVoi5TngOHg9g', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'playable_in_embed': True, + 'age_limit': 0, + 'duration': 3, + 'timestamp': 1724306170, + 'upload_date': '20240822', + 'availability': 'public', + 'live_status': 'not_live', + 'view_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'categories': ['People & Blogs'], + 'tags': [], + }, + }, ] _WEBPAGE_TESTS = [ From 09a6c687126f04e243fcb105a828787efddd1030 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 15 Dec 2024 20:09:48 +0000 Subject: [PATCH 38/99] [ie/youtube] Add age-gate workaround for some embeddable videos (#11821) Closes #11296 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e12f728ea..0d3963116 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1496,7 +1496,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, # Age-gate videos. See https://github.com/yt-dlp/yt-dlp/pull/575#issuecomment-888837000 { - 'note': 'Embed allowed age-gate video', + 'note': 'Embed allowed age-gate video; works with web_embedded', 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { 'id': 'HtVdAasjOgU', @@ -1526,7 +1526,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'heatmap': 'count:100', 'timestamp': 1401991663, }, - 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Age-gate video with embed allowed in public site', @@ -4013,10 +4012,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: prs.append(pr) + # web_embedded can work around age-gate and age-verification for some embeddable videos + if self._is_agegated(pr) and variant != 'web_embedded': + append_client(f'web_embedded.{base_client}') + # Unauthenticated users will only get web_embedded client formats if age-gated + if self._is_agegated(pr) and not self.is_authenticated: + self.to_screen( + f'{video_id}: This video is age-restricted; some formats may be missing ' + f'without authentication. {self._login_hint()}', only_once=True) + ''' This code is pointless while web_creator is in _DEFAULT_AUTHED_CLIENTS # EU countries require age-verification for accounts to access age-restricted videos # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients - if self.is_authenticated and self._is_agegated(pr): + embedding_is_disabled = variant == 'web_embedded' and self._is_unplayable(pr) + if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled): self.to_screen( f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) From d298693b1b266d198e8eeecb90ea17c4a031268f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 15 Dec 2024 20:16:04 +0000 Subject: [PATCH 39/99] [ie/soundcloud] Various fixes (#11820) - Fix original/download formats so that they are considered bestaudio - Raise appropriate error if track is DRM-protected Authored by: bashonly --- yt_dlp/extractor/soundcloud.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 66bc5f9c5..85779e91a 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -210,6 +210,7 @@ class SoundcloudBaseIE(InfoExtractor): format_urls = set() formats = [] + has_drm = False query = {'client_id': self._CLIENT_ID} if secret_token: query['secret_token'] = secret_token @@ -245,6 +246,7 @@ class SoundcloudBaseIE(InfoExtractor): 'url': format_url, 'quality': 10, 'format_note': 'Original', + 'vcodec': 'none', }) def invalid_url(url): @@ -260,6 +262,7 @@ class SoundcloudBaseIE(InfoExtractor): protocol = traverse_obj(t, ('format', 'protocol', {str})) or 'http' if protocol.startswith(('ctr-', 'cbc-')): + has_drm = True continue if protocol == 'progressive': protocol = 'http' @@ -317,8 +320,11 @@ class SoundcloudBaseIE(InfoExtractor): 'preference': -10 if is_preview else None, }) - if not formats and info.get('policy') == 'BLOCK': - self.raise_geo_restricted(metadata_available=True) + if not formats: + if has_drm: + self.report_drm(track_id) + if info.get('policy') == 'BLOCK': + self.raise_geo_restricted(metadata_available=True) user = info.get('user') or {} From 6fc85f617a5850307fd5b258477070e6ee177796 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 23 Dec 2024 15:57:25 +0530 Subject: [PATCH 40/99] Don't sanitize filename on Unix when `--no-windows-filenames` (#9591) Closes #4547, Closes #8464 Authored by: pukkandan --- README.md | 3 +-- test/test_YoutubeDL.py | 7 +++++++ yt_dlp/YoutubeDL.py | 26 ++++++++++++++++++-------- yt_dlp/options.py | 4 ++-- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1db4ed2a5..f8c99ace4 100644 --- a/README.md +++ b/README.md @@ -613,8 +613,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-restrict-filenames Allow Unicode characters, "&" and spaces in filenames (default) --windows-filenames Force filenames to be Windows-compatible - --no-windows-filenames Make filenames Windows-compatible only if - using Windows (default) + --no-windows-filenames Sanitize filenames only minimally --trim-filenames LENGTH Limit the filename length (excluding extension) to the specified number of characters diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 966d27a49..6b022a7ea 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -761,6 +761,13 @@ class TestYoutubeDL(unittest.TestCase): test('%(width)06d.%%(ext)s', 'NA.%(ext)s') test('%%(width)06d.%(ext)s', '%(width)06d.mp4') + # Sanitization options + test('%(title3)s', (None, 'foo⧸bar⧹test')) + test('%(title5)s', (None, 'aei_A'), restrictfilenames=True) + test('%(title3)s', (None, 'foo_bar_test'), windowsfilenames=False, restrictfilenames=True) + if sys.platform != 'win32': + test('%(title3)s', (None, 'foo⧸bar\\test'), windowsfilenames=False) + # ID sanitization test('%(id)s', '_abcd', info={'id': '_abcd'}) test('%(some_id)s', '_abcd', info={'some_id': '_abcd'}) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 65b72e026..764baf3a0 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -266,7 +266,9 @@ class YoutubeDL: outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names trim_file_name: Limit length of filename (extension excluded) - windowsfilenames: Force the filenames to be windows compatible + windowsfilenames: True: Force filenames to be Windows compatible + False: Sanitize filenames only minimally + This option has no effect when running on Windows ignoreerrors: Do not stop on download/postprocessing errors. Can be 'only_download' to ignore only download errors. Default is 'only_download' for CLI, but False for API @@ -1192,8 +1194,7 @@ class YoutubeDL: def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict - @param sanitize Whether to sanitize the output as a filename. - For backward compatibility, a function can also be passed + @param sanitize Whether to sanitize the output as a filename """ info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set @@ -1309,14 +1310,23 @@ class YoutubeDL: na = self.params.get('outtmpl_na_placeholder', 'NA') - def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): + def filename_sanitizer(key, value, restricted): return sanitize_filename(str(value), restricted=restricted, is_id=( bool(re.search(r'(^|[_.])id(\.|$)', key)) if 'filename-sanitization' in self.params['compat_opts'] else NO_DEFAULT)) - sanitizer = sanitize if callable(sanitize) else filename_sanitizer - sanitize = bool(sanitize) + if callable(sanitize): + self.deprecation_warning('Passing a callable "sanitize" to YoutubeDL.prepare_outtmpl is deprecated') + elif not sanitize: + pass + elif (sys.platform != 'win32' and not self.params.get('restrictfilenames') + and self.params.get('windowsfilenames') is False): + def sanitize(key, value): + return value.replace('/', '\u29F8').replace('\0', '') + else: + def sanitize(key, value): + return filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')) def _dumpjson_default(obj): if isinstance(obj, (set, LazyList)): @@ -1399,13 +1409,13 @@ class YoutubeDL: if sanitize: # If value is an object, sanitize might convert it to a string - # So we convert it to repr first + # So we manually convert it before sanitizing if fmt[-1] == 'r': value, fmt = repr(value), str_fmt elif fmt[-1] == 'a': value, fmt = ascii(value), str_fmt if fmt[-1] in 'csra': - value = sanitizer(last_field, value) + value = sanitize(last_field, value) key = '{}\0{}'.format(key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 930d9d4be..06b65e0ea 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1370,12 +1370,12 @@ def create_parser(): help='Allow Unicode characters, "&" and spaces in filenames (default)') filesystem.add_option( '--windows-filenames', - action='store_true', dest='windowsfilenames', default=False, + action='store_true', dest='windowsfilenames', default=None, help='Force filenames to be Windows-compatible') filesystem.add_option( '--no-windows-filenames', action='store_false', dest='windowsfilenames', - help='Make filenames Windows-compatible only if using Windows (default)') + help='Sanitize filenames only minimally') filesystem.add_option( '--trim-filenames', '--trim-file-names', metavar='LENGTH', dest='trim_file_name', default=0, type=int, From 9f42e68a74f3f00b0253fe70763abd57cac4237b Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Tue, 24 Dec 2024 12:03:28 +1300 Subject: [PATCH 41/99] [ie/youtube] Skip iOS formats that require PO Token (#11890) Partial fix for https://github.com/yt-dlp/yt-dlp/issues/11868 Authored by: coletdjnz --- README.md | 2 +- yt_dlp/extractor/youtube.py | 44 +++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f8c99ace4..1c628d025 100644 --- a/README.md +++ b/README.md @@ -1775,7 +1775,7 @@ The following extractors use this feature: * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total -* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) +* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 0d3963116..2638eaa5d 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -214,6 +214,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_PO_TOKEN': True, 'REQUIRE_JS_PLAYER': False, }, # This client now requires sign-in for every video @@ -3973,13 +3974,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') - if not po_token and require_po_token: + if not po_token and require_po_token and 'missing_pot' in self._configuration_arg('formats'): self.report_warning( f'No PO Token provided for {client} client, ' - f'which is required for working {client} formats. ' - f'You can manually pass a PO Token for this client with ' - f'--extractor-args "youtube:po_token={client}+XXX"', - only_once=True) + f'which may be required for working {client} formats. This client will be deprioritized', only_once=True) deprioritize_pr = True pr = initial_pr if client == 'web' else None @@ -4053,6 +4051,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or (live_status == 'post_live' and (duration or 0) > 2 * 3600)): return live_status + def _report_pot_format_skipped(self, video_id, client_name, proto): + msg = ( + f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + 'They will be skipped as they may yield HTTP Error 403. ' + f'You can manually pass a PO Token for this client with --extractor-args "youtube:po_token={client_name}+XXX. ' + 'For more information, refer to https://github.com/yt-dlp/yt-dlp/wiki/Extractors#po-token-guide . ' + 'To enable these broken formats anyway, pass --extractor-args "youtube:formats=missing_pot"') + + # Only raise a warning for non-default clients, to not confuse users. + # iOS HLS formats still work without PO Token, so we don't need to warn about them. + if client_name in (*self._DEFAULT_CLIENTS, *self._DEFAULT_AUTHED_CLIENTS): + self.write_debug(msg, only_once=True) + else: + self.report_warning(msg, only_once=True) + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 PREFERRED_LANG_VALUE = 10 @@ -4179,11 +4192,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): fmt_url = update_url_query(fmt_url, {'pot': po_token}) # Clients that require PO Token return videoplayback URLs that may return 403 - is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) - if is_broken: - self.report_warning( - f'{video_id}: {client_name} client formats require a PO Token which was not provided. ' - 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) + require_po_token = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) + if require_po_token and 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, 'https') + continue name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 @@ -4196,7 +4208,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - is_damaged and 'DAMAGED', is_broken and 'BROKEN', + is_damaged and 'DAMAGED', require_po_token and 'MISSING POT', (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 @@ -4213,7 +4225,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, # Strictly de-prioritize broken, damaged and 3gp formats - 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, + 'preference': -20 if require_po_token else -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -4271,10 +4283,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Clients that require PO Token return videoplayback URLs that may return 403 # hls does not currently require PO Token if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': - self.report_warning( - f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' - 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) - f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ') + if 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, proto) + return False + f['format_note'] = join_nonempty(f.get('format_note'), 'MISSING POT', delim=' ') f['source_preference'] -= 20 if itag and all_formats: From 65cf46cddd873fd229dbb0fc0689bca4c201c6b6 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 23 Dec 2024 17:26:35 -0600 Subject: [PATCH 42/99] [ie/youtube] Player client maintenance (#11893) Closes #11867 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2638eaa5d..33a93c5c9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -162,7 +162,6 @@ INNERTUBE_CLIENTS = { 'REQUIRE_JS_PLAYER': False, 'REQUIRE_PO_TOKEN': True, 'REQUIRE_AUTH': True, - 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video 'android_creator': { @@ -197,7 +196,6 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, 'REQUIRE_JS_PLAYER': False, - 'SUPPORTS_COOKIES': True, }, # iOS clients have HLS live streams. Setting device model to get 60fps formats. # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 @@ -233,7 +231,6 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, 'REQUIRE_JS_PLAYER': False, 'REQUIRE_AUTH': True, - 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video 'ios_creator': { @@ -4028,7 +4025,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) # web_creator can work around the age-verification requirement - # android_vr may also be able to work around age-verification # tv_embedded may(?) still work around age-verification if the video is embeddable append_client('web_creator') ''' From 3905f64920ed078d9eeb5640884f5854e01d744d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 23:47:20 +0000 Subject: [PATCH 43/99] Release 2024.12.23 Created by: bashonly :ci skip all --- Changelog.md | 16 ++++++++++++++++ yt_dlp/version.py | 6 +++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Changelog.md b/Changelog.md index 75e824033..22a9a6e4b 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,22 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.12.23 + +#### Core changes +- [Don't sanitize filename on Unix when `--no-windows-filenames`](https://github.com/yt-dlp/yt-dlp/commit/6fc85f617a5850307fd5b258477070e6ee177796) ([#9591](https://github.com/yt-dlp/yt-dlp/issues/9591)) by [pukkandan](https://github.com/pukkandan) +- **update** + - [Check 64-bitness when upgrading ARM builds](https://github.com/yt-dlp/yt-dlp/commit/b91c3925c2059970daa801cb131c0c2f4f302e72) ([#11819](https://github.com/yt-dlp/yt-dlp/issues/11819)) by [bashonly](https://github.com/bashonly) + - [Fix endless update loop for `linux_exe` builds](https://github.com/yt-dlp/yt-dlp/commit/3d3ee458c1fe49dd5ebd7651a092119d23eb7000) ([#11827](https://github.com/yt-dlp/yt-dlp/issues/11827)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **soundcloud**: [Various fixes](https://github.com/yt-dlp/yt-dlp/commit/d298693b1b266d198e8eeecb90ea17c4a031268f) ([#11820](https://github.com/yt-dlp/yt-dlp/issues/11820)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add age-gate workaround for some embeddable videos](https://github.com/yt-dlp/yt-dlp/commit/09a6c687126f04e243fcb105a828787efddd1030) ([#11821](https://github.com/yt-dlp/yt-dlp/issues/11821)) by [bashonly](https://github.com/bashonly) + - [Fix `uploader_id` extraction](https://github.com/yt-dlp/yt-dlp/commit/1a8851b689763e5173b96f70f8a71df0e4a44b66) ([#11818](https://github.com/yt-dlp/yt-dlp/issues/11818)) by [bashonly](https://github.com/bashonly) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/65cf46cddd873fd229dbb0fc0689bca4c201c6b6) ([#11893](https://github.com/yt-dlp/yt-dlp/issues/11893)) by [bashonly](https://github.com/bashonly) + - [Skip iOS formats that require PO Token](https://github.com/yt-dlp/yt-dlp/commit/9f42e68a74f3f00b0253fe70763abd57cac4237b) ([#11890](https://github.com/yt-dlp/yt-dlp/issues/11890)) by [coletdjnz](https://github.com/coletdjnz) + ### 2024.12.13 #### Extractor changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index f696e1e9d..1ff43c611 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.12.13' +__version__ = '2024.12.23' -RELEASE_GIT_HEAD = '54216696261bc07cacd9a837c501d9e0b7fed09e' +RELEASE_GIT_HEAD = '65cf46cddd873fd229dbb0fc0689bca4c201c6b6' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.12.13' +_pkg_version = '2024.12.23' From 0b6b7742c2e7f2a1fcb0b54ef3dd484bab404b3f Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Thu, 26 Dec 2024 14:19:17 +1300 Subject: [PATCH 44/99] [ie/youtube] Fix DASH formats incorrectly skipped in some situations (#11910) Closes https://github.com/yt-dlp/yt-dlp/issues/11907 Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 33a93c5c9..1e83e41b8 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -256,6 +256,7 @@ INNERTUBE_CLIENTS = { 'client': { 'clientName': 'MWEB', 'clientVersion': '2.20241202.07.00', + # mweb does not require PO Token with this UA 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', }, }, @@ -4051,7 +4052,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): msg = ( f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' 'They will be skipped as they may yield HTTP Error 403. ' - f'You can manually pass a PO Token for this client with --extractor-args "youtube:po_token={client_name}+XXX. ' + f'You can manually pass a PO Token for this client with --extractor-args "youtube:po_token={client_name}+XXX". ' 'For more information, refer to https://github.com/yt-dlp/yt-dlp/wiki/Extractors#po-token-guide . ' 'To enable these broken formats anyway, pass --extractor-args "youtube:formats=missing_pot"') @@ -4271,7 +4272,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False - itags[itag].add(key) if f.get('source_preference') is None: f['source_preference'] = -1 @@ -4285,6 +4285,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_note'] = join_nonempty(f.get('format_note'), 'MISSING POT', delim=' ') f['source_preference'] -= 20 + itags[itag].add(key) + if itag and all_formats: f['format_id'] = f'{itag}-{proto}' elif any(p != proto for p, _ in itags[itag]): From 3c14e9191f3035b9a729d1d87bc0381f42de57cf Mon Sep 17 00:00:00 2001 From: voidptr_t Date: Sat, 11 Jan 2025 17:39:31 +0300 Subject: [PATCH 45/99] [ie/PlVideo] Add extractor (#10657) Closes #10311 Authored by: Sanceilaks, seproDev Co-authored-by: sepro --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/plvideo.py | 130 ++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 yt_dlp/extractor/plvideo.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 967010826..bbd6d21bd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1551,6 +1551,7 @@ from .pluralsight import ( PluralsightIE, ) from .plutotv import PlutoTVIE +from .plvideo import PlVideoIE from .podbayfm import ( PodbayFMChannelIE, PodbayFMIE, diff --git a/yt_dlp/extractor/plvideo.py b/yt_dlp/extractor/plvideo.py new file mode 100644 index 000000000..9351af10a --- /dev/null +++ b/yt_dlp/extractor/plvideo.py @@ -0,0 +1,130 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + parse_resolution, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class PlVideoIE(InfoExtractor): + IE_DESC = 'Платформа' + _VALID_URL = r'https?://(?:www\.)?plvideo\.ru/(?:watch\?(?:[^#]+&)?v=|shorts/)(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://plvideo.ru/watch?v=Y5JzUzkcQTMK', + 'md5': 'fe8e18aca892b3b31f3bf492169f8a26', + 'info_dict': { + 'id': 'Y5JzUzkcQTMK', + 'ext': 'mp4', + 'thumbnail': 'https://img.plvideo.ru/images/fp-2024-images/v/cover/37/dd/37dd00a4c96c77436ab737e85947abd7/original663a4a3bb713e5.33151959.jpg', + 'title': 'Presidente de Cuba llega a Moscú en una visita de trabajo', + 'channel': 'RT en Español', + 'channel_id': 'ZH4EKqunVDvo', + 'media_type': 'video', + 'comment_count': int, + 'tags': ['rusia', 'cuba', 'russia', 'miguel díaz-canel'], + 'description': 'md5:a1a395d900d77a86542a91ee0826c115', + 'released_timestamp': 1715096124, + 'channel_is_verified': True, + 'like_count': int, + 'timestamp': 1715095911, + 'duration': 44320, + 'view_count': int, + 'dislike_count': int, + 'upload_date': '20240507', + 'modified_date': '20240701', + 'channel_follower_count': int, + 'modified_timestamp': 1719824073, + }, + }, { + 'url': 'https://plvideo.ru/shorts/S3Uo9c-VLwFX', + 'md5': '7d8fa2279406c69d2fd2a6fc548a9805', + 'info_dict': { + 'id': 'S3Uo9c-VLwFX', + 'ext': 'mp4', + 'channel': 'Romaatom', + 'tags': 'count:22', + 'dislike_count': int, + 'upload_date': '20241130', + 'description': 'md5:452e6de219bf2f32bb95806c51c3b364', + 'duration': 58433, + 'modified_date': '20241130', + 'thumbnail': 'https://img.plvideo.ru/images/fp-2024-11-cover/S3Uo9c-VLwFX/f9318999-a941-482b-b700-2102a7049366.jpg', + 'media_type': 'shorts', + 'like_count': int, + 'modified_timestamp': 1732961458, + 'channel_is_verified': True, + 'channel_id': 'erJyyTIbmUd1', + 'timestamp': 1732961355, + 'comment_count': int, + 'title': 'Белоусов отменил приказы о кадровом резерве на гражданской службе', + 'channel_follower_count': int, + 'view_count': int, + 'released_timestamp': 1732961458, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._download_json( + f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18', video_id) + + is_live = False + formats = [] + subtitles = {} + automatic_captions = {} + for quality, data in traverse_obj(video_data, ('item', 'profiles', {dict.items}, lambda _, v: url_or_none(v[1]['hls']))): + formats.append({ + 'format_id': quality, + 'ext': 'mp4', + 'protocol': 'm3u8_native', + **traverse_obj(data, { + 'url': 'hls', + 'fps': ('fps', {float_or_none}), + 'aspect_ratio': ('aspectRatio', {float_or_none}), + }), + **parse_resolution(quality), + }) + if livestream_url := traverse_obj(video_data, ('item', 'livestream', 'url', {url_or_none})): + is_live = True + formats.extend(self._extract_m3u8_formats(livestream_url, video_id, 'mp4', live=True)) + for lang, url in traverse_obj(video_data, ('item', 'subtitles', {dict.items}, lambda _, v: url_or_none(v[1]))): + if lang.endswith('-auto'): + automatic_captions.setdefault(lang[:-5], []).append({ + 'url': url, + }) + else: + subtitles.setdefault(lang, []).append({ + 'url': url, + }) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + 'is_live': is_live, + **traverse_obj(video_data, ('item', { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('cover', 'paths', 'original', 'src', {url_or_none}), + 'duration': ('uploadFile', 'videoDuration', {int_or_none}), + 'channel': ('channel', 'name', {str}), + 'channel_id': ('channel', 'id', {str}), + 'channel_follower_count': ('channel', 'stats', 'subscribers', {int_or_none}), + 'channel_is_verified': ('channel', 'verified', {bool}), + 'tags': ('tags', ..., {str}), + 'timestamp': ('createdAt', {parse_iso8601}), + 'released_timestamp': ('publishedAt', {parse_iso8601}), + 'modified_timestamp': ('updatedAt', {parse_iso8601}), + 'view_count': ('stats', 'viewTotalCount', {int_or_none}), + 'like_count': ('stats', 'likeCount', {int_or_none}), + 'dislike_count': ('stats', 'dislikeCount', {int_or_none}), + 'comment_count': ('stats', 'commentCount', {int_or_none}), + 'media_type': ('type', {str}), + })), + } From 763ed06ee69f13949397897bd42ff2ec3dc3d384 Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Sun, 12 Jan 2025 01:25:18 +0800 Subject: [PATCH 46/99] [ie/XiaoHongShu] Extend `_VALID_URL` (#11806) Closes #11797 Authored by: HobbyistDev --- yt_dlp/extractor/xiaohongshu.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/xiaohongshu.py b/yt_dlp/extractor/xiaohongshu.py index 1280ca6a9..46543b823 100644 --- a/yt_dlp/extractor/xiaohongshu.py +++ b/yt_dlp/extractor/xiaohongshu.py @@ -10,7 +10,7 @@ from ..utils.traversal import traverse_obj class XiaoHongShuIE(InfoExtractor): - _VALID_URL = r'https?://www\.xiaohongshu\.com/explore/(?P[\da-f]+)' + _VALID_URL = r'https?://www\.xiaohongshu\.com/(?:explore|discovery/item)/(?P[\da-f]+)' IE_DESC = '小红书' _TESTS = [{ 'url': 'https://www.xiaohongshu.com/explore/6411cf99000000001300b6d9', @@ -25,6 +25,18 @@ class XiaoHongShuIE(InfoExtractor): 'duration': 101.726, 'thumbnail': r're:https?://sns-webpic-qc\.xhscdn\.com/\d+/[a-z0-9]+/[\w]+', }, + }, { + 'url': 'https://www.xiaohongshu.com/discovery/item/674051740000000007027a15?xsec_token=CBgeL8Dxd1ZWBhwqRd568gAZ_iwG-9JIf9tnApNmteU2E=', + 'info_dict': { + 'id': '674051740000000007027a15', + 'ext': 'mp4', + 'title': '相互喜欢就可以了', + 'uploader_id': '63439913000000001901f49a', + 'duration': 28.073, + 'description': '#广州[话题]# #深圳[话题]# #香港[话题]# #街头采访[话题]# #是你喜欢的类型[话题]#', + 'thumbnail': r're:https?://sns-webpic-qc\.xhscdn\.com/\d+/[\da-f]+/[^/]+', + 'tags': ['广州', '深圳', '香港', '街头采访', '是你喜欢的类型'], + }, }] def _real_extract(self, url): From 1f4e1e85a27c5b43e34d7706cfd88ffce1b56a4a Mon Sep 17 00:00:00 2001 From: Paul Storkman <111140294+Strkmn@users.noreply.github.com> Date: Sat, 11 Jan 2025 19:51:16 +0100 Subject: [PATCH 47/99] [core] Validate retries values are non-negative (#11927) Closes #11926 Authored by: Strkmn --- yt_dlp/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 20111175b..c76fe2748 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -261,9 +261,11 @@ def validate_options(opts): elif value in ('inf', 'infinite'): return float('inf') try: - return int(value) + int_value = int(value) except (TypeError, ValueError): validate(False, f'{name} retry count', value) + validate_positive(f'{name} retry count', int_value) + return int_value opts.retries = parse_retries('download', opts.retries) opts.fragment_retries = parse_retries('fragment', opts.fragment_retries) From 8346b549150003df988538e54c9d8bc4de568979 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 11 Jan 2025 13:05:23 -0600 Subject: [PATCH 48/99] Fix filename sanitization with `--no-windows-filenames` (#11988) Fix bug in 6fc85f617a5850307fd5b258477070e6ee177796 Closes #11987 Authored by: bashonly --- yt_dlp/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 764baf3a0..178c5b951 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1323,7 +1323,7 @@ class YoutubeDL: elif (sys.platform != 'win32' and not self.params.get('restrictfilenames') and self.params.get('windowsfilenames') is False): def sanitize(key, value): - return value.replace('/', '\u29F8').replace('\0', '') + return str(value).replace('/', '\u29F8').replace('\0', '') else: def sanitize(key, value): return filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')) From 712d2abb32f59b2d246be2901255f84f1a4c30b3 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 12 Jan 2025 15:01:13 +1300 Subject: [PATCH 49/99] [ie/youtube] Use `tv` instead of `mweb` client by default (#12059) Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1e83e41b8..f414d9d03 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -256,11 +256,12 @@ INNERTUBE_CLIENTS = { 'client': { 'clientName': 'MWEB', 'clientVersion': '2.20241202.07.00', - # mweb does not require PO Token with this UA + # mweb previously did not require PO Token with this UA 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, + 'REQUIRE_PO_TOKEN': True, 'SUPPORTS_COOKIES': True, }, 'tv': { @@ -1356,8 +1357,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') - _DEFAULT_CLIENTS = ('ios', 'mweb') - _DEFAULT_AUTHED_CLIENTS = ('web_creator', 'mweb') + _DEFAULT_CLIENTS = ('ios', 'tv') + _DEFAULT_AUTHED_CLIENTS = ('web_creator', 'tv') _GEO_BYPASS = False From 75079f4e3f7dce49b61ef01da7adcd9876a0ca3b Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sun, 12 Jan 2025 15:02:57 +1300 Subject: [PATCH 50/99] [ie/youtube] Refactor cookie auth (#11989) Authored by: coletdjnz --- yt_dlp/extractor/youtube.py | 178 +++++++++++++++++++++++++----------- 1 file changed, 124 insertions(+), 54 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index f414d9d03..e16ec43ed 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -32,7 +32,6 @@ from ..utils import ( classproperty, clean_html, datetime_from_str, - dict_get, filesize_from_tbr, filter_dict, float_or_none, @@ -568,9 +567,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'}) self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) + def _initialize_cookie_auth(self): + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + if yt_sapisid or yt_1psapisid or yt_3psapisid: + self.write_debug('Found YouTube account cookies') + def _real_initialize(self): self._initialize_pref() self._initialize_consent() + self._initialize_cookie_auth() self._check_login_required() def _perform_login(self, username, password): @@ -628,32 +633,63 @@ class YoutubeBaseInfoExtractor(InfoExtractor): client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) return context - _SAPISID = None + @staticmethod + def _make_sid_authorization(scheme, sid, origin, additional_parts): + timestamp = str(round(time.time())) - def _generate_sapisidhash_header(self, origin='https://www.youtube.com'): - time_now = round(time.time()) - if self._SAPISID is None: - yt_cookies = self._get_cookies('https://www.youtube.com') - # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is. - # See: https://github.com/yt-dlp/yt-dlp/issues/393 - sapisid_cookie = dict_get( - yt_cookies, ('__Secure-3PAPISID', 'SAPISID')) - if sapisid_cookie and sapisid_cookie.value: - self._SAPISID = sapisid_cookie.value - self.write_debug('Extracted SAPISID cookie') - # SAPISID cookie is required if not already present - if not yt_cookies.get('SAPISID'): - self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie') - self._set_cookie( - '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600) - else: - self._SAPISID = False - if not self._SAPISID: + hash_parts = [] + if additional_parts: + hash_parts.append(':'.join(additional_parts.values())) + hash_parts.extend([timestamp, sid, origin]) + sidhash = hashlib.sha1(' '.join(hash_parts).encode()).hexdigest() + + parts = [timestamp, sidhash] + if additional_parts: + parts.append(''.join(additional_parts)) + + return f'{scheme} {"_".join(parts)}' + + def _get_sid_cookies(self): + """ + Get SAPISID, 1PSAPISID, 3PSAPISID cookie values + @returns sapisid, 1psapisid, 3psapisid + """ + yt_cookies = self._get_cookies('https://www.youtube.com') + yt_sapisid = try_call(lambda: yt_cookies['SAPISID'].value) + yt_3papisid = try_call(lambda: yt_cookies['__Secure-3PAPISID'].value) + yt_1papisid = try_call(lambda: yt_cookies['__Secure-1PAPISID'].value) + + # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is. + # YouTube also falls back to __Secure-3PAPISID if SAPISID is missing. + # See: https://github.com/yt-dlp/yt-dlp/issues/393 + + return yt_sapisid or yt_3papisid, yt_1papisid, yt_3papisid + + def _get_sid_authorization_header(self, origin='https://www.youtube.com', user_session_id=None): + """ + Generate API Session ID Authorization for Innertube requests. Assumes all requests are secure (https). + @param origin: Origin URL + @param user_session_id: Optional User Session ID + @return: Authorization header value + """ + + authorizations = [] + additional_parts = {} + if user_session_id: + additional_parts['u'] = user_session_id + + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + + for scheme, sid in (('SAPISIDHASH', yt_sapisid), + ('SAPISID1PHASH', yt_1psapisid), + ('SAPISID3PHASH', yt_3psapisid)): + if sid: + authorizations.append(self._make_sid_authorization(scheme, sid, origin, additional_parts)) + + if not authorizations: return None - # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323 - sapisidhash = hashlib.sha1( - f'{time_now} {self._SAPISID} {origin}'.encode()).hexdigest() - return f'SAPISIDHASH {time_now}_{sapisidhash}' + + return ' '.join(authorizations) def _call_api(self, ep, query, video_id, fatal=True, headers=None, note='Downloading API JSON', errnote='Unable to download API page', @@ -689,26 +725,48 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if session_index is not None: return session_index - def _data_sync_id_to_delegated_session_id(self, data_sync_id): - if not data_sync_id: - return - # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel - # and just "user_syncid||" for primary channel. We only want the channel_syncid - channel_syncid, _, user_syncid = data_sync_id.partition('||') - if user_syncid: - return channel_syncid - - def _extract_account_syncid(self, *args): + @staticmethod + def _parse_data_sync_id(data_sync_id): """ - Extract current session ID required to download private playlists of secondary channels + Parse data_sync_id into delegated_session_id and user_session_id. + + data_sync_id is of the form "delegated_session_id||user_session_id" for secondary channel + and just "user_session_id||" for primary channel. + + @param data_sync_id: data_sync_id string + @return: Tuple of (delegated_session_id, user_session_id) + """ + if not data_sync_id: + return None, None + first, _, second = data_sync_id.partition('||') + if second: + return first, second + return None, first + + def _extract_delegated_session_id(self, *args): + """ + Extract current delegated session ID required to download private playlists of secondary channels @params response and/or ytcfg + @return: delegated session ID """ # ytcfg includes channel_syncid if on secondary channel if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): return delegated_sid data_sync_id = self._extract_data_sync_id(*args) - return self._data_sync_id_to_delegated_session_id(data_sync_id) + return self._parse_data_sync_id(data_sync_id)[0] + + def _extract_user_session_id(self, *args): + """ + Extract current user session ID + @params response and/or ytcfg + @return: user session ID + """ + if user_sid := traverse_obj(args, (..., 'USER_SESSION_ID', {str}, any)): + return user_sid + + data_sync_id = self._extract_data_sync_id(*args) + return self._parse_data_sync_id(data_sync_id)[1] def _extract_data_sync_id(self, *args): """ @@ -735,7 +793,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): @functools.cached_property def is_authenticated(self): - return bool(self._generate_sapisidhash_header()) + return bool(self._get_sid_authorization_header()) def extract_ytcfg(self, video_id, webpage): if not webpage: @@ -745,25 +803,28 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', default='{}'), video_id, fatal=False) or {} - def _generate_cookie_auth_headers(self, *, ytcfg=None, account_syncid=None, session_index=None, origin=None, **kwargs): + def _generate_cookie_auth_headers(self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, origin=None, **kwargs): headers = {} - account_syncid = account_syncid or self._extract_account_syncid(ytcfg) - if account_syncid: - headers['X-Goog-PageId'] = account_syncid + delegated_session_id = delegated_session_id or self._extract_delegated_session_id(ytcfg) + if delegated_session_id: + headers['X-Goog-PageId'] = delegated_session_id if session_index is None: session_index = self._extract_session_index(ytcfg) - if account_syncid or session_index is not None: + if delegated_session_id or session_index is not None: headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 - auth = self._generate_sapisidhash_header(origin) + auth = self._get_sid_authorization_header(origin, user_session_id=user_session_id or self._extract_user_session_id(ytcfg)) if auth is not None: headers['Authorization'] = auth headers['X-Origin'] = origin + if traverse_obj(ytcfg, 'LOGGED_IN', expected_type=bool): + headers['X-Youtube-Bootstrap-Logged-In'] = 'true' + return headers def generate_api_headers( - self, *, ytcfg=None, account_syncid=None, session_index=None, + self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, visitor_data=None, api_hostname=None, default_client='web', **kwargs): origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) @@ -774,7 +835,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'Origin': origin, 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client), - **self._generate_cookie_auth_headers(ytcfg=ytcfg, account_syncid=account_syncid, session_index=session_index, origin=origin), + **self._generate_cookie_auth_headers( + ytcfg=ytcfg, + delegated_session_id=delegated_session_id, + user_session_id=user_session_id, + session_index=session_index, + origin=origin), } return filter_dict(headers) @@ -3837,9 +3903,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): default_client=client, visitor_data=visitor_data, session_index=self._extract_session_index(master_ytcfg, player_ytcfg), - account_syncid=( - self._data_sync_id_to_delegated_session_id(data_sync_id) - or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg) + delegated_session_id=( + self._parse_data_sync_id(data_sync_id)[0] + or self._extract_delegated_session_id(master_ytcfg, initial_pr, player_ytcfg) + ), + user_session_id=( + self._parse_data_sync_id(data_sync_id)[1] + or self._extract_user_session_id(master_ytcfg, initial_pr, player_ytcfg) ), ) @@ -5351,7 +5421,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if not continuation_list[0]: continuation_list[0] = self._extract_continuation(parent_renderer) - def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): + def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): continuation_list = [None] extract_entries = lambda x: self._extract_entries(x, continuation_list) tab_content = try_get(tab, lambda x: x['content'], dict) @@ -5372,7 +5442,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): break seen_continuations.add(continuation_token) headers = self.generate_api_headers( - ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data) + ytcfg=ytcfg, delegated_session_id=delegated_session_id, visitor_data=visitor_data) response = self._extract_response( item_id=f'{item_id} page {page_num}', query=continuation, headers=headers, ytcfg=ytcfg, @@ -5442,7 +5512,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): return self.playlist_result( self._entries( selected_tab, metadata['id'], ytcfg, - self._extract_account_syncid(ytcfg, data), + self._extract_delegated_session_id(ytcfg, data), self._extract_visitor_data(data, ytcfg)), **metadata) @@ -5594,7 +5664,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): watch_endpoint = try_get( playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint']) headers = self.generate_api_headers( - ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), + ytcfg=ytcfg, delegated_session_id=self._extract_delegated_session_id(ytcfg, data), visitor_data=self._extract_visitor_data(response, data, ytcfg)) query = { 'playlistId': playlist_id, @@ -5692,7 +5762,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if not is_playlist: return headers = self.generate_api_headers( - ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), + ytcfg=ytcfg, delegated_session_id=self._extract_delegated_session_id(ytcfg, data), visitor_data=self._extract_visitor_data(data, ytcfg)) query = { 'params': 'wgYCCAA=', From 1f489f4a45691cac3f9e787d22a3a8a086229ba6 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Sun, 12 Jan 2025 18:42:02 +0000 Subject: [PATCH 51/99] [ie/DrTalks] Add extractor (#10831) Closes #6390 Authored by: pzhlkj6612, seproDev Co-authored-by: sepro --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/drtalks.py | 51 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 yt_dlp/extractor/drtalks.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bbd6d21bd..e3947dfb5 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -555,6 +555,7 @@ from .dropout import ( DropoutIE, DropoutSeasonIE, ) +from .drtalks import DrTalksIE from .drtuber import DrTuberIE from .drtv import ( DRTVIE, diff --git a/yt_dlp/extractor/drtalks.py b/yt_dlp/extractor/drtalks.py new file mode 100644 index 000000000..5ea7f7580 --- /dev/null +++ b/yt_dlp/extractor/drtalks.py @@ -0,0 +1,51 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class DrTalksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?drtalks\.com/videos/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://drtalks.com/videos/six-pillars-of-resilience-tools-for-managing-stress-and-flourishing/', + 'info_dict': { + 'id': '6366193757112', + 'ext': 'mp4', + 'uploader_id': '6314452011001', + 'tags': ['resilience'], + 'description': 'md5:9c6805aee237ee6de8052461855b9dda', + 'timestamp': 1734546659, + 'thumbnail': 'https://drtalks.com/wp-content/uploads/2024/12/Episode-82-Eva-Selhub-DrTalks-Thumbs.jpg', + 'title': 'Six Pillars of Resilience: Tools for Managing Stress and Flourishing', + 'duration': 2800.682, + 'upload_date': '20241218', + }, + }, { + 'url': 'https://drtalks.com/videos/the-pcos-puzzle-mastering-metabolic-health-with-marcelle-pick/', + 'info_dict': { + 'id': '6364699891112', + 'ext': 'mp4', + 'title': 'The PCOS Puzzle: Mastering Metabolic Health with Marcelle Pick', + 'description': 'md5:e87cbe00ca50135d5702787fc4043aaa', + 'thumbnail': 'https://drtalks.com/wp-content/uploads/2024/11/Episode-34-Marcelle-Pick-OBGYN-NP-DrTalks.jpg', + 'duration': 3515.2, + 'tags': ['pcos'], + 'upload_date': '20241114', + 'timestamp': 1731592119, + 'uploader_id': '6314452011001', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + next_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data']['video'] + + return self.url_result( + next_data['videos']['brightcoveVideoLink'], BrightcoveNewIE, video_id, + url_transparent=True, + **traverse_obj(next_data, { + 'title': ('title', {str}), + 'description': ('videos', 'summury', {str}), + 'thumbnail': ('featuredImage', 'node', 'sourceUrl', {url_or_none}), + })) From e2ef4fece6c9742d1733e3bae408c4787765f78c Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Sun, 12 Jan 2025 19:43:16 +0100 Subject: [PATCH 52/99] [ie/vine] Remove extractors (#11700) Authored by: allendema --- yt_dlp/extractor/_extractors.py | 4 - yt_dlp/extractor/tumblr.py | 21 ----- yt_dlp/extractor/twitter.py | 39 --------- yt_dlp/extractor/vine.py | 150 -------------------------------- 4 files changed, 214 deletions(-) delete mode 100644 yt_dlp/extractor/vine.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e3947dfb5..d42bce21b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2356,10 +2356,6 @@ from .vimm import ( VimmIE, VimmRecordingIE, ) -from .vine import ( - VineIE, - VineUserIE, -) from .viously import ViouslyIE from .viqeo import ViqeoIE from .viu import ( diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py index d6d436883..1f2c9b19c 100644 --- a/yt_dlp/extractor/tumblr.py +++ b/yt_dlp/extractor/tumblr.py @@ -189,26 +189,6 @@ class TumblrIE(InfoExtractor): 'release_date': '20140227', }, 'add_ie': ['Vimeo'], - }, { - 'url': 'http://sutiblr.tumblr.com/post/139638707273', - 'md5': '2dd184b3669e049ba40563a7d423f95c', - 'info_dict': { - 'id': 'ir7qBEIKqvq', - 'ext': 'mp4', - 'title': 'Vine by sutiblr', - 'alt_title': 'Vine by sutiblr', - 'uploader': 'sutiblr', - 'uploader_id': '1198993975374495744', - 'upload_date': '20160220', - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1455940159, - 'view_count': int, - }, - 'add_ie': ['Vine'], - 'skip': 'Vine is unavailable', }, { 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine', 'md5': '3c92d7c3d867f14ccbeefa2119022277', @@ -366,7 +346,6 @@ class TumblrIE(InfoExtractor): _providers = { 'instagram': 'Instagram', 'vimeo': 'Vimeo', - 'vine': 'Vine', 'youtube': 'Youtube', 'dailymotion': 'Dailymotion', 'tiktok': 'TikTok', diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 8196ce6c3..c05b5bf9c 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -409,26 +409,6 @@ class TwitterCardIE(InfoExtractor): }, 'add_ie': ['Youtube'], }, - { - 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', - 'info_dict': { - 'id': 'iBb2x00UVlv', - 'ext': 'mp4', - 'upload_date': '20151113', - 'uploader_id': '1189339351084113920', - 'uploader': 'ArsenalTerje', - 'title': 'Vine by ArsenalTerje', - 'timestamp': 1447451307, - 'alt_title': 'Vine by ArsenalTerje', - 'comment_count': int, - 'like_count': int, - 'thumbnail': r're:^https?://[^?#]+\.jpg', - 'view_count': int, - 'repost_count': int, - }, - 'add_ie': ['Vine'], - 'params': {'skip_download': 'm3u8'}, - }, { 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88', @@ -567,25 +547,6 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, '_old_archive_ids': ['twitter 700207533655363584'], }, - }, { - 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', - 'md5': '89a15ed345d13b86e9a5a5e051fa308a', - 'info_dict': { - 'id': 'MIOxnrUteUd', - 'ext': 'mp4', - 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', - 'uploader': 'TAKUMA', - 'uploader_id': '1004126642786242560', - 'timestamp': 1402826626, - 'upload_date': '20140615', - 'thumbnail': r're:^https?://.*\.jpg', - 'alt_title': 'Vine by TAKUMA', - 'comment_count': int, - 'repost_count': int, - 'like_count': int, - 'view_count': int, - }, - 'add_ie': ['Vine'], }, { 'url': 'https://twitter.com/captainamerica/status/719944021058060289', 'info_dict': { diff --git a/yt_dlp/extractor/vine.py b/yt_dlp/extractor/vine.py deleted file mode 100644 index eed4bfeeb..000000000 --- a/yt_dlp/extractor/vine.py +++ /dev/null @@ -1,150 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - format_field, - int_or_none, - unified_timestamp, -) - - -class VineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P\w+)' - _EMBED_REGEX = [r']+src=[\'"](?P(?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))'] - _TESTS = [{ - 'url': 'https://vine.co/v/b9KOOWX7HUx', - 'md5': '2f36fed6235b16da96ce9b4dc890940d', - 'info_dict': { - 'id': 'b9KOOWX7HUx', - 'ext': 'mp4', - 'title': 'Chicken.', - 'alt_title': 'Vine by Jack', - 'timestamp': 1368997951, - 'upload_date': '20130519', - 'uploader': 'Jack', - 'uploader_id': '76', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/v/e192BnZnZ9V', - 'info_dict': { - 'id': 'e192BnZnZ9V', - 'ext': 'mp4', - 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', - 'alt_title': 'Vine by Pimry_zaa', - 'timestamp': 1436057405, - 'upload_date': '20150705', - 'uploader': 'Pimry_zaa', - 'uploader_id': '1135760698325307392', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://vine.co/v/MYxVapFvz2z', - 'only_matching': True, - }, { - 'url': 'https://vine.co/v/bxVjBbZlPUH', - 'only_matching': True, - }, { - 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_json( - f'https://archive.vine.co/posts/{video_id}.json', video_id) - - def video_url(kind): - for url_suffix in ('Url', 'URL'): - format_url = data.get(f'video{kind}{url_suffix}') - if format_url: - return format_url - - formats = [] - for quality, format_id in enumerate(('low', '', 'dash')): - format_url = video_url(format_id.capitalize()) - if not format_url: - continue - # DASH link returns plain mp4 - if format_id == 'dash' and determine_ext(format_url) == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id or 'standard', - 'quality': quality, - }) - self._check_formats(formats, video_id) - - username = data.get('username') - - alt_title = format_field(username, None, 'Vine by %s') - - return { - 'id': video_id, - 'title': data.get('description') or alt_title or 'Vine video', - 'alt_title': alt_title, - 'thumbnail': data.get('thumbnailUrl'), - 'timestamp': unified_timestamp(data.get('created')), - 'uploader': username, - 'uploader_id': data.get('userIdStr'), - 'view_count': int_or_none(data.get('loops')), - 'like_count': int_or_none(data.get('likes')), - 'comment_count': int_or_none(data.get('comments')), - 'repost_count': int_or_none(data.get('reposts')), - 'formats': formats, - } - - -class VineUserIE(InfoExtractor): - IE_NAME = 'vine:user' - _VALID_URL = r'https?://vine\.co/(?Pu/)?(?P[^/]+)' - _VINE_BASE_URL = 'https://vine.co/' - _TESTS = [{ - 'url': 'https://vine.co/itsruthb', - 'info_dict': { - 'id': 'itsruthb', - 'title': 'Ruth B', - 'description': '| Instagram/Twitter: itsruthb | still a lost boy from neverland', - }, - 'playlist_mincount': 611, - }, { - 'url': 'https://vine.co/u/942914934646415360', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if VineIE.suitable(url) else super().suitable(url) - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - user = mobj.group('user') - u = mobj.group('u') - - profile_url = '{}api/users/profiles/{}{}'.format( - self._VINE_BASE_URL, 'vanity/' if not u else '', user) - profile_data = self._download_json( - profile_url, user, note='Downloading user profile data') - - data = profile_data['data'] - user_id = data.get('userId') or data['userIdStr'] - profile = self._download_json( - f'https://archive.vine.co/profiles/{user_id}.json', user_id) - entries = [ - self.url_result( - f'https://vine.co/v/{post_id}', ie='Vine', video_id=post_id) - for post_id in profile['posts'] - if post_id and isinstance(post_id, str)] - return self.playlist_result( - entries, user, profile.get('username'), profile.get('description')) From dade5e35c89adaad04408bfef766820dbca06ebe Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Mon, 13 Jan 2025 00:24:22 +0100 Subject: [PATCH 53/99] [cleanup] Misc (#11915) Authored by: grqz, Grub4K, seproDev Co-authored-by: sepro Co-authored-by: N/Ame <173015200+grqz@users.noreply.github.com> --- README.md | 2 +- pyproject.toml | 3 ++- yt_dlp/YoutubeDL.py | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1c628d025..2f848bd13 100644 --- a/README.md +++ b/README.md @@ -1769,7 +1769,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, or `web_creator,mweb` is used when authenticating with cookies. The `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. Not all clients support authentication via cookies. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,tv` is used, or `web_creator,tv` is used when authenticating with cookies. The `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. Not all clients support authentication via cookies. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/pyproject.toml b/pyproject.toml index 96e2d669a..5eb9a9644 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.8.0", + "ruff~=0.9.0", ] test = [ "pytest~=8.1", @@ -195,6 +195,7 @@ ignore = [ "B023", # function-uses-loop-variable (false positives) "B028", # no-explicit-stacklevel "B904", # raise-without-from-inside-except + "A005", # stdlib-module-shadowing "C401", # unnecessary-generator-set "C402", # unnecessary-generator-dict "PIE790", # unnecessary-placeholder diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 178c5b951..f6155dd2e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -283,7 +283,10 @@ class YoutubeDL: lazy_playlist: Process playlist entries as they are received. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. - logger: Log messages to a logging.Logger instance. + logger: A class having a `debug`, `warning` and `error` function where + each has a single string parameter, the message to be logged. + For compatibility reasons, both debug and info messages are passed to `debug`. + A debug message will have a prefix of `[debug] ` to discern it from info messages. logtostderr: Print everything to stderr instead of stdout. consoletitle: Display progress in the console window's titlebar. writedescription: Write the video description to a .description file From a3c0321825110d7eb447a6e6f393cec2bade34f9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 12 Jan 2025 23:35:35 +0000 Subject: [PATCH 54/99] Release 2025.01.12 Created by: bashonly :ci skip all --- CONTRIBUTORS | 2 ++ Changelog.md | 19 +++++++++++++++++++ supportedsites.md | 4 ++-- yt_dlp/version.py | 6 +++--- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 4b6964260..010226418 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -713,3 +713,5 @@ xiaomac wesson09 Crypto90 MutantPiggieGolem1 +Sanceilaks +Strkmn diff --git a/Changelog.md b/Changelog.md index 22a9a6e4b..cf9806ea1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,25 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.01.12 + +#### Core changes +- [Fix filename sanitization with `--no-windows-filenames`](https://github.com/yt-dlp/yt-dlp/commit/8346b549150003df988538e54c9d8bc4de568979) ([#11988](https://github.com/yt-dlp/yt-dlp/issues/11988)) by [bashonly](https://github.com/bashonly) +- [Validate retries values are non-negative](https://github.com/yt-dlp/yt-dlp/commit/1f4e1e85a27c5b43e34d7706cfd88ffce1b56a4a) ([#11927](https://github.com/yt-dlp/yt-dlp/issues/11927)) by [Strkmn](https://github.com/Strkmn) + +#### Extractor changes +- **drtalks**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1f489f4a45691cac3f9e787d22a3a8a086229ba6) ([#10831](https://github.com/yt-dlp/yt-dlp/issues/10831)) by [pzhlkj6612](https://github.com/pzhlkj6612), [seproDev](https://github.com/seproDev) +- **plvideo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3c14e9191f3035b9a729d1d87bc0381f42de57cf) ([#10657](https://github.com/yt-dlp/yt-dlp/issues/10657)) by [Sanceilaks](https://github.com/Sanceilaks), [seproDev](https://github.com/seproDev) +- **vine**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/e2ef4fece6c9742d1733e3bae408c4787765f78c) ([#11700](https://github.com/yt-dlp/yt-dlp/issues/11700)) by [allendema](https://github.com/allendema) +- **xiaohongshu**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/763ed06ee69f13949397897bd42ff2ec3dc3d384) ([#11806](https://github.com/yt-dlp/yt-dlp/issues/11806)) by [HobbyistDev](https://github.com/HobbyistDev) +- **youtube** + - [Fix DASH formats incorrectly skipped in some situations](https://github.com/yt-dlp/yt-dlp/commit/0b6b7742c2e7f2a1fcb0b54ef3dd484bab404b3f) ([#11910](https://github.com/yt-dlp/yt-dlp/issues/11910)) by [coletdjnz](https://github.com/coletdjnz) + - [Refactor cookie auth](https://github.com/yt-dlp/yt-dlp/commit/75079f4e3f7dce49b61ef01da7adcd9876a0ca3b) ([#11989](https://github.com/yt-dlp/yt-dlp/issues/11989)) by [coletdjnz](https://github.com/coletdjnz) + - [Use `tv` instead of `mweb` client by default](https://github.com/yt-dlp/yt-dlp/commit/712d2abb32f59b2d246be2901255f84f1a4c30b3) ([#12059](https://github.com/yt-dlp/yt-dlp/issues/12059)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup**: Miscellaneous: [dade5e3](https://github.com/yt-dlp/yt-dlp/commit/dade5e35c89adaad04408bfef766820dbca06ebe) by [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) + ### 2024.12.23 #### Core changes diff --git a/supportedsites.md b/supportedsites.md index 916735e08..1420742d1 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -374,6 +374,7 @@ - **Dropbox** - **Dropout**: [*dropout*](## "netrc machine") - **DropoutSeason** + - **DrTalks** - **DrTuber** - **drtv** - **drtv:live** @@ -1086,6 +1087,7 @@ - **pluralsight**: [*pluralsight*](## "netrc machine") - **pluralsight:course** - **PlutoTV**: (**Currently broken**) + - **PlVideo**: Платформа - **PodbayFM** - **PodbayFMChannel** - **Podchaser** @@ -1641,8 +1643,6 @@ - **Vimm:stream** - **ViMP** - **ViMP:Playlist** - - **Vine** - - **vine:user** - **Viously** - **Viqeo**: (**Currently broken**) - **Viu** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 1ff43c611..97a3b1b26 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.12.23' +__version__ = '2025.01.12' -RELEASE_GIT_HEAD = '65cf46cddd873fd229dbb0fc0689bca4c201c6b6' +RELEASE_GIT_HEAD = 'dade5e35c89adaad04408bfef766820dbca06ebe' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.12.23' +_pkg_version = '2025.01.12' From c8541f8b13e743fcfa06667530d13fee8686e22a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 15 Jan 2025 12:21:56 -0600 Subject: [PATCH 55/99] [ie/youtube] Do not use `web_creator` as a default client (#12087) Closes #12085 Authored by: bashonly --- README.md | 2 +- yt_dlp/extractor/youtube.py | 41 +++++++------------------------------ 2 files changed, 8 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 2f848bd13..56e4458dc 100644 --- a/README.md +++ b/README.md @@ -1769,7 +1769,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,tv` is used, or `web_creator,tv` is used when authenticating with cookies. The `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. Not all clients support authentication via cookies. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `_music` variants may be added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e16ec43ed..c23e65cc5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -116,6 +116,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, + 'REQUIRE_PO_TOKEN': True, 'SUPPORTS_COOKIES': True, }, # This client now requires sign-in for every video @@ -127,6 +128,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, + 'REQUIRE_PO_TOKEN': True, 'REQUIRE_AUTH': True, 'SUPPORTS_COOKIES': True, }, @@ -211,8 +213,8 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, - 'REQUIRE_PO_TOKEN': True, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, # This client now requires sign-in for every video 'ios_music': { @@ -229,6 +231,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, 'REQUIRE_AUTH': True, }, # This client now requires sign-in for every video @@ -246,6 +249,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, 'REQUIRE_AUTH': True, }, # mweb has 'ultralow' formats @@ -1423,8 +1427,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') - _DEFAULT_CLIENTS = ('ios', 'tv') - _DEFAULT_AUTHED_CLIENTS = ('web_creator', 'tv') + _DEFAULT_CLIENTS = ('tv', 'ios', 'web') + _DEFAULT_AUTHED_CLIENTS = ('tv', 'web') _GEO_BYPASS = False @@ -3960,15 +3964,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not requested_clients: raise ExtractorError('No player clients have been requested', expected=True) - if smuggled_data.get('is_music_url') or self.is_music_url(url): - for requested_client in requested_clients: - _, base_client, variant = _split_innertube_client(requested_client) - music_client = f'{base_client}_music' if base_client != 'mweb' else 'web_music' - if variant != 'music' and music_client in INNERTUBE_CLIENTS: - client_info = INNERTUBE_CLIENTS[music_client] - if not client_info['REQUIRE_AUTH'] or (self.is_authenticated and client_info['SUPPORTS_COOKIES']): - requested_clients.append(music_client) - if self.is_authenticated: unsupported_clients = [ client for client in requested_clients if not INNERTUBE_CLIENTS[client]['SUPPORTS_COOKIES'] @@ -4079,28 +4074,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: prs.append(pr) - # web_embedded can work around age-gate and age-verification for some embeddable videos - if self._is_agegated(pr) and variant != 'web_embedded': - append_client(f'web_embedded.{base_client}') - # Unauthenticated users will only get web_embedded client formats if age-gated - if self._is_agegated(pr) and not self.is_authenticated: - self.to_screen( - f'{video_id}: This video is age-restricted; some formats may be missing ' - f'without authentication. {self._login_hint()}', only_once=True) - - ''' This code is pointless while web_creator is in _DEFAULT_AUTHED_CLIENTS - # EU countries require age-verification for accounts to access age-restricted videos - # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients - embedding_is_disabled = variant == 'web_embedded' and self._is_unplayable(pr) - if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled): - self.to_screen( - f'{video_id}: This video is age-restricted and YouTube is requiring ' - 'account age-verification; some formats may be missing', only_once=True) - # web_creator can work around the age-verification requirement - # tv_embedded may(?) still work around age-verification if the video is embeddable - append_client('web_creator') - ''' - prs.extend(deprioritized_prs) if skipped_clients: From bbc7591d3bb650f96cd1f1584055888cc919f14a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 15 Jan 2025 23:50:41 +0000 Subject: [PATCH 56/99] Release 2025.01.15 Created by: bashonly :ci skip all --- Changelog.md | 5 +++++ yt_dlp/version.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Changelog.md b/Changelog.md index cf9806ea1..b996d35f7 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,11 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.01.15 + +#### Extractor changes +- **youtube**: [Do not use `web_creator` as a default client](https://github.com/yt-dlp/yt-dlp/commit/c8541f8b13e743fcfa06667530d13fee8686e22a) ([#12087](https://github.com/yt-dlp/yt-dlp/issues/12087)) by [bashonly](https://github.com/bashonly) + ### 2025.01.12 #### Core changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 97a3b1b26..e7588aebb 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.01.12' +__version__ = '2025.01.15' -RELEASE_GIT_HEAD = 'dade5e35c89adaad04408bfef766820dbca06ebe' +RELEASE_GIT_HEAD = 'c8541f8b13e743fcfa06667530d13fee8686e22a' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.01.12' +_pkg_version = '2025.01.15' From 164368610456e2d96b279f8b120dea08f7b1d74f Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 15 Jan 2025 20:40:13 -0600 Subject: [PATCH 57/99] [ie/dropout] Fix extraction (#12102) Closes #12103 Authored by: bashonly --- yt_dlp/extractor/dropout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py index 7e97c4d40..a0d8aacdb 100644 --- a/yt_dlp/extractor/dropout.py +++ b/yt_dlp/extractor/dropout.py @@ -135,7 +135,7 @@ class DropoutIE(InfoExtractor): self.raise_login_required(method='any') raise ExtractorError(login_err, expected=True) - embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') + embed_url = self._html_search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') thumbnail = self._og_search_thumbnail(webpage) watch_info = get_element_by_id('watch-info', webpage) or '' From a567f97b62ae9f6d6f5a9376c361512ab8dceda2 Mon Sep 17 00:00:00 2001 From: 4ft35t <4ft35t@users.noreply.github.com> Date: Sun, 19 Jan 2025 21:10:36 +0800 Subject: [PATCH 58/99] [ie/Weibo] Extend `_VALID_URL` (#12088) Closes #12086 Authored by: 4ft35t --- yt_dlp/extractor/weibo.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index e632858e5..6e57446e9 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -124,7 +124,7 @@ class WeiboBaseIE(InfoExtractor): class WeiboIE(WeiboBaseIE): - _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:m\.weibo\.cn/(?:status|detail)|(?:www\.)?weibo\.com/\d+)/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://weibo.com/7827771738/N4xlMvjhI', 'info_dict': { @@ -164,6 +164,25 @@ class WeiboIE(WeiboBaseIE): 'like_count': int, 'repost_count': int, }, + }, { + 'url': 'https://m.weibo.cn/detail/4189191225395228', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'display_id': 'FBqgOmDxO', + 'title': '柴犬柴犬的秒拍视频', + 'description': '午睡当然是要甜甜蜜蜜的啦![坏笑] Instagram:shibainu.gaku http://t.cn/RHbmjzW ', + 'duration': 53, + 'timestamp': 1514264429, + 'upload_date': '20171226', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '柴犬柴犬', + 'uploader_id': '5926682210', + 'uploader_url': 'https://weibo.com/u/5926682210', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + }, }, { 'url': 'https://weibo.com/0/4224132150961381', 'note': 'no playback_list example', From 89198bb23b4d03e0473ac408bfb50d67c2f71165 Mon Sep 17 00:00:00 2001 From: Boof <97455552+hexahigh@users.noreply.github.com> Date: Sun, 19 Jan 2025 14:13:40 +0100 Subject: [PATCH 59/99] [ie/nrk] Extract more formats (#12069) Closes #12053 Authored by: hexahigh --- yt_dlp/extractor/nrk.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index 658ae5f91..efc4a1734 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -12,6 +12,7 @@ from ..utils import ( parse_iso8601, str_or_none, try_get, + update_url_query, url_or_none, urljoin, ) @@ -171,6 +172,8 @@ class NRKIE(NRKBaseIE): format_url = url_or_none(asset.get('url')) if not format_url: continue + # Remove the 'adap' query parameter + format_url = update_url_query(format_url, {'adap': []}) asset_format = (asset.get('format') or '').lower() if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': formats.extend(self._extract_nrk_formats(format_url, video_id)) From de30f652ffb7623500215f5906844f2ae0d92c7b Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 19 Jan 2025 17:52:31 +0100 Subject: [PATCH 60/99] [ie/LBRY] Support signed URLs (#12138) Authored by: seproDev --- yt_dlp/extractor/lbry.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 0445b7cbf..7b22f90e9 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -310,7 +310,13 @@ class LBRYIE(LBRYBaseIE): if stream_type in self._SUPPORTED_STREAM_TYPES: claim_id, is_live = result['claim_id'], False streaming_url = self._call_api_proxy( - 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + 'get', claim_id, { + 'uri': uri, + **traverse_obj(parse_qs(url), { + 'signature': ('signature', 0), + 'signature_ts': ('signature_ts', 0), + }), + }, 'streaming url')['streaming_url'] # GET request to v3 API returns original video/audio file if available direct_url = re.sub(r'/api/v\d+/', '/api/v3/', streaming_url) From 68221ecc87c6a3f3515757bac2a0f9674a38e3f2 Mon Sep 17 00:00:00 2001 From: Grabien <60237587+Grabien@users.noreply.github.com> Date: Mon, 20 Jan 2025 01:01:22 +0200 Subject: [PATCH 61/99] [ie/senategov] Fix extractors (#9361) Authored by: Grabien, seproDev Co-authored-by: sepro --- yt_dlp/extractor/senategov.py | 219 ++++++++++++++++++++-------------- 1 file changed, 132 insertions(+), 87 deletions(-) diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index cddca09d0..efcdb79d0 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -4,43 +4,12 @@ import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_qs, - unsmuggle_url, + UnsupportedError, + make_archive_id, + remove_end, + url_or_none, ) - -_COMMITTEES = { - 'ag': ('76440', 'http://ag-f.akamaihd.net'), - 'aging': ('76442', 'http://aging-f.akamaihd.net'), - 'approps': ('76441', 'http://approps-f.akamaihd.net'), - 'arch': ('', 'http://ussenate-f.akamaihd.net'), - 'armed': ('76445', 'http://armed-f.akamaihd.net'), - 'banking': ('76446', 'http://banking-f.akamaihd.net'), - 'budget': ('76447', 'http://budget-f.akamaihd.net'), - 'cecc': ('76486', 'http://srs-f.akamaihd.net'), - 'commerce': ('80177', 'http://commerce1-f.akamaihd.net'), - 'csce': ('75229', 'http://srs-f.akamaihd.net'), - 'dpc': ('76590', 'http://dpc-f.akamaihd.net'), - 'energy': ('76448', 'http://energy-f.akamaihd.net'), - 'epw': ('76478', 'http://epw-f.akamaihd.net'), - 'ethics': ('76449', 'http://ethics-f.akamaihd.net'), - 'finance': ('76450', 'http://finance-f.akamaihd.net'), - 'foreign': ('76451', 'http://foreign-f.akamaihd.net'), - 'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'), - 'help': ('76452', 'http://help-f.akamaihd.net'), - 'indian': ('76455', 'http://indian-f.akamaihd.net'), - 'intel': ('76456', 'http://intel-f.akamaihd.net'), - 'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'), - 'jccic': ('85180', 'http://jccic-f.akamaihd.net'), - 'jec': ('76458', 'http://jec-f.akamaihd.net'), - 'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'), - 'rpc': ('76591', 'http://rpc-f.akamaihd.net'), - 'rules': ('76460', 'http://rules-f.akamaihd.net'), - 'saa': ('76489', 'http://srs-f.akamaihd.net'), - 'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'), - 'srs': ('75229', 'http://srs-f.akamaihd.net'), - 'uscc': ('76487', 'http://srs-f.akamaihd.net'), - 'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'), -} +from ..utils.traversal import traverse_obj class SenateISVPIE(InfoExtractor): @@ -53,31 +22,46 @@ class SenateISVPIE(InfoExtractor): 'info_dict': { 'id': 'judiciary031715', 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', + 'title': 'ISVP', 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + '_old_archive_ids': ['senategov judiciary031715'], }, 'params': { # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', 'info_dict': { 'id': 'commerce011514', 'ext': 'mp4', 'title': 'Integrated Senate Video Player', + '_old_archive_ids': ['senategov commerce011514'], }, 'params': { # m3u8 download 'skip_download': True, }, + 'skip': 'This video is not available.', }, { 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', # checksum differs each time 'info_dict': { 'id': 'intel090613', 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', + 'title': 'ISVP', + '_old_archive_ids': ['senategov intel090613'], + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.senate.gov/isvp/?auto_play=false&comm=help&filename=help090920&poster=https://www.help.senate.gov/assets/images/video-poster.png&stt=950', + 'info_dict': { + 'id': 'help090920', + 'ext': 'mp4', + 'title': 'ISVP', + 'thumbnail': 'https://www.help.senate.gov/assets/images/video-poster.png', + '_old_archive_ids': ['senategov help090920'], }, }, { # From http://www.c-span.org/video/?96791-1 @@ -85,60 +69,81 @@ class SenateISVPIE(InfoExtractor): 'only_matching': True, }] + _COMMITTEES = { + 'ag': ('76440', 'https://ag-f.akamaihd.net', '2036803', 'agriculture'), + 'aging': ('76442', 'https://aging-f.akamaihd.net', '2036801', 'aging'), + 'approps': ('76441', 'https://approps-f.akamaihd.net', '2036802', 'appropriations'), + 'arch': ('', 'https://ussenate-f.akamaihd.net', '', 'arch'), + 'armed': ('76445', 'https://armed-f.akamaihd.net', '2036800', 'armedservices'), + 'banking': ('76446', 'https://banking-f.akamaihd.net', '2036799', 'banking'), + 'budget': ('76447', 'https://budget-f.akamaihd.net', '2036798', 'budget'), + 'cecc': ('76486', 'https://srs-f.akamaihd.net', '2036782', 'srs_cecc'), + 'commerce': ('80177', 'https://commerce1-f.akamaihd.net', '2036779', 'commerce'), + 'csce': ('75229', 'https://srs-f.akamaihd.net', '2036777', 'srs_srs'), + 'dpc': ('76590', 'https://dpc-f.akamaihd.net', '', 'dpc'), + 'energy': ('76448', 'https://energy-f.akamaihd.net', '2036797', 'energy'), + 'epw': ('76478', 'https://epw-f.akamaihd.net', '2036783', 'environment'), + 'ethics': ('76449', 'https://ethics-f.akamaihd.net', '2036796', 'ethics'), + 'finance': ('76450', 'https://finance-f.akamaihd.net', '2036795', 'finance_finance'), + 'foreign': ('76451', 'https://foreign-f.akamaihd.net', '2036794', 'foreignrelations'), + 'govtaff': ('76453', 'https://govtaff-f.akamaihd.net', '2036792', 'hsgac'), + 'help': ('76452', 'https://help-f.akamaihd.net', '2036793', 'help'), + 'indian': ('76455', 'https://indian-f.akamaihd.net', '2036791', 'indianaffairs'), + 'intel': ('76456', 'https://intel-f.akamaihd.net', '2036790', 'intelligence'), + 'intlnarc': ('76457', 'https://intlnarc-f.akamaihd.net', '', 'internationalnarcoticscaucus'), + 'jccic': ('85180', 'https://jccic-f.akamaihd.net', '2036778', 'jccic'), + 'jec': ('76458', 'https://jec-f.akamaihd.net', '2036789', 'jointeconomic'), + 'judiciary': ('76459', 'https://judiciary-f.akamaihd.net', '2036788', 'judiciary'), + 'rpc': ('76591', 'https://rpc-f.akamaihd.net', '', 'rpc'), + 'rules': ('76460', 'https://rules-f.akamaihd.net', '2036787', 'rules'), + 'saa': ('76489', 'https://srs-f.akamaihd.net', '2036780', 'srs_saa'), + 'smbiz': ('76461', 'https://smbiz-f.akamaihd.net', '2036786', 'smallbusiness'), + 'srs': ('75229', 'https://srs-f.akamaihd.net', '2031966', 'srs_srs'), + 'uscc': ('76487', 'https://srs-f.akamaihd.net', '2036781', 'srs_uscc'), + 'vetaff': ('76462', 'https://vetaff-f.akamaihd.net', '2036785', 'veteransaffairs'), + } + def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - qs = urllib.parse.parse_qs(self._match_valid_url(url).group('qs')) - if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + if not qs.get('filename') or not qs.get('comm'): raise ExtractorError('Invalid URL', expected=True) - - video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + filename = qs['filename'][0] + video_id = remove_end(filename, '.mp4') webpage = self._download_webpage(url, video_id) + committee = qs['comm'][0] - if smuggled_data.get('force_title'): - title = smuggled_data['force_title'] - else: - title = self._html_extract_title(webpage) - poster = qs.get('poster') - thumbnail = poster[0] if poster else None - - video_type = qs['type'][0] - committee = video_type if video_type == 'arch' else qs['comm'][0] - - stream_num, domain = _COMMITTEES[committee] + stream_num, stream_domain, stream_id, msl3 = self._COMMITTEES[committee] + urls_alternatives = [f'https://www-senate-gov-media-srs.akamaized.net/hls/live/{stream_id}/{committee}/{filename}/master.m3u8', + f'https://www-senate-gov-msl3archive.akamaized.net/{msl3}/{filename}_1/master.m3u8', + f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8', + f'{stream_domain}/i/{filename}.mp4/master.m3u8'] formats = [] - if video_type == 'arch': - filename = video_id if '.' in video_id else video_id + '.mp4' - m3u8_url = urllib.parse.urljoin(domain, 'i/' + filename + '/master.m3u8') - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8') - else: - hdcore_sign = 'hdcore=3.1.0' - url_params = (domain, video_id, stream_num) - f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params - m3u8_url = '{}/i/{}_1@{}/master.m3u8'.format(*url_params) - for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): - # URLs without the extra param induce an 404 error - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.append(entry) - for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): - mobj = re.search(r'(?P(?:-p|-b)).m3u8', entry['url']) - if mobj: - entry['format_id'] += mobj.group('tag') - formats.append(entry) + subtitles = {} + for video_url in urls_alternatives: + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', fatal=False) + if formats: + break return { 'id': video_id, - 'title': title, + 'title': self._html_extract_title(webpage), 'formats': formats, - 'thumbnail': thumbnail, + 'subtitles': subtitles, + 'thumbnail': traverse_obj(qs, ('poster', 0, {url_or_none})), + '_old_archive_ids': [make_archive_id(SenateGovIE, video_id)], } class SenateGovIE(InfoExtractor): _IE_NAME = 'senate.gov' - _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov' + _SUBDOMAIN_RE = '|'.join(map(re.escape, ( + 'agriculture', 'aging', 'appropriations', 'armed-services', 'banking', + 'budget', 'commerce', 'energy', 'epw', 'finance', 'foreign', 'help', + 'intelligence', 'inaugural', 'judiciary', 'rules', 'sbc', 'veterans', + ))) + _VALID_URL = rf'https?://(?:www\.)?(?:{_SUBDOMAIN_RE})\.senate\.gov' _TESTS = [{ 'url': 'https://www.help.senate.gov/hearings/vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', 'info_dict': { @@ -147,6 +152,9 @@ class SenateGovIE(InfoExtractor): 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health', 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions', 'ext': 'mp4', + 'age_limit': 0, + 'thumbnail': 'https://www.help.senate.gov/assets/images/sharelogo.jpg', + '_old_archive_ids': ['senategov help090920'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -156,8 +164,12 @@ class SenateGovIE(InfoExtractor): 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', 'title': 'Review of the FY2019 Budget Request for the U.S. Army', 'ext': 'mp4', + 'age_limit': 0, + 'thumbnail': 'https://www.appropriations.senate.gov/themes/appropriations/images/video-poster-flash-fit.png', + '_old_archive_ids': ['senategov appropsA051518'], }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', 'info_dict': { @@ -166,32 +178,65 @@ class SenateGovIE(InfoExtractor): 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization', 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs', 'ext': 'mp4', + 'thumbnail': 'https://www.banking.senate.gov/themes/banking/images/sharelogo.jpg', + 'age_limit': 0, + '_old_archive_ids': ['senategov banking041521'], }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.agriculture.senate.gov/hearings/hemp-production-and-the-2018-farm-bill', + 'only_matching': True, + }, { + 'url': 'https://www.aging.senate.gov/hearings/the-older-americans-act-the-local-impact-of-the-law-and-the-upcoming-reauthorization', + 'only_matching': True, + }, { + 'url': 'https://www.budget.senate.gov/hearings/improving-care-lowering-costs-achieving-health-care-efficiency', + 'only_matching': True, + }, { + 'url': 'https://www.commerce.senate.gov/2024/12/communications-networks-safety-and-security', + 'only_matching': True, + }, { + 'url': 'https://www.energy.senate.gov/hearings/2024/2/full-committee-hearing-to-examine', + 'only_matching': True, + }, { + 'url': 'https://www.epw.senate.gov/public/index.cfm/hearings?ID=F63083EA-2C13-498C-B548-341BED68C209', + 'only_matching': True, + }, { + 'url': 'https://www.foreign.senate.gov/hearings/american-diplomacy-and-global-leadership-review-of-the-fy25-state-department-budget-request', + 'only_matching': True, + }, { + 'url': 'https://www.intelligence.senate.gov/hearings/foreign-threats-elections-2024-%E2%80%93-roles-and-responsibilities-us-tech-providers', + 'only_matching': True, + }, { + 'url': 'https://www.inaugural.senate.gov/52nd-inaugural-ceremonies/', + 'only_matching': True, + }, { + 'url': 'https://www.rules.senate.gov/hearings/02/07/2023/business-meeting', + 'only_matching': True, + }, { + 'url': 'https://www.sbc.senate.gov/public/index.cfm/hearings?ID=5B13AA6B-8279-45AF-B54B-94156DC7A2AB', + 'only_matching': True, + }, { + 'url': 'https://www.veterans.senate.gov/2024/5/frontier-health-care-ensuring-veterans-access-no-matter-where-they-live', + 'only_matching': True, }] def _real_extract(self, url): display_id = self._generic_id(url) webpage = self._download_webpage(url, display_id) - parse_info = parse_qs(self._search_regex( - r'