diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 4a14421869..3b0ef323d7 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -80,5 +80,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 748885e850..c8702c3569 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -92,5 +92,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index ac68a08c6f..5a6d2b0fbd 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -88,5 +88,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 6ae107ec1c..a17770f614 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -73,5 +73,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index a2263bec52..c600a9dcb6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -67,5 +67,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 27eb98bc8e..57bc9daf51 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -73,5 +73,8 @@ body: - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 12ec5b0d8c..4ff1cbc1dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -266,7 +266,7 @@ jobs: # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi - python3 -m pip install -U delocate + python3 -m pip install -U 'delocate==0.11.0' mkdir curl_cffi_whls curl_cffi_universal2 python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do diff --git a/.github/workflows/antispam.yaml b/.github/workflows/issue-lockdown.yml similarity index 76% rename from .github/workflows/antispam.yaml rename to .github/workflows/issue-lockdown.yml index 0fd867072e..4b973e2e61 100644 --- a/.github/workflows/antispam.yaml +++ b/.github/workflows/issue-lockdown.yml @@ -1,4 +1,4 @@ -name: Anti-Spam +name: Issue Lockdown on: issues: types: [opened] @@ -9,6 +9,7 @@ permissions: jobs: lockdown: name: Issue Lockdown + if: vars.ISSUE_LOCKDOWN runs-on: ubuntu-latest steps: - name: "Lock new issue" @@ -17,4 +18,4 @@ jobs: ISSUE_NUMBER: ${{ github.event.issue.number }} REPOSITORY: ${{ github.repository }} run: | - gh issue lock "${ISSUE_NUMBER}" -r too_heated -R "${REPOSITORY}" + gh issue lock "${ISSUE_NUMBER}" -R "${REPOSITORY}" diff --git a/.github/workflows/sanitize-comment.yml b/.github/workflows/sanitize-comment.yml new file mode 100644 index 0000000000..45c87cdd47 --- /dev/null +++ b/.github/workflows/sanitize-comment.yml @@ -0,0 +1,17 @@ +name: Sanitize comment + +on: + issue_comment: + types: [created, edited] + +permissions: + issues: write + +jobs: + sanitize-comment: + name: Sanitize comment + if: vars.SANITIZE_COMMENT && !github.event.issue.pull_request + runs-on: ubuntu-latest + steps: + - name: Sanitize comment + uses: yt-dlp/sanitize-comment@v1 diff --git a/README.md b/README.md index ca32e09bfb..428eb9f478 100644 --- a/README.md +++ b/README.md @@ -1777,6 +1777,9 @@ The following extractors use this feature: * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning +* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` +* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) +* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 4f782d8c62..8135689c7e 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -49,8 +49,11 @@ VERBOSE_TMPL = ''' - type: markdown attributes: value: | - ### NOTE: Due to a recent increase in malicious spam activity, this issue will be automatically locked until it is triaged by a maintainer. - ### If you receive any replies asking you download a file, do NOT follow the download links! + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. '''.strip() NO_SKIP = ''' diff --git a/test/test_networking.py b/test/test_networking.py index 826f11a561..d96624af18 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -822,6 +822,24 @@ class TestRequestHandlerMisc: rh.close() assert len(logging_handlers) == before_count + def test_wrap_request_errors(self): + class TestRequestHandler(RequestHandler): + def _validate(self, request): + if request.headers.get('x-fail'): + raise UnsupportedRequest('test error') + + def _send(self, request: Request): + raise RequestError('test error') + + with TestRequestHandler(logger=FakeLogger()) as rh: + with pytest.raises(UnsupportedRequest, match='test error') as exc_info: + rh.validate(Request('http://example.com', headers={'x-fail': '1'})) + assert exc_info.value.handler is rh + + with pytest.raises(RequestError, match='test error') as exc_info: + rh.send(Request('http://example.com')) + assert exc_info.value.handler is rh + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) class TestUrllibRequestHandler(TestRequestHandlerBase): diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a3610dc976..d8abf0b5d3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -217,6 +217,7 @@ from .bbc import ( BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, ) +from .beacon import BeaconTvIE from .beatbump import ( BeatBumpPlaylistIE, BeatBumpVideoIE, @@ -822,7 +823,10 @@ from .hungama import ( HungamaIE, HungamaSongIE, ) -from .huya import HuyaLiveIE +from .huya import ( + HuyaLiveIE, + HuyaVideoIE, +) from .hypem import HypemIE from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE @@ -945,6 +949,7 @@ from .kick import ( ) from .kicker import KickerIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE @@ -1036,10 +1041,7 @@ from .livestream import ( LivestreamShortenerIE, ) from .livestreamfails import LivestreamfailsIE -from .lnkgo import ( - LnkGoIE, - LnkIE, -) +from .lnk import LnkIE from .loom import ( LoomFolderIE, LoomIE, @@ -1810,6 +1812,7 @@ from .screen9 import Screen9IE from .screencast import ScreencastIE from .screencastify import ScreencastifyIE from .screencastomatic import ScreencastOMaticIE +from .screenrec import ScreenRecIE from .scrippsnetworks import ( ScrippsNetworksIE, ScrippsNetworksWatchIE, @@ -1820,6 +1823,7 @@ from .scte import ( SCTECourseIE, ) from .sejmpl import SejmIE +from .sen import SenIE from .senalcolombia import SenalColombiaLiveIE from .senategov import ( SenateGovIE, diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 61cbab5a7a..0abe059829 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -1,3 +1,5 @@ +import functools +import json import random import re import time @@ -6,7 +8,9 @@ from .common import InfoExtractor from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, + extract_attributes, float_or_none, + get_element_html_by_id, int_or_none, parse_filesize, str_or_none, @@ -17,6 +21,7 @@ from ..utils import ( url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj class BandcampIE(InfoExtractor): @@ -459,7 +464,7 @@ class BandcampUserIE(InfoExtractor): }, }, { 'url': 'https://coldworldofficial.bandcamp.com/music', - 'playlist_mincount': 10, + 'playlist_mincount': 7, 'info_dict': { 'id': 'coldworldofficial', 'title': 'Discography of coldworldofficial', @@ -473,12 +478,19 @@ class BandcampUserIE(InfoExtractor): }, }] + def _yield_items(self, webpage): + yield from ( + re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) + + yield from traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes}, + 'data-client-items', {json.loads}, ..., 'page_url', {str})) + def _real_extract(self, url): uploader = self._match_id(url) webpage = self._download_webpage(url, uploader) - discography_data = (re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) - return self.playlist_from_matches( - discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x)) + self._yield_items(webpage), uploader, f'Discography of {uploader}', + getter=functools.partial(urljoin, url)) diff --git a/yt_dlp/extractor/beacon.py b/yt_dlp/extractor/beacon.py new file mode 100644 index 0000000000..ae47687cc8 --- /dev/null +++ b/yt_dlp/extractor/beacon.py @@ -0,0 +1,68 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, + traverse_obj, +) + + +class BeaconTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?beacon\.tv/content/(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://beacon.tv/content/welcome-to-beacon', + 'md5': 'b3f5932d437f288e662f10f3bfc5bd04', + 'info_dict': { + 'id': 'welcome-to-beacon', + 'ext': 'mp4', + 'upload_date': '20240509', + 'description': 'md5:ea2bd32e71acf3f9fca6937412cc3563', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/I4CkkEvN/poster.jpg?width=720', + 'title': 'Your home for Critical Role!', + 'timestamp': 1715227200, + 'duration': 105.494, + }, + }, { + 'url': 'https://beacon.tv/content/re-slayers-take-trailer', + 'md5': 'd879b091485dbed2245094c8152afd89', + 'info_dict': { + 'id': 're-slayers-take-trailer', + 'ext': 'mp4', + 'title': 'The Re-Slayer’s Take | Official Trailer', + 'timestamp': 1715189040, + 'upload_date': '20240508', + 'duration': 53.249, + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/PW5ApIw3/poster.jpg?width=720', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + content_data = traverse_obj(self._search_nextjs_data(webpage, video_id), ( + 'props', 'pageProps', '__APOLLO_STATE__', + lambda k, v: k.startswith('Content:') and v['slug'] == video_id, any)) + if not content_data: + raise ExtractorError('Failed to extract content data') + + jwplayer_data = traverse_obj(content_data, ( + (('contentVideo', 'video', 'videoData'), + ('contentPodcast', 'podcast', 'audioData')), {json.loads}, {dict}, any)) + if not jwplayer_data: + if content_data.get('contentType') not in ('videoPodcast', 'video', 'podcast'): + raise ExtractorError('Content is not a video/podcast', expected=True) + if traverse_obj(content_data, ('contentTier', '__ref')) != 'MemberTier:65b258d178f89be87b4dc0a4': + self.raise_login_required('This video/podcast is for members only') + raise ExtractorError('Failed to extract content') + + return { + **self._parse_jwplayer_data(jwplayer_data, video_id), + **traverse_obj(content_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('publishedAt', {parse_iso8601}), + }), + } diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 3163df8ab7..2fe1103cb9 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1852,7 +1852,7 @@ class BiliBiliPlayerIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor): _API_URL = 'https://api.bilibili.tv/intl/gateway' _NETRC_MACHINE = 'biliintl' - _HEADERS = {'Referer': 'https://www.bilibili.com/'} + _HEADERS = {'Referer': 'https://www.bilibili.tv/'} def _call_api(self, endpoint, *args, **kwargs): json = self._download_json(self._API_URL + endpoint, *args, **kwargs) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 187f73e7b9..9501e5ec9a 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -35,6 +35,7 @@ from ..networking import HEADRequest, Request from ..networking.exceptions import ( HTTPError, IncompleteRead, + TransportError, network_exceptions, ) from ..networking.impersonate import ImpersonateTarget @@ -965,6 +966,9 @@ class InfoExtractor: return False content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data) + if content is False: + assert not fatal + return False return (content, urlh) @staticmethod @@ -1039,7 +1043,15 @@ class InfoExtractor: def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None, data=None): - webpage_bytes = urlh.read() + try: + webpage_bytes = urlh.read() + except TransportError as err: + errmsg = f'{video_id}: Error reading response: {err.msg}' + if fatal: + raise ExtractorError(errmsg, cause=err) + self.report_warning(errmsg) + return False + if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): @@ -3489,7 +3501,7 @@ class InfoExtractor: continue urls.add(source_url) source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) + ext = determine_ext(source_url, default_ext=mimetype2ext(source_type)) if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index 8d7707271d..86950b2445 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -319,32 +319,6 @@ class DPlayIE(DPlayBaseIE): url, display_id, host, 'dplay' + country, country, domain) -class HGTVDeIE(DPlayBaseIE): - _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', - 'info_dict': { - 'id': '151205', - 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', - 'ext': 'mp4', - 'title': 'Wer braucht schon eine Toilette', - 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', - 'duration': 1177.024, - 'timestamp': 1595705400, - 'upload_date': '20200725', - 'creator': 'HGTV', - 'series': 'Tiny House - klein, aber oho', - 'season_number': 3, - 'episode_number': 3, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') - - class DiscoveryPlusBaseIE(DPlayBaseIE): """Subclasses must set _PRODUCT, _DISCO_API_PARAMS""" @@ -373,6 +347,45 @@ class DiscoveryPlusBaseIE(DPlayBaseIE): return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS) +class HGTVDeIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/mein-kleinstadt-traumhaus/vom-landleben-ins-loft', + 'info_dict': { + 'id': '7332936', + 'ext': 'mp4', + 'display_id': 'mein-kleinstadt-traumhaus/vom-landleben-ins-loft', + 'title': 'Vom Landleben ins Loft', + 'description': 'md5:e5f72c02c853970796dd3818f2e25745', + 'episode': 'Episode 7', + 'episode_number': 7, + 'season': 'Season 7', + 'season_number': 7, + 'series': 'Mein Kleinstadt-Traumhaus', + 'duration': 2645.0, + 'timestamp': 1725998100, + 'upload_date': '20240910', + 'creators': ['HGTV'], + 'tags': [], + 'thumbnail': 'https://eu1-prod-images.disco-api.com/2024/08/09/82a386b9-c688-32c7-b9ff-0b13865f0bae.jpeg', + }, + }] + + _PRODUCT = 'hgtv' + _DISCO_API_PARAMS = { + 'disco_host': 'eu1-prod.disco-api.com', + 'realm': 'hgtv', + 'country': 'de', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': 'Alps:HyogaPlayer:0.0.0', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + class GoDiscoveryIE(DiscoveryPlusBaseIE): _VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index 4e9b63524e..552f9af12e 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -294,37 +294,37 @@ class ESPNCricInfoIE(InfoExtractor): class WatchESPNIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' _TESTS = [{ - 'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'url': 'https://www.espn.com/watch/player/_/id/11ce417a-6ac9-42b6-8a15-46aeb9ad5710', 'info_dict': { - 'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'id': '11ce417a-6ac9-42b6-8a15-46aeb9ad5710', 'ext': 'mp4', - 'title': 'Huddersfield vs. Burnley', - 'duration': 7500, - 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', + 'title': 'Abilene Chrstn vs. Texas Tech', + 'duration': 14166, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/11ce417a-6ac9-42b6-8a15-46aeb9ad5710/16x9.jpg?timestamp=202407252343&showBadge=true&cb=12&package=ESPN_PLUS', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'url': 'https://www.espn.com/watch/player/_/id/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3', 'info_dict': { - 'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'id': '90a2c85d-75e0-4b1e-a878-8e428a3cb2f3', 'ext': 'mp4', - 'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)', - 'duration': 8335, - 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', + 'title': 'UC Davis vs. California', + 'duration': 9547, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421', + 'url': 'https://www.espn.com/watch/player/_/id/c4313bbe-95b5-4bb8-b251-ac143ea0fc54', 'info_dict': { - 'id': '317f5fd1-c78a-4ebe-824a-129e0d348421', + 'id': 'c4313bbe-95b5-4bb8-b251-ac143ea0fc54', 'ext': 'mp4', - 'title': 'The Wheel - Episode 10', - 'duration': 3352, - 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS', + 'title': 'The College Football Show', + 'duration': 3639, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/c4313bbe-95b5-4bb8-b251-ac143ea0fc54/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', }, 'params': { 'skip_download': True, @@ -353,6 +353,13 @@ class WatchESPNIE(AdobePassIE): if not cookie: self.raise_login_required(method='cookies') + jwt = self._search_regex(r'=([^|]+)\|', cookie.value, 'cookie jwt') + id_token = self._download_json( + 'https://registerdisney.go.com/jgc/v6/client/ESPN-ONESITE.WEB-PROD/guest/refresh-auth', + None, 'Refreshing token', headers={'Content-Type': 'application/json'}, data=json.dumps({ + 'refreshToken': json.loads(base64.urlsafe_b64decode(f'{jwt}==='))['refresh_token'], + }).encode())['data']['token']['id_token'] + assertion = self._call_bamgrid_api( 'devices', video_id, headers={'Content-Type': 'application/json; charset=UTF-8'}, @@ -371,7 +378,7 @@ class WatchESPNIE(AdobePassIE): })['access_token'] assertion = self._call_bamgrid_api( - 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, + 'accounts/grant', video_id, payload={'id_token': id_token}, headers={ 'Authorization': token, 'Content-Type': 'application/json; charset=UTF-8', diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index a43ffe95e2..1adb35b5f0 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -84,7 +84,7 @@ class FacebookIE(InfoExtractor): 'timestamp': 1692346159, 'thumbnail': r're:^https?://.*', 'uploader_id': '100063551323670', - 'duration': 3132.184, + 'duration': 3133.583, 'view_count': int, 'concurrent_view_count': 0, }, @@ -112,9 +112,10 @@ class FacebookIE(InfoExtractor): 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', + 'uploader_id': 'pfbid05AzrFTXgY37tqwaSgbFTTEpCLBjjEJHkigogwGiRPtKEpAsJYJpzE94H1RxYXWEtl', 'duration': 131.03, 'concurrent_view_count': int, + 'view_count': int, }, }, { 'note': 'Video with DASH manifest', @@ -167,7 +168,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': 'ca63897a90c9452efee5f8c40d080e25', + 'md5': '1659aa21fb3dd1585874f668e81a72c8', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -180,9 +181,10 @@ class FacebookIE(InfoExtractor): 'view_count': int, 'uploader_id': '100059479812265', 'concurrent_view_count': int, - 'duration': 44.478, + 'duration': 44.181, }, }, { + # FIXME: unable to extract uploader, no formats found # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', @@ -241,9 +243,9 @@ class FacebookIE(InfoExtractor): 'timestamp': 1511548260, 'upload_date': '20171124', 'uploader': 'Vickie Gentry', - 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', + 'uploader_id': 'pfbid0FkkycT95ySNNyfCw4Cho6u5G7WbbZEcxT496Hq8rtx1K3LcTCATpR3wnyYhmyGC5l', 'thumbnail': r're:^https?://.*', - 'duration': 148.435, + 'duration': 148.224, }, }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media @@ -271,7 +273,7 @@ class FacebookIE(InfoExtractor): 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', 'thumbnail': r're:^https?://.*', 'uploader': 'Lela Evans', - 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', + 'uploader_id': 'pfbid0swT2y7t6TAsZVBvcyeYPdhTMefGaS26mzUwML3vd1ma6ndGZKxsyS4Ssu3jitZLXl', 'upload_date': '20231228', 'timestamp': 1703804085, 'duration': 394.347, @@ -322,7 +324,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', 'uploader_id': '100066514874195', - 'duration': 4524.212, + 'duration': 4524.001, 'view_count': int, 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, @@ -339,9 +341,9 @@ class FacebookIE(InfoExtractor): 'title': 'Josef', 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, - 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', + 'uploader_id': 'pfbid02gpfwRM2XvdEJfsERupwQiNmBiDArc38RMRYZnap372q6Vs7MtFTVy72mmFWpJBTKl', 'timestamp': 1549275572, - 'duration': 3.413, + 'duration': 3.283, 'uploader': 'Josef Novak', 'description': '', 'upload_date': '20190204', @@ -396,6 +398,7 @@ class FacebookIE(InfoExtractor): 'playlist_count': 1, 'skip': 'Requires logging in', }, { + # FIXME: Cannot parse data error # data.event.cover_media_renderer.cover_video 'url': 'https://m.facebook.com/events/1509582499515440', 'info_dict': { @@ -498,7 +501,8 @@ class FacebookIE(InfoExtractor): or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) or get_first(post, ('node', 'actors', ..., {dict})) - or get_first(post, ('event', 'event_creator', {dict})) or {}) + or get_first(post, ('event', 'event_creator', {dict})) + or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( @@ -524,6 +528,11 @@ class FacebookIE(InfoExtractor): webpage, 'view count', default=None)), 'concurrent_view_count': get_first(post, ( ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { + 'like_count': ('likers', 'count', {int}), + 'comment_count': ('total_comment_count', {int}), + 'repost_count': ('share_count_reduced', {parse_count}), + }), get_all=False), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -932,18 +941,21 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'f13dd37f2633595982db5ed8765474d3', + 'md5': 'a53256d10fc2105441fe0c4212ed8cea', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', - 'description': 'md5:22f03309b216ac84720183961441d8db', - 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$', + 'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$', + 'uploader': 'Beast Camp Training', 'uploader_id': '100040874179269', 'duration': 9.579, 'timestamp': 1637502609, 'upload_date': '20211121', 'thumbnail': r're:^https?://.*', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }] diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index 5663a78a37..f79e032e4a 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -8,15 +8,19 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + parse_duration, str_or_none, try_get, unescapeHTML, + unified_strdate, update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class HuyaLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P[^/#?&]+)(?:\D|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?!(?:video/play/))(?P[^/#?&]+)(?:\D|$)' IE_NAME = 'huya:live' IE_DESC = 'huya.com' TESTS = [{ @@ -24,6 +28,7 @@ class HuyaLiveIE(InfoExtractor): 'info_dict': { 'id': '572329', 'title': str, + 'ext': 'flv', 'description': str, 'is_live': True, 'view_count': int, @@ -131,3 +136,76 @@ class HuyaLiveIE(InfoExtractor): fm = base64.b64decode(params['fm']).decode().split('_', 1)[0] ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) return fm, ss + + +class HuyaVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?huya\.com/video/play/(?P\d+)\.html' + IE_NAME = 'huya:video' + IE_DESC = '虎牙视频' + + _TESTS = [{ + 'url': 'https://www.huya.com/video/play/1002412640.html', + 'info_dict': { + 'id': '1002412640', + 'ext': 'mp4', + 'title': '8月3日', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 14, + 'uploader': '虎牙-ATS欧卡车队青木', + 'uploader_id': '1564376151', + 'upload_date': '20240803', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + { + 'url': 'https://www.huya.com/video/play/556054543.html', + 'info_dict': { + 'id': '556054543', + 'ext': 'mp4', + 'title': '我不挑事 也不怕事', + 'thumbnail': r're:https?://.*\.jpg', + 'duration': 1864, + 'uploader': '卡尔', + 'uploader_id': '367138632', + 'upload_date': '20210811', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + }, + }] + + def _real_extract(self, url: str): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://liveapi.huya.com/moment/getMomentContent', video_id, + query={'videoId': video_id})['data']['moment']['videoInfo'] + + formats = [] + for definition in traverse_obj(video_data, ('definitions', lambda _, v: url_or_none(v['url']))): + formats.append({ + 'url': definition['url'], + **traverse_obj(definition, { + 'format_id': ('defName', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(video_data, { + 'title': ('videoTitle', {str}), + 'thumbnail': ('videoCover', {url_or_none}), + 'duration': ('videoDuration', {parse_duration}), + 'uploader': ('nickName', {str}), + 'uploader_id': ('uid', {str_or_none}), + 'upload_date': ('videoUploadTime', {unified_strdate}), + 'view_count': ('videoPlayNum', {int_or_none}), + 'comment_count': ('videoCommentNum', {int_or_none}), + 'like_count': ('favorCount', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index ab26dc5efe..9b91a454b1 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -25,9 +25,29 @@ class IPrimaIE(InfoExtractor): 'id': 'p51388', 'ext': 'mp4', 'title': 'Partička (92)', - 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - 'upload_date': '20201103', - 'timestamp': 1604437480, + 'description': 'md5:57943f6a50d6188288c3a579d2fd5f01', + 'episode': 'Partička (92)', + 'season': 'Partička', + 'series': 'Prima Partička', + 'episode_number': 92, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-ef6cf9de-c980-4443-92e4-17fe8bccd45c-16x9.jpeg', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + 'url': 'https://zoom.iprima.cz/porady/krasy-kanarskych-ostrovu/tenerife-v-risi-ohne', + 'info_dict': { + 'id': 'p1412199', + 'ext': 'mp4', + 'episode_number': 3, + 'episode': 'Tenerife: V říši ohně', + 'description': 'md5:4b4a05c574b5eaef130e68d4811c3f2c', + 'duration': 3111.0, + 'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-f66dd7fb-c1a0-47d1-b3bc-7db328d566c5-16x9-1711636518.jpg/t_16x9_medium_1366_768', + 'title': 'Tenerife: V říši ohně', + 'timestamp': 1711825800, + 'upload_date': '20240330', }, 'params': { 'skip_download': True, # m3u8 download @@ -131,6 +151,7 @@ class IPrimaIE(InfoExtractor): video_id = self._search_regex(( r'productId\s*=\s*([\'"])(?Pp\d+)\1', r'pproduct_id\s*=\s*([\'"])(?Pp\d+)\1', + r'let\s+videos\s*=\s*([\'"])(?Pp\d+)\1', ), webpage, 'real id', group='id', default=None) if not video_id: @@ -176,7 +197,7 @@ class IPrimaIE(InfoExtractor): final_result = self._search_json_ld(webpage, video_id, default={}) final_result.update({ 'id': video_id, - 'title': title, + 'title': final_result.get('title') or title, 'thumbnail': self._html_search_meta( ['thumbnail', 'og:image', 'twitter:image'], webpage, 'thumbnail', default=None), diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 3f03f9e4c4..42eef3c922 100644 --- a/yt_dlp/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py @@ -15,7 +15,7 @@ from ..utils import ( class KhanAcademyBaseIE(InfoExtractor): _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' - _PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70' + _PUBLISHED_CONTENT_VERSION = 'dc34750f0572c80f5effe7134082fe351143c1e4' def _parse_video(self, video): return { @@ -39,7 +39,7 @@ class KhanAcademyBaseIE(InfoExtractor): query={ 'fastly_cacheable': 'persist_until_publish', 'pcv': self._PUBLISHED_CONTENT_VERSION, - 'hash': '1242644265', + 'hash': '3712657851', 'variables': json.dumps({ 'path': display_id, 'countryCode': 'US', diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index 1c1b2a1772..abea5280ba 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -67,7 +67,7 @@ class KickIE(KickBaseIE): @classmethod def suitable(cls, url): - return False if KickClipIE.suitable(url) else super().suitable(url) + return False if (KickVODIE.suitable(url) or KickClipIE.suitable(url)) else super().suitable(url) def _real_extract(self, url): channel = self._match_id(url) @@ -98,25 +98,25 @@ class KickIE(KickBaseIE): class KickVODIE(KickBaseIE): IE_NAME = 'kick:vod' - _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/videos/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/e74614f4-5270-4319-90ad-32179f19a45c', + 'url': 'https://kick.com/xqc/videos/8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': 'e74614f4-5270-4319-90ad-32179f19a45c', + 'id': '8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea', 'ext': 'mp4', - 'title': r're:❎ MEGA DRAMA ❎ LIVE ❎ CLICK ❎ ULTIMATE SKILLS .+', + 'title': '18+ #ad 🛑LIVE🛑CLICK🛑DRAMA🛑NEWS🛑STUFF🛑REACT🛑GET IN HHERE🛑BOP BOP🛑WEEEE WOOOO🛑', 'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.', 'channel': 'xqc', 'channel_id': '668', 'uploader': 'xQc', 'uploader_id': '676', - 'upload_date': '20240724', - 'timestamp': 1721796562, - 'duration': 18566.0, + 'upload_date': '20240909', + 'timestamp': 1725919141, + 'duration': 10155.0, 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, - 'categories': ['VALORANT'], + 'categories': ['Just Chatting'], 'age_limit': 0, }, 'params': {'skip_download': 'm3u8'}, diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py new file mode 100644 index 0000000000..852a4de3f2 --- /dev/null +++ b/yt_dlp/extractor/kika.py @@ -0,0 +1,126 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class KikaIE(InfoExtractor): + IE_DESC = 'KiKA.de' + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w/-]+/videos/(?P[a-z-]+\d+)' + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/videos/logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'md5': 'fbfc8da483719ef06f396e5e5b938c69', + 'info_dict': { + 'id': 'logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'ext': 'mp4', + 'upload_date': '20240831', + 'timestamp': 1725126600, + 'season_number': 2024, + 'modified_date': '20240831', + 'episode': 'Episode 476', + 'episode_number': 476, + 'season': 'Season 2024', + 'duration': 634, + 'title': 'logo! vom Samstag, 31. August 2024', + 'modified_timestamp': 1725129983, + }, + }, { + 'url': 'https://www.kika.de/kaltstart/videos/video92498', + 'md5': '710ece827e5055094afeb474beacb7aa', + 'info_dict': { + 'id': 'video92498', + 'ext': 'mp4', + 'title': '7. Wo ist Leo?', + 'description': 'md5:fb48396a5b75068bcac1df74f1524920', + 'duration': 436, + 'timestamp': 1702926876, + 'upload_date': '20231218', + 'episode_number': 7, + 'modified_date': '20240319', + 'modified_timestamp': 1710880610, + 'episode': 'Episode 7', + 'season_number': 1, + 'season': 'Season 1', + }, + }, { + 'url': 'https://www.kika.de/bernd-das-brot/astrobrot/videos/video90088', + 'md5': 'ffd1b700d7de0a6616a1d08544c77294', + 'info_dict': { + 'id': 'video90088', + 'ext': 'mp4', + 'upload_date': '20221102', + 'timestamp': 1667390580, + 'duration': 197, + 'modified_timestamp': 1711093771, + 'episode_number': 8, + 'title': 'Es ist nicht leicht, ein Astrobrot zu sein', + 'modified_date': '20240322', + 'description': 'md5:d3641deaf1b5515a160788b2be4159a9', + 'season_number': 1, + 'episode': 'Episode 8', + 'season': 'Season 1', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + doc = self._download_json(f'https://www.kika.de/_next-api/proxy/v1/videos/{video_id}', video_id) + video_assets = self._download_json(doc['assets']['url'], video_id) + + subtitles = {} + if ttml_resource := url_or_none(video_assets.get('videoSubtitle')): + subtitles['de'] = [{ + 'url': ttml_resource, + 'ext': 'ttml', + }] + if webvtt_resource := url_or_none(video_assets.get('webvttUrl')): + subtitles.setdefault('de', []).append({ + 'url': webvtt_resource, + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': list(self._extract_formats(video_assets, video_id)), + 'subtitles': subtitles, + **traverse_obj(doc, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('date', {parse_iso8601}), + 'modified_timestamp': ('modificationDate', {parse_iso8601}), + 'duration': (( + ('durationInSeconds', {int_or_none}), + ('duration', {parse_duration})), any), + 'episode_number': ('episodeNumber', {int_or_none}), + 'season_number': ('season', {int_or_none}), + }), + } + + def _extract_formats(self, media_info, video_id): + for media in traverse_obj(media_info, ('assets', lambda _, v: url_or_none(v['url']))): + stream_url = media['url'] + ext = determine_ext(stream_url) + if ext == 'm3u8': + yield from self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + yield { + 'url': stream_url, + 'format_id': ext, + **traverse_obj(media, { + 'width': ('frameWidth', {int_or_none}), + 'height': ('frameHeight', {int_or_none}), + # NB: filesize is 0 if unknown, bitrate is -1 if unknown + 'filesize': ('fileSize', {int_or_none}, {lambda x: x or None}), + 'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}), + 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), + }), + } diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnk.py similarity index 53% rename from yt_dlp/extractor/lnkgo.py rename to yt_dlp/extractor/lnk.py index 31a7cefd82..593f73410d 100644 --- a/yt_dlp/extractor/lnkgo.py +++ b/yt_dlp/extractor/lnk.py @@ -1,86 +1,11 @@ from .common import InfoExtractor from ..utils import ( - clean_html, format_field, int_or_none, - parse_iso8601, unified_strdate, ) -class LnkGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P[A-Za-z0-9-]+)(?:/(?P\d+))?' - _TESTS = [{ - 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai', - 'info_dict': { - 'id': '10809', - 'ext': 'mp4', - 'title': "Put'ka: Trys Klausimai", - 'upload_date': '20161216', - 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.', - 'age_limit': 18, - 'duration': 117, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1481904000, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2', - 'info_dict': { - 'id': '10467', - 'ext': 'mp4', - 'title': 'Nėrdas: Kompiuterio Valymas', - 'upload_date': '20150113', - 'description': 'md5:7352d113a242a808676ff17e69db6a69', - 'age_limit': 18, - 'duration': 346, - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1421164800, - }, - 'params': { - 'skip_download': True, # HLS download - }, - }, { - 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413', - 'only_matching': True, - }] - _AGE_LIMITS = { - 'N-7': 7, - 'N-14': 14, - 'S': 18, - } - _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s' - - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - - video_info = self._download_json( - 'https://lnk.lt/api/main/video-page/{}/{}/false'.format(display_id, video_id or '0'), - display_id)['videoConfig']['videoInfo'] - - video_id = str(video_info['id']) - title = video_info['title'] - prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' - formats = self._extract_m3u8_formats( - self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), - video_id, 'mp4', 'm3u8_native') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'), - 'duration': int_or_none(video_info.get('duration')), - 'description': clean_html(video_info.get('htmlDescription')), - 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0), - 'timestamp': parse_iso8601(video_info.get('airDate')), - 'view_count': int_or_none(video_info.get('viewsCount')), - } - - class LnkIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P\d+)' diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 46097fa20e..dfda3cc534 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -13,8 +13,8 @@ from ..utils import ( class MDRIE(InfoExtractor): - IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' + IE_DESC = 'MDR.DE' + _VALID_URL = r'https?://(?:www\.)?mdr\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' _GEO_COUNTRIES = ['DE'] @@ -34,30 +34,6 @@ class MDRIE(InfoExtractor): 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'duration': 134, - 'uploader': 'KIKA', - }, - 'skip': '404 not found', - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1482541200, - 'upload_date': '20161224', - 'duration': 4628, - 'uploader': 'KIKA', - }, }, { # audio with alternative playerURL pattern 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', @@ -68,28 +44,7 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, - }, { - # empty bitrateVideo and bitrateAudio - 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', - 'info_dict': { - 'id': '128372', - 'ext': 'mp4', - 'title': 'Der kleine Wichtel kehrt zurück', - 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', - 'duration': 4876, - 'timestamp': 1607823300, - 'upload_date': '20201213', - 'uploader': 'ZDF', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'only_matching': True, - }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'only_matching': True, + 'skip': '404 not found', }, { 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', 'only_matching': True, diff --git a/yt_dlp/extractor/nzz.py b/yt_dlp/extractor/nzz.py index ac3b73156e..047c4e1ac9 100644 --- a/yt_dlp/extractor/nzz.py +++ b/yt_dlp/extractor/nzz.py @@ -1,9 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - extract_attributes, -) class NZZIE(InfoExtractor): @@ -22,19 +19,14 @@ class NZZIE(InfoExtractor): 'playlist_count': 1, }] + def _entries(self, webpage, page_id): + for script in re.findall(r'(?s)]* data-hid="jw-video-jw[^>]+>(.+?)', webpage): + settings = self._search_json(r'var\s+settings\s*=[^{]*', script, 'settings', page_id, fatal=False) + if entry := self._parse_jwplayer_data(settings, page_id): + yield entry + def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - entries = [] - for player_element in re.findall( - r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): - player_params = extract_attributes(player_element) - if player_params.get('data-type') not in ('kaltura_singleArticle',): - self.report_warning('Unsupported player type') - continue - entry_id = player_params['data-id'] - entries.append(self.url_result( - 'kaltura:1750922:' + entry_id, 'Kaltura', entry_id)) - - return self.playlist_result(entries, page_id) + return self.playlist_result(self._entries(webpage, page_id), page_id) diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 07f249498c..f0b38893b2 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -109,7 +109,7 @@ class PinterestBaseIE(InfoExtractor): class PinterestIE(PinterestBaseIE): - _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P\d+)' + _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?:[\w-]+--)?(?P\d+)' _TESTS = [{ # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', @@ -174,6 +174,25 @@ class PinterestIE(PinterestBaseIE): }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', 'only_matching': True, + }, + { + 'url': 'https://pinterest.com/pin/dive-into-serenity-blue-lagoon-pedi-nails-for-a-tranquil-and-refreshing-spa-experience-video-in-2024--2885187256207927', + 'info_dict': { + 'id': '2885187256207927', + 'ext': 'mp4', + 'title': 'Dive into Serenity: Blue Lagoon Pedi Nails for a Tranquil and Refreshing Spa Experience! 💙💅', + 'description': 'md5:5da41c767d2317e42e49b663b0b2150f', + 'uploader': 'Glamour Artistry |Everyday Outfits, Luxury Fashion & Nail Designs', + 'uploader_id': '1142999717836434688', + 'upload_date': '20240702', + 'timestamp': 1719939156, + 'duration': 7.967, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': ['#BlueLagoonPediNails', '#SpaExperience'], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py index 944e8636ab..26aec2e4cc 100644 --- a/yt_dlp/extractor/rtp.py +++ b/yt_dlp/extractor/rtp.py @@ -8,7 +8,7 @@ from ..utils import js_to_json class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P[0-9]+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -19,9 +19,25 @@ class RTPIE(InfoExtractor): 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril', + 'md5': '9a81ed53f2b2197cfa7ed455b12f8ade', + 'info_dict': { + 'id': 'e757904', + 'ext': 'mp4', + 'title': '25 Curiosidades, 25 de Abril', + 'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', + 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', + 'only_matching': True, }] _RX_OBFUSCATION = re.compile(r'''(?xs) @@ -49,17 +65,17 @@ class RTPIE(InfoExtractor): f, config = self._search_regex( r'''(?sx) - var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s* + (?:var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s*)? var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P{(?:(?!\*/).)+?})\);(?!\s*\*/) ''', webpage, 'player config', group=('f', 'config')) - f = self._parse_json( - f, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) config = self._parse_json( config, video_id, lambda data: self.__unobfuscate(data, video_id=video_id)) + f = config['file'] if not f else self._parse_json( + f, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) formats = [] if isinstance(f, dict): diff --git a/yt_dlp/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py index 36ceb0254d..3db3ce1424 100644 --- a/yt_dlp/extractor/samplefocus.py +++ b/yt_dlp/extractor/samplefocus.py @@ -36,7 +36,7 @@ class SampleFocusIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, impersonate=True) sample_id = self._search_regex( r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', @@ -82,7 +82,15 @@ class SampleFocusIE(InfoExtractor): return { 'id': sample_id, 'title': title, - 'url': mp3_url, + 'formats': [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'http_headers': { + 'Referer': url, + }, + }], 'display_id': display_id, 'thumbnail': thumbnail, 'uploader': uploader, diff --git a/yt_dlp/extractor/screenrec.py b/yt_dlp/extractor/screenrec.py new file mode 100644 index 0000000000..64f8d2494a --- /dev/null +++ b/yt_dlp/extractor/screenrec.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor + + +class ScreenRecIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screenrec\.com/share/(?P\w{10})' + _TESTS = [{ + 'url': 'https://screenrec.com/share/DasLtbknYo', + 'info_dict': { + 'id': 'DasLtbknYo', + 'ext': 'mp4', + 'title': '02.05.2024_03.01.25_REC', + 'description': 'Recorded with ScreenRec', + 'thumbnail': r're:^https?://.*\.gif$', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex( + r'customUrl\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'm3u8 URL', group='url') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'), + } diff --git a/yt_dlp/extractor/sen.py b/yt_dlp/extractor/sen.py new file mode 100644 index 0000000000..d8f14ecdc0 --- /dev/null +++ b/yt_dlp/extractor/sen.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class SenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sen\.com/video/(?P[0-9a-f-]+)' + _TEST = { + 'url': 'https://www.sen.com/video/eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'md5': 'ff615aca9691053c94f8f10d96cd7884', + 'info_dict': { + 'id': 'eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'ext': 'mp4', + 'description': 'Florida, 28 Sep 2022', + 'title': 'Hurricane Ian', + 'tags': ['North America', 'Storm', 'Weather'], + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_data = self._download_json(f'https://api.sen.com/content/public/video/{video_id}', video_id) + m3u8_url = (traverse_obj(api_data, ( + 'data', 'nodes', lambda _, v: v['id'] == 'player', 'video', 'url', {url_or_none}, any)) + or f'https://vod.sen.com/videos/{video_id}/manifest.m3u8') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + **traverse_obj(api_data, ('data', 'nodes', lambda _, v: v['id'] == 'details', any, 'content', { + 'title': ('title', 'text', {str}), + 'description': ('descriptions', 0, 'text', {str}), + 'tags': ('badges', ..., 'text', {str}), + })), + } diff --git a/yt_dlp/extractor/servus.py b/yt_dlp/extractor/servus.py index 117f180814..841c7ebf33 100644 --- a/yt_dlp/extractor/servus.py +++ b/yt_dlp/extractor/servus.py @@ -27,7 +27,7 @@ class ServusIE(InfoExtractor): 'info_dict': { 'id': 'AA-28BYCQNH92111', 'ext': 'mp4', - 'title': 'Klettersteige in den Alpen', + 'title': 'Vie Ferrate - Klettersteige in den Alpen', 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2823, @@ -38,6 +38,7 @@ class ServusIE(InfoExtractor): 'season_number': 11, 'episode': 'Episode 8 - Vie Ferrate – Klettersteige in den Alpen', 'episode_number': 8, + 'categories': ['Bergwelten'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -71,8 +72,11 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + next_data = self._search_nextjs_data(webpage, video_id, fatal=False) + video = self._download_json( - 'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin', + 'https://api-player.redbull.com/stv/servus-tv-playnet', video_id, 'Downloading video JSON', query={'videoId': video_id}) if not video.get('videoUrl'): self._report_errors(video) @@ -89,7 +93,7 @@ class ServusIE(InfoExtractor): return { 'id': video_id, 'title': video.get('title'), - 'description': self._get_description(video_id) or video.get('description'), + 'description': self._get_description(next_data) or video.get('description'), 'thumbnail': video.get('poster'), 'duration': float_or_none(video.get('duration')), 'timestamp': unified_timestamp(video.get('currentSunrise')), @@ -100,16 +104,19 @@ class ServusIE(InfoExtractor): 'episode_number': episode_number, 'formats': formats, 'subtitles': subtitles, + **traverse_obj(next_data, ('props', 'pageProps', 'data', { + 'title': ('title', 'rendered', {str}), + 'timestamp': ('stv_date', 'raw', {int}), + 'duration': ('stv_duration', {float_or_none}), + 'categories': ('category_names', ..., {str}), + })), } - def _get_description(self, video_id): - info = self._download_json( - f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page', - video_id, fatal=False) - - return join_nonempty(*traverse_obj(info, ( - ('stv_short_description', 'stv_long_description'), - {lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n') + def _get_description(self, next_data): + return join_nonempty(*traverse_obj(next_data, ( + 'props', 'pageProps', 'data', + ('stv_short_description', 'stv_long_description'), {str}, + {lambda x: x.replace('\n\n', '\n')}, {unescapeHTML})), delim='\n\n') def _report_errors(self, video): playability_errors = traverse_obj(video, ('playabilityErrors', ...)) diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index d8c556acef..07db583470 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,33 +1,31 @@ -import base64 -import datetime as dt import functools import itertools from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin +from ..utils import int_or_none, traverse_obj, url_or_none, urljoin class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?Ptpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ - 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'url': 'https://10play.com.au/neighbours/web-extras/season-41/heres-a-first-look-at-mischa-bartons-neighbours-debut/tpv230911hyxnz', 'info_dict': { - 'id': '6226844312001', + 'id': '6336940246112', 'ext': 'mp4', - 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', - 'duration': 186, - 'season': 'Season 39', - 'season_number': 39, + 'title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'alt_title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'description': 'Neighbours Premieres Monday, September 18 At 4:30pm On 10 And 10 Play And 6:30pm On 10 Peach', + 'duration': 74, + 'season': 'Season 41', + 'season_number': 41, 'series': 'Neighbours', 'thumbnail': r're:https://.*\.jpg', 'uploader': 'Channel 10', 'age_limit': 15, - 'timestamp': 1611810000, - 'upload_date': '20210128', + 'timestamp': 1694386800, + 'upload_date': '20230910', 'uploader_id': '2199827728001', }, 'params': { @@ -35,21 +33,30 @@ class TenPlayIE(InfoExtractor): }, 'skip': 'Only available in Australia', }, { - 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', + 'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp', 'info_dict': { - 'id': '6192880312001', + 'id': '9000000000091177', 'ext': 'mp4', - 'title': "Todd Sampson's Body Hack - S4 Ep. 2", - 'description': 'md5:fa278820ad90f08ea187f9458316ac74', + 'title': 'Neighbours - S42 Ep. 9107', + 'alt_title': 'Thu 05 Sep', + 'description': 'md5:37a1f4271be34b9ee2b533426a5fbaef', + 'duration': 1388, + 'episode': 'Episode 9107', + 'episode_number': 9107, + 'season': 'Season 42', + 'season_number': 42, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', 'age_limit': 15, - 'timestamp': 1600770600, - 'upload_date': '20200922', + 'timestamp': 1725517860, + 'upload_date': '20240905', 'uploader': 'Channel 10', 'uploader_id': '2199827728001', }, 'params': { 'skip_download': True, }, + 'skip': 'Only available in Australia', }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, @@ -66,55 +73,42 @@ class TenPlayIE(InfoExtractor): 'X': 18, } - def _get_bearer_token(self, video_id): - username, password = self._get_login_info() - if username is None or password is None: - self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') - _timestamp = dt.datetime.now().strftime('%Y%m%d000000') - _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') - data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ - 'X-Network-Ten-Auth': _auth_header, - }, data=urlencode_postdata({ - 'email': username, - 'password': password, - })) - return 'Bearer ' + data['jwt']['accessToken'] - def _real_extract(self, url): content_id = self._match_id(url) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) - headers = {} - if data.get('memberGated') is True: - _token = self._get_bearer_token(content_id) - headers = {'Authorization': _token} - - _video_url = self._download_json( - data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers=headers).get('source') - m3u8_url = self._request_webpage(HEADRequest( - _video_url), content_id).url + video_data = self._download_json( + f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}', + content_id, 'Downloading video JSON') + m3u8_url = self._request_webpage( + HEADRequest(video_data['items'][0]['HLSURL']), + content_id, 'Checking stream URL').url if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) + # Attempt to get a higher quality stream + m3u8_url = m3u8_url.replace(',150,75,55,0000', ',300,150,75,55,0000') formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') return { + 'id': content_id, 'formats': formats, - 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, - 'id': data.get('altId') or content_id, - 'duration': data.get('duration'), - 'title': data.get('subtitle'), - 'alt_title': data.get('title'), - 'description': data.get('description'), - 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('tvShow'), - 'season_number': int_or_none(data.get('season')), - 'episode_number': int_or_none(data.get('episode')), - 'timestamp': data.get('published'), - 'thumbnail': data.get('imageUrl'), + 'subtitles': {'en': [{'url': data['captionUrl']}]} if url_or_none(data.get('captionUrl')) else None, 'uploader': 'Channel 10', 'uploader_id': '2199827728001', + **traverse_obj(data, { + 'id': ('altId', {str}), + 'duration': ('duration', {int_or_none}), + 'title': ('subtitle', {str}), + 'alt_title': ('title', {str}), + 'description': ('description', {str}), + 'age_limit': ('classification', {self._AUS_AGES.get}), + 'series': ('tvShow', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('published', {int_or_none}), + 'thumbnail': ('imageUrl', {url_or_none}), + }), } diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index a20cf4b17d..9a03948cd9 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -234,13 +234,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _extract_original_format(self, url, video_id, unlisted_hash=None): + def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + return self._download_json( + join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), + video_id, 'Downloading API JSON', headers={ + 'Authorization': f'jwt {jwt_token}', + 'Accept': 'application/json', + }, query={ + 'fields': ','.join(( + 'config_url', 'created_time', 'description', 'download', 'license', + 'metadata.connections.comments.total', 'metadata.connections.likes.total', + 'release_time', 'stats.plays')), + }, **kwargs) + + def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + # Original/source formats are only available when logged in + if not self._get_cookies('https://vimeo.com/').get('vimeo'): + return + query = {'action': 'load_download_config'} if unlisted_hash: query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}, + url, video_id, 'Loading download config JSON', fatal=False, + query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, expected_status=(403, 404)) or {} source_file = download_data.get('source_file') download_url = try_get(source_file, lambda x: x['download_url']) @@ -261,15 +278,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'quality': 1, } - jwt_response = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} - if not jwt_response.get('jwt'): + jwt = jwt or traverse_obj(self._download_json( + 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) + if not jwt: return - headers = {'Authorization': 'jwt {}'.format(jwt_response['jwt']), 'Accept': 'application/json'} - original_response = self._download_json( - f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False, expected_status=(403, 404)) or {} - for download_data in original_response.get('download') or []: + original_response = api_data or self._call_videos_api( + video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue @@ -354,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'No longer available', }, { - 'url': 'http://player.vimeo.com/video/54469442', + 'url': 'https://player.vimeo.com/video/54469442', 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { @@ -370,6 +385,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/68375962', @@ -379,22 +395,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, + 'timestamp': 1371214555, 'upload_date': '20130614', + 'release_timestamp': 1371214555, + 'release_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, 'comment_count': int, 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/channels/keypeele/75629013', @@ -418,29 +435,38 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, }, 'params': {'format': 'http-1080p'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/76979871', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', - 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', - 'timestamp': 1381846109, + 'description': str, # FIXME: Dynamic SEO spam description + 'timestamp': 1381860509, 'upload_date': '20131015', + 'release_timestamp': 1381860509, + 'release_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', - 'uploader': 'Vimeo Staff', + 'uploader': 'Vimeo', 'duration': 62, + 'comment_count': int, + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/452001751-8216e0571c251a09d7a8387550942d89f7f86f6398f8ed886e639b0dd50d3c90-d_1280', 'subtitles': { - 'de': [{'ext': 'vtt'}], - 'en': [{'ext': 'vtt'}], - 'es': [{'ext': 'vtt'}], - 'fr': [{'ext': 'vtt'}], + 'de': 'count:3', + 'en': 'count:3', + 'es': 'count:3', + 'fr': 'count:3', }, }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': [ + 'Ignoring subtitle tracks found in the HLS manifest', + 'Failed to parse XML: not well-formed', + ], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -456,11 +482,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 118, 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - # contains original format + # contains Original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + # 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -476,15 +503,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', 'like_count': int, + 'tags': 'count:11', }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - 'note': 'Contains original format not accessible in webpage', + 'note': 'Contains source format not accessible in webpage', 'url': 'https://vimeo.com/393756517', - 'md5': 'c464af248b592190a5ffbb5d33f382b0', + # 'md5': 'c464af248b592190a5ffbb5d33f382b0', 'info_dict': { 'id': '393756517', - 'ext': 'mov', + # 'ext': 'mov', + 'ext': 'mp4', 'timestamp': 1582642091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', @@ -495,6 +526,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', 'uploader_url': 'https://vimeo.com/frameworkla', }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -511,16 +544,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', - 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'description': str, # FIXME: Dynamic SEO spam description 'duration': 321, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', 'like_count': int, + 'tags': ['[the shining', 'vimeohq', 'cv', 'vimeo tribute]'], }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # redirects to ondemand extractor and should be passed through it @@ -543,28 +578,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'this page is no longer available.', }, { - 'url': 'http://player.vimeo.com/video/68375962', + 'url': 'https://player.vimeo.com/video/68375962', 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', 'info_dict': { 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, - 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, - 'comment_count': int, - 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', @@ -592,7 +622,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', - 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', + 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', @@ -606,6 +636,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # source file returns 403: Forbidden @@ -633,11 +664,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'release_date': '20160329', }, 'params': {'skip_download': True}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/138909882', 'info_dict': { 'id': '138909882', + # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', 'description': 'md5:5967e090768a831488f6e74b7821b3c1', @@ -645,11 +678,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Firework Champions', 'upload_date': '20150910', 'timestamp': 1441901895, + 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d_1280', + 'uploader_url': 'https://vimeo.com/fireworkchampions', + 'tags': 'count:6', + 'duration': 229, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, - 'format': 'Original', + # 'format': 'source', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/channels/staffpicks/143603739', @@ -670,8 +711,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/karimhd', 'channel_url': 'https://vimeo.com/channels/staffpicks', + 'tags': 'count:6', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires passing unlisted_hash(a52724358e) to load_download_config request @@ -701,6 +744,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308 @@ -735,6 +779,48 @@ class VimeoIE(VimeoBaseInfoExtractor): }, 'expected_warnings': ['Failed to parse XML: not well-formed'], }, + { + # vimeo.com URL with unlisted hash and Original format + 'url': 'https://vimeo.com/144579403/ec02229140', + # 'md5': '6b662c2884e0373183fbde2a0d15cb78', + 'info_dict': { + 'id': '144579403', + 'ext': 'mp4', + 'title': 'SALESMANSHIP', + 'description': 'md5:4338302f347a1ff8841b4a3aecaa09f0', + 'uploader': 'Off the Picture Pictures', + 'uploader_id': 'offthepicturepictures', + 'uploader_url': 'https://vimeo.com/offthepicturepictures', + 'duration': 669, + 'upload_date': '20151104', + 'timestamp': 1446607180, + 'release_date': '20151104', + 'release_timestamp': 1446607180, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1018638656-[\da-f]+-d_1280', + }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # player.vimeo.com URL with source format + 'url': 'https://player.vimeo.com/video/859028877', + # 'md5': '19ca3d2463441dee2d2f0671ac2916a2', + 'info_dict': { + 'id': '859028877', + 'ext': 'mp4', + 'title': 'Ariana Grande - Honeymoon Avenue (Live from London)', + 'uploader': 'Raja Virdi', + 'uploader_id': 'rajavirdi', + 'uploader_url': 'https://vimeo.com/rajavirdi', + 'duration': 309, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d_1280', + }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { # user playlist alias -> https://vimeo.com/258705797 'url': 'https://vimeo.com/user26785108/newspiritualguide', @@ -768,16 +854,6 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('Wrong video password', expected=True) return checked - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None): - return self._download_json( - join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), - video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - def _extract_from_api(self, video_id, unlisted_hash=None): viewer = self._download_json( 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') @@ -798,6 +874,11 @@ class VimeoIE(VimeoBaseInfoExtractor): info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + if source_format: + info['formats'].append(source_format) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) info.update({ 'description': video.get('description'), @@ -899,7 +980,12 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) - return self._parse_config(config, video_id) + info = self._parse_config(config, video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash) + if source_format: + info['formats'].append(source_format) + return info vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: @@ -1269,6 +1355,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): IE_DESC = 'Review pages on vimeo' _VALID_URL = r'https?://vimeo\.com/(?P[^/?#]+)/review/(?P\d+)/(?P[\da-f]{10})' _TESTS = [{ + 'url': 'https://vimeo.com/user170863801/review/996447483/a316d6ed8d', + 'info_dict': { + 'id': '996447483', + 'ext': 'mp4', + 'title': 'Rodeo day 1-_2', + 'uploader': 'BROADKAST', + 'uploader_id': 'user170863801', + 'uploader_url': 'https://vimeo.com/user170863801', + 'duration': 30, + 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML'], + }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', 'info_dict': { @@ -1282,6 +1382,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', 'uploader_url': 'https://vimeo.com/user21297594', }, + 'skip': '404 Not Found', }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1316,6 +1417,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) + viewer = {} if data.get('isLocked') is True: video_password = self._get_video_password() viewer = self._download_json( @@ -1327,8 +1429,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', video_id, - unlisted_hash=traverse_obj(config_url, ({parse_qs}, 'h', -1))) + f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', + video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index e900a4ad9f..d63964a004 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -1,7 +1,17 @@ +import base64 import math +import time from .common import InfoExtractor -from ..utils import InAdvancePagedList, str_or_none, traverse_obj, try_call +from .videa import VideaIE +from ..utils import ( + InAdvancePagedList, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, +) class XimalayaBaseIE(InfoExtractor): @@ -71,23 +81,92 @@ class XimalayaIE(XimalayaBaseIE): 'like_count': int, }, }, + { + # VIP-restricted audio + 'url': 'https://www.ximalaya.com/sound/562111701', + 'only_matching': True, + }, ] + @staticmethod + def _decrypt_filename(file_id, seed): + cgstr = '' + key = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890' + for _ in key: + seed = float(int(211 * seed + 30031) % 65536) + r = int(seed / 65536 * len(key)) + cgstr += key[r] + key = key.replace(key[r], '') + parts = file_id.split('*') + filename = ''.join(cgstr[int(part)] for part in parts if part.isdecimal()) + if not filename.startswith('/'): + filename = '/' + filename + return filename + + @staticmethod + def _decrypt_url_params(encrypted_params): + params = VideaIE.rc4( + base64.b64decode(encrypted_params), 'xkt3a41psizxrh9l').split('-') + # sign, token, timestamp + return params[1], params[2], params[3] + def _real_extract(self, url): scheme = 'https' if url.startswith('https') else 'http' audio_id = self._match_id(url) - audio_info_file = f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json' audio_info = self._download_json( - audio_info_file, audio_id, - f'Downloading info json {audio_info_file}', 'Unable to download info file') + f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json', audio_id, + 'Downloading info json', 'Unable to download info file') - formats = [{ + formats = [] + # NOTE: VIP-restricted audio + if audio_info.get('is_paid'): + ts = int(time.time()) + vip_info = self._download_json( + f'{scheme}://mpay.ximalaya.com/mobile/track/pay/{audio_id}/{ts}', + audio_id, 'Downloading VIP info json', 'Unable to download VIP info file', + query={'device': 'pc', 'isBackend': 'true', '_': ts}) + filename = self._decrypt_filename(vip_info['fileId'], vip_info['seed']) + sign, token, timestamp = self._decrypt_url_params(vip_info['ep']) + vip_url = update_url_query( + f'{vip_info["domain"]}/download/{vip_info["apiVersion"]}{filename}', { + 'sign': sign, + 'token': token, + 'timestamp': timestamp, + 'buy_key': vip_info['buyKey'], + 'duration': vip_info['duration'], + }) + fmt = { + 'format_id': 'vip', + 'url': vip_url, + 'vcodec': 'none', + } + if '_preview_' in vip_url: + self.report_warning( + f'This tracks requires a VIP account. Using a sample instead. {self._login_hint()}') + fmt.update({ + 'format_note': 'Sample', + 'preference': -10, + **traverse_obj(vip_info, { + 'filesize': ('sampleLength', {int_or_none}), + 'duration': ('sampleDuration', {int_or_none}), + }), + }) + else: + fmt.update(traverse_obj(vip_info, { + 'filesize': ('totalLength', {int_or_none}), + 'duration': ('duration', {int_or_none}), + })) + + fmt['abr'] = try_call(lambda: fmt['filesize'] * 8 / fmt['duration'] / 1024) + formats.append(fmt) + + formats.extend([{ 'format_id': f'{bps}k', 'url': audio_info[k], 'abr': bps, 'vcodec': 'none', - } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)] + } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)]) thumbnails = [] for k in audio_info: diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 10849916b8..23ed9270da 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -3,16 +3,13 @@ from ..utils import ( int_or_none, str_or_none, try_get, - update_url_query, url_or_none, ) class XinpianchangIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://www\.xinpianchang\.com/(?P[^/]+?)(?:\D|$)' - IE_NAME = 'xinpianchang' - IE_DESC = 'xinpianchang.com' + _VALID_URL = r'https?://(www\.)?xinpianchang\.com/(?Pa\d+)' + IE_DESC = '新片场' _TESTS = [{ 'url': 'https://www.xinpianchang.com/a11766551', 'info_dict': { @@ -49,11 +46,11 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) - vid = self.find_value_with_regex(var='vid', webpage=webpage) - app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) - api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) - data = self._download_json(api, video_id=video_id)['data'] + video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] + + data = self._download_json( + f'https://mod-api.xinpianchang.com/mod/api/v2/media/{video_data["vid"]}', video_id, + query={'appKey': video_data['appKey']})['data'] formats, subtitles = [], {} for k, v in data.get('resource').items(): if k in ('dash', 'hls'): @@ -72,6 +69,10 @@ class XinpianchangIE(InfoExtractor): 'width': int_or_none(prog.get('width')), 'height': int_or_none(prog.get('height')), 'ext': 'mp4', + 'http_headers': { + # NB: Server returns 403 without the Range header + 'Range': 'bytes=0-', + }, } for prog in v if prog.get('url') or []]) return { @@ -87,6 +88,3 @@ class XinpianchangIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } - - def find_value_with_regex(self, var, webpage): - return self._search_regex(rf'var\s{var}\s=\s\"(?P[^\"]+)\"', webpage, name=var) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2501398ba1..3d11c32f6e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -69,6 +69,8 @@ from ..utils import ( ) STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' +STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token' + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -79,6 +81,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats 'web_safari': { @@ -90,6 +93,7 @@ INNERTUBE_CLIENTS = { }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, 'web_embedded': { 'INNERTUBE_CONTEXT': { @@ -132,6 +136,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_music': { 'INNERTUBE_CONTEXT': { @@ -146,6 +151,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_creator': { 'INNERTUBE_CONTEXT': { @@ -160,6 +166,7 @@ INNERTUBE_CLIENTS = { }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { @@ -323,6 +330,7 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('REQUIRE_PO_TOKEN', False) ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') @@ -688,31 +696,46 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None, fatal=False) - @staticmethod - def _extract_account_syncid(*args): + def _data_sync_id_to_delegated_session_id(self, data_sync_id): + if not data_sync_id: + return + # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel + # and just "user_syncid||" for primary channel. We only want the channel_syncid + channel_syncid, _, user_syncid = data_sync_id.partition('||') + if user_syncid: + return channel_syncid + + def _extract_account_syncid(self, *args): """ - Extract syncId required to download private playlists of secondary channels + Extract current session ID required to download private playlists of secondary channels @params response and/or ytcfg """ - for data in args: - # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) - if delegated_sid: - return delegated_sid - sync_ids = (try_get( - data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), str) or '').split('||') - if len(sync_ids) >= 2 and sync_ids[1]: - # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel - # and just "user_syncid||" for primary channel. We only want the channel_syncid - return sync_ids[0] + # ytcfg includes channel_syncid if on secondary channel + if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): + return delegated_sid - @staticmethod - def _extract_visitor_data(*args): + data_sync_id = self._extract_data_sync_id(*args) + return self._data_sync_id_to_delegated_session_id(data_sync_id) + + def _extract_data_sync_id(self, *args): + """ + Extract current account dataSyncId. + In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| + @params response and/or ytcfg + """ + if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]: + return data_sync_id + + return traverse_obj( + args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) + + def _extract_visitor_data(self, *args): """ Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ + if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]: + return visitor_data return get_first( args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) @@ -1334,11 +1357,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') - _POTOKEN_EXPERIMENTS = ('51217476', '51217102') - _BROKEN_CLIENTS = { - short_client_name(client): client - for client in ('android', 'android_creator', 'android_music') - } _DEFAULT_CLIENTS = ('ios', 'web_creator') _GEO_BYPASS = False @@ -3701,6 +3719,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor): **cls._get_checkok_params(), } + def _get_config_po_token(self, client): + po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True) + for token_str in po_token_strs: + po_token_client, sep, po_token = token_str.partition('+') + if not sep: + self.report_warning( + f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True) + continue + if po_token_client == client: + return po_token + + def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function. + if not visitor_data and not self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. ' + f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"') + return + + config_po_token = self._get_config_po_token(client) + if config_po_token: + # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token, + # if using first channel in an account then we don't need the data_sync_id anymore... + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + + return config_po_token + + # Require PO Token if logged in for external fetching + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. ' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + return + + return self._fetch_po_token( + client=client, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + player_url=player_url, + **kwargs, + ) + + def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + """External PO Token fetch stub""" + @staticmethod def _is_agegated(player_response): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): @@ -3717,13 +3783,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): - - session_index = self._extract_session_index(player_ytcfg, master_ytcfg) - syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): headers = self.generate_api_headers( - ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) + ytcfg=player_ytcfg, + default_client=client, + visitor_data=visitor_data, + session_index=self._extract_session_index(master_ytcfg, player_ytcfg), + account_syncid=( + self._data_sync_id_to_delegated_session_id(data_sync_id) + or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg) + ), + ) yt_query = { 'videoId': video_id, @@ -3734,6 +3804,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: yt_query['params'] = player_params + if po_token: + yt_query['serviceIntegrityDimensions'] = {'poToken': po_token} + + sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3744,7 +3818,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - broken_clients = [] excluded_clients = [] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), @@ -3758,12 +3831,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): excluded_clients.append(client[1:]) elif client not in allowed_clients: self.report_warning(f'Skipping unsupported client "{client}"') - elif client in self._BROKEN_CLIENTS.values(): - broken_clients.append(client) else: requested_clients.append(client) - # Force deprioritization of _BROKEN_CLIENTS for format de-duplication - requested_clients.extend(broken_clients) if not requested_clients: requested_clients.extend(self._DEFAULT_CLIENTS) for excluded_client in excluded_clients: @@ -3788,19 +3857,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return pr_id def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): - initial_pr = ignore_initial_response = None + initial_pr = None if webpage: - if 'web' in clients: - experiments = traverse_obj(master_ytcfg, ( - 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - self.report_warning( - 'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response') - ignore_initial_response = True initial_pr = self._search_json( self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) prs = [] + deprioritized_prs = [] + if initial_pr and not self._invalid_player_response(initial_pr, video_id): # Android player_response does not have microFormats which are needed for # extraction of some data. So we return the initial_pr with formats @@ -3822,14 +3886,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return tried_iframe_fallback = False - player_url = None + player_url = visitor_data = data_sync_id = None skipped_clients = {} while clients: + deprioritize_pr = False client, base_client, variant = _split_innertube_client(clients.pop()) - player_ytcfg = {} - if client == 'web': - player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg - elif 'configs' not in self._configuration_arg('player_skip'): + player_ytcfg = master_ytcfg if client == 'web' else {} + if 'configs' not in self._configuration_arg('player_skip') and client != 'web': player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) @@ -3842,34 +3905,53 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url = self._download_player_url(video_id) tried_iframe_fallback = True - pr = initial_pr if client == 'web' and not ignore_initial_response else None - for retry in self.RetryManager(fatal=False): - try: - pr = pr or self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, - player_url if require_js_player else None, initial_pr, smuggled_data) - except ExtractorError as e: - self.report_warning(e) - break - experiments = traverse_obj(pr, ( - 'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK', - 'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - pr = None - retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True) - if not pr: + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) + data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) + po_token = self.fetch_po_token( + client=client, visitor_data=visitor_data, + data_sync_id=data_sync_id if self.is_authenticated else None, + player_url=player_url if require_js_player else None, + ) + + require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') + if not po_token and require_po_token: + self.report_warning( + f'No PO Token provided for {client} client, ' + f'which is required for working {client} formats. ' + f'You can manually pass a PO Token for this client with ' + f'--extractor-args "youtube:po_token={client}+XXX"', + only_once=True) + deprioritize_pr = True + + pr = initial_pr if client == 'web' else None + try: + pr = pr or self._extract_player_response( + client, video_id, + master_ytcfg=player_ytcfg or master_ytcfg, + player_ytcfg=player_ytcfg, + player_url=player_url, + initial_pr=initial_pr, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + po_token=po_token) + except ExtractorError as e: + self.report_warning(e) continue if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: # Save client name for introspection later - name = short_client_name(client) sd = traverse_obj(pr, ('streamingData', {dict})) or {} - sd[STREAMING_DATA_CLIENT_NAME] = name + sd[STREAMING_DATA_CLIENT_NAME] = client + sd[STREAMING_DATA_PO_TOKEN] = po_token for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): - f[STREAMING_DATA_CLIENT_NAME] = name - prs.append(pr) + f[STREAMING_DATA_CLIENT_NAME] = client + f[STREAMING_DATA_PO_TOKEN] = po_token + if deprioritize_pr: + deprioritized_prs.append(pr) + else: + prs.append(pr) # tv_embedded can work around age-gate and age-verification IF the video is embeddable if self._is_agegated(pr) and variant != 'tv_embedded': @@ -3893,6 +3975,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # _producer, _testsuite, & _vr variants can also work around age-verification append_client('web_creator', 'mediaconnect') + prs.extend(deprioritized_prs) + if skipped_clients: self.report_warning( f'Skipping player responses from {"/".join(skipped_clients)} clients ' @@ -4027,13 +4111,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) - # _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds - # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 - is_broken = client_name in self._BROKEN_CLIENTS + po_token = fmt.get(STREAMING_DATA_PO_TOKEN) + + if po_token: + fmt_url = update_url_query(fmt_url, {'pot': po_token}) + + # Clients that require PO Token return videoplayback URLs that may return 403 + is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) if is_broken: self.report_warning( - f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken ' - 'and may yield HTTP Error 403. They will be deprioritized', only_once=True) + f'{video_id}: {client_name} client formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 @@ -4109,12 +4197,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, client_name, itag): + def process_manifest_format(f, proto, client_name, itag, po_token): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False itags[itag].add(key) + if f.get('source_preference') is None: + f['source_preference'] = -1 + + # Clients that require PO Token return videoplayback URLs that may return 403 + # hls does not currently require PO Token + if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': + self.report_warning( + f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) + f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ') + f['source_preference'] -= 20 + if itag and all_formats: f['format_id'] = f'{itag}-{proto}' elif any(p != proto for p, _ in itags[itag]): @@ -4126,9 +4226,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') f['language_preference'] = PREFERRED_LANG_VALUE - if f.get('source_preference') is None: - f['source_preference'] = -1 - if itag in ('616', '235'): f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') f['source_preference'] += 100 @@ -4149,23 +4246,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): subtitles = {} for sd in streaming_data: client_name = sd.get(STREAMING_DATA_CLIENT_NAME) - + po_token = sd.get(STREAMING_DATA_PO_TOKEN) hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: + if po_token: + hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None)): + r'/itag/(\d+)', f['url'], 'itag', default=None), po_token): yield f dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: + if po_token: + dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: - if process_manifest_format(f, 'dash', client_name, f['format_id']): + if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if needs_live_processing: @@ -4987,7 +5088,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( rich_grid_renderer, - ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {} video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) @@ -4999,6 +5100,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=self._get_text(renderer, 'title')) return + # shortsLockupViewModel extraction + entity_id = renderer.get('entityId') + if entity_id: + video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str})) + if not video_id: + return + yield self.url_result( + f'https://www.youtube.com/shorts/{video_id}', + ie=YoutubeIE, video_id=video_id, + **traverse_obj(renderer, ('overlayMetadata', { + 'title': ('primaryText', 'content', {str}), + 'view_count': ('secondaryText', 'content', {parse_count}), + })), + thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources')) + return def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index fe3354ea29..b86d3606d8 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -10,7 +10,7 @@ import typing import urllib.parse import urllib.request -from .exceptions import RequestError, UnsupportedRequest +from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj @@ -206,7 +206,7 @@ def wrap_request_errors(func): def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) - except UnsupportedRequest as e: + except RequestError as e: if e.handler is None: e.handler = self raise diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 0d3e707c58..04dd0f8d2c 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2919,6 +2919,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): 'audio/webm': 'webm', 'audio/x-matroska': 'mka', 'audio/x-mpegurl': 'm3u', + 'aacp': 'aac', 'midi': 'mid', 'ogg': 'ogg', 'wav': 'wav',