Merge branch 'master' into ie-afl

This commit is contained in:
subrat-lima 2024-09-18 13:32:07 +05:30
commit c386fc0d43
24 changed files with 789 additions and 344 deletions

View file

@ -1777,6 +1777,9 @@ The following extractors use this feature:
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
* `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning
* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage`
* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID)
* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY`
#### youtubetab (YouTube playlists, channels, feeds, etc.) #### youtubetab (YouTube playlists, channels, feeds, etc.)
* `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)

View file

@ -221,6 +221,7 @@ from .bbc import (
BBCCoUkIPlayerGroupIE, BBCCoUkIPlayerGroupIE,
BBCCoUkPlaylistIE, BBCCoUkPlaylistIE,
) )
from .beacon import BeaconTvIE
from .beatbump import ( from .beatbump import (
BeatBumpPlaylistIE, BeatBumpPlaylistIE,
BeatBumpVideoIE, BeatBumpVideoIE,
@ -826,7 +827,10 @@ from .hungama import (
HungamaIE, HungamaIE,
HungamaSongIE, HungamaSongIE,
) )
from .huya import HuyaLiveIE from .huya import (
HuyaLiveIE,
HuyaVideoIE,
)
from .hypem import HypemIE from .hypem import HypemIE
from .hypergryph import MonsterSirenHypergryphMusicIE from .hypergryph import MonsterSirenHypergryphMusicIE
from .hytale import HytaleIE from .hytale import HytaleIE
@ -1041,10 +1045,7 @@ from .livestream import (
LivestreamShortenerIE, LivestreamShortenerIE,
) )
from .livestreamfails import LivestreamfailsIE from .livestreamfails import LivestreamfailsIE
from .lnkgo import ( from .lnk import LnkIE
LnkGoIE,
LnkIE,
)
from .loom import ( from .loom import (
LoomFolderIE, LoomFolderIE,
LoomIE, LoomIE,
@ -1816,6 +1817,7 @@ from .screen9 import Screen9IE
from .screencast import ScreencastIE from .screencast import ScreencastIE
from .screencastify import ScreencastifyIE from .screencastify import ScreencastifyIE
from .screencastomatic import ScreencastOMaticIE from .screencastomatic import ScreencastOMaticIE
from .screenrec import ScreenRecIE
from .scrippsnetworks import ( from .scrippsnetworks import (
ScrippsNetworksIE, ScrippsNetworksIE,
ScrippsNetworksWatchIE, ScrippsNetworksWatchIE,
@ -1826,6 +1828,7 @@ from .scte import (
SCTECourseIE, SCTECourseIE,
) )
from .sejmpl import SejmIE from .sejmpl import SejmIE
from .sen import SenIE
from .senalcolombia import SenalColombiaLiveIE from .senalcolombia import SenalColombiaLiveIE
from .senategov import ( from .senategov import (
SenateGovIE, SenateGovIE,

View file

@ -1,3 +1,5 @@
import functools
import json
import random import random
import re import re
import time import time
@ -6,7 +8,9 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
KNOWN_EXTENSIONS, KNOWN_EXTENSIONS,
ExtractorError, ExtractorError,
extract_attributes,
float_or_none, float_or_none,
get_element_html_by_id,
int_or_none, int_or_none,
parse_filesize, parse_filesize,
str_or_none, str_or_none,
@ -17,6 +21,7 @@ from ..utils import (
url_or_none, url_or_none,
urljoin, urljoin,
) )
from ..utils.traversal import traverse_obj
class BandcampIE(InfoExtractor): class BandcampIE(InfoExtractor):
@ -459,7 +464,7 @@ class BandcampUserIE(InfoExtractor):
}, },
}, { }, {
'url': 'https://coldworldofficial.bandcamp.com/music', 'url': 'https://coldworldofficial.bandcamp.com/music',
'playlist_mincount': 10, 'playlist_mincount': 7,
'info_dict': { 'info_dict': {
'id': 'coldworldofficial', 'id': 'coldworldofficial',
'title': 'Discography of coldworldofficial', 'title': 'Discography of coldworldofficial',
@ -473,12 +478,19 @@ class BandcampUserIE(InfoExtractor):
}, },
}] }]
def _yield_items(self, webpage):
yield from (
re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
yield from traverse_obj(webpage, (
{functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes},
'data-client-items', {json.loads}, ..., 'page_url', {str}))
def _real_extract(self, url): def _real_extract(self, url):
uploader = self._match_id(url) uploader = self._match_id(url)
webpage = self._download_webpage(url, uploader) webpage = self._download_webpage(url, uploader)
discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage)
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
return self.playlist_from_matches( return self.playlist_from_matches(
discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x)) self._yield_items(webpage), uploader, f'Discography of {uploader}',
getter=functools.partial(urljoin, url))

View file

@ -0,0 +1,68 @@
import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_iso8601,
traverse_obj,
)
class BeaconTvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beacon\.tv/content/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://beacon.tv/content/welcome-to-beacon',
'md5': 'b3f5932d437f288e662f10f3bfc5bd04',
'info_dict': {
'id': 'welcome-to-beacon',
'ext': 'mp4',
'upload_date': '20240509',
'description': 'md5:ea2bd32e71acf3f9fca6937412cc3563',
'thumbnail': 'https://cdn.jwplayer.com/v2/media/I4CkkEvN/poster.jpg?width=720',
'title': 'Your home for Critical Role!',
'timestamp': 1715227200,
'duration': 105.494,
},
}, {
'url': 'https://beacon.tv/content/re-slayers-take-trailer',
'md5': 'd879b091485dbed2245094c8152afd89',
'info_dict': {
'id': 're-slayers-take-trailer',
'ext': 'mp4',
'title': 'The Re-Slayers Take | Official Trailer',
'timestamp': 1715189040,
'upload_date': '20240508',
'duration': 53.249,
'thumbnail': 'https://cdn.jwplayer.com/v2/media/PW5ApIw3/poster.jpg?width=720',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
content_data = traverse_obj(self._search_nextjs_data(webpage, video_id), (
'props', 'pageProps', '__APOLLO_STATE__',
lambda k, v: k.startswith('Content:') and v['slug'] == video_id, any))
if not content_data:
raise ExtractorError('Failed to extract content data')
jwplayer_data = traverse_obj(content_data, (
(('contentVideo', 'video', 'videoData'),
('contentPodcast', 'podcast', 'audioData')), {json.loads}, {dict}, any))
if not jwplayer_data:
if content_data.get('contentType') not in ('videoPodcast', 'video', 'podcast'):
raise ExtractorError('Content is not a video/podcast', expected=True)
if traverse_obj(content_data, ('contentTier', '__ref')) != 'MemberTier:65b258d178f89be87b4dc0a4':
self.raise_login_required('This video/podcast is for members only')
raise ExtractorError('Failed to extract content')
return {
**self._parse_jwplayer_data(jwplayer_data, video_id),
**traverse_obj(content_data, {
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('publishedAt', {parse_iso8601}),
}),
}

View file

@ -1852,7 +1852,7 @@ class BiliBiliPlayerIE(InfoExtractor):
class BiliIntlBaseIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor):
_API_URL = 'https://api.bilibili.tv/intl/gateway' _API_URL = 'https://api.bilibili.tv/intl/gateway'
_NETRC_MACHINE = 'biliintl' _NETRC_MACHINE = 'biliintl'
_HEADERS = {'Referer': 'https://www.bilibili.com/'} _HEADERS = {'Referer': 'https://www.bilibili.tv/'}
def _call_api(self, endpoint, *args, **kwargs): def _call_api(self, endpoint, *args, **kwargs):
json = self._download_json(self._API_URL + endpoint, *args, **kwargs) json = self._download_json(self._API_URL + endpoint, *args, **kwargs)

View file

@ -35,6 +35,7 @@ from ..networking import HEADRequest, Request
from ..networking.exceptions import ( from ..networking.exceptions import (
HTTPError, HTTPError,
IncompleteRead, IncompleteRead,
TransportError,
network_exceptions, network_exceptions,
) )
from ..networking.impersonate import ImpersonateTarget from ..networking.impersonate import ImpersonateTarget
@ -965,6 +966,9 @@ class InfoExtractor:
return False return False
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
encoding=encoding, data=data) encoding=encoding, data=data)
if content is False:
assert not fatal
return False
return (content, urlh) return (content, urlh)
@staticmethod @staticmethod
@ -1039,7 +1043,15 @@ class InfoExtractor:
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
prefix=None, encoding=None, data=None): prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read() try:
webpage_bytes = urlh.read()
except TransportError as err:
errmsg = f'{video_id}: Error reading response: {err.msg}'
if fatal:
raise ExtractorError(errmsg, cause=err)
self.report_warning(errmsg)
return False
if prefix is not None: if prefix is not None:
webpage_bytes = prefix + webpage_bytes webpage_bytes = prefix + webpage_bytes
if self.get_param('dump_intermediate_pages', False): if self.get_param('dump_intermediate_pages', False):
@ -3489,7 +3501,7 @@ class InfoExtractor:
continue continue
urls.add(source_url) urls.add(source_url)
source_type = source.get('type') or '' source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url) ext = determine_ext(source_url, default_ext=mimetype2ext(source_type))
if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native', source_url, video_id, 'mp4', entry_protocol='m3u8_native',

View file

@ -319,32 +319,6 @@ class DPlayIE(DPlayBaseIE):
url, display_id, host, 'dplay' + country, country, domain) url, display_id, host, 'dplay' + country, country, domain)
class HGTVDeIE(DPlayBaseIE):
_VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
'info_dict': {
'id': '151205',
'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
'ext': 'mp4',
'title': 'Wer braucht schon eine Toilette',
'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
'duration': 1177.024,
'timestamp': 1595705400,
'upload_date': '20200725',
'creator': 'HGTV',
'series': 'Tiny House - klein, aber oho',
'season_number': 3,
'episode_number': 3,
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
return self._get_disco_api_info(
url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
class DiscoveryPlusBaseIE(DPlayBaseIE): class DiscoveryPlusBaseIE(DPlayBaseIE):
"""Subclasses must set _PRODUCT, _DISCO_API_PARAMS""" """Subclasses must set _PRODUCT, _DISCO_API_PARAMS"""
@ -373,6 +347,45 @@ class DiscoveryPlusBaseIE(DPlayBaseIE):
return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS) return self._get_disco_api_info(url, self._match_id(url), **self._DISCO_API_PARAMS)
class HGTVDeIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX
_TESTS = [{
'url': 'https://de.hgtv.com/sendungen/mein-kleinstadt-traumhaus/vom-landleben-ins-loft',
'info_dict': {
'id': '7332936',
'ext': 'mp4',
'display_id': 'mein-kleinstadt-traumhaus/vom-landleben-ins-loft',
'title': 'Vom Landleben ins Loft',
'description': 'md5:e5f72c02c853970796dd3818f2e25745',
'episode': 'Episode 7',
'episode_number': 7,
'season': 'Season 7',
'season_number': 7,
'series': 'Mein Kleinstadt-Traumhaus',
'duration': 2645.0,
'timestamp': 1725998100,
'upload_date': '20240910',
'creators': ['HGTV'],
'tags': [],
'thumbnail': 'https://eu1-prod-images.disco-api.com/2024/08/09/82a386b9-c688-32c7-b9ff-0b13865f0bae.jpeg',
},
}]
_PRODUCT = 'hgtv'
_DISCO_API_PARAMS = {
'disco_host': 'eu1-prod.disco-api.com',
'realm': 'hgtv',
'country': 'de',
}
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers.update({
'x-disco-params': f'realm={realm}',
'x-disco-client': 'Alps:HyogaPlayer:0.0.0',
'Authorization': self._get_auth(disco_base, display_id, realm),
})
class GoDiscoveryIE(DiscoveryPlusBaseIE): class GoDiscoveryIE(DiscoveryPlusBaseIE):
_VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX _VALID_URL = r'https?://(?:go\.)?discovery\.com/video' + DPlayBaseIE._PATH_REGEX
_TESTS = [{ _TESTS = [{

View file

@ -294,37 +294,37 @@ class ESPNCricInfoIE(InfoExtractor):
class WatchESPNIE(AdobePassIE): class WatchESPNIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'
_TESTS = [{ _TESTS = [{
'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309', 'url': 'https://www.espn.com/watch/player/_/id/11ce417a-6ac9-42b6-8a15-46aeb9ad5710',
'info_dict': { 'info_dict': {
'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309', 'id': '11ce417a-6ac9-42b6-8a15-46aeb9ad5710',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Huddersfield vs. Burnley', 'title': 'Abilene Chrstn vs. Texas Tech',
'duration': 7500, 'duration': 14166,
'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/11ce417a-6ac9-42b6-8a15-46aeb9ad5710/16x9.jpg?timestamp=202407252343&showBadge=true&cb=12&package=ESPN_PLUS',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c', 'url': 'https://www.espn.com/watch/player/_/id/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3',
'info_dict': { 'info_dict': {
'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c', 'id': '90a2c85d-75e0-4b1e-a878-8e428a3cb2f3',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)', 'title': 'UC Davis vs. California',
'duration': 8335, 'duration': 9547,
'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/90a2c85d-75e0-4b1e-a878-8e428a3cb2f3/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421', 'url': 'https://www.espn.com/watch/player/_/id/c4313bbe-95b5-4bb8-b251-ac143ea0fc54',
'info_dict': { 'info_dict': {
'id': '317f5fd1-c78a-4ebe-824a-129e0d348421', 'id': 'c4313bbe-95b5-4bb8-b251-ac143ea0fc54',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The Wheel - Episode 10', 'title': 'The College Football Show',
'duration': 3352, 'duration': 3639,
'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS', 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/c4313bbe-95b5-4bb8-b251-ac143ea0fc54/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -353,6 +353,13 @@ class WatchESPNIE(AdobePassIE):
if not cookie: if not cookie:
self.raise_login_required(method='cookies') self.raise_login_required(method='cookies')
jwt = self._search_regex(r'=([^|]+)\|', cookie.value, 'cookie jwt')
id_token = self._download_json(
'https://registerdisney.go.com/jgc/v6/client/ESPN-ONESITE.WEB-PROD/guest/refresh-auth',
None, 'Refreshing token', headers={'Content-Type': 'application/json'}, data=json.dumps({
'refreshToken': json.loads(base64.urlsafe_b64decode(f'{jwt}==='))['refresh_token'],
}).encode())['data']['token']['id_token']
assertion = self._call_bamgrid_api( assertion = self._call_bamgrid_api(
'devices', video_id, 'devices', video_id,
headers={'Content-Type': 'application/json; charset=UTF-8'}, headers={'Content-Type': 'application/json; charset=UTF-8'},
@ -371,7 +378,7 @@ class WatchESPNIE(AdobePassIE):
})['access_token'] })['access_token']
assertion = self._call_bamgrid_api( assertion = self._call_bamgrid_api(
'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, 'accounts/grant', video_id, payload={'id_token': id_token},
headers={ headers={
'Authorization': token, 'Authorization': token,
'Content-Type': 'application/json; charset=UTF-8', 'Content-Type': 'application/json; charset=UTF-8',

View file

@ -84,7 +84,7 @@ class FacebookIE(InfoExtractor):
'timestamp': 1692346159, 'timestamp': 1692346159,
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'uploader_id': '100063551323670', 'uploader_id': '100063551323670',
'duration': 3132.184, 'duration': 3133.583,
'view_count': int, 'view_count': int,
'concurrent_view_count': 0, 'concurrent_view_count': 0,
}, },
@ -112,9 +112,10 @@ class FacebookIE(InfoExtractor):
'upload_date': '20140506', 'upload_date': '20140506',
'timestamp': 1399398998, 'timestamp': 1399398998,
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', 'uploader_id': 'pfbid05AzrFTXgY37tqwaSgbFTTEpCLBjjEJHkigogwGiRPtKEpAsJYJpzE94H1RxYXWEtl',
'duration': 131.03, 'duration': 131.03,
'concurrent_view_count': int, 'concurrent_view_count': int,
'view_count': int,
}, },
}, { }, {
'note': 'Video with DASH manifest', 'note': 'Video with DASH manifest',
@ -167,7 +168,7 @@ class FacebookIE(InfoExtractor):
# have 1080P, but only up to 720p in swf params # have 1080P, but only up to 720p in swf params
# data.video.story.attachments[].media # data.video.story.attachments[].media
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
'md5': 'ca63897a90c9452efee5f8c40d080e25', 'md5': '1659aa21fb3dd1585874f668e81a72c8',
'info_dict': { 'info_dict': {
'id': '10155529876156509', 'id': '10155529876156509',
'ext': 'mp4', 'ext': 'mp4',
@ -180,9 +181,10 @@ class FacebookIE(InfoExtractor):
'view_count': int, 'view_count': int,
'uploader_id': '100059479812265', 'uploader_id': '100059479812265',
'concurrent_view_count': int, 'concurrent_view_count': int,
'duration': 44.478, 'duration': 44.181,
}, },
}, { }, {
# FIXME: unable to extract uploader, no formats found
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
@ -241,9 +243,9 @@ class FacebookIE(InfoExtractor):
'timestamp': 1511548260, 'timestamp': 1511548260,
'upload_date': '20171124', 'upload_date': '20171124',
'uploader': 'Vickie Gentry', 'uploader': 'Vickie Gentry',
'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', 'uploader_id': 'pfbid0FkkycT95ySNNyfCw4Cho6u5G7WbbZEcxT496Hq8rtx1K3LcTCATpR3wnyYhmyGC5l',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'duration': 148.435, 'duration': 148.224,
}, },
}, { }, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media # data.node.comet_sections.content.story.attachments[].styles.attachment.media
@ -271,7 +273,7 @@ class FacebookIE(InfoExtractor):
'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'uploader': 'Lela Evans', 'uploader': 'Lela Evans',
'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', 'uploader_id': 'pfbid0swT2y7t6TAsZVBvcyeYPdhTMefGaS26mzUwML3vd1ma6ndGZKxsyS4Ssu3jitZLXl',
'upload_date': '20231228', 'upload_date': '20231228',
'timestamp': 1703804085, 'timestamp': 1703804085,
'duration': 394.347, 'duration': 394.347,
@ -322,7 +324,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20180523', 'upload_date': '20180523',
'uploader': 'ESL One Dota 2', 'uploader': 'ESL One Dota 2',
'uploader_id': '100066514874195', 'uploader_id': '100066514874195',
'duration': 4524.212, 'duration': 4524.001,
'view_count': int, 'view_count': int,
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'concurrent_view_count': int, 'concurrent_view_count': int,
@ -339,9 +341,9 @@ class FacebookIE(InfoExtractor):
'title': 'Josef', 'title': 'Josef',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'concurrent_view_count': int, 'concurrent_view_count': int,
'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', 'uploader_id': 'pfbid02gpfwRM2XvdEJfsERupwQiNmBiDArc38RMRYZnap372q6Vs7MtFTVy72mmFWpJBTKl',
'timestamp': 1549275572, 'timestamp': 1549275572,
'duration': 3.413, 'duration': 3.283,
'uploader': 'Josef Novak', 'uploader': 'Josef Novak',
'description': '', 'description': '',
'upload_date': '20190204', 'upload_date': '20190204',
@ -396,6 +398,7 @@ class FacebookIE(InfoExtractor):
'playlist_count': 1, 'playlist_count': 1,
'skip': 'Requires logging in', 'skip': 'Requires logging in',
}, { }, {
# FIXME: Cannot parse data error
# data.event.cover_media_renderer.cover_video # data.event.cover_media_renderer.cover_video
'url': 'https://m.facebook.com/events/1509582499515440', 'url': 'https://m.facebook.com/events/1509582499515440',
'info_dict': { 'info_dict': {
@ -498,7 +501,8 @@ class FacebookIE(InfoExtractor):
or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
or get_first(post, ('node', 'actors', ..., {dict})) or get_first(post, ('node', 'actors', ..., {dict}))
or get_first(post, ('event', 'event_creator', {dict})) or {}) or get_first(post, ('event', 'event_creator', {dict}))
or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {})
uploader = uploader_data.get('name') or ( uploader = uploader_data.get('name') or (
clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
or self._search_regex( or self._search_regex(
@ -524,6 +528,11 @@ class FacebookIE(InfoExtractor):
webpage, 'view count', default=None)), webpage, 'view count', default=None)),
'concurrent_view_count': get_first(post, ( 'concurrent_view_count': get_first(post, (
('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
**traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', {
'like_count': ('likers', 'count', {int}),
'comment_count': ('total_comment_count', {int}),
'repost_count': ('share_count_reduced', {parse_count}),
}), get_all=False),
} }
info_json_ld = self._search_json_ld(webpage, video_id, default={}) info_json_ld = self._search_json_ld(webpage, video_id, default={})
@ -932,18 +941,21 @@ class FacebookReelIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'https://www.facebook.com/reel/1195289147628387', 'url': 'https://www.facebook.com/reel/1195289147628387',
'md5': 'f13dd37f2633595982db5ed8765474d3', 'md5': 'a53256d10fc2105441fe0c4212ed8cea',
'info_dict': { 'info_dict': {
'id': '1195289147628387', 'id': '1195289147628387',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', 'title': r're:9\.6K views · 355 reactions .+ Let the “Slapathon” commence!! .+ LL COOL J · Mama Said Knock You Out$',
'description': 'md5:22f03309b216ac84720183961441d8db', 'description': r're:When your trying to help your partner .+ LL COOL J · Mama Said Knock You Out$',
'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', 'uploader': 'Beast Camp Training',
'uploader_id': '100040874179269', 'uploader_id': '100040874179269',
'duration': 9.579, 'duration': 9.579,
'timestamp': 1637502609, 'timestamp': 1637502609,
'upload_date': '20211121', 'upload_date': '20211121',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'like_count': int,
'comment_count': int,
'repost_count': int,
}, },
}] }]

View file

@ -8,15 +8,19 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
parse_duration,
str_or_none, str_or_none,
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_strdate,
update_url_query, update_url_query,
url_or_none,
) )
from ..utils.traversal import traverse_obj
class HuyaLiveIE(InfoExtractor): class HuyaLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P<id>[^/#?&]+)(?:\D|$)' _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?!(?:video/play/))(?P<id>[^/#?&]+)(?:\D|$)'
IE_NAME = 'huya:live' IE_NAME = 'huya:live'
IE_DESC = 'huya.com' IE_DESC = 'huya.com'
TESTS = [{ TESTS = [{
@ -24,6 +28,7 @@ class HuyaLiveIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '572329', 'id': '572329',
'title': str, 'title': str,
'ext': 'flv',
'description': str, 'description': str,
'is_live': True, 'is_live': True,
'view_count': int, 'view_count': int,
@ -131,3 +136,76 @@ class HuyaLiveIE(InfoExtractor):
fm = base64.b64decode(params['fm']).decode().split('_', 1)[0] fm = base64.b64decode(params['fm']).decode().split('_', 1)[0]
ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']]))
return fm, ss return fm, ss
class HuyaVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?huya\.com/video/play/(?P<id>\d+)\.html'
IE_NAME = 'huya:video'
IE_DESC = '虎牙视频'
_TESTS = [{
'url': 'https://www.huya.com/video/play/1002412640.html',
'info_dict': {
'id': '1002412640',
'ext': 'mp4',
'title': '8月3日',
'thumbnail': r're:https?://.*\.jpg',
'duration': 14,
'uploader': '虎牙-ATS欧卡车队青木',
'uploader_id': '1564376151',
'upload_date': '20240803',
'view_count': int,
'comment_count': int,
'like_count': int,
},
},
{
'url': 'https://www.huya.com/video/play/556054543.html',
'info_dict': {
'id': '556054543',
'ext': 'mp4',
'title': '我不挑事 也不怕事',
'thumbnail': r're:https?://.*\.jpg',
'duration': 1864,
'uploader': '卡尔',
'uploader_id': '367138632',
'upload_date': '20210811',
'view_count': int,
'comment_count': int,
'like_count': int,
},
}]
def _real_extract(self, url: str):
video_id = self._match_id(url)
video_data = self._download_json(
'https://liveapi.huya.com/moment/getMomentContent', video_id,
query={'videoId': video_id})['data']['moment']['videoInfo']
formats = []
for definition in traverse_obj(video_data, ('definitions', lambda _, v: url_or_none(v['url']))):
formats.append({
'url': definition['url'],
**traverse_obj(definition, {
'format_id': ('defName', {str}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
}),
})
return {
'id': video_id,
'formats': formats,
**traverse_obj(video_data, {
'title': ('videoTitle', {str}),
'thumbnail': ('videoCover', {url_or_none}),
'duration': ('videoDuration', {parse_duration}),
'uploader': ('nickName', {str}),
'uploader_id': ('uid', {str_or_none}),
'upload_date': ('videoUploadTime', {unified_strdate}),
'view_count': ('videoPlayNum', {int_or_none}),
'comment_count': ('videoCommentNum', {int_or_none}),
'like_count': ('favorCount', {int_or_none}),
}),
}

View file

@ -25,9 +25,29 @@ class IPrimaIE(InfoExtractor):
'id': 'p51388', 'id': 'p51388',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Partička (92)', 'title': 'Partička (92)',
'description': 'md5:859d53beae4609e6dd7796413f1b6cac', 'description': 'md5:57943f6a50d6188288c3a579d2fd5f01',
'upload_date': '20201103', 'episode': 'Partička (92)',
'timestamp': 1604437480, 'season': 'Partička',
'series': 'Prima Partička',
'episode_number': 92,
'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-ef6cf9de-c980-4443-92e4-17fe8bccd45c-16x9.jpeg',
},
'params': {
'skip_download': True, # m3u8 download
},
}, {
'url': 'https://zoom.iprima.cz/porady/krasy-kanarskych-ostrovu/tenerife-v-risi-ohne',
'info_dict': {
'id': 'p1412199',
'ext': 'mp4',
'episode_number': 3,
'episode': 'Tenerife: V říši ohně',
'description': 'md5:4b4a05c574b5eaef130e68d4811c3f2c',
'duration': 3111.0,
'thumbnail': 'https://d31b9s05ygj54s.cloudfront.net/prima-plus/image/video-f66dd7fb-c1a0-47d1-b3bc-7db328d566c5-16x9-1711636518.jpg/t_16x9_medium_1366_768',
'title': 'Tenerife: V říši ohně',
'timestamp': 1711825800,
'upload_date': '20240330',
}, },
'params': { 'params': {
'skip_download': True, # m3u8 download 'skip_download': True, # m3u8 download
@ -131,6 +151,7 @@ class IPrimaIE(InfoExtractor):
video_id = self._search_regex(( video_id = self._search_regex((
r'productId\s*=\s*([\'"])(?P<id>p\d+)\1', r'productId\s*=\s*([\'"])(?P<id>p\d+)\1',
r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1', r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1',
r'let\s+videos\s*=\s*([\'"])(?P<id>p\d+)\1',
), webpage, 'real id', group='id', default=None) ), webpage, 'real id', group='id', default=None)
if not video_id: if not video_id:
@ -176,7 +197,7 @@ class IPrimaIE(InfoExtractor):
final_result = self._search_json_ld(webpage, video_id, default={}) final_result = self._search_json_ld(webpage, video_id, default={})
final_result.update({ final_result.update({
'id': video_id, 'id': video_id,
'title': title, 'title': final_result.get('title') or title,
'thumbnail': self._html_search_meta( 'thumbnail': self._html_search_meta(
['thumbnail', 'og:image', 'twitter:image'], ['thumbnail', 'og:image', 'twitter:image'],
webpage, 'thumbnail', default=None), webpage, 'thumbnail', default=None),

View file

@ -67,7 +67,7 @@ class KickIE(KickBaseIE):
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
return False if KickClipIE.suitable(url) else super().suitable(url) return False if (KickVODIE.suitable(url) or KickClipIE.suitable(url)) else super().suitable(url)
def _real_extract(self, url): def _real_extract(self, url):
channel = self._match_id(url) channel = self._match_id(url)
@ -98,25 +98,25 @@ class KickIE(KickBaseIE):
class KickVODIE(KickBaseIE): class KickVODIE(KickBaseIE):
IE_NAME = 'kick:vod' IE_NAME = 'kick:vod'
_VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/videos/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{ _TESTS = [{
'url': 'https://kick.com/video/e74614f4-5270-4319-90ad-32179f19a45c', 'url': 'https://kick.com/xqc/videos/8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea',
'md5': '3870f94153e40e7121a6e46c068b70cb', 'md5': '3870f94153e40e7121a6e46c068b70cb',
'info_dict': { 'info_dict': {
'id': 'e74614f4-5270-4319-90ad-32179f19a45c', 'id': '8dd97a8d-e17f-48fb-8bc3-565f88dbc9ea',
'ext': 'mp4', 'ext': 'mp4',
'title': r're:❎ MEGA DRAMA ❎ LIVE ❎ CLICK ❎ ULTIMATE SKILLS .+', 'title': '18+ #ad 🛑LIVE🛑CLICK🛑DRAMA🛑NEWS🛑STUFF🛑REACT🛑GET IN HHERE🛑BOP BOP🛑WEEEE WOOOO🛑',
'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.', 'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.',
'channel': 'xqc', 'channel': 'xqc',
'channel_id': '668', 'channel_id': '668',
'uploader': 'xQc', 'uploader': 'xQc',
'uploader_id': '676', 'uploader_id': '676',
'upload_date': '20240724', 'upload_date': '20240909',
'timestamp': 1721796562, 'timestamp': 1725919141,
'duration': 18566.0, 'duration': 10155.0,
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'view_count': int, 'view_count': int,
'categories': ['VALORANT'], 'categories': ['Just Chatting'],
'age_limit': 0, 'age_limit': 0,
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},

View file

@ -1,86 +1,11 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html,
format_field, format_field,
int_or_none, int_or_none,
parse_iso8601,
unified_strdate, unified_strdate,
) )
class LnkGoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P<id>[A-Za-z0-9-]+)(?:/(?P<episode_id>\d+))?'
_TESTS = [{
'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai',
'info_dict': {
'id': '10809',
'ext': 'mp4',
'title': "Put'ka: Trys Klausimai",
'upload_date': '20161216',
'description': 'Seniai matytas Putka užduoda tris klausimėlius. Pabandykime surasti atsakymus.',
'age_limit': 18,
'duration': 117,
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1481904000,
},
'params': {
'skip_download': True, # HLS download
},
}, {
'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2',
'info_dict': {
'id': '10467',
'ext': 'mp4',
'title': 'Nėrdas: Kompiuterio Valymas',
'upload_date': '20150113',
'description': 'md5:7352d113a242a808676ff17e69db6a69',
'age_limit': 18,
'duration': 346,
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1421164800,
},
'params': {
'skip_download': True, # HLS download
},
}, {
'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413',
'only_matching': True,
}]
_AGE_LIMITS = {
'N-7': 7,
'N-14': 14,
'S': 18,
}
_M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s'
def _real_extract(self, url):
display_id, video_id = self._match_valid_url(url).groups()
video_info = self._download_json(
'https://lnk.lt/api/main/video-page/{}/{}/false'.format(display_id, video_id or '0'),
display_id)['videoConfig']['videoInfo']
video_id = str(video_info['id'])
title = video_info['title']
prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4'
formats = self._extract_m3u8_formats(
self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''),
video_id, 'mp4', 'm3u8_native')
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': format_field(video_info, 'posterImage', 'https://lnk.lt/all-images/%s'),
'duration': int_or_none(video_info.get('duration')),
'description': clean_html(video_info.get('htmlDescription')),
'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0),
'timestamp': parse_iso8601(video_info.get('airDate')),
'view_count': int_or_none(video_info.get('viewsCount')),
}
class LnkIE(InfoExtractor): class LnkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?lnk\.lt/[^/]+/(?P<id>\d+)'

View file

@ -1,9 +1,6 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
extract_attributes,
)
class NZZIE(InfoExtractor): class NZZIE(InfoExtractor):
@ -22,19 +19,14 @@ class NZZIE(InfoExtractor):
'playlist_count': 1, 'playlist_count': 1,
}] }]
def _entries(self, webpage, page_id):
for script in re.findall(r'(?s)<script[^>]* data-hid="jw-video-jw[^>]+>(.+?)</script>', webpage):
settings = self._search_json(r'var\s+settings\s*=[^{]*', script, 'settings', page_id, fatal=False)
if entry := self._parse_jwplayer_data(settings, page_id):
yield entry
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id) webpage = self._download_webpage(url, page_id)
entries = [] return self.playlist_result(self._entries(webpage, page_id), page_id)
for player_element in re.findall(
r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage):
player_params = extract_attributes(player_element)
if player_params.get('data-type') not in ('kaltura_singleArticle',):
self.report_warning('Unsupported player type')
continue
entry_id = player_params['data-id']
entries.append(self.url_result(
'kaltura:1750922:' + entry_id, 'Kaltura', entry_id))
return self.playlist_result(entries, page_id)

View file

@ -109,7 +109,7 @@ class PinterestBaseIE(InfoExtractor):
class PinterestIE(PinterestBaseIE): class PinterestIE(PinterestBaseIE):
_VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P<id>\d+)' _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?:[\w-]+--)?(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
# formats found in data['videos'] # formats found in data['videos']
'url': 'https://www.pinterest.com/pin/664281013778109217/', 'url': 'https://www.pinterest.com/pin/664281013778109217/',
@ -174,6 +174,25 @@ class PinterestIE(PinterestBaseIE):
}, { }, {
'url': 'https://co.pinterest.com/pin/824721750502199491/', 'url': 'https://co.pinterest.com/pin/824721750502199491/',
'only_matching': True, 'only_matching': True,
},
{
'url': 'https://pinterest.com/pin/dive-into-serenity-blue-lagoon-pedi-nails-for-a-tranquil-and-refreshing-spa-experience-video-in-2024--2885187256207927',
'info_dict': {
'id': '2885187256207927',
'ext': 'mp4',
'title': 'Dive into Serenity: Blue Lagoon Pedi Nails for a Tranquil and Refreshing Spa Experience! 💙💅',
'description': 'md5:5da41c767d2317e42e49b663b0b2150f',
'uploader': 'Glamour Artistry |Everyday Outfits, Luxury Fashion & Nail Designs',
'uploader_id': '1142999717836434688',
'upload_date': '20240702',
'timestamp': 1719939156,
'duration': 7.967,
'comment_count': int,
'repost_count': int,
'categories': 'count:9',
'tags': ['#BlueLagoonPediNails', '#SpaExperience'],
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View file

@ -8,7 +8,7 @@ from ..utils import js_to_json
class RTPIE(InfoExtractor): class RTPIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
'md5': 'e736ce0c665e459ddb818546220b4ef8', 'md5': 'e736ce0c665e459ddb818546220b4ef8',
@ -19,9 +19,25 @@ class RTPIE(InfoExtractor):
'description': 'As paixões musicais de António Cartaxo e António Macedo', 'description': 'As paixões musicais de António Cartaxo e António Macedo',
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
}, },
}, {
'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril',
'md5': '9a81ed53f2b2197cfa7ed455b12f8ade',
'info_dict': {
'id': 'e757904',
'ext': 'mp4',
'title': '25 Curiosidades, 25 de Abril',
'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr',
'thumbnail': r're:^https?://.*\.jpg',
},
}, { }, {
'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano',
'only_matching': True,
}, {
'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon',
'only_matching': True,
}] }]
_RX_OBFUSCATION = re.compile(r'''(?xs) _RX_OBFUSCATION = re.compile(r'''(?xs)
@ -49,17 +65,17 @@ class RTPIE(InfoExtractor):
f, config = self._search_regex( f, config = self._search_regex(
r'''(?sx) r'''(?sx)
var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s* (?:var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*)?
var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/) var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
''', webpage, ''', webpage,
'player config', group=('f', 'config')) 'player config', group=('f', 'config'))
f = self._parse_json(
f, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id))
config = self._parse_json( config = self._parse_json(
config, video_id, config, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id)) lambda data: self.__unobfuscate(data, video_id=video_id))
f = config['file'] if not f else self._parse_json(
f, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id))
formats = [] formats = []
if isinstance(f, dict): if isinstance(f, dict):

View file

@ -0,0 +1,33 @@
from .common import InfoExtractor
class ScreenRecIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?screenrec\.com/share/(?P<id>\w{10})'
_TESTS = [{
'url': 'https://screenrec.com/share/DasLtbknYo',
'info_dict': {
'id': 'DasLtbknYo',
'ext': 'mp4',
'title': '02.05.2024_03.01.25_REC',
'description': 'Recorded with ScreenRec',
'thumbnail': r're:^https?://.*\.gif$',
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m3u8_url = self._search_regex(
r'customUrl\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 URL', group='url')
return {
'id': video_id,
'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'),
}

36
yt_dlp/extractor/sen.py Normal file
View file

@ -0,0 +1,36 @@
from .common import InfoExtractor
from ..utils import url_or_none
from ..utils.traversal import traverse_obj
class SenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?sen\.com/video/(?P<id>[0-9a-f-]+)'
_TEST = {
'url': 'https://www.sen.com/video/eef46eb1-4d79-4e28-be9d-bd937767f8c4',
'md5': 'ff615aca9691053c94f8f10d96cd7884',
'info_dict': {
'id': 'eef46eb1-4d79-4e28-be9d-bd937767f8c4',
'ext': 'mp4',
'description': 'Florida, 28 Sep 2022',
'title': 'Hurricane Ian',
'tags': ['North America', 'Storm', 'Weather'],
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
api_data = self._download_json(f'https://api.sen.com/content/public/video/{video_id}', video_id)
m3u8_url = (traverse_obj(api_data, (
'data', 'nodes', lambda _, v: v['id'] == 'player', 'video', 'url', {url_or_none}, any))
or f'https://vod.sen.com/videos/{video_id}/manifest.m3u8')
return {
'id': video_id,
'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'),
**traverse_obj(api_data, ('data', 'nodes', lambda _, v: v['id'] == 'details', any, 'content', {
'title': ('title', 'text', {str}),
'description': ('descriptions', 0, 'text', {str}),
'tags': ('badges', ..., 'text', {str}),
})),
}

View file

@ -27,7 +27,7 @@ class ServusIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'AA-28BYCQNH92111', 'id': 'AA-28BYCQNH92111',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Klettersteige in den Alpen', 'title': 'Vie Ferrate - Klettersteige in den Alpen',
'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce', 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce',
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'duration': 2823, 'duration': 2823,
@ -38,6 +38,7 @@ class ServusIE(InfoExtractor):
'season_number': 11, 'season_number': 11,
'episode': 'Episode 8 - Vie Ferrate Klettersteige in den Alpen', 'episode': 'Episode 8 - Vie Ferrate Klettersteige in den Alpen',
'episode_number': 8, 'episode_number': 8,
'categories': ['Bergwelten'],
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, { }, {
@ -71,8 +72,11 @@ class ServusIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url).upper() video_id = self._match_id(url).upper()
webpage = self._download_webpage(url, video_id)
next_data = self._search_nextjs_data(webpage, video_id, fatal=False)
video = self._download_json( video = self._download_json(
'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin', 'https://api-player.redbull.com/stv/servus-tv-playnet',
video_id, 'Downloading video JSON', query={'videoId': video_id}) video_id, 'Downloading video JSON', query={'videoId': video_id})
if not video.get('videoUrl'): if not video.get('videoUrl'):
self._report_errors(video) self._report_errors(video)
@ -89,7 +93,7 @@ class ServusIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'title': video.get('title'), 'title': video.get('title'),
'description': self._get_description(video_id) or video.get('description'), 'description': self._get_description(next_data) or video.get('description'),
'thumbnail': video.get('poster'), 'thumbnail': video.get('poster'),
'duration': float_or_none(video.get('duration')), 'duration': float_or_none(video.get('duration')),
'timestamp': unified_timestamp(video.get('currentSunrise')), 'timestamp': unified_timestamp(video.get('currentSunrise')),
@ -100,16 +104,19 @@ class ServusIE(InfoExtractor):
'episode_number': episode_number, 'episode_number': episode_number,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
**traverse_obj(next_data, ('props', 'pageProps', 'data', {
'title': ('title', 'rendered', {str}),
'timestamp': ('stv_date', 'raw', {int}),
'duration': ('stv_duration', {float_or_none}),
'categories': ('category_names', ..., {str}),
})),
} }
def _get_description(self, video_id): def _get_description(self, next_data):
info = self._download_json( return join_nonempty(*traverse_obj(next_data, (
f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page', 'props', 'pageProps', 'data',
video_id, fatal=False) ('stv_short_description', 'stv_long_description'), {str},
{lambda x: x.replace('\n\n', '\n')}, {unescapeHTML})), delim='\n\n')
return join_nonempty(*traverse_obj(info, (
('stv_short_description', 'stv_long_description'),
{lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n')
def _report_errors(self, video): def _report_errors(self, video):
playability_errors = traverse_obj(video, ('playabilityErrors', ...)) playability_errors = traverse_obj(video, ('playabilityErrors', ...))

View file

@ -1,33 +1,31 @@
import base64
import datetime as dt
import functools import functools
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..networking import HEADRequest from ..networking import HEADRequest
from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin from ..utils import int_or_none, traverse_obj, url_or_none, urljoin
class TenPlayIE(InfoExtractor): class TenPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})' _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})'
_NETRC_MACHINE = '10play' _NETRC_MACHINE = '10play'
_TESTS = [{ _TESTS = [{
'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', 'url': 'https://10play.com.au/neighbours/web-extras/season-41/heres-a-first-look-at-mischa-bartons-neighbours-debut/tpv230911hyxnz',
'info_dict': { 'info_dict': {
'id': '6226844312001', 'id': '6336940246112',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', 'title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut',
'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', 'alt_title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut',
'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', 'description': 'Neighbours Premieres Monday, September 18 At 4:30pm On 10 And 10 Play And 6:30pm On 10 Peach',
'duration': 186, 'duration': 74,
'season': 'Season 39', 'season': 'Season 41',
'season_number': 39, 'season_number': 41,
'series': 'Neighbours', 'series': 'Neighbours',
'thumbnail': r're:https://.*\.jpg', 'thumbnail': r're:https://.*\.jpg',
'uploader': 'Channel 10', 'uploader': 'Channel 10',
'age_limit': 15, 'age_limit': 15,
'timestamp': 1611810000, 'timestamp': 1694386800,
'upload_date': '20210128', 'upload_date': '20230910',
'uploader_id': '2199827728001', 'uploader_id': '2199827728001',
}, },
'params': { 'params': {
@ -35,21 +33,30 @@ class TenPlayIE(InfoExtractor):
}, },
'skip': 'Only available in Australia', 'skip': 'Only available in Australia',
}, { }, {
'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', 'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp',
'info_dict': { 'info_dict': {
'id': '6192880312001', 'id': '9000000000091177',
'ext': 'mp4', 'ext': 'mp4',
'title': "Todd Sampson's Body Hack - S4 Ep. 2", 'title': 'Neighbours - S42 Ep. 9107',
'description': 'md5:fa278820ad90f08ea187f9458316ac74', 'alt_title': 'Thu 05 Sep',
'description': 'md5:37a1f4271be34b9ee2b533426a5fbaef',
'duration': 1388,
'episode': 'Episode 9107',
'episode_number': 9107,
'season': 'Season 42',
'season_number': 42,
'series': 'Neighbours',
'thumbnail': r're:https://.*\.jpg',
'age_limit': 15, 'age_limit': 15,
'timestamp': 1600770600, 'timestamp': 1725517860,
'upload_date': '20200922', 'upload_date': '20240905',
'uploader': 'Channel 10', 'uploader': 'Channel 10',
'uploader_id': '2199827728001', 'uploader_id': '2199827728001',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'skip': 'Only available in Australia',
}, { }, {
'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc',
'only_matching': True, 'only_matching': True,
@ -66,55 +73,42 @@ class TenPlayIE(InfoExtractor):
'X': 18, 'X': 18,
} }
def _get_bearer_token(self, video_id):
username, password = self._get_login_info()
if username is None or password is None:
self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.')
_timestamp = dt.datetime.now().strftime('%Y%m%d000000')
_auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii')
data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={
'X-Network-Ten-Auth': _auth_header,
}, data=urlencode_postdata({
'email': username,
'password': password,
}))
return 'Bearer ' + data['jwt']['accessToken']
def _real_extract(self, url): def _real_extract(self, url):
content_id = self._match_id(url) content_id = self._match_id(url)
data = self._download_json( data = self._download_json(
'https://10play.com.au/api/v1/videos/' + content_id, content_id) 'https://10play.com.au/api/v1/videos/' + content_id, content_id)
headers = {}
if data.get('memberGated') is True: video_data = self._download_json(
_token = self._get_bearer_token(content_id) f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}',
headers = {'Authorization': _token} content_id, 'Downloading video JSON')
m3u8_url = self._request_webpage(
_video_url = self._download_json( HEADRequest(video_data['items'][0]['HLSURL']),
data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', content_id, 'Checking stream URL').url
headers=headers).get('source')
m3u8_url = self._request_webpage(HEADRequest(
_video_url), content_id).url
if '10play-not-in-oz' in m3u8_url: if '10play-not-in-oz' in m3u8_url:
self.raise_geo_restricted(countries=['AU']) self.raise_geo_restricted(countries=['AU'])
# Attempt to get a higher quality stream
m3u8_url = m3u8_url.replace(',150,75,55,0000', ',300,150,75,55,0000')
formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4')
return { return {
'id': content_id,
'formats': formats, 'formats': formats,
'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, 'subtitles': {'en': [{'url': data['captionUrl']}]} if url_or_none(data.get('captionUrl')) else None,
'id': data.get('altId') or content_id,
'duration': data.get('duration'),
'title': data.get('subtitle'),
'alt_title': data.get('title'),
'description': data.get('description'),
'age_limit': self._AUS_AGES.get(data.get('classification')),
'series': data.get('tvShow'),
'season_number': int_or_none(data.get('season')),
'episode_number': int_or_none(data.get('episode')),
'timestamp': data.get('published'),
'thumbnail': data.get('imageUrl'),
'uploader': 'Channel 10', 'uploader': 'Channel 10',
'uploader_id': '2199827728001', 'uploader_id': '2199827728001',
**traverse_obj(data, {
'id': ('altId', {str}),
'duration': ('duration', {int_or_none}),
'title': ('subtitle', {str}),
'alt_title': ('title', {str}),
'description': ('description', {str}),
'age_limit': ('classification', {self._AUS_AGES.get}),
'series': ('tvShow', {str}),
'season_number': ('season', {int_or_none}),
'episode_number': ('episode', {int_or_none}),
'timestamp': ('published', {int_or_none}),
'thumbnail': ('imageUrl', {url_or_none}),
}),
} }

View file

@ -1,7 +1,17 @@
import base64
import math import math
import time
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import InAdvancePagedList, str_or_none, traverse_obj, try_call from .videa import VideaIE
from ..utils import (
InAdvancePagedList,
int_or_none,
str_or_none,
traverse_obj,
try_call,
update_url_query,
)
class XimalayaBaseIE(InfoExtractor): class XimalayaBaseIE(InfoExtractor):
@ -71,23 +81,92 @@ class XimalayaIE(XimalayaBaseIE):
'like_count': int, 'like_count': int,
}, },
}, },
{
# VIP-restricted audio
'url': 'https://www.ximalaya.com/sound/562111701',
'only_matching': True,
},
] ]
@staticmethod
def _decrypt_filename(file_id, seed):
cgstr = ''
key = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890'
for _ in key:
seed = float(int(211 * seed + 30031) % 65536)
r = int(seed / 65536 * len(key))
cgstr += key[r]
key = key.replace(key[r], '')
parts = file_id.split('*')
filename = ''.join(cgstr[int(part)] for part in parts if part.isdecimal())
if not filename.startswith('/'):
filename = '/' + filename
return filename
@staticmethod
def _decrypt_url_params(encrypted_params):
params = VideaIE.rc4(
base64.b64decode(encrypted_params), 'xkt3a41psizxrh9l').split('-')
# sign, token, timestamp
return params[1], params[2], params[3]
def _real_extract(self, url): def _real_extract(self, url):
scheme = 'https' if url.startswith('https') else 'http' scheme = 'https' if url.startswith('https') else 'http'
audio_id = self._match_id(url) audio_id = self._match_id(url)
audio_info_file = f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json'
audio_info = self._download_json( audio_info = self._download_json(
audio_info_file, audio_id, f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json', audio_id,
f'Downloading info json {audio_info_file}', 'Unable to download info file') 'Downloading info json', 'Unable to download info file')
formats = [{ formats = []
# NOTE: VIP-restricted audio
if audio_info.get('is_paid'):
ts = int(time.time())
vip_info = self._download_json(
f'{scheme}://mpay.ximalaya.com/mobile/track/pay/{audio_id}/{ts}',
audio_id, 'Downloading VIP info json', 'Unable to download VIP info file',
query={'device': 'pc', 'isBackend': 'true', '_': ts})
filename = self._decrypt_filename(vip_info['fileId'], vip_info['seed'])
sign, token, timestamp = self._decrypt_url_params(vip_info['ep'])
vip_url = update_url_query(
f'{vip_info["domain"]}/download/{vip_info["apiVersion"]}{filename}', {
'sign': sign,
'token': token,
'timestamp': timestamp,
'buy_key': vip_info['buyKey'],
'duration': vip_info['duration'],
})
fmt = {
'format_id': 'vip',
'url': vip_url,
'vcodec': 'none',
}
if '_preview_' in vip_url:
self.report_warning(
f'This tracks requires a VIP account. Using a sample instead. {self._login_hint()}')
fmt.update({
'format_note': 'Sample',
'preference': -10,
**traverse_obj(vip_info, {
'filesize': ('sampleLength', {int_or_none}),
'duration': ('sampleDuration', {int_or_none}),
}),
})
else:
fmt.update(traverse_obj(vip_info, {
'filesize': ('totalLength', {int_or_none}),
'duration': ('duration', {int_or_none}),
}))
fmt['abr'] = try_call(lambda: fmt['filesize'] * 8 / fmt['duration'] / 1024)
formats.append(fmt)
formats.extend([{
'format_id': f'{bps}k', 'format_id': f'{bps}k',
'url': audio_info[k], 'url': audio_info[k],
'abr': bps, 'abr': bps,
'vcodec': 'none', 'vcodec': 'none',
} for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)] } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)])
thumbnails = [] thumbnails = []
for k in audio_info: for k in audio_info:

View file

@ -3,16 +3,13 @@ from ..utils import (
int_or_none, int_or_none,
str_or_none, str_or_none,
try_get, try_get,
update_url_query,
url_or_none, url_or_none,
) )
class XinpianchangIE(InfoExtractor): class XinpianchangIE(InfoExtractor):
_WORKING = False _VALID_URL = r'https?://(www\.)?xinpianchang\.com/(?P<id>a\d+)'
_VALID_URL = r'https?://www\.xinpianchang\.com/(?P<id>[^/]+?)(?:\D|$)' IE_DESC = '新片场'
IE_NAME = 'xinpianchang'
IE_DESC = 'xinpianchang.com'
_TESTS = [{ _TESTS = [{
'url': 'https://www.xinpianchang.com/a11766551', 'url': 'https://www.xinpianchang.com/a11766551',
'info_dict': { 'info_dict': {
@ -49,11 +46,11 @@ class XinpianchangIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id=video_id) webpage = self._download_webpage(url, video_id=video_id)
domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video']
vid = self.find_value_with_regex(var='vid', webpage=webpage)
app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) data = self._download_json(
api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) f'https://mod-api.xinpianchang.com/mod/api/v2/media/{video_data["vid"]}', video_id,
data = self._download_json(api, video_id=video_id)['data'] query={'appKey': video_data['appKey']})['data']
formats, subtitles = [], {} formats, subtitles = [], {}
for k, v in data.get('resource').items(): for k, v in data.get('resource').items():
if k in ('dash', 'hls'): if k in ('dash', 'hls'):
@ -72,6 +69,10 @@ class XinpianchangIE(InfoExtractor):
'width': int_or_none(prog.get('width')), 'width': int_or_none(prog.get('width')),
'height': int_or_none(prog.get('height')), 'height': int_or_none(prog.get('height')),
'ext': 'mp4', 'ext': 'mp4',
'http_headers': {
# NB: Server returns 403 without the Range header
'Range': 'bytes=0-',
},
} for prog in v if prog.get('url') or []]) } for prog in v if prog.get('url') or []])
return { return {
@ -87,6 +88,3 @@ class XinpianchangIE(InfoExtractor):
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} }
def find_value_with_regex(self, var, webpage):
return self._search_regex(rf'var\s{var}\s=\s\"(?P<vid>[^\"]+)\"', webpage, name=var)

View file

@ -69,6 +69,8 @@ from ..utils import (
) )
STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client'
STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token'
# any clients starting with _ cannot be explicitly requested by the user # any clients starting with _ cannot be explicitly requested by the user
INNERTUBE_CLIENTS = { INNERTUBE_CLIENTS = {
'web': { 'web': {
@ -79,6 +81,7 @@ INNERTUBE_CLIENTS = {
}, },
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 1, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
'REQUIRE_PO_TOKEN': True,
}, },
# Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats
'web_safari': { 'web_safari': {
@ -90,6 +93,7 @@ INNERTUBE_CLIENTS = {
}, },
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 1, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
'REQUIRE_PO_TOKEN': True,
}, },
'web_embedded': { 'web_embedded': {
'INNERTUBE_CONTEXT': { 'INNERTUBE_CONTEXT': {
@ -132,6 +136,7 @@ INNERTUBE_CLIENTS = {
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
'REQUIRE_JS_PLAYER': False, 'REQUIRE_JS_PLAYER': False,
'REQUIRE_PO_TOKEN': True,
}, },
'android_music': { 'android_music': {
'INNERTUBE_CONTEXT': { 'INNERTUBE_CONTEXT': {
@ -146,6 +151,7 @@ INNERTUBE_CLIENTS = {
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
'REQUIRE_JS_PLAYER': False, 'REQUIRE_JS_PLAYER': False,
'REQUIRE_PO_TOKEN': True,
}, },
'android_creator': { 'android_creator': {
'INNERTUBE_CONTEXT': { 'INNERTUBE_CONTEXT': {
@ -160,6 +166,7 @@ INNERTUBE_CLIENTS = {
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
'REQUIRE_JS_PLAYER': False, 'REQUIRE_JS_PLAYER': False,
'REQUIRE_PO_TOKEN': True,
}, },
# YouTube Kids videos aren't returned on this client for some reason # YouTube Kids videos aren't returned on this client for some reason
'android_vr': { 'android_vr': {
@ -323,6 +330,7 @@ def build_innertube_clients():
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
ytcfg.setdefault('REQUIRE_PO_TOKEN', False)
ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg.setdefault('PLAYER_PARAMS', None)
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
@ -688,31 +696,46 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
'identity token', default=None, fatal=False) 'identity token', default=None, fatal=False)
@staticmethod def _data_sync_id_to_delegated_session_id(self, data_sync_id):
def _extract_account_syncid(*args): if not data_sync_id:
return
# datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
# and just "user_syncid||" for primary channel. We only want the channel_syncid
channel_syncid, _, user_syncid = data_sync_id.partition('||')
if user_syncid:
return channel_syncid
def _extract_account_syncid(self, *args):
""" """
Extract syncId required to download private playlists of secondary channels Extract current session ID required to download private playlists of secondary channels
@params response and/or ytcfg @params response and/or ytcfg
""" """
for data in args: # ytcfg includes channel_syncid if on secondary channel
# ytcfg includes channel_syncid if on secondary channel if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)):
delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) return delegated_sid
if delegated_sid:
return delegated_sid
sync_ids = (try_get(
data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
lambda x: x['DATASYNC_ID']), str) or '').split('||')
if len(sync_ids) >= 2 and sync_ids[1]:
# datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
# and just "user_syncid||" for primary channel. We only want the channel_syncid
return sync_ids[0]
@staticmethod data_sync_id = self._extract_data_sync_id(*args)
def _extract_visitor_data(*args): return self._data_sync_id_to_delegated_session_id(data_sync_id)
def _extract_data_sync_id(self, *args):
"""
Extract current account dataSyncId.
In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID||
@params response and/or ytcfg
"""
if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]:
return data_sync_id
return traverse_obj(
args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any))
def _extract_visitor_data(self, *args):
""" """
Extracts visitorData from an API response or ytcfg Extracts visitorData from an API response or ytcfg
Appears to be used to track session state Appears to be used to track session state
""" """
if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]:
return visitor_data
return get_first( return get_first(
args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))],
expected_type=str) expected_type=str)
@ -1334,11 +1357,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
} }
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
_POTOKEN_EXPERIMENTS = ('51217476', '51217102')
_BROKEN_CLIENTS = {
short_client_name(client): client
for client in ('android', 'android_creator', 'android_music')
}
_DEFAULT_CLIENTS = ('ios', 'web_creator') _DEFAULT_CLIENTS = ('ios', 'web_creator')
_GEO_BYPASS = False _GEO_BYPASS = False
@ -3701,6 +3719,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
**cls._get_checkok_params(), **cls._get_checkok_params(),
} }
def _get_config_po_token(self, client):
po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True)
for token_str in po_token_strs:
po_token_client, sep, po_token = token_str.partition('+')
if not sep:
self.report_warning(
f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True)
continue
if po_token_client == client:
return po_token
def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
# PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function.
if not visitor_data and not self.is_authenticated and player_url:
self.report_warning(
f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. '
f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"')
return
config_po_token = self._get_config_po_token(client)
if config_po_token:
# PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token,
# if using first channel in an account then we don't need the data_sync_id anymore...
if not data_sync_id and self.is_authenticated and player_url:
self.report_warning(
f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.'
f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
return config_po_token
# Require PO Token if logged in for external fetching
if not data_sync_id and self.is_authenticated and player_url:
self.report_warning(
f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. '
f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
return
return self._fetch_po_token(
client=client,
visitor_data=visitor_data,
data_sync_id=data_sync_id,
player_url=player_url,
**kwargs,
)
def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
"""External PO Token fetch stub"""
@staticmethod @staticmethod
def _is_agegated(player_response): def _is_agegated(player_response):
if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
@ -3717,13 +3783,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _is_unplayable(player_response): def _is_unplayable(player_response):
return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token):
session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
headers = self.generate_api_headers( headers = self.generate_api_headers(
ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) ytcfg=player_ytcfg,
default_client=client,
visitor_data=visitor_data,
session_index=self._extract_session_index(master_ytcfg, player_ytcfg),
account_syncid=(
self._data_sync_id_to_delegated_session_id(data_sync_id)
or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg)
),
)
yt_query = { yt_query = {
'videoId': video_id, 'videoId': video_id,
@ -3734,6 +3804,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]:
yt_query['params'] = player_params yt_query['params'] = player_params
if po_token:
yt_query['serviceIntegrityDimensions'] = {'poToken': po_token}
sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
yt_query.update(self._generate_player_context(sts)) yt_query.update(self._generate_player_context(sts))
return self._extract_response( return self._extract_response(
item_id=video_id, ep='player', query=yt_query, item_id=video_id, ep='player', query=yt_query,
@ -3744,7 +3818,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _get_requested_clients(self, url, smuggled_data): def _get_requested_clients(self, url, smuggled_data):
requested_clients = [] requested_clients = []
broken_clients = []
excluded_clients = [] excluded_clients = []
allowed_clients = sorted( allowed_clients = sorted(
(client for client in INNERTUBE_CLIENTS if client[:1] != '_'), (client for client in INNERTUBE_CLIENTS if client[:1] != '_'),
@ -3758,12 +3831,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
excluded_clients.append(client[1:]) excluded_clients.append(client[1:])
elif client not in allowed_clients: elif client not in allowed_clients:
self.report_warning(f'Skipping unsupported client "{client}"') self.report_warning(f'Skipping unsupported client "{client}"')
elif client in self._BROKEN_CLIENTS.values():
broken_clients.append(client)
else: else:
requested_clients.append(client) requested_clients.append(client)
# Force deprioritization of _BROKEN_CLIENTS for format de-duplication
requested_clients.extend(broken_clients)
if not requested_clients: if not requested_clients:
requested_clients.extend(self._DEFAULT_CLIENTS) requested_clients.extend(self._DEFAULT_CLIENTS)
for excluded_client in excluded_clients: for excluded_client in excluded_clients:
@ -3788,19 +3857,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return pr_id return pr_id
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data):
initial_pr = ignore_initial_response = None initial_pr = None
if webpage: if webpage:
if 'web' in clients:
experiments = traverse_obj(master_ytcfg, (
'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...))
if all(x in experiments for x in self._POTOKEN_EXPERIMENTS):
self.report_warning(
'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response')
ignore_initial_response = True
initial_pr = self._search_json( initial_pr = self._search_json(
self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False)
prs = [] prs = []
deprioritized_prs = []
if initial_pr and not self._invalid_player_response(initial_pr, video_id): if initial_pr and not self._invalid_player_response(initial_pr, video_id):
# Android player_response does not have microFormats which are needed for # Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats # extraction of some data. So we return the initial_pr with formats
@ -3822,14 +3886,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return return
tried_iframe_fallback = False tried_iframe_fallback = False
player_url = None player_url = visitor_data = data_sync_id = None
skipped_clients = {} skipped_clients = {}
while clients: while clients:
deprioritize_pr = False
client, base_client, variant = _split_innertube_client(clients.pop()) client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = {} player_ytcfg = master_ytcfg if client == 'web' else {}
if client == 'web': if 'configs' not in self._configuration_arg('player_skip') and client != 'web':
player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg
elif 'configs' not in self._configuration_arg('player_skip'):
player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg
player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
@ -3842,34 +3905,53 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_url = self._download_player_url(video_id) player_url = self._download_player_url(video_id)
tried_iframe_fallback = True tried_iframe_fallback = True
pr = initial_pr if client == 'web' and not ignore_initial_response else None visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg)
for retry in self.RetryManager(fatal=False): data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg)
try: po_token = self.fetch_po_token(
pr = pr or self._extract_player_response( client=client, visitor_data=visitor_data,
client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, data_sync_id=data_sync_id if self.is_authenticated else None,
player_url if require_js_player else None, initial_pr, smuggled_data) player_url=player_url if require_js_player else None,
except ExtractorError as e: )
self.report_warning(e)
break require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN')
experiments = traverse_obj(pr, ( if not po_token and require_po_token:
'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK', self.report_warning(
'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...)) f'No PO Token provided for {client} client, '
if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): f'which is required for working {client} formats. '
pr = None f'You can manually pass a PO Token for this client with '
retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True) f'--extractor-args "youtube:po_token={client}+XXX"',
if not pr: only_once=True)
deprioritize_pr = True
pr = initial_pr if client == 'web' else None
try:
pr = pr or self._extract_player_response(
client, video_id,
master_ytcfg=player_ytcfg or master_ytcfg,
player_ytcfg=player_ytcfg,
player_url=player_url,
initial_pr=initial_pr,
visitor_data=visitor_data,
data_sync_id=data_sync_id,
po_token=po_token)
except ExtractorError as e:
self.report_warning(e)
continue continue
if pr_id := self._invalid_player_response(pr, video_id): if pr_id := self._invalid_player_response(pr, video_id):
skipped_clients[client] = pr_id skipped_clients[client] = pr_id
elif pr: elif pr:
# Save client name for introspection later # Save client name for introspection later
name = short_client_name(client)
sd = traverse_obj(pr, ('streamingData', {dict})) or {} sd = traverse_obj(pr, ('streamingData', {dict})) or {}
sd[STREAMING_DATA_CLIENT_NAME] = name sd[STREAMING_DATA_CLIENT_NAME] = client
sd[STREAMING_DATA_PO_TOKEN] = po_token
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
f[STREAMING_DATA_CLIENT_NAME] = name f[STREAMING_DATA_CLIENT_NAME] = client
prs.append(pr) f[STREAMING_DATA_PO_TOKEN] = po_token
if deprioritize_pr:
deprioritized_prs.append(pr)
else:
prs.append(pr)
# tv_embedded can work around age-gate and age-verification IF the video is embeddable # tv_embedded can work around age-gate and age-verification IF the video is embeddable
if self._is_agegated(pr) and variant != 'tv_embedded': if self._is_agegated(pr) and variant != 'tv_embedded':
@ -3893,6 +3975,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# _producer, _testsuite, & _vr variants can also work around age-verification # _producer, _testsuite, & _vr variants can also work around age-verification
append_client('web_creator', 'mediaconnect') append_client('web_creator', 'mediaconnect')
prs.extend(deprioritized_prs)
if skipped_clients: if skipped_clients:
self.report_warning( self.report_warning(
f'Skipping player responses from {"/".join(skipped_clients)} clients ' f'Skipping player responses from {"/".join(skipped_clients)} clients '
@ -4027,13 +4111,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
# _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds po_token = fmt.get(STREAMING_DATA_PO_TOKEN)
# Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
is_broken = client_name in self._BROKEN_CLIENTS if po_token:
fmt_url = update_url_query(fmt_url, {'pot': po_token})
# Clients that require PO Token return videoplayback URLs that may return 403
is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN'))
if is_broken: if is_broken:
self.report_warning( self.report_warning(
f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken ' f'{video_id}: {client_name} client formats require a PO Token which was not provided. '
'and may yield HTTP Error 403. They will be deprioritized', only_once=True) 'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
fps = int_or_none(fmt.get('fps')) or 0 fps = int_or_none(fmt.get('fps')) or 0
@ -4109,12 +4197,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live':
skip_manifests.add('dash') skip_manifests.add('dash')
def process_manifest_format(f, proto, client_name, itag): def process_manifest_format(f, proto, client_name, itag, po_token):
key = (proto, f.get('language')) key = (proto, f.get('language'))
if not all_formats and key in itags[itag]: if not all_formats and key in itags[itag]:
return False return False
itags[itag].add(key) itags[itag].add(key)
if f.get('source_preference') is None:
f['source_preference'] = -1
# Clients that require PO Token return videoplayback URLs that may return 403
# hls does not currently require PO Token
if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls':
self.report_warning(
f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. '
'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ')
f['source_preference'] -= 20
if itag and all_formats: if itag and all_formats:
f['format_id'] = f'{itag}-{proto}' f['format_id'] = f'{itag}-{proto}'
elif any(p != proto for p, _ in itags[itag]): elif any(p != proto for p, _ in itags[itag]):
@ -4126,9 +4226,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ')
f['language_preference'] = PREFERRED_LANG_VALUE f['language_preference'] = PREFERRED_LANG_VALUE
if f.get('source_preference') is None:
f['source_preference'] = -1
if itag in ('616', '235'): if itag in ('616', '235'):
f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ')
f['source_preference'] += 100 f['source_preference'] += 100
@ -4149,23 +4246,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
subtitles = {} subtitles = {}
for sd in streaming_data: for sd in streaming_data:
client_name = sd.get(STREAMING_DATA_CLIENT_NAME) client_name = sd.get(STREAMING_DATA_CLIENT_NAME)
po_token = sd.get(STREAMING_DATA_PO_TOKEN)
hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl')
if hls_manifest_url: if hls_manifest_url:
if po_token:
hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}'
fmts, subs = self._extract_m3u8_formats_and_subtitles( fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live')
subtitles = self._merge_subtitles(subs, subtitles) subtitles = self._merge_subtitles(subs, subtitles)
for f in fmts: for f in fmts:
if process_manifest_format(f, 'hls', client_name, self._search_regex( if process_manifest_format(f, 'hls', client_name, self._search_regex(
r'/itag/(\d+)', f['url'], 'itag', default=None)): r'/itag/(\d+)', f['url'], 'itag', default=None), po_token):
yield f yield f
dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl')
if dash_manifest_url: if dash_manifest_url:
if po_token:
dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}'
formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH
for f in formats: for f in formats:
if process_manifest_format(f, 'dash', client_name, f['format_id']): if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token):
f['filesize'] = int_or_none(self._search_regex( f['filesize'] = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
if needs_live_processing: if needs_live_processing:
@ -4987,7 +5088,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
def _rich_entries(self, rich_grid_renderer): def _rich_entries(self, rich_grid_renderer):
renderer = traverse_obj( renderer = traverse_obj(
rich_grid_renderer, rich_grid_renderer,
('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {}
video_id = renderer.get('videoId') video_id = renderer.get('videoId')
if video_id: if video_id:
yield self._extract_video(renderer) yield self._extract_video(renderer)
@ -4999,6 +5100,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
ie=YoutubeTabIE.ie_key(), video_id=playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
video_title=self._get_text(renderer, 'title')) video_title=self._get_text(renderer, 'title'))
return return
# shortsLockupViewModel extraction
entity_id = renderer.get('entityId')
if entity_id:
video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str}))
if not video_id:
return
yield self.url_result(
f'https://www.youtube.com/shorts/{video_id}',
ie=YoutubeIE, video_id=video_id,
**traverse_obj(renderer, ('overlayMetadata', {
'title': ('primaryText', 'content', {str}),
'view_count': ('secondaryText', 'content', {parse_count}),
})),
thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources'))
return
def _video_entry(self, video_renderer): def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId') video_id = video_renderer.get('videoId')

View file

@ -2919,6 +2919,7 @@ def mimetype2ext(mt, default=NO_DEFAULT):
'audio/webm': 'webm', 'audio/webm': 'webm',
'audio/x-matroska': 'mka', 'audio/x-matroska': 'mka',
'audio/x-mpegurl': 'm3u', 'audio/x-mpegurl': 'm3u',
'aacp': 'aac',
'midi': 'mid', 'midi': 'mid',
'ogg': 'ogg', 'ogg': 'ogg',
'wav': 'wav', 'wav': 'wav',