Update to ytdl-commit-9f6c03

[cbsnews] Fix extraction for python <3.6
9f6c03a006
This commit is contained in:
pukkandan 2021-04-17 08:32:33 +05:30
parent 5d34200268
commit 201c145953
No known key found for this signature in database
GPG key ID: 0F00D95A001F4698
5 changed files with 163 additions and 41 deletions

View file

@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE):
def _real_extract(self, url): def _real_extract(self, url):
item = self._parse_json(zlib.decompress(compat_b64decode( item = self._parse_json(zlib.decompress(compat_b64decode(
compat_urllib_parse_unquote(self._match_id(url))), compat_urllib_parse_unquote(self._match_id(url))),
-zlib.MAX_WBITS), None)['video']['items'][0] -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0]
return self._extract_video_info(item['mpxRefId'], 'cbsnews') return self._extract_video_info(item['mpxRefId'], 'cbsnews')

View file

@ -122,6 +122,26 @@ class LBRYIE(LBRYBaseIE):
'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212',
'vcodec': 'none', 'vcodec': 'none',
} }
}, {
# HLS
'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e',
'md5': 'fc82f45ea54915b1495dd7cb5cc1289f',
'info_dict': {
'id': 'e51671357333fe22ae88aad320bde2f6f96b1410',
'ext': 'mp4',
'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁',
'description': 'md5:9c539c6a03fb843956de61a4d5288d5e',
'timestamp': 1618254123,
'upload_date': '20210412',
'release_timestamp': 1618254002,
'release_date': '20210412',
'tags': list,
'duration': 554,
'channel': 'Gardening In Canada',
'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc',
'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc',
'formats': 'mincount:3',
}
}, { }, {
'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
'only_matching': True, 'only_matching': True,
@ -168,10 +188,18 @@ class LBRYIE(LBRYBaseIE):
streaming_url = self._call_api_proxy( streaming_url = self._call_api_proxy(
'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url']
info = self._parse_stream(result, url) info = self._parse_stream(result, url)
urlh = self._request_webpage(
streaming_url, display_id, note='Downloading streaming redirect url info')
if determine_ext(urlh.geturl()) == 'm3u8':
info['formats'] = self._extract_m3u8_formats(
urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
self._sort_formats(info['formats'])
else:
info['url'] = streaming_url
info.update({ info.update({
'id': claim_id, 'id': claim_id,
'title': title, 'title': title,
'url': streaming_url,
}) })
return info return info

View file

@ -393,7 +393,7 @@ query viewClip {
# To somewhat reduce the probability of these consequences # To somewhat reduce the probability of these consequences
# we will sleep random amount of time before each call to ViewClip. # we will sleep random amount of time before each call to ViewClip.
self._sleep( self._sleep(
random.randint(2, 5), display_id, random.randint(5, 10), display_id,
'%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling')
if not viewclip: if not viewclip:

View file

@ -53,6 +53,10 @@ from ..utils import (
) )
def parse_qs(url):
return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors""" """Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
@ -438,14 +442,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'(?:(?:www|dev)\.)?invidio\.us', r'(?:(?:www|dev)\.)?invidio\.us',
# Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
r'(?:www\.)?invidious\.pussthecat\.org', r'(?:www\.)?invidious\.pussthecat\.org',
r'(?:www\.)?invidious\.048596\.xyz',
r'(?:www\.)?invidious\.zee\.li', r'(?:www\.)?invidious\.zee\.li',
r'(?:www\.)?vid\.puffyan\.us',
r'(?:(?:www|au)\.)?ytprivate\.com', r'(?:(?:www|au)\.)?ytprivate\.com',
r'(?:www\.)?invidious\.namazso\.eu', r'(?:www\.)?invidious\.namazso\.eu',
r'(?:www\.)?invidious\.ethibox\.fr', r'(?:www\.)?invidious\.ethibox\.fr',
r'(?:www\.)?inv\.skyn3t\.in',
r'(?:www\.)?invidious\.himiko\.cloud',
r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
@ -454,25 +454,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'(?:(?:www|no)\.)?invidiou\.sh', r'(?:(?:www|no)\.)?invidiou\.sh',
r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
r'(?:www\.)?invidious\.kabi\.tk', r'(?:www\.)?invidious\.kabi\.tk',
r'(?:www\.)?invidious\.13ad\.de',
r'(?:www\.)?invidious\.mastodon\.host', r'(?:www\.)?invidious\.mastodon\.host',
r'(?:www\.)?invidious\.zapashcanon\.fr', r'(?:www\.)?invidious\.zapashcanon\.fr',
r'(?:www\.)?invidious\.kavin\.rocks', r'(?:www\.)?invidious\.kavin\.rocks',
r'(?:www\.)?invidious\.tinfoil-hat\.net',
r'(?:www\.)?invidious\.himiko\.cloud',
r'(?:www\.)?invidious\.reallyancient\.tech',
r'(?:www\.)?invidious\.tube', r'(?:www\.)?invidious\.tube',
r'(?:www\.)?invidiou\.site', r'(?:www\.)?invidiou\.site',
r'(?:www\.)?invidious\.site', r'(?:www\.)?invidious\.site',
r'(?:www\.)?invidious\.xyz', r'(?:www\.)?invidious\.xyz',
r'(?:www\.)?invidious\.nixnet\.xyz', r'(?:www\.)?invidious\.nixnet\.xyz',
r'(?:www\.)?invidious\.048596\.xyz',
r'(?:www\.)?invidious\.drycat\.fr', r'(?:www\.)?invidious\.drycat\.fr',
r'(?:www\.)?inv\.skyn3t\.in',
r'(?:www\.)?tube\.poal\.co', r'(?:www\.)?tube\.poal\.co',
r'(?:www\.)?tube\.connect\.cafe', r'(?:www\.)?tube\.connect\.cafe',
r'(?:www\.)?vid\.wxzm\.sx', r'(?:www\.)?vid\.wxzm\.sx',
r'(?:www\.)?vid\.mint\.lgbt', r'(?:www\.)?vid\.mint\.lgbt',
r'(?:www\.)?vid\.puffyan\.us',
r'(?:www\.)?yewtu\.be', r'(?:www\.)?yewtu\.be',
r'(?:www\.)?yt\.elukerio\.org', r'(?:www\.)?yt\.elukerio\.org',
r'(?:www\.)?yt\.lelux\.fi', r'(?:www\.)?yt\.lelux\.fi',
r'(?:www\.)?invidious\.ggc-project\.de', r'(?:www\.)?invidious\.ggc-project\.de',
r'(?:www\.)?yt\.maisputain\.ovh', r'(?:www\.)?yt\.maisputain\.ovh',
r'(?:www\.)?ytprivate\.com',
r'(?:www\.)?invidious\.13ad\.de',
r'(?:www\.)?invidious\.toot\.koeln', r'(?:www\.)?invidious\.toot\.koeln',
r'(?:www\.)?invidious\.fdn\.fr', r'(?:www\.)?invidious\.fdn\.fr',
r'(?:www\.)?watch\.nettohikari\.com', r'(?:www\.)?watch\.nettohikari\.com',
@ -515,16 +522,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
) )
)? # all until now is optional -> you can pass the naked ID )? # all until now is optional -> you can pass the naked ID
(?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?!.*?\blist=
(?:
%(playlist_id)s| # combined list/video URLs are handled by the playlist IE
WL # WL are handled by the watch later IE
)
)
(?(1).+)? # if we found the ID, everything can follow (?(1).+)? # if we found the ID, everything can follow
$""" % { $""" % {
'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
'invidious': '|'.join(_INVIDIOUS_SITES), 'invidious': '|'.join(_INVIDIOUS_SITES),
} }
_PLAYER_INFO_RE = ( _PLAYER_INFO_RE = (
@ -1009,6 +1009,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}, },
'skip': 'This video does not exist.', 'skip': 'This video does not exist.',
}, },
{
# Video with incomplete 'yt:stretch=16:'
'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
'only_matching': True,
},
{ {
# Video licensed under Creative Commons # Video licensed under Creative Commons
'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
@ -1304,6 +1309,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}, },
] ]
@classmethod
def suitable(cls, url):
qs = parse_qs(url)
if qs.get('list', [None])[0]:
return False
return super(YoutubeIE, cls).suitable(url)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs) super(YoutubeIE, self).__init__(*args, **kwargs)
self._code_cache = {} self._code_cache = {}
@ -2079,15 +2091,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
for keyword in keywords: for keyword in keywords:
if keyword.startswith('yt:stretch='): if keyword.startswith('yt:stretch='):
stretch_ratio = map( mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
lambda x: int_or_none(x, default=0), if mobj:
keyword.split('=')[1].split(':')) # NB: float is intentional for forcing float division
w, h = (list(stretch_ratio) + [0])[:2] w, h = (float(v) for v in mobj.groups())
if w > 0 and h > 0: if w > 0 and h > 0:
ratio = w / h ratio = w / h
for f in formats: for f in formats:
if f.get('vcodec') != 'none': if f.get('vcodec') != 'none':
f['stretched_ratio'] = ratio f['stretched_ratio'] = ratio
break
thumbnails = [] thumbnails = []
for container in (video_details, microformat): for container in (video_details, microformat):
@ -2484,6 +2497,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
'uploader': 'Игорь Клейнер', 'uploader': 'Игорь Клейнер',
}, },
}, {
# playlists, series
'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
'playlist_mincount': 5,
'info_dict': {
'id': 'UCYO_jab_esuFRV4b17AJtAw',
'title': '3Blue1Brown - Playlists',
'description': 'md5:e1384e8a133307dd10edee76e875d62f',
},
}, { }, {
# playlists, singlepage # playlists, singlepage
'url': 'https://www.youtube.com/user/ThirstForScience/playlists', 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
@ -2790,6 +2812,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'title': '#cctv9', 'title': '#cctv9',
}, },
'playlist_mincount': 350, 'playlist_mincount': 350,
}, {
'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
'only_matching': True,
}] }]
@classmethod @classmethod
@ -2813,14 +2838,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
@staticmethod @staticmethod
def _extract_basic_item_renderer(item): def _extract_basic_item_renderer(item):
# Modified from _extract_grid_item_renderer # Modified from _extract_grid_item_renderer
known_renderers = ( known_basic_renderers = (
'playlistRenderer', 'videoRenderer', 'channelRenderer', 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer'
'gridPlaylistRenderer', 'gridVideoRenderer', 'gridChannelRenderer'
) )
for key, renderer in item.items(): for key, renderer in item.items():
if key not in known_renderers: if not isinstance(renderer, dict):
continue continue
return renderer elif key in known_basic_renderers:
return renderer
elif key.startswith('grid') and key.endswith('Renderer'):
return renderer
def _grid_entries(self, grid_renderer): def _grid_entries(self, grid_renderer):
for item in grid_renderer['items']: for item in grid_renderer['items']:
@ -2830,7 +2857,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if not isinstance(renderer, dict): if not isinstance(renderer, dict):
continue continue
title = try_get( title = try_get(
renderer, lambda x: x['title']['runs'][0]['text'], compat_str) renderer, (lambda x: x['title']['runs'][0]['text'],
lambda x: x['title']['simpleText']), compat_str)
# playlist # playlist
playlist_id = renderer.get('playlistId') playlist_id = renderer.get('playlistId')
if playlist_id: if playlist_id:
@ -2838,10 +2866,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'https://www.youtube.com/playlist?list=%s' % playlist_id, 'https://www.youtube.com/playlist?list=%s' % playlist_id,
ie=YoutubeTabIE.ie_key(), video_id=playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
video_title=title) video_title=title)
continue
# video # video
video_id = renderer.get('videoId') video_id = renderer.get('videoId')
if video_id: if video_id:
yield self._extract_video(renderer) yield self._extract_video(renderer)
continue
# channel # channel
channel_id = renderer.get('channelId') channel_id = renderer.get('channelId')
if channel_id: if channel_id:
@ -2850,6 +2880,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
yield self.url_result( yield self.url_result(
'https://www.youtube.com/channel/%s' % channel_id, 'https://www.youtube.com/channel/%s' % channel_id,
ie=YoutubeTabIE.ie_key(), video_title=title) ie=YoutubeTabIE.ie_key(), video_title=title)
continue
# generic endpoint URL support
ep_url = urljoin('https://www.youtube.com/', try_get(
renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
compat_str))
if ep_url:
for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
if ie.suitable(ep_url):
yield self.url_result(
ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
break
def _shelf_entries_from_content(self, shelf_renderer): def _shelf_entries_from_content(self, shelf_renderer):
content = shelf_renderer.get('content') content = shelf_renderer.get('content')
@ -3444,7 +3485,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
url = '%s/videos%s' % (mobj.get('pre'), mobj.get('post') or '') url = '%s/videos%s' % (mobj.get('pre'), mobj.get('post') or '')
# Handle both video/playlist URLs # Handle both video/playlist URLs
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) qs = parse_qs(url)
video_id = qs.get('v', [None])[0] video_id = qs.get('v', [None])[0]
playlist_id = qs.get('list', [None])[0] playlist_id = qs.get('list', [None])[0]
@ -3550,12 +3591,16 @@ class YoutubePlaylistIE(InfoExtractor):
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
return False if YoutubeTabIE.suitable(url) else super( if YoutubeTabIE.suitable(url):
YoutubePlaylistIE, cls).suitable(url) return False
qs = parse_qs(url)
if qs.get('v', [None])[0]:
return False
return super(YoutubePlaylistIE, cls).suitable(url)
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) qs = parse_qs(url)
if not qs: if not qs:
qs = {'list': playlist_id} qs = {'list': playlist_id}
return self.url_result( return self.url_result(

View file

@ -40,6 +40,7 @@ import zlib
from .compat import ( from .compat import (
compat_HTMLParseError, compat_HTMLParseError,
compat_HTMLParser, compat_HTMLParser,
compat_HTTPError,
compat_basestring, compat_basestring,
compat_chr, compat_chr,
compat_cookiejar, compat_cookiejar,
@ -2925,12 +2926,60 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
if sys.version_info[0] < 3: """YoutubeDL redirect handler
def redirect_request(self, req, fp, code, msg, headers, newurl):
# On python 2 urlh.geturl() may sometimes return redirect URL The code is based on HTTPRedirectHandler implementation from CPython [1].
# as byte string instead of unicode. This workaround allows
# to force it always return unicode. This redirect handler solves two issues:
return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) - ensures redirect URL is always unicode under python 2
- introduces support for experimental HTTP response status code
308 Permanent Redirect [2] used by some sites [3]
1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
3. https://github.com/ytdl-org/youtube-dl/issues/28768
"""
http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
def redirect_request(self, req, fp, code, msg, headers, newurl):
"""Return a Request or None in response to a redirect.
This is called by the http_error_30x methods when a
redirection response is received. If a redirection should
take place, return a new Request to allow http_error_30x to
perform the redirect. Otherwise, raise HTTPError if no-one
else should try to handle this url. Return None if you can't
but another Handler might.
"""
m = req.get_method()
if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
or code in (301, 302, 303) and m == "POST")):
raise compat_HTTPError(req.full_url, code, msg, headers, fp)
# Strictly (according to RFC 2616), 301 or 302 in response to
# a POST MUST NOT cause a redirection without confirmation
# from the user (of urllib.request, in this case). In practice,
# essentially all clients do redirect in this case, so we do
# the same.
# On python 2 urlh.geturl() may sometimes return redirect URL
# as byte string instead of unicode. This workaround allows
# to force it always return unicode.
if sys.version_info[0] < 3:
newurl = compat_str(newurl)
# Be conciliant with URIs containing a space. This is mainly
# redundant with the more complete encoding done in http_error_302(),
# but it is kept for compatibility with other callers.
newurl = newurl.replace(' ', '%20')
CONTENT_HEADERS = ("content-length", "content-type")
# NB: don't use dict comprehension for python 2.6 compatibility
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in CONTENT_HEADERS)
return compat_urllib_request.Request(
newurl, headers=newheaders, origin_req_host=req.origin_req_host,
unverifiable=True)
def extract_timezone(date_str): def extract_timezone(date_str):