From f5f15c9993cf8087753a7ba2b57fee55e366b80e Mon Sep 17 00:00:00 2001 From: Luc Ritchie Date: Sun, 27 Mar 2022 23:21:42 -0400 Subject: [PATCH] [BiliIntl] Support user-generated videos (#3203) Authored by: wlritchi --- yt_dlp/extractor/bilibili.py | 67 ++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index b4eb20642b..dd1ff512eb 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -15,6 +15,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + filter_dict, int_or_none, float_or_none, mimetype2ext, @@ -755,15 +756,21 @@ class BiliIntlBaseIE(InfoExtractor): for i, line in enumerate(json['body']) if line.get('content')) return data - def _get_subtitles(self, ep_id): - sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id) + def _get_subtitles(self, *, ep_id=None, aid=None): + sub_json = self._call_api( + '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list', + errnote='Unable to download subtitles list', query=filter_dict({ + 'platform': 'web', + 'episode_id': ep_id, + 'aid': aid, + })) subtitles = {} for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') if not sub_url: continue sub_data = self._download_json( - sub_url, ep_id, errnote='Unable to download subtitles', fatal=False, + sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') if not sub_data: continue @@ -773,9 +780,14 @@ class BiliIntlBaseIE(InfoExtractor): }) return subtitles - def _get_formats(self, ep_id): - video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id, - note='Downloading video formats', errnote='Unable to download video formats') + def _get_formats(self, *, ep_id=None, aid=None): + video_json = self._call_api( + '/web/playurl', ep_id or aid, note='Downloading video formats', + errnote='Unable to download video formats', query=filter_dict({ + 'platform': 'web', + 'ep_id': ep_id, + 'aid': aid, + })) video_json = video_json['playurl'] formats = [] for vid in video_json.get('video') or []: @@ -809,15 +821,15 @@ class BiliIntlBaseIE(InfoExtractor): self._sort_formats(formats) return formats - def _extract_ep_info(self, episode_data, ep_id): + def _extract_video_info(self, video_data, *, ep_id=None, aid=None): return { - 'id': ep_id, - 'title': episode_data.get('title_display') or episode_data['title'], - 'thumbnail': episode_data.get('cover'), + 'id': ep_id or aid, + 'title': video_data.get('title_display') or video_data.get('title'), + 'thumbnail': video_data.get('cover'), 'episode_number': int_or_none(self._search_regex( - r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)), - 'formats': self._get_formats(ep_id), - 'subtitles': self._get_subtitles(ep_id), + r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), 'extractor_key': BiliIntlIE.ie_key(), } @@ -854,7 +866,7 @@ class BiliIntlBaseIE(InfoExtractor): class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P\d+)/(?P\d+)|video/(?P\d+))' _TESTS = [{ # Bstation page 'url': 'https://www.bilibili.tv/en/play/34613/341736', @@ -889,24 +901,35 @@ class BiliIntlIE(BiliIntlBaseIE): }, { 'url': 'https://www.biliintl.com/en/play/34613/341736', 'only_matching': True, + }, { + # User-generated content (as opposed to a series licensed from a studio) + 'url': 'https://bilibili.tv/en/video/2019955076', + 'only_matching': True, + }, { + # No language in URL + 'url': 'https://www.bilibili.tv/video/2019955076', + 'only_matching': True, }] def _real_extract(self, url): - season_id, video_id = self._match_valid_url(url).groups() + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid webpage = self._download_webpage(url, video_id) # Bstation layout initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, 'preload state', default='{}'), video_id, fatal=False) or {} - episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + video_data = ( + traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) - if not episode_data: + if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - episode_data = next( + video_data = next( episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) - if str(episode.get('episode_id')) == video_id) - return self._extract_ep_info(episode_data, video_id) + if str(episode.get('episode_id')) == ep_id) + return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): @@ -934,7 +957,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): episode_id = str(episode.get('episode_id')) - yield self._extract_ep_info(episode, episode_id) + yield self._extract_video_info(episode, ep_id=episode_id) def _real_extract(self, url): series_id = self._match_id(url)