import base64 import functools import itertools import math import urllib.error import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( ExtractorError, GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, filter_dict, float_or_none, format_field, int_or_none, make_archive_id, mimetype2ext, parse_count, parse_qs, qualities, srt_subtitles_timecode, str_or_none, traverse_obj, url_or_none, urlencode_postdata, ) class BilibiliBaseIE(InfoExtractor): def extract_formats(self, play_info): format_names = { r['quality']: traverse_obj(r, 'new_description', 'display_desc') for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) } audios = traverse_obj(play_info, ('dash', 'audio', ...)) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) if flac_audio: audios.append(flac_audio) formats = [{ 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), 'acodec': audio.get('codecs'), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), 'filesize': int_or_none(audio.get('size')) } for audio in audios] formats.extend({ 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')), 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')), 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'vcodec': video.get('codecs'), 'acodec': 'none' if audios else None, 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), 'format': format_names.get(video.get('id')), } for video in traverse_obj(play_info, ('dash', 'video', ...))) missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality'))) if missing_formats: self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; ' 'you have to login or become premium member to download them') self._sort_formats(formats) return formats def json2srt(self, json_data): srt_data = '' for idx, line in enumerate(json_data.get('body') or []): srt_data += (f'{idx + 1}\n' f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n' f'{line["content"]}\n\n') return srt_data def _get_subtitles(self, video_id, initial_state, cid): subtitles = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', }] } for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) }) return subtitles def _get_comments(self, aid): for idx in itertools.count(1): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685', aid, note=f'Extracting comments from page {idx}', fatal=False), ('data', 'replies')) if not replies: return for children in map(self._get_all_children, replies): yield from children def _get_all_children(self, reply): yield { 'author': traverse_obj(reply, ('member', 'uname')), 'author_id': traverse_obj(reply, ('member', 'mid')), 'id': reply.get('rpid'), 'text': traverse_obj(reply, ('content', 'message')), 'timestamp': reply.get('ctime'), 'parent': reply.get('parent') or 'root', } for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children def extract_common_info(self, video_id, initial_state, play_info, aid, cid): season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) season_number = season_id and next(( idx + 1 for idx, e in enumerate( traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) if e.get('season_id') == season_id ), None) return { 'title': traverse_obj(initial_state, 'h1Title'), 'description': traverse_obj(initial_state, ('videoData', 'desc')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), 'uploader': traverse_obj(initial_state, ('upData', 'name')), 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')) or None, 'thumbnail': traverse_obj( initial_state, ('videoData', 'pic'), ('epInfo', 'cover')), 'timestamp': traverse_obj( initial_state, ('videoData', 'pubdate'), ('epInfo', 'pub_time')), 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), 'season_id': season_id, 'season_number': season_number, 'subtitles': self.extract_subtitles(video_id, initial_state, cid), '__post_extractor': self.extract_comments(aid), } class BiliBiliIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', 'info_dict': { 'id': 'BV13x41117TL', 'title': '阿滴英文|英文歌分享#6 "Closer', 'ext': 'mp4', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'upload_date': '20170301', 'timestamp': 1488353834, 'like_count': int, 'view_count': int, }, }, { # old av URL version 'url': 'http://www.bilibili.com/video/av1074402/', 'info_dict': { 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'ext': 'mp4', 'uploader': '菊子桑', 'uploader_id': '156160', 'id': 'BV11x411K7CN', 'title': '【金坷垃】金泡沫', 'duration': 308.36, 'upload_date': '20140420', 'timestamp': 1397983878, 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', 'like_count': int, 'comment_count': int, 'view_count': int, 'tags': list, }, 'params': { 'skip_download': True, }, }, { 'note': 'Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797', 'info_dict': { 'id': 'BV1bK411W797', 'title': '物语中的人物是如何吐槽自己的OP的' }, 'playlist_count': 18, 'playlist': [{ 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:11', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, } }] }, { 'note': 'Specific page of Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', 'info_dict': { 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'tags': 'count:11', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', 'uploader_id': '150259984', 'like_count': int, 'comment_count': int, 'upload_date': '20200516', 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, } }, { 'note': 'video has subtitles', 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', 'info_dict': { 'id': 'BV12N4y1M7rh', 'ext': 'mp4', 'title': '游戏帧数增加40%?下代联发科天玑芯片或将支持光线追踪!从Immortalis-G715看下代联发科SoC的GPU表现 | Arm: 可以不用咬打火机了!', 'tags': list, 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'duration': 313.557, 'upload_date': '20220709', 'uploader': '小夫Tech', 'timestamp': 1657347907, 'uploader_id': '1326814124', 'comment_count': int, 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'subtitles': 'count:2' }, 'params': {'listsubtitles': True}, }, { 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { 'id': 'BV13x41117TL', 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'upload_date': '20170301', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', 'timestamp': 1488353834, 'uploader_id': '65880958', 'uploader': '阿滴英文', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'duration': 554.117, 'tags': list, 'comment_count': int, 'view_count': int, 'like_count': int, }, 'params': { 'skip_download': True, }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) play_info = self._search_json(r'window.__playinfo__\s*=', webpage, 'play info', video_id)['data'] video_data = initial_state['videoData'] video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. page_list_json = traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, note='Extracting videos in anthology'), 'data', expected_type=list) or [] is_anthology = len(page_list_json) > 1 part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) if is_anthology and not part_id and self._yes_playlist(video_id, video_id): return self.playlist_from_matches( page_list_json, video_id, title, ie=BiliBiliIE, getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') return { 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'http_headers': {'Referer': url}, **self.extract_common_info(video_id, initial_state, play_info, aid, cid=( traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid'))), 'title': title, } class BiliBiliBangumiIE(BilibiliBaseIE): _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P(?:ss|ep)\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss897', 'info_dict': { 'id': 'ss897', 'ext': 'mp4', 'series': '神的记事本', 'season': '神的记事本', 'season_id': 897, 'season_number': 1, 'episode': '你与旅行包', 'episode_number': 2, 'title': '神的记事本:第2话 你与旅行包', 'duration': 1428.487, 'timestamp': 1310809380, 'upload_date': '20110716', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, }, { 'url': 'https://www.bilibili.com/bangumi/play/ep508406', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage or '正在观看预览,大会员免费看全片' in webpage): self.raise_login_required('This video is for premium members only') play_info = self._search_json(r'window.__playinfo__\s*=\s*', webpage, 'play info', video_id)['data'] formats = self.extract_formats(play_info) if (not formats and '成为大会员抢先看' in webpage and play_info.get('durl') and not play_info.get('dash')): self.raise_login_required('This video is for premium members only') initial_state = self._search_json(r'window.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) return { 'id': video_id, 'formats': formats, 'http_headers': {'Referer': url, **self.geo_verification_headers()}, **self.extract_common_info( video_id, initial_state, play_info, aid=traverse_obj(initial_state, ('epInfo', 'aid')), cid=traverse_obj(initial_state, ('epInfo', 'cid'))) } class BiliBiliBangumiMediaIE(InfoExtractor): _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { 'id': '24097891', }, 'playlist_mincount': 25, }] def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) initial_state = self._search_json(r'window.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) episode_list = self._download_json( 'https://api.bilibili.com/pgc/web/season/section', media_id, query={'season_id': initial_state['mediaInfo']['season_id']}, note='Downloading season info')['result']['main_section']['episodes'] return self.playlist_result(( self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) for entry in episode_list), media_id) class BilibiliSpaceBaseIE(InfoExtractor): def _extract_playlist(self, fetch_page, get_metadata, get_entries): first_page = fetch_page(0) metadata = get_metadata(first_page) paged_list = InAdvancePagedList( lambda idx: get_entries(fetch_page(idx) if idx else first_page), metadata['page_count'], metadata['page_size']) return metadata, paged_list class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P\d+)(?P