From 277d6ff5f2bd4f142429def30d01df264eb7c922 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 28 Feb 2021 20:26:08 +0530 Subject: [PATCH] Extract comments only when needed #95 (Closes #94) --- README.md | 8 +++++--- yt_dlp/YoutubeDL.py | 21 +++++++++++++++++++++ yt_dlp/extractor/bilibili.py | 15 +++++++++------ yt_dlp/extractor/common.py | 8 ++++++++ yt_dlp/extractor/youtube.py | 16 ++++++++++------ yt_dlp/options.py | 6 ++++-- 6 files changed, 57 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 8def97e4c1..4501ba426b 100644 --- a/README.md +++ b/README.md @@ -245,7 +245,7 @@ Then simply run `make`. You can also run `make yt-dlp` instead to compile only t "OUTPUT TEMPLATE" for a list of available keys) to match if the key is present, !key to check if the key is not present, - key>NUMBER (like "comment_count > 12", also + key>NUMBER (like "view_count > 12", also works with >=, <, <=, !=, =) to compare against a number, key = 'LITERAL' (like "uploader = 'Mike Smith'", also works with @@ -403,7 +403,9 @@ Then simply run `make`. You can also run `make yt-dlp` instead to compile only t --no-write-playlist-metafiles Do not write playlist metadata when using --write-info-json, --write-description etc. --get-comments Retrieve video comments to be placed in the - .info.json file + .info.json file. The comments are fetched + even without this option if the extraction + is known to be quick --load-info-json FILE JSON file containing the video information (created with the "--write-info-json" option) @@ -814,7 +816,7 @@ The available fields are: - `dislike_count` (numeric): Number of negative ratings of the video - `repost_count` (numeric): Number of reposts of the video - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage - - `comment_count` (numeric): Number of comments on the video + - `comment_count` (numeric): Number of comments on the video (For some extractors, comments are only downloaded at the end, and so this field cannot be used) - `age_limit` (numeric): Age restriction for the video (years) - `is_live` (boolean): Whether this video is a live stream or a fixed-length video - `was_live` (boolean): Whether this video was originally a live stream diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 3c53f4cd88..e9cb7e1876 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2041,6 +2041,7 @@ class YoutubeDL(object): self.to_stdout(formatSeconds(info_dict['duration'])) print_mandatory('format') if self.params.get('forcejson', False): + self.post_extract(info_dict) self.to_stdout(json.dumps(info_dict)) def process_info(self, info_dict): @@ -2064,6 +2065,7 @@ class YoutubeDL(object): if self._match_entry(info_dict, incomplete=False) is not None: return + self.post_extract(info_dict) self._num_downloads += 1 info_dict = self.pre_process(info_dict) @@ -2497,6 +2499,7 @@ class YoutubeDL(object): raise else: if self.params.get('dump_single_json', False): + self.post_extract(res) self.to_stdout(json.dumps(res)) return self._download_retcode @@ -2545,6 +2548,24 @@ class YoutubeDL(object): del files_to_move[old_filename] return files_to_move, infodict + @staticmethod + def post_extract(info_dict): + def actual_post_extract(info_dict): + if info_dict.get('_type') in ('playlist', 'multi_video'): + for video_dict in info_dict.get('entries', {}): + actual_post_extract(video_dict) + return + + if '__post_extractor' not in info_dict: + return + post_extractor = info_dict['__post_extractor'] + if post_extractor: + info_dict.update(post_extractor().items()) + del info_dict['__post_extractor'] + return + + actual_post_extract(info_dict) + def pre_process(self, ie_info): info = dict(ie_info) for pp in self._pps['beforedl']: diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index be117a2bb4..764ac4d3c9 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -255,10 +255,6 @@ class BiliBiliIE(InfoExtractor): info['uploader'] = self._html_search_meta( 'author', webpage, 'uploader', default=None) - comments = None - if self._downloader.params.get('getcomments', False): - comments = self._get_all_comment_pages(video_id) - raw_danmaku = self._get_raw_danmaku(video_id, cid) raw_tags = self._get_tags(video_id) @@ -266,11 +262,18 @@ class BiliBiliIE(InfoExtractor): top_level_info = { 'raw_danmaku': raw_danmaku, - 'comments': comments, - 'comment_count': len(comments) if comments is not None else None, 'tags': tags, 'raw_tags': raw_tags, } + if self._downloader.params.get('getcomments', False): + def get_comments(): + comments = self._get_all_comment_pages(video_id) + return { + 'comments': comments, + 'comment_count': len(comments) + } + + top_level_info['__post_extractor'] = get_comments ''' # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 47b91a00a7..3326d436bb 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -294,6 +294,14 @@ class InfoExtractor(object): players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string specifying the criteria for embedability (Eg: 'whitelist'). + __post_extractor: A function to be called just before the metadata is + written to either disk, logger or console. The function + must return a dict which will be added to the info_dict. + This is usefull for additional information that is + time-consuming to extract. Note that the fields thus + extracted will not be available to output template and + match_filter. So, only "comments" and "comment_count" are + currently allowed to be extracted via this method. The following fields should only be used when the video belongs to some logical chapter or section: diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 41b894776d..804186b851 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2012,9 +2012,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get comments # TODO: Refactor and move to seperate function - if get_comments: + def extract_comments(): expected_video_comment_count = 0 video_comments = [] + comment_xsrf = xsrf_token def find_value(html, key, num_chars=2, separator='"'): pos_begin = html.find(key) + len(key) + num_chars @@ -2083,7 +2084,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen('Downloading comments') while continuations: continuation = continuations.pop() - comment_response = get_continuation(continuation, xsrf_token) + comment_response = get_continuation(continuation, comment_xsrf) if not comment_response: continue if list(search_dict(comment_response, 'externalErrorMessage')): @@ -2094,7 +2095,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue # not sure if this actually helps if 'xsrf_token' in comment_response: - xsrf_token = comment_response['xsrf_token'] + comment_xsrf = comment_response['xsrf_token'] item_section = comment_response['response']['continuationContents']['itemSectionContinuation'] if first_continuation: @@ -2123,7 +2124,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): while reply_continuations: time.sleep(1) continuation = reply_continuations.pop() - replies_data = get_continuation(continuation, xsrf_token, True) + replies_data = get_continuation(continuation, comment_xsrf, True) if not replies_data or 'continuationContents' not in replies_data[1]['response']: continue @@ -2152,10 +2153,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): time.sleep(1) self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count)) - info.update({ + return { 'comments': video_comments, 'comment_count': expected_video_comment_count - }) + } + + if get_comments: + info['__post_extractor'] = extract_comments self.mark_watched(video_id, player_response) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index ae11e6b8bc..2694990228 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -347,7 +347,7 @@ def parseOpts(overrideArguments=None): 'Specify any key (see "OUTPUT TEMPLATE" for a list of available keys) to ' 'match if the key is present, ' '!key to check if the key is not present, ' - 'key>NUMBER (like "comment_count > 12", also works with ' + 'key>NUMBER (like "view_count > 12", also works with ' '>=, <, <=, !=, =) to compare against a number, ' 'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) ' 'to match against a string literal ' @@ -985,7 +985,9 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '--get-comments', action='store_true', dest='getcomments', default=False, - help='Retrieve video comments to be placed in the .info.json file') + help=( + 'Retrieve video comments to be placed in the .info.json file. ' + 'The comments are fetched even without this option if the extraction is known to be quick')) filesystem.add_option( '--load-info-json', '--load-info', dest='load_info_filename', metavar='FILE',