Reject entire playlists faster with --match-filter

Rejected based on `playlist_id` etc can be checked before any entries are extracted

Related: #4383
This commit is contained in:
pukkandan 2022-07-26 09:28:37 +05:30
parent 7d0f6f0c45
commit 3bec830a59
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39
3 changed files with 39 additions and 32 deletions

View file

@ -1309,7 +1309,7 @@ class YoutubeDL:
def _match_entry(self, info_dict, incomplete=False, silent=False): def _match_entry(self, info_dict, incomplete=False, silent=False):
""" Returns None if the file should be downloaded """ """ Returns None if the file should be downloaded """
video_title = info_dict.get('title', info_dict.get('id', 'video')) video_title = info_dict.get('title', info_dict.get('id', 'entry'))
def check_filter(): def check_filter():
if 'title' in info_dict: if 'title' in info_dict:
@ -1677,23 +1677,37 @@ class YoutubeDL:
return make_dir(path, self.report_error) return make_dir(path, self.report_error)
@staticmethod @staticmethod
def _playlist_infodict(ie_result, **kwargs): def _playlist_infodict(ie_result, strict=False, **kwargs):
return { info = {
**ie_result, 'playlist_count': ie_result.get('playlist_count'),
'playlist': ie_result.get('title') or ie_result.get('id'), 'playlist': ie_result.get('title') or ie_result.get('id'),
'playlist_id': ie_result.get('id'), 'playlist_id': ie_result.get('id'),
'playlist_title': ie_result.get('title'), 'playlist_title': ie_result.get('title'),
'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader': ie_result.get('uploader'),
'playlist_uploader_id': ie_result.get('uploader_id'), 'playlist_uploader_id': ie_result.get('uploader_id'),
'playlist_index': 0,
**kwargs, **kwargs,
} }
if strict:
return info
return {
**info,
'playlist_index': 0,
'__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
'extractor': ie_result['extractor'],
'webpage_url': ie_result['webpage_url'],
'webpage_url_basename': url_basename(ie_result['webpage_url']),
'webpage_url_domain': get_domain(ie_result['webpage_url']),
'extractor_key': ie_result['extractor_key'],
}
def __process_playlist(self, ie_result, download): def __process_playlist(self, ie_result, download):
"""Process each entry in the playlist""" """Process each entry in the playlist"""
assert ie_result['_type'] in ('playlist', 'multi_video') assert ie_result['_type'] in ('playlist', 'multi_video')
title = ie_result.get('title') or ie_result.get('id') or '<Untitled>' common_info = self._playlist_infodict(ie_result, strict=True)
title = common_info.get('title') or '<Untitled>'
if self._match_entry(common_info, incomplete=True) is not None:
return
self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}') self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}')
all_entries = PlaylistEntries(self, ie_result) all_entries = PlaylistEntries(self, ie_result)
@ -1711,12 +1725,14 @@ class YoutubeDL:
# Better to do this after potentially exhausting entries # Better to do this after potentially exhausting entries
ie_result['playlist_count'] = all_entries.get_full_count() ie_result['playlist_count'] = all_entries.get_full_count()
common_info = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
ie_copy = collections.ChainMap(ie_result, common_info)
_infojson_written = False _infojson_written = False
write_playlist_files = self.params.get('allow_playlist_files', True) write_playlist_files = self.params.get('allow_playlist_files', True)
if write_playlist_files and self.params.get('list_thumbnails'): if write_playlist_files and self.params.get('list_thumbnails'):
self.list_thumbnails(ie_result) self.list_thumbnails(ie_result)
if write_playlist_files and not self.params.get('simulate'): if write_playlist_files and not self.params.get('simulate'):
ie_copy = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
_infojson_written = self._write_info_json( _infojson_written = self._write_info_json(
'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
if _infojson_written is None: if _infojson_written is None:
@ -1725,7 +1741,7 @@ class YoutubeDL:
self.prepare_filename(ie_copy, 'pl_description')) is None: self.prepare_filename(ie_copy, 'pl_description')) is None:
return return
# TODO: This should be passed to ThumbnailsConvertor if necessary # TODO: This should be passed to ThumbnailsConvertor if necessary
self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail'))
if lazy: if lazy:
if self.params.get('playlistreverse') or self.params.get('playlistrandom'): if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
@ -1749,35 +1765,26 @@ class YoutubeDL:
for i, (playlist_index, entry) in enumerate(entries): for i, (playlist_index, entry) in enumerate(entries):
if lazy: if lazy:
resolved_entries.append((playlist_index, entry)) resolved_entries.append((playlist_index, entry))
if not entry:
# TODO: Add auto-generated fields
if not entry or self._match_entry(entry, incomplete=True) is not None:
continue continue
self.to_screen('[download] Downloading video %s of %s' % (
self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
if not lazy and 'playlist-index' in self.params.get('compat_opts', []): if not lazy and 'playlist-index' in self.params.get('compat_opts', []):
playlist_index = ie_result['requested_entries'][i] playlist_index = ie_result['requested_entries'][i]
entry_result = self.__process_iterable_entry(entry, download, { extra = {
'n_entries': int_or_none(n_entries), **common_info,
'__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
'playlist_count': ie_result.get('playlist_count'),
'playlist_index': playlist_index, 'playlist_index': playlist_index,
'playlist_autonumber': i + 1, 'playlist_autonumber': i + 1,
'playlist': title, }
'playlist_id': ie_result.get('id'),
'playlist_title': ie_result.get('title'), if self._match_entry(collections.ChainMap(entry, extra), incomplete=True) is not None:
'playlist_uploader': ie_result.get('uploader'), continue
'playlist_uploader_id': ie_result.get('uploader_id'),
'extractor': ie_result['extractor'], self.to_screen('[download] Downloading video %s of %s' % (
'webpage_url': ie_result['webpage_url'], self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
'webpage_url_basename': url_basename(ie_result['webpage_url']),
'webpage_url_domain': get_domain(ie_result['webpage_url']), entry_result = self.__process_iterable_entry(entry, download, extra)
'extractor_key': ie_result['extractor_key'],
})
if not entry_result: if not entry_result:
failures += 1 failures += 1
if failures >= max_failures: if failures >= max_failures:

View file

@ -1149,9 +1149,9 @@ class FFmpegConcatPP(FFmpegPostProcessor):
if len(in_files) < len(entries): if len(in_files) < len(entries):
raise PostProcessingError('Aborting concatenation because some downloads failed') raise PostProcessingError('Aborting concatenation because some downloads failed')
ie_copy = self._downloader._playlist_infodict(info)
exts = traverse_obj(entries, (..., 'requested_downloads', 0, 'ext'), (..., 'ext')) exts = traverse_obj(entries, (..., 'requested_downloads', 0, 'ext'), (..., 'ext'))
ie_copy['ext'] = exts[0] if len(set(exts)) == 1 else 'mkv' ie_copy = collections.ChainMap({'ext': exts[0] if len(set(exts)) == 1 else 'mkv'},
info, self._downloader._playlist_infodict(info))
out_file = self._downloader.prepare_filename(ie_copy, 'pl_video') out_file = self._downloader.prepare_filename(ie_copy, 'pl_video')
files_to_delete = self.concat_files(in_files, out_file) files_to_delete = self.concat_files(in_files, out_file)

View file

@ -3666,7 +3666,7 @@ def match_filter_func(filters):
if not filters or any(match_str(f, info_dict, incomplete) for f in filters): if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
return NO_DEFAULT if interactive and not incomplete else None return NO_DEFAULT if interactive and not incomplete else None
else: else:
video_title = info_dict.get('title') or info_dict.get('id') or 'video' video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
filter_str = ') | ('.join(map(str.strip, filters)) filter_str = ') | ('.join(map(str.strip, filters))
return f'{video_title} does not pass filter ({filter_str}), skipping ..' return f'{video_title} does not pass filter ({filter_str}), skipping ..'
return _match_func return _match_func