Add option --parse-metadata

* The fields extracted by this can be used in `--output`
* Deprecated `--metadata-from-title`

:ci skip dl
This commit is contained in:
pukkandan 2021-01-26 15:50:20 +05:30
parent 9882064024
commit 5bfa486205
8 changed files with 162 additions and 110 deletions

View file

@ -610,16 +610,19 @@ Then simply type this
--no-embed-thumbnail Do not embed thumbnail (default) --no-embed-thumbnail Do not embed thumbnail (default)
--add-metadata Write metadata to the video file --add-metadata Write metadata to the video file
--no-add-metadata Do not write metadata (default) --no-add-metadata Do not write metadata (default)
--metadata-from-title FORMAT Parse additional metadata like song title / --parse-metadata FIELD:FORMAT Parse additional metadata like title/artist
artist from the video title. The format from other fields. Give field name to
syntax is the same as --output. Regular extract data from, and format of the field
expression with named capture groups may seperated by a ":". The format syntax is
also be used. The parsed parameters replace the same as --output. Regular expression
existing values. Example: --metadata-from- with named capture groups may also be used.
title "%(artist)s - %(title)s" matches a The parsed parameters replace existing
values. This option can be used multiple
times. Example: --parse-metadata
"title:%(artist)s - %(title)s" matches a
title like "Coldplay - Paradise". Example title like "Coldplay - Paradise". Example
(regex): --metadata-from-title (regex): --parse-metadata
"(?P<artist>.+?) - (?P<title>.+)" "description:Artist - (?P<artist>.+?)"
--xattrs Write metadata to the video file's xattrs --xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards) (using dublin core and xdg standards)
--fixup POLICY Automatically correct known faults of the --fixup POLICY Automatically correct known faults of the
@ -1098,7 +1101,7 @@ $ youtube-dlc -S '+res:480,codec,br'
Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example. Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example.
**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`) **Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`)
# MORE # MORE
For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl) For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq)

View file

@ -8,10 +8,16 @@ import sys
import unittest import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dlc.postprocessor import MetadataFromTitlePP from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP
class TestMetadataFromField(unittest.TestCase):
def test_format_to_regex(self):
pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
class TestMetadataFromTitle(unittest.TestCase): class TestMetadataFromTitle(unittest.TestCase):
def test_format_to_regex(self): def test_format_to_regex(self):
pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')

View file

@ -375,8 +375,7 @@ class YoutubeDL(object):
params = None params = None
_ies = [] _ies = []
_pps = [] _pps = {'beforedl': [], 'aftermove': [], 'normal': []}
_pps_end = []
__prepare_filename_warned = False __prepare_filename_warned = False
_download_retcode = None _download_retcode = None
_num_downloads = None _num_downloads = None
@ -390,8 +389,7 @@ class YoutubeDL(object):
params = {} params = {}
self._ies = [] self._ies = []
self._ies_instances = {} self._ies_instances = {}
self._pps = [] self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
self._pps_end = []
self.__prepare_filename_warned = False self.__prepare_filename_warned = False
self._post_hooks = [] self._post_hooks = []
self._progress_hooks = [] self._progress_hooks = []
@ -494,11 +492,13 @@ class YoutubeDL(object):
pp_class = get_postprocessor(pp_def_raw['key']) pp_class = get_postprocessor(pp_def_raw['key'])
pp_def = dict(pp_def_raw) pp_def = dict(pp_def_raw)
del pp_def['key'] del pp_def['key']
after_move = pp_def.get('_after_move', False) if 'when' in pp_def:
if '_after_move' in pp_def: when = pp_def['when']
del pp_def['_after_move'] del pp_def['when']
else:
when = 'normal'
pp = pp_class(self, **compat_kwargs(pp_def)) pp = pp_class(self, **compat_kwargs(pp_def))
self.add_post_processor(pp, after_move=after_move) self.add_post_processor(pp, when=when)
for ph in self.params.get('post_hooks', []): for ph in self.params.get('post_hooks', []):
self.add_post_hook(ph) self.add_post_hook(ph)
@ -550,12 +550,9 @@ class YoutubeDL(object):
for ie in gen_extractor_classes(): for ie in gen_extractor_classes():
self.add_info_extractor(ie) self.add_info_extractor(ie)
def add_post_processor(self, pp, after_move=False): def add_post_processor(self, pp, when='normal'):
"""Add a PostProcessor object to the end of the chain.""" """Add a PostProcessor object to the end of the chain."""
if after_move: self._pps[when].append(pp)
self._pps_end.append(pp)
else:
self._pps.append(pp)
pp.set_downloader(self) pp.set_downloader(self)
def add_post_hook(self, ph): def add_post_hook(self, ph):
@ -1948,6 +1945,8 @@ class YoutubeDL(object):
self._num_downloads += 1 self._num_downloads += 1
info_dict = self.pre_process(info_dict)
filename = self.prepare_filename(info_dict, warn=True) filename = self.prepare_filename(info_dict, warn=True)
info_dict['_filename'] = full_filename = self.prepare_filepath(filename) info_dict['_filename'] = full_filename = self.prepare_filepath(filename)
temp_filename = self.prepare_filepath(filename, 'temp') temp_filename = self.prepare_filepath(filename, 'temp')
@ -2400,20 +2399,14 @@ class YoutubeDL(object):
(k, v) for k, v in info_dict.items() (k, v) for k, v in info_dict.items()
if k not in ['requested_formats', 'requested_subtitles']) if k not in ['requested_formats', 'requested_subtitles'])
def post_process(self, filename, ie_info, files_to_move={}): def run_pp(self, pp, infodict, files_to_move={}):
"""Run all the postprocessors on the given file."""
info = dict(ie_info)
info['filepath'] = filename
def run_pp(pp):
files_to_delete = [] files_to_delete = []
infodict = info
try: try:
files_to_delete, infodict = pp.run(infodict) files_to_delete, infodict = pp.run(infodict)
except PostProcessingError as e: except PostProcessingError as e:
self.report_error(e.msg) self.report_error(e.msg)
if not files_to_delete: if not files_to_delete:
return infodict return files_to_move, infodict
if self.params.get('keepvideo', False): if self.params.get('keepvideo', False):
for f in files_to_delete: for f in files_to_delete:
@ -2427,14 +2420,24 @@ class YoutubeDL(object):
self.report_warning('Unable to remove downloaded original file') self.report_warning('Unable to remove downloaded original file')
if old_filename in files_to_move: if old_filename in files_to_move:
del files_to_move[old_filename] del files_to_move[old_filename]
return infodict return files_to_move, infodict
for pp in ie_info.get('__postprocessors', []) + self._pps: def pre_process(self, ie_info):
info = run_pp(pp) info = dict(ie_info)
info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move)) for pp in self._pps['beforedl']:
files_to_move = {} info = self.run_pp(pp, info)[1]
for pp in self._pps_end: return info
info = run_pp(pp)
def post_process(self, filename, ie_info, files_to_move={}):
"""Run all the postprocessors on the given file."""
info = dict(ie_info)
info['filepath'] = filename
for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
files_to_move, info = self.run_pp(pp, info, files_to_move)
info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1]
for pp in self._pps['aftermove']:
files_to_move, info = self.run_pp(pp, info, {})
def _make_archive_id(self, info_dict): def _make_archive_id(self, info_dict):
video_id = info_dict.get('id') video_id = info_dict.get('id')

View file

@ -45,6 +45,7 @@ from .downloader import (
from .extractor import gen_extractors, list_extractors from .extractor import gen_extractors, list_extractors
from .extractor.common import InfoExtractor from .extractor.common import InfoExtractor
from .extractor.adobepass import MSO_INFO from .extractor.adobepass import MSO_INFO
from .postprocessor.metadatafromfield import MetadataFromFieldPP
from .YoutubeDL import YoutubeDL from .YoutubeDL import YoutubeDL
@ -249,16 +250,25 @@ def _real_main(argv=None):
if re.match(InfoExtractor.FormatSort.regex, f) is None: if re.match(InfoExtractor.FormatSort.regex, f) is None:
parser.error('invalid format sort string "%s" specified' % f) parser.error('invalid format sort string "%s" specified' % f)
if opts.metafromfield is None:
opts.metafromfield = []
if opts.metafromtitle is not None:
opts.metafromfield.append('title:%s' % opts.metafromtitle)
for f in opts.metafromfield:
if re.match(MetadataFromFieldPP.regex, f) is None:
parser.error('invalid format string "%s" specified for --parse-metadata' % f)
any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
any_printing = opts.print_json any_printing = opts.print_json
download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
# PostProcessors # PostProcessors
postprocessors = [] postprocessors = []
if opts.metafromtitle: if opts.metafromfield:
postprocessors.append({ postprocessors.append({
'key': 'MetadataFromTitle', 'key': 'MetadataFromField',
'titleformat': opts.metafromtitle 'formats': opts.metafromfield,
'when': 'beforedl'
}) })
if opts.extractaudio: if opts.extractaudio:
postprocessors.append({ postprocessors.append({
@ -324,7 +334,7 @@ def _real_main(argv=None):
postprocessors.append({ postprocessors.append({
'key': 'ExecAfterDownload', 'key': 'ExecAfterDownload',
'exec_cmd': opts.exec_cmd, 'exec_cmd': opts.exec_cmd,
'_after_move': True 'when': 'aftermove'
}) })
_args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n' _args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n'

View file

@ -1078,14 +1078,20 @@ def parseOpts(overrideArguments=None):
postproc.add_option( postproc.add_option(
'--metadata-from-title', '--metadata-from-title',
metavar='FORMAT', dest='metafromtitle', metavar='FORMAT', dest='metafromtitle',
help=optparse.SUPPRESS_HELP)
postproc.add_option(
'--parse-metadata',
metavar='FIELD:FORMAT', dest='metafromfield', action='append',
help=( help=(
'Parse additional metadata like song title / artist from the video title. ' 'Parse additional metadata like title/artist from other fields. '
'The format syntax is the same as --output. Regular expression with ' 'Give field name to extract data from, and format of the field seperated by a ":". '
'named capture groups may also be used. ' 'The format syntax is the same as --output. '
'Regular expression with named capture groups may also be used. '
'The parsed parameters replace existing values. ' 'The parsed parameters replace existing values. '
'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' 'This option can be used multiple times. '
'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
'"Coldplay - Paradise". ' '"Coldplay - Paradise". '
'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"')) 'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
postproc.add_option( postproc.add_option(
'--xattrs', '--xattrs',
action='store_true', dest='xattrs', default=False, action='store_true', dest='xattrs', default=False,

View file

@ -16,7 +16,8 @@ from .ffmpeg import (
) )
from .xattrpp import XAttrMetadataPP from .xattrpp import XAttrMetadataPP
from .execafterdownload import ExecAfterDownloadPP from .execafterdownload import ExecAfterDownloadPP
from .metadatafromtitle import MetadataFromTitlePP from .metadatafromfield import MetadataFromFieldPP
from .metadatafromfield import MetadataFromTitlePP
from .movefilesafterdownload import MoveFilesAfterDownloadPP from .movefilesafterdownload import MoveFilesAfterDownloadPP
from .sponskrub import SponSkrubPP from .sponskrub import SponSkrubPP
@ -39,6 +40,7 @@ __all__ = [
'FFmpegSubtitlesConvertorPP', 'FFmpegSubtitlesConvertorPP',
'FFmpegVideoConvertorPP', 'FFmpegVideoConvertorPP',
'FFmpegVideoRemuxerPP', 'FFmpegVideoRemuxerPP',
'MetadataFromFieldPP',
'MetadataFromTitlePP', 'MetadataFromTitlePP',
'MoveFilesAfterDownloadPP', 'MoveFilesAfterDownloadPP',
'SponSkrubPP', 'SponSkrubPP',

View file

@ -0,0 +1,66 @@
from __future__ import unicode_literals
import re
from .common import PostProcessor
from ..compat import compat_str
class MetadataFromFieldPP(PostProcessor):
regex = r'(?P<field>\w+):(?P<format>.+)$'
def __init__(self, downloader, formats):
PostProcessor.__init__(self, downloader)
assert isinstance(formats, (list, tuple))
self._data = []
for f in formats:
assert isinstance(f, compat_str)
match = re.match(self.regex, f)
assert match is not None
self._data.append({
'field': match.group('field'),
'format': match.group('format'),
'regex': self.format_to_regex(match.group('format'))})
def format_to_regex(self, fmt):
r"""
Converts a string like
'%(title)s - %(artist)s'
to a regex like
'(?P<title>.+)\ \-\ (?P<artist>.+)'
"""
if not re.search(r'%\(\w+\)s', fmt):
return fmt
lastpos = 0
regex = ''
# replace %(..)s with regex group and escape other string parts
for match in re.finditer(r'%\((\w+)\)s', fmt):
regex += re.escape(fmt[lastpos:match.start()])
regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
lastpos = match.end()
if lastpos < len(fmt):
regex += re.escape(fmt[lastpos:])
return regex
def run(self, info):
for dictn in self._data:
field, regex = dictn['field'], dictn['regex']
if field not in info:
self.report_warning('Video doesnot have a %s' % field)
continue
self.write_debug('Searching for r"%s" in %s' % (regex, field))
match = re.search(regex, info[field])
if match is None:
self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
continue
for attribute, value in match.groupdict().items():
info[attribute] = value
self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
return [], info
class MetadataFromTitlePP(MetadataFromFieldPP): # for backward compatibility
def __init__(self, downloader, titleformat):
super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat])
self._titleformat = titleformat
self._titleregex = self._data[0]['regex']

View file

@ -1,44 +0,0 @@
from __future__ import unicode_literals
import re
from .common import PostProcessor
class MetadataFromTitlePP(PostProcessor):
def __init__(self, downloader, titleformat):
super(MetadataFromTitlePP, self).__init__(downloader)
self._titleformat = titleformat
self._titleregex = (self.format_to_regex(titleformat)
if re.search(r'%\(\w+\)s', titleformat)
else titleformat)
def format_to_regex(self, fmt):
r"""
Converts a string like
'%(title)s - %(artist)s'
to a regex like
'(?P<title>.+)\ \-\ (?P<artist>.+)'
"""
lastpos = 0
regex = ''
# replace %(..)s with regex group and escape other string parts
for match in re.finditer(r'%\((\w+)\)s', fmt):
regex += re.escape(fmt[lastpos:match.start()])
regex += r'(?P<' + match.group(1) + '>.+)'
lastpos = match.end()
if lastpos < len(fmt):
regex += re.escape(fmt[lastpos:])
return regex
def run(self, info):
title = info['title']
match = re.match(self._titleregex, title)
if match is None:
self.to_screen('Could not interpret title of video as "%s"' % self._titleformat)
return [], info
for attribute, value in match.groupdict().items():
info[attribute] = value
self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA'))
return [], info