From 5bfa48620542d9ee34958d7c96aa45465b058fbd Mon Sep 17 00:00:00 2001
From: pukkandan <pukkandan@gmail.com>
Date: Tue, 26 Jan 2021 15:50:20 +0530
Subject: [PATCH] Add option `--parse-metadata`

* The fields extracted by this can be used in `--output`
* Deprecated `--metadata-from-title`

:ci skip dl
---
 README.md                                     | 25 +++---
 test/test_postprocessors.py                   | 10 ++-
 youtube_dlc/YoutubeDL.py                      | 89 ++++++++++---------
 youtube_dlc/__init__.py                       | 18 +++-
 youtube_dlc/options.py                        | 16 ++--
 youtube_dlc/postprocessor/__init__.py         |  4 +-
 .../postprocessor/metadatafromfield.py        | 66 ++++++++++++++
 .../postprocessor/metadatafromtitle.py        | 44 ---------
 8 files changed, 162 insertions(+), 110 deletions(-)
 create mode 100644 youtube_dlc/postprocessor/metadatafromfield.py
 delete mode 100644 youtube_dlc/postprocessor/metadatafromtitle.py
diff --git a/README.md b/README.md
index 7524e84939..886ec245f4 100644
--- a/README.md
+++ b/README.md
@@ -610,16 +610,19 @@ Then simply type this
     --no-embed-thumbnail             Do not embed thumbnail (default)
     --add-metadata                   Write metadata to the video file
     --no-add-metadata                Do not write metadata (default)
-    --metadata-from-title FORMAT     Parse additional metadata like song title /
-                                     artist from the video title. The format
-                                     syntax is the same as --output. Regular
-                                     expression with named capture groups may
-                                     also be used. The parsed parameters replace
-                                     existing values. Example: --metadata-from-
-                                     title "%(artist)s - %(title)s" matches a
+    --parse-metadata FIELD:FORMAT    Parse additional metadata like title/artist
+                                     from other fields. Give field name to
+                                     extract data from, and format of the field
+                                     seperated by a ":". The format syntax is
+                                     the same as --output. Regular expression
+                                     with named capture groups may also be used.
+                                     The parsed parameters replace existing
+                                     values. This option can be used multiple
+                                     times. Example: --parse-metadata
+                                     "title:%(artist)s - %(title)s" matches a
                                      title like "Coldplay - Paradise". Example
-                                     (regex): --metadata-from-title
-                                     "(?P<artist>.+?) - (?P<title>.+)"
+                                     (regex): --parse-metadata
+                                     "description:Artist - (?P<artist>.+?)"
     --xattrs                         Write metadata to the video file's xattrs
                                      (using dublin core and xdg standards)
     --fixup POLICY                   Automatically correct known faults of the
@@ -1098,7 +1101,7 @@ $ youtube-dlc -S '+res:480,codec,br'
 
 Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example.
 
-**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`)
+**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`)
 
 # MORE
-For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl)
+For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq)
diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py
index 6f538a3da0..fabe7e6fb9 100644
--- a/test/test_postprocessors.py
+++ b/test/test_postprocessors.py
@@ -8,10 +8,16 @@ import sys
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from youtube_dlc.postprocessor import MetadataFromTitlePP
+from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP
+
+
+class TestMetadataFromField(unittest.TestCase):
+    def test_format_to_regex(self):
+        pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
+        self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
 
 
 class TestMetadataFromTitle(unittest.TestCase):
     def test_format_to_regex(self):
         pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
-        self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+        self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py
index ce990507ca..0e93303b1e 100644
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@@ -375,8 +375,7 @@ class YoutubeDL(object):
 
     params = None
     _ies = []
-    _pps = []
-    _pps_end = []
+    _pps = {'beforedl': [], 'aftermove': [], 'normal': []}
     __prepare_filename_warned = False
     _download_retcode = None
     _num_downloads = None
@@ -390,8 +389,7 @@ class YoutubeDL(object):
             params = {}
         self._ies = []
         self._ies_instances = {}
-        self._pps = []
-        self._pps_end = []
+        self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
         self.__prepare_filename_warned = False
         self._post_hooks = []
         self._progress_hooks = []
@@ -494,11 +492,13 @@ class YoutubeDL(object):
             pp_class = get_postprocessor(pp_def_raw['key'])
             pp_def = dict(pp_def_raw)
             del pp_def['key']
-            after_move = pp_def.get('_after_move', False)
-            if '_after_move' in pp_def:
-                del pp_def['_after_move']
+            if 'when' in pp_def:
+                when = pp_def['when']
+                del pp_def['when']
+            else:
+                when = 'normal'
             pp = pp_class(self, **compat_kwargs(pp_def))
-            self.add_post_processor(pp, after_move=after_move)
+            self.add_post_processor(pp, when=when)
 
         for ph in self.params.get('post_hooks', []):
             self.add_post_hook(ph)
@@ -550,12 +550,9 @@ class YoutubeDL(object):
         for ie in gen_extractor_classes():
             self.add_info_extractor(ie)
 
-    def add_post_processor(self, pp, after_move=False):
+    def add_post_processor(self, pp, when='normal'):
         """Add a PostProcessor object to the end of the chain."""
-        if after_move:
-            self._pps_end.append(pp)
-        else:
-            self._pps.append(pp)
+        self._pps[when].append(pp)
         pp.set_downloader(self)
 
     def add_post_hook(self, ph):
@@ -1948,6 +1945,8 @@ class YoutubeDL(object):
 
         self._num_downloads += 1
 
+        info_dict = self.pre_process(info_dict)
+
         filename = self.prepare_filename(info_dict, warn=True)
         info_dict['_filename'] = full_filename = self.prepare_filepath(filename)
         temp_filename = self.prepare_filepath(filename, 'temp')
@@ -2400,41 +2399,45 @@ class YoutubeDL(object):
             (k, v) for k, v in info_dict.items()
             if k not in ['requested_formats', 'requested_subtitles'])
 
+    def run_pp(self, pp, infodict, files_to_move={}):
+        files_to_delete = []
+        try:
+            files_to_delete, infodict = pp.run(infodict)
+        except PostProcessingError as e:
+            self.report_error(e.msg)
+        if not files_to_delete:
+            return files_to_move, infodict
+
+        if self.params.get('keepvideo', False):
+            for f in files_to_delete:
+                files_to_move.setdefault(f, '')
+        else:
+            for old_filename in set(files_to_delete):
+                self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
+                try:
+                    os.remove(encodeFilename(old_filename))
+                except (IOError, OSError):
+                    self.report_warning('Unable to remove downloaded original file')
+                if old_filename in files_to_move:
+                    del files_to_move[old_filename]
+        return files_to_move, infodict
+
+    def pre_process(self, ie_info):
+        info = dict(ie_info)
+        for pp in self._pps['beforedl']:
+            info = self.run_pp(pp, info)[1]
+        return info
+
     def post_process(self, filename, ie_info, files_to_move={}):
         """Run all the postprocessors on the given file."""
         info = dict(ie_info)
         info['filepath'] = filename
 
-        def run_pp(pp):
-            files_to_delete = []
-            infodict = info
-            try:
-                files_to_delete, infodict = pp.run(infodict)
-            except PostProcessingError as e:
-                self.report_error(e.msg)
-            if not files_to_delete:
-                return infodict
-
-            if self.params.get('keepvideo', False):
-                for f in files_to_delete:
-                    files_to_move.setdefault(f, '')
-            else:
-                for old_filename in set(files_to_delete):
-                    self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
-                    try:
-                        os.remove(encodeFilename(old_filename))
-                    except (IOError, OSError):
-                        self.report_warning('Unable to remove downloaded original file')
-                    if old_filename in files_to_move:
-                        del files_to_move[old_filename]
-            return infodict
-
-        for pp in ie_info.get('__postprocessors', []) + self._pps:
-            info = run_pp(pp)
-        info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move))
-        files_to_move = {}
-        for pp in self._pps_end:
-            info = run_pp(pp)
+        for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
+            files_to_move, info = self.run_pp(pp, info, files_to_move)
+        info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1]
+        for pp in self._pps['aftermove']:
+            files_to_move, info = self.run_pp(pp, info, {})
 
     def _make_archive_id(self, info_dict):
         video_id = info_dict.get('id')
diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py
index e2db662665..5f97b51ff1 100644
--- a/youtube_dlc/__init__.py
+++ b/youtube_dlc/__init__.py
@@ -45,6 +45,7 @@ from .downloader import (
 from .extractor import gen_extractors, list_extractors
 from .extractor.common import InfoExtractor
 from .extractor.adobepass import MSO_INFO
+from .postprocessor.metadatafromfield import MetadataFromFieldPP
 from .YoutubeDL import YoutubeDL
 
 
@@ -249,16 +250,25 @@ def _real_main(argv=None):
         if re.match(InfoExtractor.FormatSort.regex, f) is None:
             parser.error('invalid format sort string "%s" specified' % f)
 
+    if opts.metafromfield is None:
+        opts.metafromfield = []
+    if opts.metafromtitle is not None:
+        opts.metafromfield.append('title:%s' % opts.metafromtitle)
+    for f in opts.metafromfield:
+        if re.match(MetadataFromFieldPP.regex, f) is None:
+            parser.error('invalid format string "%s" specified for --parse-metadata' % f)
+
     any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
     any_printing = opts.print_json
     download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
 
     # PostProcessors
     postprocessors = []
-    if opts.metafromtitle:
+    if opts.metafromfield:
         postprocessors.append({
-            'key': 'MetadataFromTitle',
-            'titleformat': opts.metafromtitle
+            'key': 'MetadataFromField',
+            'formats': opts.metafromfield,
+            'when': 'beforedl'
         })
     if opts.extractaudio:
         postprocessors.append({
@@ -324,7 +334,7 @@ def _real_main(argv=None):
         postprocessors.append({
             'key': 'ExecAfterDownload',
             'exec_cmd': opts.exec_cmd,
-            '_after_move': True
+            'when': 'aftermove'
         })
 
     _args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n'
diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py
index 4910c2083f..859f28e2b1 100644
--- a/youtube_dlc/options.py
+++ b/youtube_dlc/options.py
@@ -1078,14 +1078,20 @@ def parseOpts(overrideArguments=None):
     postproc.add_option(
         '--metadata-from-title',
         metavar='FORMAT', dest='metafromtitle',
+        help=optparse.SUPPRESS_HELP)
+    postproc.add_option(
+        '--parse-metadata',
+        metavar='FIELD:FORMAT', dest='metafromfield', action='append',
         help=(
-            'Parse additional metadata like song title / artist from the video title. '
-            'The format syntax is the same as --output. Regular expression with '
-            'named capture groups may also be used. '
+            'Parse additional metadata like title/artist from other fields. '
+            'Give field name to extract data from, and format of the field seperated by a ":". '
+            'The format syntax is the same as --output. '
+            'Regular expression with named capture groups may also be used. '
             'The parsed parameters replace existing values. '
-            'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
+            'This option can be used multiple times. '
+            'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
             '"Coldplay - Paradise". '
-            'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'))
+            'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
     postproc.add_option(
         '--xattrs',
         action='store_true', dest='xattrs', default=False,
diff --git a/youtube_dlc/postprocessor/__init__.py b/youtube_dlc/postprocessor/__init__.py
index 840a83b0e2..c5aa925c65 100644
--- a/youtube_dlc/postprocessor/__init__.py
+++ b/youtube_dlc/postprocessor/__init__.py
@@ -16,7 +16,8 @@ from .ffmpeg import (
 )
 from .xattrpp import XAttrMetadataPP
 from .execafterdownload import ExecAfterDownloadPP
-from .metadatafromtitle import MetadataFromTitlePP
+from .metadatafromfield import MetadataFromFieldPP
+from .metadatafromfield import MetadataFromTitlePP
 from .movefilesafterdownload import MoveFilesAfterDownloadPP
 from .sponskrub import SponSkrubPP
 
@@ -39,6 +40,7 @@ __all__ = [
     'FFmpegSubtitlesConvertorPP',
     'FFmpegVideoConvertorPP',
     'FFmpegVideoRemuxerPP',
+    'MetadataFromFieldPP',
     'MetadataFromTitlePP',
     'MoveFilesAfterDownloadPP',
     'SponSkrubPP',
diff --git a/youtube_dlc/postprocessor/metadatafromfield.py b/youtube_dlc/postprocessor/metadatafromfield.py
new file mode 100644
index 0000000000..eb774326b8
--- /dev/null
+++ b/youtube_dlc/postprocessor/metadatafromfield.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import PostProcessor
+from ..compat import compat_str
+
+
+class MetadataFromFieldPP(PostProcessor):
+    regex = r'(?P<field>\w+):(?P<format>.+)$'
+
+    def __init__(self, downloader, formats):
+        PostProcessor.__init__(self, downloader)
+        assert isinstance(formats, (list, tuple))
+        self._data = []
+        for f in formats:
+            assert isinstance(f, compat_str)
+            match = re.match(self.regex, f)
+            assert match is not None
+            self._data.append({
+                'field': match.group('field'),
+                'format': match.group('format'),
+                'regex': self.format_to_regex(match.group('format'))})
+
+    def format_to_regex(self, fmt):
+        r"""
+        Converts a string like
+           '%(title)s - %(artist)s'
+        to a regex like
+           '(?P<title>.+)\ \-\ (?P<artist>.+)'
+        """
+        if not re.search(r'%\(\w+\)s', fmt):
+            return fmt
+        lastpos = 0
+        regex = ''
+        # replace %(..)s with regex group and escape other string parts
+        for match in re.finditer(r'%\((\w+)\)s', fmt):
+            regex += re.escape(fmt[lastpos:match.start()])
+            regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
+            lastpos = match.end()
+        if lastpos < len(fmt):
+            regex += re.escape(fmt[lastpos:])
+        return regex
+
+    def run(self, info):
+        for dictn in self._data:
+            field, regex = dictn['field'], dictn['regex']
+            if field not in info:
+                self.report_warning('Video doesnot have a %s' % field)
+                continue
+            self.write_debug('Searching for r"%s" in %s' % (regex, field))
+            match = re.search(regex, info[field])
+            if match is None:
+                self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
+                continue
+            for attribute, value in match.groupdict().items():
+                info[attribute] = value
+                self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
+        return [], info
+
+
+class MetadataFromTitlePP(MetadataFromFieldPP):  # for backward compatibility
+    def __init__(self, downloader, titleformat):
+        super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat])
+        self._titleformat = titleformat
+        self._titleregex = self._data[0]['regex']
diff --git a/youtube_dlc/postprocessor/metadatafromtitle.py b/youtube_dlc/postprocessor/metadatafromtitle.py
deleted file mode 100644
index 86df3b4f06..0000000000
--- a/youtube_dlc/postprocessor/metadatafromtitle.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import PostProcessor
-
-
-class MetadataFromTitlePP(PostProcessor):
-    def __init__(self, downloader, titleformat):
-        super(MetadataFromTitlePP, self).__init__(downloader)
-        self._titleformat = titleformat
-        self._titleregex = (self.format_to_regex(titleformat)
-                            if re.search(r'%\(\w+\)s', titleformat)
-                            else titleformat)
-
-    def format_to_regex(self, fmt):
-        r"""
-        Converts a string like
-           '%(title)s - %(artist)s'
-        to a regex like
-           '(?P<title>.+)\ \-\ (?P<artist>.+)'
-        """
-        lastpos = 0
-        regex = ''
-        # replace %(..)s with regex group and escape other string parts
-        for match in re.finditer(r'%\((\w+)\)s', fmt):
-            regex += re.escape(fmt[lastpos:match.start()])
-            regex += r'(?P<' + match.group(1) + '>.+)'
-            lastpos = match.end()
-        if lastpos < len(fmt):
-            regex += re.escape(fmt[lastpos:])
-        return regex
-
-    def run(self, info):
-        title = info['title']
-        match = re.match(self._titleregex, title)
-        if match is None:
-            self.to_screen('Could not interpret title of video as "%s"' % self._titleformat)
-            return [], info
-        for attribute, value in match.groupdict().items():
-            info[attribute] = value
-            self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA'))
-
-        return [], info