[extractor/telegram] Add playlist support and more metadata (#5358)

Authored by: bashonly, bsun0000
2024-12-26 21:59:08 +01:00 · 2022-11-06 19:05:09 +00:00 · 2022-11-06 19:05:09 +00:00 · 96b9e9cf62
commit 96b9e9cf62
parent cb1553e966
2 changed files with 122 additions and 26 deletions
--- a/yt_dlp/extractor/telegram.py
+++ b/yt_dlp/extractor/telegram.py
@ -1,41 +1,137 @@
+import re
+
 from .common import InfoExtractor
-from ..utils import clean_html, get_element_by_class
+from ..utils import (
+    clean_html,
+    format_field,
+    get_element_by_class,
+    parse_duration,
+    parse_qs,
+    traverse_obj,
+    unified_timestamp,
+    update_url_query,
+    url_basename,
+)


 class TelegramEmbedIE(InfoExtractor):
    IE_NAME = 'telegram:embed'
-    _VALID_URL = r'https?://t\.me/(?P<channel_name>[^/]+)/(?P<id>\d+)'
+    _VALID_URL = r'https?://t\.me/(?P<channel_id>[^/]+)/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://t.me/europa_press/613',
+        'md5': 'dd707708aea958c11a590e8068825f22',
        'info_dict': {
            'id': '613',
            'ext': 'mp4',
-            'title': 'Europa Press',
-            'description': '6ce2d7e8d56eda16d80607b23db7b252',
-            'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+',
+            'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
+            'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
+            'channel_id': 'europa_press',
+            'channel': 'Europa Press ✔',
+            'thumbnail': r're:^https?://.+',
+            'timestamp': 1635631203,
+            'upload_date': '20211030',
+            'duration': 61,
+        },
+    }, {
+        # 2-video post
+        'url': 'https://t.me/vorposte/29342',
+        'info_dict': {
+            'id': 'vorposte-29342',
+            'title': 'Форпост 29342',
+            'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+        },
+        'playlist_count': 2,
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # 2-video post with --no-playlist
+        'url': 'https://t.me/vorposte/29343',
+        'md5': '1724e96053c18e788c8464038876e245',
+        'info_dict': {
+            'id': '29343',
+            'ext': 'mp4',
+            'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+            'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+            'channel_id': 'vorposte',
+            'channel': 'Форпост',
+            'thumbnail': r're:^https?://.+',
+            'timestamp': 1666384480,
+            'upload_date': '20221021',
+            'duration': 35,
+        },
+        'params': {
+            'noplaylist': True,
+        }
+    }, {
+        # 2-video post with 'single' query param
+        'url': 'https://t.me/vorposte/29342?single',
+        'md5': 'd20b202f1e41400a9f43201428add18f',
+        'info_dict': {
+            'id': '29342',
+            'ext': 'mp4',
+            'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+            'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+            'channel_id': 'vorposte',
+            'channel': 'Форпост',
+            'thumbnail': r're:^https?://.+',
+            'timestamp': 1666384480,
+            'upload_date': '20221021',
+            'duration': 33,
        },
    }]

    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id, query={'embed': 0})
-        webpage_embed = self._download_webpage(url, video_id, query={'embed': 1}, note='Downloading ermbed page')
+        channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id')
+        embed = self._download_webpage(
+            url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame')

-        formats = [{
-            'url': self._proto_relative_url(self._search_regex(
-                '<video[^>]+src="([^"]+)"', webpage_embed, 'source')),
-            'ext': 'mp4',
-        }]
-        self._sort_formats(formats)
+        def clean_text(html_class, html):
+            text = clean_html(get_element_by_class(html_class, html))
+            return text.replace('\n', ' ') if text else None

-        return {
-            'id': video_id,
-            'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
-            'description': self._html_search_meta(
-                ['og:description', 'twitter:description'], webpage,
-                default=clean_html(get_element_by_class('tgme_widget_message_text', webpage_embed))),
-            'thumbnail': self._search_regex(
-                r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
-                webpage_embed, 'thumbnail'),
-            'formats': formats,
+        description = clean_text('tgme_widget_message_text', embed)
+        message = {
+            'title': description or '',
+            'description': description,
+            'channel': clean_text('tgme_widget_message_author', embed),
+            'channel_id': channel_id,
+            'timestamp': unified_timestamp(self._search_regex(
+                r'<time[^>]*datetime="([^"]*)"', embed, 'timestamp', fatal=False)),
        }
+
+        videos = []
+        for video in re.findall(r'<a class="tgme_widget_message_video_player(?s:.+?)</time>', embed):
+            video_url = self._search_regex(
+                r'<video[^>]+src="([^"]+)"', video, 'video URL', fatal=False)
+            webpage_url = self._search_regex(
+                r'<a class="tgme_widget_message_video_player[^>]+href="([^"]+)"',
+                video, 'webpage URL', fatal=False)
+            if not video_url or not webpage_url:
+                continue
+            formats = [{
+                'url': video_url,
+                'ext': 'mp4',
+            }]
+            self._sort_formats(formats)
+            videos.append({
+                'id': url_basename(webpage_url),
+                'webpage_url': update_url_query(webpage_url, {'single': True}),
+                'duration': parse_duration(self._search_regex(
+                    r'<time[^>]+duration[^>]*>([\d:]+)</time>', video, 'duration', fatal=False)),
+                'thumbnail': self._search_regex(
+                    r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
+                    video, 'thumbnail', fatal=False),
+                'formats': formats,
+                **message,
+            })
+
+        playlist_id = None
+        if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True):
+            playlist_id = f'{channel_id}-{msg_id}'
+
+        if self._yes_playlist(playlist_id, msg_id):
+            return self.playlist_result(
+                videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description)
+        else:
+            return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -3092,8 +3092,8 @@ def escape_url(url):
    ).geturl()


-def parse_qs(url):
-    return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
+def parse_qs(url, **kwargs):
+    return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)


 def read_batch_urls(batch_fd):