mirror of
https://github.com/yt-dlp/yt-dlp
synced 2024-12-26 21:59:08 +01:00
[extractor/telegram] Add playlist support and more metadata (#5358)
Authored by: bashonly, bsun0000
This commit is contained in:
parent
cb1553e966
commit
96b9e9cf62
2 changed files with 122 additions and 26 deletions
|
@ -1,41 +1,137 @@
|
|||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import clean_html, get_element_by_class
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
format_field,
|
||||
get_element_by_class,
|
||||
parse_duration,
|
||||
parse_qs,
|
||||
traverse_obj,
|
||||
unified_timestamp,
|
||||
update_url_query,
|
||||
url_basename,
|
||||
)
|
||||
|
||||
|
||||
class TelegramEmbedIE(InfoExtractor):
|
||||
IE_NAME = 'telegram:embed'
|
||||
_VALID_URL = r'https?://t\.me/(?P<channel_name>[^/]+)/(?P<id>\d+)'
|
||||
_VALID_URL = r'https?://t\.me/(?P<channel_id>[^/]+)/(?P<id>\d+)'
|
||||
_TESTS = [{
|
||||
'url': 'https://t.me/europa_press/613',
|
||||
'md5': 'dd707708aea958c11a590e8068825f22',
|
||||
'info_dict': {
|
||||
'id': '613',
|
||||
'ext': 'mp4',
|
||||
'title': 'Europa Press',
|
||||
'description': '6ce2d7e8d56eda16d80607b23db7b252',
|
||||
'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+',
|
||||
'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
|
||||
'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
|
||||
'channel_id': 'europa_press',
|
||||
'channel': 'Europa Press ✔',
|
||||
'thumbnail': r're:^https?://.+',
|
||||
'timestamp': 1635631203,
|
||||
'upload_date': '20211030',
|
||||
'duration': 61,
|
||||
},
|
||||
}, {
|
||||
# 2-video post
|
||||
'url': 'https://t.me/vorposte/29342',
|
||||
'info_dict': {
|
||||
'id': 'vorposte-29342',
|
||||
'title': 'Форпост 29342',
|
||||
'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
|
||||
},
|
||||
'playlist_count': 2,
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
}, {
|
||||
# 2-video post with --no-playlist
|
||||
'url': 'https://t.me/vorposte/29343',
|
||||
'md5': '1724e96053c18e788c8464038876e245',
|
||||
'info_dict': {
|
||||
'id': '29343',
|
||||
'ext': 'mp4',
|
||||
'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
|
||||
'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
|
||||
'channel_id': 'vorposte',
|
||||
'channel': 'Форпост',
|
||||
'thumbnail': r're:^https?://.+',
|
||||
'timestamp': 1666384480,
|
||||
'upload_date': '20221021',
|
||||
'duration': 35,
|
||||
},
|
||||
'params': {
|
||||
'noplaylist': True,
|
||||
}
|
||||
}, {
|
||||
# 2-video post with 'single' query param
|
||||
'url': 'https://t.me/vorposte/29342?single',
|
||||
'md5': 'd20b202f1e41400a9f43201428add18f',
|
||||
'info_dict': {
|
||||
'id': '29342',
|
||||
'ext': 'mp4',
|
||||
'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
|
||||
'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
|
||||
'channel_id': 'vorposte',
|
||||
'channel': 'Форпост',
|
||||
'thumbnail': r're:^https?://.+',
|
||||
'timestamp': 1666384480,
|
||||
'upload_date': '20221021',
|
||||
'duration': 33,
|
||||
},
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id, query={'embed': 0})
|
||||
webpage_embed = self._download_webpage(url, video_id, query={'embed': 1}, note='Downloading ermbed page')
|
||||
channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id')
|
||||
embed = self._download_webpage(
|
||||
url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame')
|
||||
|
||||
formats = [{
|
||||
'url': self._proto_relative_url(self._search_regex(
|
||||
'<video[^>]+src="([^"]+)"', webpage_embed, 'source')),
|
||||
'ext': 'mp4',
|
||||
}]
|
||||
self._sort_formats(formats)
|
||||
def clean_text(html_class, html):
|
||||
text = clean_html(get_element_by_class(html_class, html))
|
||||
return text.replace('\n', ' ') if text else None
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
|
||||
'description': self._html_search_meta(
|
||||
['og:description', 'twitter:description'], webpage,
|
||||
default=clean_html(get_element_by_class('tgme_widget_message_text', webpage_embed))),
|
||||
'thumbnail': self._search_regex(
|
||||
r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
|
||||
webpage_embed, 'thumbnail'),
|
||||
'formats': formats,
|
||||
description = clean_text('tgme_widget_message_text', embed)
|
||||
message = {
|
||||
'title': description or '',
|
||||
'description': description,
|
||||
'channel': clean_text('tgme_widget_message_author', embed),
|
||||
'channel_id': channel_id,
|
||||
'timestamp': unified_timestamp(self._search_regex(
|
||||
r'<time[^>]*datetime="([^"]*)"', embed, 'timestamp', fatal=False)),
|
||||
}
|
||||
|
||||
videos = []
|
||||
for video in re.findall(r'<a class="tgme_widget_message_video_player(?s:.+?)</time>', embed):
|
||||
video_url = self._search_regex(
|
||||
r'<video[^>]+src="([^"]+)"', video, 'video URL', fatal=False)
|
||||
webpage_url = self._search_regex(
|
||||
r'<a class="tgme_widget_message_video_player[^>]+href="([^"]+)"',
|
||||
video, 'webpage URL', fatal=False)
|
||||
if not video_url or not webpage_url:
|
||||
continue
|
||||
formats = [{
|
||||
'url': video_url,
|
||||
'ext': 'mp4',
|
||||
}]
|
||||
self._sort_formats(formats)
|
||||
videos.append({
|
||||
'id': url_basename(webpage_url),
|
||||
'webpage_url': update_url_query(webpage_url, {'single': True}),
|
||||
'duration': parse_duration(self._search_regex(
|
||||
r'<time[^>]+duration[^>]*>([\d:]+)</time>', video, 'duration', fatal=False)),
|
||||
'thumbnail': self._search_regex(
|
||||
r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
|
||||
video, 'thumbnail', fatal=False),
|
||||
'formats': formats,
|
||||
**message,
|
||||
})
|
||||
|
||||
playlist_id = None
|
||||
if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True):
|
||||
playlist_id = f'{channel_id}-{msg_id}'
|
||||
|
||||
if self._yes_playlist(playlist_id, msg_id):
|
||||
return self.playlist_result(
|
||||
videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description)
|
||||
else:
|
||||
return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False)
|
||||
|
|
|
@ -3092,8 +3092,8 @@ def escape_url(url):
|
|||
).geturl()
|
||||
|
||||
|
||||
def parse_qs(url):
|
||||
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
|
||||
def parse_qs(url, **kwargs):
|
||||
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
|
||||
|
||||
|
||||
def read_batch_urls(batch_fd):
|
||||
|
|
Loading…
Reference in a new issue