From c567d27ede734c9400d34b06b6d3213501d2de21 Mon Sep 17 00:00:00 2001 From: sepro Date: Sat, 7 Dec 2024 15:57:57 +0100 Subject: [PATCH] Rework extractor --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/plvideo.py | 188 +++++++++++++++++++------------- 2 files changed, 114 insertions(+), 76 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3d01ed6fe..bbd6d21bd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1551,7 +1551,7 @@ from .pluralsight import ( PluralsightIE, ) from .plutotv import PlutoTVIE -from .plvideo import PlVideoVideoIE +from .plvideo import PlVideoIE from .podbayfm import ( PodbayFMChannelIE, PodbayFMIE, diff --git a/yt_dlp/extractor/plvideo.py b/yt_dlp/extractor/plvideo.py index 700a06fdb..7c91c9201 100644 --- a/yt_dlp/extractor/plvideo.py +++ b/yt_dlp/extractor/plvideo.py @@ -1,91 +1,129 @@ -from yt_dlp.utils._utils import qualities - from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + parse_resolution, + url_or_none, +) +from ..utils.traversal import traverse_obj -class PlVideoVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?plvideo\.ru/watch\?v=(?P\w+)&?(.+)?' # type: ignore - _TESTS = [ - { - 'url': 'https://plvideo.ru/watch?v=lYmu2gcUKOa9', - 'md5': 'eb3e7830abb375a782d943f593d2646b', - 'info_dict': { - 'id': 'lYmu2gcUKOa9', - 'ext': 'mp4', - 'title': 'Запретная страсть. Премьера 2024. 18+Мелодрама. Триллер. 18+', - 'uploader_id': 'y__S081jJiUt', - 'uploader': 'Tvoja Mediateka', - 'duration': 6238333, - 'like_count': int, - 'description': str, - 'comment_count': int, - 'thumbnail': r're:^https?://.*\.jpg', - 'type': 'video', - 'view_count': int, - 'dislike_count': int, - }, +class PlVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?plvideo\.ru/(?:watch\?(?:[^#]+&)?v=|shorts/)(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://plvideo.ru/watch?v=owo7vk1sTqzA', + 'md5': 'be768d1d4c44462f180ca39927ad07f2', + 'info_dict': { + 'id': 'owo7vk1sTqzA', + 'ext': 'mp4', + 'thumbnail': 'https://img.plvideo.ru/images/fp-2024-images/v/cover/d9/e9/d9e9a78134c01ca56e9e795244e1ba95/original6753a778ab7fe1.79895808.jpg', + 'title': 'Две угрозы для ВСУ на Донбассе, интервью Лаврова Карлсону, что означают для Украины события в Сирии.', + 'channel': 'Страна.ua', + 'channel_id': 'hX0oxkAgBfaK', + 'media_type': 'video', + 'comment_count': int, + 'tags': ['политика', 'путин', 'зеленский', 'украина', 'война', 'новости', 'сша', 'такеркарлсон', 'интервью'], + 'description': 'md5:52e3cb3cf9deac3a0d9c3b6523a1c1ff', + 'released_timestamp': 1733535609, + 'channel_is_verified': False, + 'like_count': int, + 'timestamp': 1733535474, + 'duration': 1112011, + 'view_count': int, + 'dislike_count': int, + 'upload_date': '20241207', + 'modified_date': '20241207', + 'channel_follower_count': int, + 'modified_timestamp': 1733535710, }, - ] - - def _quality_to_dimensions(self, quality): - mapped = { - '240p': (426, 240), - '360p': (640, 360), - '468p': (720, 468), - '480p': (720, 480), - '720p': (1280, 720), - '1080p': (1920, 1080), - } - return mapped.get(quality) + }, { + 'url': 'https://plvideo.ru/shorts/S3Uo9c-VLwFX', + 'md5': '7d8fa2279406c69d2fd2a6fc548a9805', + 'info_dict': { + 'id': 'S3Uo9c-VLwFX', + 'ext': 'mp4', + 'channel': 'Romaatom', + 'tags': 'count:22', + 'dislike_count': int, + 'upload_date': '20241130', + 'description': 'md5:452e6de219bf2f32bb95806c51c3b364', + 'duration': 58433, + 'modified_date': '20241130', + 'thumbnail': 'https://img.plvideo.ru/images/fp-2024-11-cover/S3Uo9c-VLwFX/f9318999-a941-482b-b700-2102a7049366.jpg', + 'media_type': 'shorts', + 'like_count': int, + 'modified_timestamp': 1732961458, + 'channel_is_verified': True, + 'channel_id': 'erJyyTIbmUd1', + 'timestamp': 1732961355, + 'comment_count': int, + 'title': 'Белоусов отменил приказы о кадровом резерве на гражданской службе', + 'channel_follower_count': int, + 'view_count': int, + 'released_timestamp': 1732961458, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) - api_url = f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18' - result = self._download_json(api_url, video_id, 'Downloading video JSON') - assert result.get('code') == 200, 'Failed to download video JSON' - - item = result.get('item') - assert item is not None, 'Bad API response' - - thumbnail = item.get('cover').get('paths').get('original').get('src') + video_data = self._download_json( + f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18', video_id) + is_live = False formats = [] - preference = qualities(['240p', '360p', '468p', '480p', '720p', '1080p']) - - for key, value in item.get('profiles').items(): - hlsurl = value.get('hls') - dimensions = self._quality_to_dimensions(key) - fmt = { - 'url': hlsurl, + subtitles = {} + automatic_captions = {} + for quality, data in traverse_obj(video_data, ('item', 'profiles', {dict.items}, lambda _, v: url_or_none(v[1]['hls']))): + formats.append({ + 'format_id': quality, 'ext': 'mp4', - 'quality': preference(key), - 'width': dimensions[0], - 'height': dimensions[1], - 'format_id': key, 'protocol': 'm3u8_native', - 'aspect_ratio': float(value.get('aspectRatio')), - } + **traverse_obj(data, { + 'url': 'hls', + 'fps': ('fps', {float_or_none}), + 'aspect_ratio': ('aspectRatio', {float_or_none}), + }), + **parse_resolution(quality), + }) + if livestream_url := traverse_obj(video_data, ('item', 'livestream', 'url', {url_or_none})): + is_live = True + formats.extend(self._extract_m3u8_formats(livestream_url, video_id, 'mp4', live=True)) + for lang, url in traverse_obj(video_data, ('item', 'subtitles', {dict.items}, lambda _, v: url_or_none(v[1]))): + if lang.endswith('-auto'): + automatic_captions.setdefault(lang[:-5], []).append({ + 'url': url, + }) + else: + subtitles.setdefault(lang, []).append({ + 'url': url, + }) - formats.append(fmt) - - result = { + return { 'id': video_id, - 'title': item.get('title'), 'formats': formats, - 'thumbnails': [{'url': thumbnail}], - 'uploader': item.get('channel').get('name'), - 'duration': item.get('uploadFile').get('videoDuration'), - 'uploader_id': item.get('channel').get('id'), - 'view_count': item.get('stats').get('viewTotalCount'), - 'like_count': item.get('stats').get('likeCount'), - 'comment_count': item.get('stats').get('commentCount'), - 'dislike_count': item.get('stats').get('dislikeCount'), - 'type': item.get('type'), + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + 'is_live': is_live, + **traverse_obj(video_data, ('item', { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('cover', 'paths', 'original', 'src', {url_or_none}), + 'duration': ('uploadFile', 'videoDuration', {int_or_none}), + 'channel': ('channel', 'name', {str}), + 'channel_id': ('channel', 'id', {str}), + 'channel_follower_count': ('channel', 'stats', 'subscribers', {int_or_none}), + 'channel_is_verified': ('channel', 'verified', {bool}), + 'tags': ('tags', ..., {str}), + 'timestamp': ('createdAt', {parse_iso8601}), + 'released_timestamp': ('publishedAt', {parse_iso8601}), + 'modified_timestamp': ('updatedAt', {parse_iso8601}), + 'view_count': ('stats', 'viewTotalCount', {int_or_none}), + 'like_count': ('stats', 'likeCount', {int_or_none}), + 'dislike_count': ('stats', 'dislikeCount', {int_or_none}), + 'comment_count': ('stats', 'commentCount', {int_or_none}), + 'media_type': ('type', {str}), + })), } - - description = item.get('description') - if description: - result['description'] = description - - return result