Rework extractor

2024-12-26 21:59:08 +01:00 · 2024-12-07 15:57:57 +01:00 · 2024-12-07 15:57:57 +01:00 · c567d27ede
commit c567d27ede
parent f58fc3c88d
2 changed files with 114 additions and 76 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1551,7 +1551,7 @@ from .pluralsight import (
    PluralsightIE,
 )
 from .plutotv import PlutoTVIE
-from .plvideo import PlVideoVideoIE
+from .plvideo import PlVideoIE
 from .podbayfm import (
    PodbayFMChannelIE,
    PodbayFMIE,
--- a/yt_dlp/extractor/plvideo.py
+++ b/yt_dlp/extractor/plvideo.py
@ -1,91 +1,129 @@
-from yt_dlp.utils._utils import qualities
-
 from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    parse_resolution,
+    url_or_none,
+)
+from ..utils.traversal import traverse_obj


-class PlVideoVideoIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?plvideo\.ru/watch\?v=(?P<id>\w+)&?(.+)?'  # type: ignore
-    _TESTS = [
-        {
-            'url': 'https://plvideo.ru/watch?v=lYmu2gcUKOa9',
-            'md5': 'eb3e7830abb375a782d943f593d2646b',
+class PlVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?plvideo\.ru/(?:watch\?(?:[^#]+&)?v=|shorts/)(?P<id>[\w-]+)'
+    _TESTS = [{
+        'url': 'https://plvideo.ru/watch?v=owo7vk1sTqzA',
+        'md5': 'be768d1d4c44462f180ca39927ad07f2',
        'info_dict': {
-                'id': 'lYmu2gcUKOa9',
+            'id': 'owo7vk1sTqzA',
            'ext': 'mp4',
-                'title': 'Запретная страсть. Премьера 2024. 18+Мелодрама. Триллер. 18+',
-                'uploader_id': 'y__S081jJiUt',
-                'uploader': 'Tvoja Mediateka',
-                'duration': 6238333,
-                'like_count': int,
-                'description': str,
+            'thumbnail': 'https://img.plvideo.ru/images/fp-2024-images/v/cover/d9/e9/d9e9a78134c01ca56e9e795244e1ba95/original6753a778ab7fe1.79895808.jpg',
+            'title': 'Две угрозы для ВСУ на Донбассе, интервью Лаврова Карлсону, что означают для Украины события в Сирии.',
+            'channel': 'Страна.ua',
+            'channel_id': 'hX0oxkAgBfaK',
+            'media_type': 'video',
            'comment_count': int,
-                'thumbnail': r're:^https?://.*\.jpg',
-                'type': 'video',
+            'tags': ['политика', 'путин', 'зеленский', 'украина', 'война', 'новости', 'сша', 'такеркарлсон', 'интервью'],
+            'description': 'md5:52e3cb3cf9deac3a0d9c3b6523a1c1ff',
+            'released_timestamp': 1733535609,
+            'channel_is_verified': False,
+            'like_count': int,
+            'timestamp': 1733535474,
+            'duration': 1112011,
            'view_count': int,
            'dislike_count': int,
+            'upload_date': '20241207',
+            'modified_date': '20241207',
+            'channel_follower_count': int,
+            'modified_timestamp': 1733535710,
        },
+    }, {
+        'url': 'https://plvideo.ru/shorts/S3Uo9c-VLwFX',
+        'md5': '7d8fa2279406c69d2fd2a6fc548a9805',
+        'info_dict': {
+            'id': 'S3Uo9c-VLwFX',
+            'ext': 'mp4',
+            'channel': 'Romaatom',
+            'tags': 'count:22',
+            'dislike_count': int,
+            'upload_date': '20241130',
+            'description': 'md5:452e6de219bf2f32bb95806c51c3b364',
+            'duration': 58433,
+            'modified_date': '20241130',
+            'thumbnail': 'https://img.plvideo.ru/images/fp-2024-11-cover/S3Uo9c-VLwFX/f9318999-a941-482b-b700-2102a7049366.jpg',
+            'media_type': 'shorts',
+            'like_count': int,
+            'modified_timestamp': 1732961458,
+            'channel_is_verified': True,
+            'channel_id': 'erJyyTIbmUd1',
+            'timestamp': 1732961355,
+            'comment_count': int,
+            'title': 'Белоусов отменил приказы о кадровом резерве на гражданской службе',
+            'channel_follower_count': int,
+            'view_count': int,
+            'released_timestamp': 1732961458,
        },
-    ]
-
-    def _quality_to_dimensions(self, quality):
-        mapped = {
-            '240p': (426, 240),
-            '360p': (640, 360),
-            '468p': (720, 468),
-            '480p': (720, 480),
-            '720p': (1280, 720),
-            '1080p': (1920, 1080),
-        }
-        return mapped.get(quality)
+    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
-        api_url = f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18'

-        result = self._download_json(api_url, video_id, 'Downloading video JSON')
-        assert result.get('code') == 200, 'Failed to download video JSON'
-
-        item = result.get('item')
-        assert item is not None, 'Bad API response'
-
-        thumbnail = item.get('cover').get('paths').get('original').get('src')
+        video_data = self._download_json(
+            f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18', video_id)

+        is_live = False
        formats = []
-        preference = qualities(['240p', '360p', '468p', '480p', '720p', '1080p'])
-
-        for key, value in item.get('profiles').items():
-            hlsurl = value.get('hls')
-            dimensions = self._quality_to_dimensions(key)
-            fmt = {
-                'url': hlsurl,
+        subtitles = {}
+        automatic_captions = {}
+        for quality, data in traverse_obj(video_data, ('item', 'profiles', {dict.items}, lambda _, v: url_or_none(v[1]['hls']))):
+            formats.append({
+                'format_id': quality,
                'ext': 'mp4',
-                'quality': preference(key),
-                'width': dimensions[0],
-                'height': dimensions[1],
-                'format_id': key,
                'protocol': 'm3u8_native',
-                'aspect_ratio': float(value.get('aspectRatio')),
-            }
+                **traverse_obj(data, {
+                    'url': 'hls',
+                    'fps': ('fps', {float_or_none}),
+                    'aspect_ratio': ('aspectRatio', {float_or_none}),
+                }),
+                **parse_resolution(quality),
+            })
+        if livestream_url := traverse_obj(video_data, ('item', 'livestream', 'url', {url_or_none})):
+            is_live = True
+            formats.extend(self._extract_m3u8_formats(livestream_url, video_id, 'mp4', live=True))
+        for lang, url in traverse_obj(video_data, ('item', 'subtitles', {dict.items}, lambda _, v: url_or_none(v[1]))):
+            if lang.endswith('-auto'):
+                automatic_captions.setdefault(lang[:-5], []).append({
+                    'url': url,
+                })
+            else:
+                subtitles.setdefault(lang, []).append({
+                    'url': url,
+                })

-            formats.append(fmt)
-
-        result = {
+        return {
            'id': video_id,
-            'title': item.get('title'),
            'formats': formats,
-            'thumbnails': [{'url': thumbnail}],
-            'uploader': item.get('channel').get('name'),
-            'duration': item.get('uploadFile').get('videoDuration'),
-            'uploader_id': item.get('channel').get('id'),
-            'view_count': item.get('stats').get('viewTotalCount'),
-            'like_count': item.get('stats').get('likeCount'),
-            'comment_count': item.get('stats').get('commentCount'),
-            'dislike_count': item.get('stats').get('dislikeCount'),
-            'type': item.get('type'),
+            'subtitles': subtitles,
+            'automatic_captions': automatic_captions,
+            'is_live': is_live,
+            **traverse_obj(video_data, ('item', {
+                'id': ('id', {str}),
+                'title': ('title', {str}),
+                'description': ('description', {str}),
+                'thumbnail': ('cover', 'paths', 'original', 'src', {url_or_none}),
+                'duration': ('uploadFile', 'videoDuration', {int_or_none}),
+                'channel': ('channel', 'name', {str}),
+                'channel_id': ('channel', 'id', {str}),
+                'channel_follower_count': ('channel', 'stats', 'subscribers', {int_or_none}),
+                'channel_is_verified': ('channel', 'verified', {bool}),
+                'tags': ('tags', ..., {str}),
+                'timestamp': ('createdAt', {parse_iso8601}),
+                'released_timestamp': ('publishedAt', {parse_iso8601}),
+                'modified_timestamp': ('updatedAt', {parse_iso8601}),
+                'view_count': ('stats', 'viewTotalCount', {int_or_none}),
+                'like_count': ('stats', 'likeCount', {int_or_none}),
+                'dislike_count': ('stats', 'dislikeCount', {int_or_none}),
+                'comment_count': ('stats', 'commentCount', {int_or_none}),
+                'media_type': ('type', {str}),
+            })),
        }
-
-        description = item.get('description')
-        if description:
-            result['description'] = description
-
-        return result