Rework extractor

This commit is contained in:
sepro 2024-12-07 15:57:57 +01:00
parent f58fc3c88d
commit c567d27ede
2 changed files with 114 additions and 76 deletions

View file

@ -1551,7 +1551,7 @@ from .pluralsight import (
PluralsightIE, PluralsightIE,
) )
from .plutotv import PlutoTVIE from .plutotv import PlutoTVIE
from .plvideo import PlVideoVideoIE from .plvideo import PlVideoIE
from .podbayfm import ( from .podbayfm import (
PodbayFMChannelIE, PodbayFMChannelIE,
PodbayFMIE, PodbayFMIE,

View file

@ -1,91 +1,129 @@
from yt_dlp.utils._utils import qualities
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
parse_iso8601,
parse_resolution,
url_or_none,
)
from ..utils.traversal import traverse_obj
class PlVideoVideoIE(InfoExtractor): class PlVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?plvideo\.ru/watch\?v=(?P<id>\w+)&?(.+)?' # type: ignore _VALID_URL = r'https?://(?:www\.)?plvideo\.ru/(?:watch\?(?:[^#]+&)?v=|shorts/)(?P<id>[\w-]+)'
_TESTS = [ _TESTS = [{
{ 'url': 'https://plvideo.ru/watch?v=owo7vk1sTqzA',
'url': 'https://plvideo.ru/watch?v=lYmu2gcUKOa9', 'md5': 'be768d1d4c44462f180ca39927ad07f2',
'md5': 'eb3e7830abb375a782d943f593d2646b', 'info_dict': {
'info_dict': { 'id': 'owo7vk1sTqzA',
'id': 'lYmu2gcUKOa9', 'ext': 'mp4',
'ext': 'mp4', 'thumbnail': 'https://img.plvideo.ru/images/fp-2024-images/v/cover/d9/e9/d9e9a78134c01ca56e9e795244e1ba95/original6753a778ab7fe1.79895808.jpg',
'title': 'Запретная страсть. Премьера 2024. 18+Мелодрама. Триллер. 18+', 'title': 'Две угрозы для ВСУ на Донбассе, интервью Лаврова Карлсону, что означают для Украины события в Сирии.',
'uploader_id': 'y__S081jJiUt', 'channel': 'Страна.ua',
'uploader': 'Tvoja Mediateka', 'channel_id': 'hX0oxkAgBfaK',
'duration': 6238333, 'media_type': 'video',
'like_count': int, 'comment_count': int,
'description': str, 'tags': ['политика', 'путин', 'зеленский', 'украина', 'война', 'новости', 'сша', 'такеркарлсон', 'интервью'],
'comment_count': int, 'description': 'md5:52e3cb3cf9deac3a0d9c3b6523a1c1ff',
'thumbnail': r're:^https?://.*\.jpg', 'released_timestamp': 1733535609,
'type': 'video', 'channel_is_verified': False,
'view_count': int, 'like_count': int,
'dislike_count': int, 'timestamp': 1733535474,
}, 'duration': 1112011,
'view_count': int,
'dislike_count': int,
'upload_date': '20241207',
'modified_date': '20241207',
'channel_follower_count': int,
'modified_timestamp': 1733535710,
}, },
] }, {
'url': 'https://plvideo.ru/shorts/S3Uo9c-VLwFX',
def _quality_to_dimensions(self, quality): 'md5': '7d8fa2279406c69d2fd2a6fc548a9805',
mapped = { 'info_dict': {
'240p': (426, 240), 'id': 'S3Uo9c-VLwFX',
'360p': (640, 360), 'ext': 'mp4',
'468p': (720, 468), 'channel': 'Romaatom',
'480p': (720, 480), 'tags': 'count:22',
'720p': (1280, 720), 'dislike_count': int,
'1080p': (1920, 1080), 'upload_date': '20241130',
} 'description': 'md5:452e6de219bf2f32bb95806c51c3b364',
return mapped.get(quality) 'duration': 58433,
'modified_date': '20241130',
'thumbnail': 'https://img.plvideo.ru/images/fp-2024-11-cover/S3Uo9c-VLwFX/f9318999-a941-482b-b700-2102a7049366.jpg',
'media_type': 'shorts',
'like_count': int,
'modified_timestamp': 1732961458,
'channel_is_verified': True,
'channel_id': 'erJyyTIbmUd1',
'timestamp': 1732961355,
'comment_count': int,
'title': 'Белоусов отменил приказы о кадровом резерве на гражданской службе',
'channel_follower_count': int,
'view_count': int,
'released_timestamp': 1732961458,
},
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
api_url = f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18'
result = self._download_json(api_url, video_id, 'Downloading video JSON') video_data = self._download_json(
assert result.get('code') == 200, 'Failed to download video JSON' f'https://api.g1.plvideo.ru/v1/videos/{video_id}?Aud=18', video_id)
item = result.get('item')
assert item is not None, 'Bad API response'
thumbnail = item.get('cover').get('paths').get('original').get('src')
is_live = False
formats = [] formats = []
preference = qualities(['240p', '360p', '468p', '480p', '720p', '1080p']) subtitles = {}
automatic_captions = {}
for key, value in item.get('profiles').items(): for quality, data in traverse_obj(video_data, ('item', 'profiles', {dict.items}, lambda _, v: url_or_none(v[1]['hls']))):
hlsurl = value.get('hls') formats.append({
dimensions = self._quality_to_dimensions(key) 'format_id': quality,
fmt = {
'url': hlsurl,
'ext': 'mp4', 'ext': 'mp4',
'quality': preference(key),
'width': dimensions[0],
'height': dimensions[1],
'format_id': key,
'protocol': 'm3u8_native', 'protocol': 'm3u8_native',
'aspect_ratio': float(value.get('aspectRatio')), **traverse_obj(data, {
} 'url': 'hls',
'fps': ('fps', {float_or_none}),
'aspect_ratio': ('aspectRatio', {float_or_none}),
}),
**parse_resolution(quality),
})
if livestream_url := traverse_obj(video_data, ('item', 'livestream', 'url', {url_or_none})):
is_live = True
formats.extend(self._extract_m3u8_formats(livestream_url, video_id, 'mp4', live=True))
for lang, url in traverse_obj(video_data, ('item', 'subtitles', {dict.items}, lambda _, v: url_or_none(v[1]))):
if lang.endswith('-auto'):
automatic_captions.setdefault(lang[:-5], []).append({
'url': url,
})
else:
subtitles.setdefault(lang, []).append({
'url': url,
})
formats.append(fmt) return {
result = {
'id': video_id, 'id': video_id,
'title': item.get('title'),
'formats': formats, 'formats': formats,
'thumbnails': [{'url': thumbnail}], 'subtitles': subtitles,
'uploader': item.get('channel').get('name'), 'automatic_captions': automatic_captions,
'duration': item.get('uploadFile').get('videoDuration'), 'is_live': is_live,
'uploader_id': item.get('channel').get('id'), **traverse_obj(video_data, ('item', {
'view_count': item.get('stats').get('viewTotalCount'), 'id': ('id', {str}),
'like_count': item.get('stats').get('likeCount'), 'title': ('title', {str}),
'comment_count': item.get('stats').get('commentCount'), 'description': ('description', {str}),
'dislike_count': item.get('stats').get('dislikeCount'), 'thumbnail': ('cover', 'paths', 'original', 'src', {url_or_none}),
'type': item.get('type'), 'duration': ('uploadFile', 'videoDuration', {int_or_none}),
'channel': ('channel', 'name', {str}),
'channel_id': ('channel', 'id', {str}),
'channel_follower_count': ('channel', 'stats', 'subscribers', {int_or_none}),
'channel_is_verified': ('channel', 'verified', {bool}),
'tags': ('tags', ..., {str}),
'timestamp': ('createdAt', {parse_iso8601}),
'released_timestamp': ('publishedAt', {parse_iso8601}),
'modified_timestamp': ('updatedAt', {parse_iso8601}),
'view_count': ('stats', 'viewTotalCount', {int_or_none}),
'like_count': ('stats', 'likeCount', {int_or_none}),
'dislike_count': ('stats', 'dislikeCount', {int_or_none}),
'comment_count': ('stats', 'commentCount', {int_or_none}),
'media_type': ('type', {str}),
})),
} }
description = item.get('description')
if description:
result['description'] = description
return result