From fbfde1c3e6b59c5ff94e2604f1502acdeb14f8f0 Mon Sep 17 00:00:00 2001 From: Fam0r Date: Sun, 3 Apr 2022 18:11:50 +0300 Subject: [PATCH] [elonet] Rewrite extractor (#3277) Closes #2911 Authored by: Fam0r, pukkandan --- yt_dlp/extractor/common.py | 4 +- yt_dlp/extractor/elonet.py | 85 ++++++++++++++------------------------ 2 files changed, 34 insertions(+), 55 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d0e57da23d..af964c5278 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1297,8 +1297,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' - % {'prop': re.escape(prop)}) + property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' + % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py index eefba4e242..9c6aea28e8 100644 --- a/yt_dlp/extractor/elonet.py +++ b/yt_dlp/extractor/elonet.py @@ -1,30 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - base_url, - ExtractorError, - try_get, -) -from ..compat import compat_str +from ..utils import determine_ext class ElonetIE(InfoExtractor): _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P[0-9]+)' _TESTS = [{ - # m3u8 with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', - 'md5': '8efc954b96c543711707f87de757caea', 'info_dict': { 'id': '107867', 'ext': 'mp4', 'title': 'Valkoinen peura', - 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_107867.+', + 'description': 'md5:bded4201c9677fab10854884fe8f7312', }, + 'params': {'skip_download': 'dash'}, }, { # DASH with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', @@ -32,58 +24,45 @@ class ElonetIE(InfoExtractor): 'id': '116539', 'ext': 'mp4', 'title': 'Minulla on tiikeri', - 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', - } + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_116539.+', + 'description': 'md5:5ab72b3fe76d3414e46cc8f277104419', + }, + 'params': {'skip_download': 'dash'}, + }, { + # Page with multiple videos, download the main one + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_117396', + 'info_dict': { + 'id': '117396', + 'ext': 'mp4', + 'title': 'Sampo', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_117396.+', + 'description': 'md5:ec69572a5b054d0ecafe8086b1fa96f7', + }, + 'params': {'skip_download': 'dash'}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r']+data-video-sources="([^"]+)"', webpage, 'json'), video_id)[0]['src'] + ext = determine_ext(src) - json_s = self._html_search_regex( - r'data-video-sources="(.+?)"', webpage, 'json') - src = try_get( - self._parse_json(json_s, video_id), - lambda x: x[0]["src"], compat_str) - formats = [] - subtitles = {} - if re.search(r'\.m3u8\??', src): - res = self._download_webpage_handle( - # elonet servers have certificate problems - src.replace('https:', 'http:'), video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') - if res: - doc, urlh = res - url = urlh.geturl() - formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) - for f in formats: - f['ext'] = 'mp4' - elif re.search(r'\.mpd\??', src): - res = self._download_xml_handle( - src, video_id, - note='Downloading MPD manifest', - errnote='Failed to download MPD manifest') - if res: - doc, urlh = res - url = base_url(urlh.geturl()) - formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) + if ext == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) + elif ext == 'mpd': + formats, subtitles = self._extract_mpd_formats_and_subtitles(src, video_id, fatal=False) else: - raise ExtractorError("Unknown streaming format") + formats, subtitles = [], {} + self.raise_no_formats(f'Unknown streaming format {ext}') + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'subtitles': subtitles, }