From 34236d0b95245b1684bebc7d898ae3083940a57b Mon Sep 17 00:00:00 2001 From: MrDemocracy Date: Sun, 6 Oct 2024 01:35:35 +0200 Subject: [PATCH] [nrk] Add 1080p support, linting improvements, and update tests --- yt_dlp/extractor/nrk.py | 207 +++++++++++++++++++++++++++------------- 1 file changed, 140 insertions(+), 67 deletions(-) diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index ec69a94356..1c3c2f21d9 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -31,18 +31,20 @@ class NRKBaseIE(InfoExtractor): _NETRC_MACHINE = 'nrk' _LOGIN_URL = 'https://innlogging.nrk.no/logginn' _AUTH_TOKEN = '' + _API_CALL_HEADERS = {'Accept': 'application/json;device=player-core'} + + def _extract_nrk_formats_and_subtitles(self, asset_url, video_id): - def _extract_nrk_formats(self, asset_url, video_id): if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): return self._extract_akamai_formats(asset_url, video_id) - asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) - formats = self._extract_m3u8_formats( + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only|adap=.+?\b)&?', '', asset_url) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) if not formats and re.search(self._CDN_REPL_REGEX, asset_url): - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), video_id, 'mp4', 'm3u8_native', fatal=False) - return formats + return formats, subtitles def _raise_error(self, data): MESSAGES = { @@ -64,7 +66,7 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('https://psapi.nrk.no/', path), video_id, note or f'Downloading {item} JSON', - fatal=fatal, query=query, headers={'authorization': f'Bearer {self._AUTH_TOKEN}'} if self._AUTH_TOKEN else None) + fatal=fatal, query=query, headers=self._API_CALL_HEADERS) class NRKIE(NRKBaseIE): @@ -83,13 +85,17 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'md5': '2b88a652ad2e275591e61cf550887eec', 'info_dict': { 'id': '150533', 'ext': 'mp4', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', 'duration': 262, + 'timestamp': 1395751833, + 'upload_date': '20140325', + 'thumbnail': 'https://gfx.nrk.no/0mZgeckEzRU6qTWrbQHD2QcyralHrYB08wBvh-K-AtAQ', + 'alt_title': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', }, }, { # audio @@ -101,6 +107,10 @@ class NRKIE(NRKBaseIE): 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, + 'alt_title': 'Cathrine Lie Wathne er blind, og bruker hurtigtaster for å navigere seg rundt på ulike nettsider.', + 'upload_date': '20140425', + 'timestamp': 1398429565, + 'thumbnail': 'https://gfx.nrk.no/urxQMSXF-WnbfjBH5ke2igLGyN27EdJVWZ6FOsEAclhA', }, }, { 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', @@ -151,10 +161,9 @@ class NRKIE(NRKBaseIE): video_id = self._match_id(url).split('/')[-1] # known values for preferredCdn: akamai, iponly, minicdn and telenor - manifest = self._call_api(f'playback/manifest/{video_id}', video_id, "manifest", query={'preferredCdn': 'akamai'}) - - type = try_get(manifest, lambda x: x['_links']['self']['href'], str).split("/")[3] + manifest = self._call_api(f'playback/manifest/{video_id}', video_id, 'manifest', query={'preferredCdn': 'akamai'}) + manifest_type = try_get(manifest, lambda x: x['_links']['self']['href'], str).split('/')[3] video_id = try_get(manifest, lambda x: x['id'], str) or video_id @@ -164,6 +173,7 @@ class NRKIE(NRKBaseIE): playable = manifest['playable'] formats = [] + subtitles = {} for asset in playable['assets']: if not isinstance(asset, dict): continue @@ -174,7 +184,9 @@ class NRKIE(NRKBaseIE): continue asset_format = (asset.get('format') or '').lower() if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_nrk_formats(format_url, video_id)) + fmts, subs = self._extract_nrk_formats_and_subtitles(format_url, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif asset_format == 'mp3': formats.append({ 'url': format_url, @@ -190,7 +202,7 @@ class NRKIE(NRKBaseIE): alt_title = try_get(titles, lambda x: x['subtitle']) description = try_get(preplay, lambda x: x['description'].replace('\r', '\n')) - duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) or self._extract_m3u8_vod_duration(formats[0]['url'], video_id) thumbnails = [] for image in try_get( @@ -206,13 +218,13 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) - subtitles = {} for sub in try_get(playable, lambda x: x['subtitles'], list) or []: if not isinstance(sub, dict): continue sub_url = url_or_none(sub.get('webVtt')) if not sub_url: continue + sub_key = str_or_none(sub.get('language')) or 'nb' sub_type = str_or_none(sub.get('type')) if sub_type: @@ -226,19 +238,18 @@ class NRKIE(NRKBaseIE): chapters = [item for item in [{ 'start_time': float_or_none(traverse_obj(data, ('skipDialogInfo', 'startIntroInSeconds'))), 'end_time': float_or_none(traverse_obj(data, ('skipDialogInfo', 'endIntroInSeconds'))), - 'title': 'Intro' + 'title': 'Intro', }, { 'start_time': float_or_none(traverse_obj(data, ('skipDialogInfo', 'startCreditsInSeconds'))), 'end_time': duration, - 'title': 'Outro' - }] if item['start_time'] != item['end_time']] - + 'title': 'Outro', + }] if not item['start_time'] == item['end_time']] if try_get(data, lambda x: x['preplay']['indexPoints']): seconds_or_none = lambda x: float_or_none(parse_duration(x)) chapters += traverse_obj(data['preplay'], ('indexPoints', ..., { 'start_time': ('startPoint', {seconds_or_none}), 'end_time': ('endPoint', {seconds_or_none}), - 'title': ('title', {lambda x: x}) + 'title': ('title', {lambda x: x}), })) chapters = sorted(chapters, key=lambda x: x['start_time']) if chapters else None legal_age = try_get( @@ -264,14 +275,14 @@ class NRKIE(NRKBaseIE): 'formats': formats, 'subtitles': subtitles, 'chapters': chapters, - 'timestamp': parse_iso8601(try_get(data, lambda x: x['availability']['onDemand']['from'], str)) + 'timestamp': parse_iso8601(try_get(data, lambda x: x['availability']['onDemand']['from'], str)), } if is_series: series = season_id = season_number = episode = episode_number = None programs = self._call_api( f'programs/{video_id}', video_id, 'programs', fatal=False) - match = re.search(r"\d+", try_get(programs, lambda x: x['firstTimeTransmitted']['publicationDate'] or x['usageRights']['availableFrom'], str) or try_get(programs, lambda x: x['usageRights']['availableFrom'], str)) + match = re.search(r'\d+', try_get(programs, lambda x: x['firstTimeTransmitted']['publicationDate'] or x['usageRights']['availableFrom'], str) or try_get(programs, lambda x: x['usageRights']['availableFrom'], str)) if match: info.update({'timestamp': min(info['timestamp'], int(match.group()) // 1000)}) if programs and isinstance(programs, dict): @@ -314,12 +325,12 @@ class NRKIE(NRKBaseIE): self._download_json( self._LOGIN_URL, None, headers={'Content-Type': 'application/json; charset=UTF-8', 'accept': 'application/json; charset=utf-8'}, data=json.dumps({ - 'clientId': "", + 'clientId': '', 'hashedPassword': {'current': { 'hash': password, 'recipe': { - 'algorithm': "cleartext", - 'salt': "" + 'algorithm': 'cleartext', + 'salt': '' } } }, @@ -327,9 +338,10 @@ class NRKIE(NRKBaseIE): 'username': username, }).encode()) - self._download_webpage("https://tv.nrk.no/auth/web/login/opsession", None) - response = self._download_json("https://tv.nrk.no/auth/session/tokenforsub/_", None) + self._download_webpage('https://tv.nrk.no/auth/web/login/opsession', None) + response = self._download_json('https://tv.nrk.no/auth/session/tokenforsub/_', None) self._AUTH_TOKEN = try_get(response, lambda x: x['session']['accessToken']) + self._API_CALL_HEADERS['authorization'] = f'Bearer {self._AUTH_TOKEN}' except ExtractorError as e: message = None if isinstance(e.cause, HTTPError) and e.cause.status in (401, 400): @@ -362,6 +374,14 @@ class NRKTVIE(InfoExtractor): 'ext': 'vtt', }], }, + 'upload_date': '20170627', + 'chapters': [{'start_time': 0, 'end_time': 2213.0, 'title': ''}, {'start_time': 2213.0, 'end_time': 2223.44, 'title': 'Outro'}], + 'timestamp': 1498591822, + 'thumbnail': 'https://gfx.nrk.no/myRSc4vuFlahB60P3n6swwRTQUZI1LqJZl9B7icZFgzA', + 'alt_title': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', @@ -373,9 +393,20 @@ class NRKTVIE(InfoExtractor): 'alt_title': '23. mai 2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, + 'age_limit': 0, 'series': '20 spørsmål', 'episode': '23. mai 2014', - 'age_limit': 0, + 'upload_date': '20140523', + 'thumbnail': 'https://gfx.nrk.no/u7uCe79SEfPVGRAGVp2_uAZnNc4mfz_kjXg6Bgek8lMQ', + 'season_id': '126936', + 'season_number': 2014, + 'season': 'Season 2014', + 'chapters': [{'start_time': 0.0, 'end_time': 39.0, 'title': 'Intro'}, {'start_time': 0.0, 'title': 'Velkommen', 'end_time': 152.32}, {'start_time': 152.32, 'title': 'Tannpirker', 'end_time': 304.76}, {'start_time': 304.76, 'title': 'Orgelbrus', 'end_time': 513.48}, {'start_time': 513.48, 'title': 'G-streng', 'end_time': 712.96}, {'start_time': 712.96, 'title': 'Medalje', 'end_time': 837.76}, {'start_time': 837.76, 'title': 'Globus', 'end_time': 1124.48}, {'start_time': 1124.48, 'title': 'Primstav', 'end_time': 1417.4}, {'start_time': 1417.4, 'title': 'Fyr', 'end_time': 1721.0}, {'start_time': 1721.0, 'end_time': 1741.0, 'title': 'Outro'}], + 'episode_number': 3, + 'timestamp': 1400871900, + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', @@ -383,11 +414,20 @@ class NRKTVIE(InfoExtractor): 'id': 'MDFP15000514', 'ext': 'mp4', 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', - 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'description': 'md5:09fd0f9cd47ba6b857836a385b88ed56', 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', 'age_limit': 0, + 'upload_date': '20140524', + 'episode_number': 17, + 'chapters': [{'start_time': 0, 'end_time': 4595.0, 'title': ''}, {'start_time': 4595.0, 'end_time': 4605.08, 'title': 'Outro'}], + 'season': 'Season 2014', + 'timestamp': 1400937600, + 'thumbnail': 'https://gfx.nrk.no/D2u6-EyVUZpVCq0PdSNHRgdBZCV40ekpk6s9fZWiMtyg', + 'season_number': 2014, + 'season_id': '39240', + 'alt_title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', }, 'params': { 'skip_download': True, @@ -435,6 +475,7 @@ class NRKTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'ProgramRightsHasExpired', }, { 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', 'info_dict': { @@ -476,13 +517,20 @@ class NRKTVEpisodeIE(InfoExtractor): 'id': 'MUHH36005220', 'ext': 'mp4', 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', - 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', + 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', 'duration': 1563.92, 'series': 'Hellums kro', 'season_number': 1, 'episode_number': 2, 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, + 'timestamp': 1572584520, + 'upload_date': '20191101', + 'thumbnail': 'https://gfx.nrk.no/2_4mhU2JhR-8IYRC_OMmAQDbbOHgwcHqgi2sBrNrsjkg', + 'alt_title': '2. Kro, krig og kjærlighet', + 'season': 'Season 1', + 'season_id': '124163', + 'chapters': [{'start_time': 0, 'end_time': 29.0, 'title': ''}, {'start_time': 29.0, 'end_time': 50.0, 'title': 'Intro'}, {'start_time': 1530.0, 'end_time': 1563.92, 'title': 'Outro'}], }, 'params': { 'skip_download': True, @@ -512,11 +560,8 @@ class NRKTVEpisodeIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - info = self._search_json_ld(webpage, display_id, default={}) - nrk_id = info.get('@id') or self._html_search_meta( - 'nrk:program-id', webpage, default=None) or self._search_regex( - rf'data-program-id=["\']({NRKTVIE._EPISODE_RE})', webpage, - 'nrk id') + info = self._parse_json(self._search_regex(r'