From 3ee1194288981c4f2c4abd8315326de0c424d2ce Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 21 Apr 2024 13:40:38 +0200 Subject: [PATCH] [ie] Make `_search_nextjs_data` non fatal (#8937) Authored by: Grub4K --- test/test_InfoExtractor.py | 9 +++++++++ yt_dlp/extractor/asobistage.py | 2 +- yt_dlp/extractor/common.py | 16 ++++++++++------ yt_dlp/extractor/stv.py | 2 +- yt_dlp/extractor/tiktok.py | 2 +- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index b7dee496af..c633ce3e47 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1906,6 +1906,15 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ expected_status=TEAPOT_RESPONSE_STATUS) self.assertEqual(content, TEAPOT_RESPONSE_BODY) + def test_search_nextjs_data(self): + data = '' + self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}}) + self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {}) + self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None) + self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {}) + with self.assertRaises(DeprecationWarning): + self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py index b088a1b132..8fa8f3edb6 100644 --- a/yt_dlp/extractor/asobistage.py +++ b/yt_dlp/extractor/asobistage.py @@ -105,7 +105,7 @@ class AsobiStageIE(InfoExtractor): video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] webpage = self._download_webpage(url, video_id) event_data = traverse_obj( - self._search_nextjs_data(webpage, video_id, default='{}'), + self._search_nextjs_data(webpage, video_id, default={}), ('props', 'pageProps', 'eventCMSData', { 'title': ('event_name', {str}), 'thumbnail': ('event_thumbnail_image', {url_or_none}), diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 57bbf9bdf1..bebbc6b43f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1738,12 +1738,16 @@ class InfoExtractor: traverse_json_ld(json_ld) return filter_dict(info) - def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): - return self._parse_json( - self._search_regex( - r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', - webpage, 'next.js data', fatal=fatal, **kw), - video_id, transform_source=transform_source, fatal=fatal) + def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw): + if default == '{}': + self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead') + default = {} + if default is not NO_DEFAULT: + fatal = False + + return self._search_json( + r']+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', + video_id, end_pattern='', fatal=fatal, default=default, **kw) def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py index 8b3e63538c..0ab7801004 100644 --- a/yt_dlp/extractor/stv.py +++ b/yt_dlp/extractor/stv.py @@ -41,7 +41,7 @@ class STVPlayerIE(InfoExtractor): ptype, video_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, video_id, fatal=False) or '' - props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {} + props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {} player_api_cache = try_get( props, lambda x: x['initialReduxState']['playerApiCache']) or {} diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 3f5261ad96..3d965dd452 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -776,7 +776,7 @@ class TikTokIE(TikTokBaseIE): status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) - elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'): + elif next_data := self._search_nextjs_data(webpage, video_id, default={}): self.write_debug('Found next.js data') status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))