From 05403ea5ad7e57dcacde98055267285c93a3278b Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:31:08 +1200 Subject: [PATCH] add tests, fix regex, improve stability. --- yt_dlp/extractor/googledrive.py | 71 +++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 4f4a9cf3f2..7a9d0befb0 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -1,4 +1,3 @@ -import json import re import urllib.parse @@ -298,23 +297,69 @@ class GoogleDriveFolderIE(InfoExtractor): 'title': 'Forrest', }, 'playlist_count': 3, + }, { + # Contains various formats + 'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', + 'info_dict': { + 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', + 'title': 'public folder', + }, + 'playlist_count': 4, }] + _JSON_DS_RE = r'key\s*?:\s*?([\'"])ds:\s*?%d\1,[^}]*data:' + _JSON_HASH_RE = r'hash\s*?:\s*?([\'"])%d\1,[^}]*data:' + _ARRAY_RE = r'\[(?s:.+)\]' - def _extract_json(self, idx, webpage): - RE = r'AF_initDataCallback\(\{key:\s*([\'"])ds:\s*%d\1,[^}]*data:(?P\[.*?\]),\s*sideChannel:\s\{' - return json.loads(self._html_search_regex(RE % idx, webpage, 'JSON', group='data')) + def _extract_json_ds(self, dsval, webpage, video_id, **kwargs): + """ + Searches for json with the 'ds' value(0~5) from the webpage with regex. + Folder info: ds=0; Folder items: ds=4. + For example, if the webpage contains the line below, the empty data array + can be got by passing dsval=3 to this function. + AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); + """ + return self._search_json(self._JSON_DS_RE % dsval, webpage, + f'webpage JSON ds:{dsval}', video_id, + contains_pattern=self._ARRAY_RE, **kwargs) - def _get_folder_items(self, results): - yield from results + def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): + """ + Searches for json with the 'hash' value(1~6) from the webpage with regex. + Folder info: hash=1; Folder items: hash=6. + For example, if the webpage contains the line below, the empty data array + can be got by passing hashval=2 to this function. + AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); + """ + return self._search_json(self._JSON_HASH_RE % hashval, webpage, + f'webpage JSON hash:{hashval}', video_id, + contains_pattern=self._ARRAY_RE, **kwargs) def _real_extract(self, url): + def item_url_getter(item): + url_from_0 = f'https://drive.google.com/file/d/{item[0]}' + if GoogleDriveIE.suitable(url_from_0): + return url_from_0 + else: + for attr in item: + if isinstance(attr, str) and GoogleDriveIE.suitable(attr): + return attr + self.write_debug('Failed to extract url!') + return None + folder_id = self._match_id(url) + headers = self.geo_verification_headers() - webpage = self._download_webpage(url, folder_id) + webpage = self._download_webpage(url, folder_id, headers=headers) + json_folder_info = ( + self._extract_json_ds(0, webpage, folder_id, default=None) + or self._extract_json_hash(1, webpage, folder_id) + ) + json_items = ( + self._extract_json_ds(4, webpage, folder_id, default=None) + or self._extract_json_hash(6, webpage, folder_id) + ) + title = json_folder_info[1][2] + items = json_items[-1] - title = self._extract_json(0, webpage)[1][2] - results_4 = self._extract_json(4, webpage)[-1] - - return self.playlist_from_matches( - self._get_folder_items(results_4), folder_id, title, - ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item[0]}') + return self.playlist_from_matches((item for item in items), folder_id, title, + ie=GoogleDriveIE, getter=item_url_getter)