mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-01-15 03:41:33 +01:00
[extractor, cleanup] Refactor _download_...
methods
This commit is contained in:
parent
8a7f6d7a15
commit
617f658b7e
1 changed files with 101 additions and 168 deletions
|
@ -791,8 +791,35 @@ class InfoExtractor:
|
||||||
"""
|
"""
|
||||||
Return a tuple (page content as string, URL handle).
|
Return a tuple (page content as string, URL handle).
|
||||||
|
|
||||||
See _download_webpage docstring for arguments specification.
|
Arguments:
|
||||||
|
url_or_request -- plain text URL as a string or
|
||||||
|
a compat_urllib_request.Requestobject
|
||||||
|
video_id -- Video/playlist/item identifier (string)
|
||||||
|
|
||||||
|
Keyword arguments:
|
||||||
|
note -- note printed before downloading (string)
|
||||||
|
errnote -- note printed in case of an error (string)
|
||||||
|
fatal -- flag denoting whether error should be considered fatal,
|
||||||
|
i.e. whether it should cause ExtractionError to be raised,
|
||||||
|
otherwise a warning will be reported and extraction continued
|
||||||
|
encoding -- encoding for a page content decoding, guessed automatically
|
||||||
|
when not explicitly specified
|
||||||
|
data -- POST data (bytes)
|
||||||
|
headers -- HTTP headers (dict)
|
||||||
|
query -- URL query (dict)
|
||||||
|
expected_status -- allows to accept failed HTTP requests (non 2xx
|
||||||
|
status code) by explicitly specifying a set of accepted status
|
||||||
|
codes. Can be any of the following entities:
|
||||||
|
- an integer type specifying an exact failed status code to
|
||||||
|
accept
|
||||||
|
- a list or a tuple of integer types specifying a list of
|
||||||
|
failed status codes to accept
|
||||||
|
- a callable accepting an actual failed status code and
|
||||||
|
returning True if it should be accepted
|
||||||
|
Note that this argument does not affect success status codes (2xx)
|
||||||
|
which are always accepted.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Strip hashes from the URL (#1038)
|
# Strip hashes from the URL (#1038)
|
||||||
if isinstance(url_or_request, (compat_str, str)):
|
if isinstance(url_or_request, (compat_str, str)):
|
||||||
url_or_request = url_or_request.partition('#')[0]
|
url_or_request = url_or_request.partition('#')[0]
|
||||||
|
@ -887,102 +914,6 @@ class InfoExtractor:
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def _download_webpage(
|
|
||||||
self, url_or_request, video_id, note=None, errnote=None,
|
|
||||||
fatal=True, tries=1, timeout=5, encoding=None, data=None,
|
|
||||||
headers={}, query={}, expected_status=None):
|
|
||||||
"""
|
|
||||||
Return the data of the page as a string.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
url_or_request -- plain text URL as a string or
|
|
||||||
a compat_urllib_request.Requestobject
|
|
||||||
video_id -- Video/playlist/item identifier (string)
|
|
||||||
|
|
||||||
Keyword arguments:
|
|
||||||
note -- note printed before downloading (string)
|
|
||||||
errnote -- note printed in case of an error (string)
|
|
||||||
fatal -- flag denoting whether error should be considered fatal,
|
|
||||||
i.e. whether it should cause ExtractionError to be raised,
|
|
||||||
otherwise a warning will be reported and extraction continued
|
|
||||||
tries -- number of tries
|
|
||||||
timeout -- sleep interval between tries
|
|
||||||
encoding -- encoding for a page content decoding, guessed automatically
|
|
||||||
when not explicitly specified
|
|
||||||
data -- POST data (bytes)
|
|
||||||
headers -- HTTP headers (dict)
|
|
||||||
query -- URL query (dict)
|
|
||||||
expected_status -- allows to accept failed HTTP requests (non 2xx
|
|
||||||
status code) by explicitly specifying a set of accepted status
|
|
||||||
codes. Can be any of the following entities:
|
|
||||||
- an integer type specifying an exact failed status code to
|
|
||||||
accept
|
|
||||||
- a list or a tuple of integer types specifying a list of
|
|
||||||
failed status codes to accept
|
|
||||||
- a callable accepting an actual failed status code and
|
|
||||||
returning True if it should be accepted
|
|
||||||
Note that this argument does not affect success status codes (2xx)
|
|
||||||
which are always accepted.
|
|
||||||
"""
|
|
||||||
|
|
||||||
success = False
|
|
||||||
try_count = 0
|
|
||||||
while success is False:
|
|
||||||
try:
|
|
||||||
res = self._download_webpage_handle(
|
|
||||||
url_or_request, video_id, note, errnote, fatal,
|
|
||||||
encoding=encoding, data=data, headers=headers, query=query,
|
|
||||||
expected_status=expected_status)
|
|
||||||
success = True
|
|
||||||
except compat_http_client.IncompleteRead as e:
|
|
||||||
try_count += 1
|
|
||||||
if try_count >= tries:
|
|
||||||
raise e
|
|
||||||
self._sleep(timeout, video_id)
|
|
||||||
if res is False:
|
|
||||||
return res
|
|
||||||
else:
|
|
||||||
content, _ = res
|
|
||||||
return content
|
|
||||||
|
|
||||||
def _download_xml_handle(
|
|
||||||
self, url_or_request, video_id, note='Downloading XML',
|
|
||||||
errnote='Unable to download XML', transform_source=None,
|
|
||||||
fatal=True, encoding=None, data=None, headers={}, query={},
|
|
||||||
expected_status=None):
|
|
||||||
"""
|
|
||||||
Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
|
|
||||||
|
|
||||||
See _download_webpage docstring for arguments specification.
|
|
||||||
"""
|
|
||||||
res = self._download_webpage_handle(
|
|
||||||
url_or_request, video_id, note, errnote, fatal=fatal,
|
|
||||||
encoding=encoding, data=data, headers=headers, query=query,
|
|
||||||
expected_status=expected_status)
|
|
||||||
if res is False:
|
|
||||||
return res
|
|
||||||
xml_string, urlh = res
|
|
||||||
return self._parse_xml(
|
|
||||||
xml_string, video_id, transform_source=transform_source,
|
|
||||||
fatal=fatal), urlh
|
|
||||||
|
|
||||||
def _download_xml(
|
|
||||||
self, url_or_request, video_id,
|
|
||||||
note='Downloading XML', errnote='Unable to download XML',
|
|
||||||
transform_source=None, fatal=True, encoding=None,
|
|
||||||
data=None, headers={}, query={}, expected_status=None):
|
|
||||||
"""
|
|
||||||
Return the xml as an xml.etree.ElementTree.Element.
|
|
||||||
|
|
||||||
See _download_webpage docstring for arguments specification.
|
|
||||||
"""
|
|
||||||
res = self._download_xml_handle(
|
|
||||||
url_or_request, video_id, note=note, errnote=errnote,
|
|
||||||
transform_source=transform_source, fatal=fatal, encoding=encoding,
|
|
||||||
data=data, headers=headers, query=query,
|
|
||||||
expected_status=expected_status)
|
|
||||||
return res if res is False else res[0]
|
|
||||||
|
|
||||||
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
|
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
|
||||||
if transform_source:
|
if transform_source:
|
||||||
xml_string = transform_source(xml_string)
|
xml_string = transform_source(xml_string)
|
||||||
|
@ -995,44 +926,6 @@ class InfoExtractor:
|
||||||
else:
|
else:
|
||||||
self.report_warning(errmsg + str(ve))
|
self.report_warning(errmsg + str(ve))
|
||||||
|
|
||||||
def _download_json_handle(
|
|
||||||
self, url_or_request, video_id, note='Downloading JSON metadata',
|
|
||||||
errnote='Unable to download JSON metadata', transform_source=None,
|
|
||||||
fatal=True, encoding=None, data=None, headers={}, query={},
|
|
||||||
expected_status=None):
|
|
||||||
"""
|
|
||||||
Return a tuple (JSON object, URL handle).
|
|
||||||
|
|
||||||
See _download_webpage docstring for arguments specification.
|
|
||||||
"""
|
|
||||||
res = self._download_webpage_handle(
|
|
||||||
url_or_request, video_id, note, errnote, fatal=fatal,
|
|
||||||
encoding=encoding, data=data, headers=headers, query=query,
|
|
||||||
expected_status=expected_status)
|
|
||||||
if res is False:
|
|
||||||
return res
|
|
||||||
json_string, urlh = res
|
|
||||||
return self._parse_json(
|
|
||||||
json_string, video_id, transform_source=transform_source,
|
|
||||||
fatal=fatal), urlh
|
|
||||||
|
|
||||||
def _download_json(
|
|
||||||
self, url_or_request, video_id, note='Downloading JSON metadata',
|
|
||||||
errnote='Unable to download JSON metadata', transform_source=None,
|
|
||||||
fatal=True, encoding=None, data=None, headers={}, query={},
|
|
||||||
expected_status=None):
|
|
||||||
"""
|
|
||||||
Return the JSON object as a dict.
|
|
||||||
|
|
||||||
See _download_webpage docstring for arguments specification.
|
|
||||||
"""
|
|
||||||
res = self._download_json_handle(
|
|
||||||
url_or_request, video_id, note=note, errnote=errnote,
|
|
||||||
transform_source=transform_source, fatal=fatal, encoding=encoding,
|
|
||||||
data=data, headers=headers, query=query,
|
|
||||||
expected_status=expected_status)
|
|
||||||
return res if res is False else res[0]
|
|
||||||
|
|
||||||
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
|
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
|
||||||
if transform_source:
|
if transform_source:
|
||||||
json_string = transform_source(json_string)
|
json_string = transform_source(json_string)
|
||||||
|
@ -1058,43 +951,83 @@ class InfoExtractor:
|
||||||
data[data.find('{'):data.rfind('}') + 1],
|
data[data.find('{'):data.rfind('}') + 1],
|
||||||
video_id, transform_source, fatal)
|
video_id, transform_source, fatal)
|
||||||
|
|
||||||
def _download_socket_json_handle(
|
def __create_download_methods(name, parser, note, errnote, return_value):
|
||||||
self, url_or_request, video_id, note='Polling socket',
|
|
||||||
errnote='Unable to poll socket', transform_source=None,
|
|
||||||
fatal=True, encoding=None, data=None, headers={}, query={},
|
|
||||||
expected_status=None):
|
|
||||||
"""
|
|
||||||
Return a tuple (JSON object, URL handle).
|
|
||||||
|
|
||||||
See _download_webpage docstring for arguments specification.
|
def parse(ie, content, *args, **kwargs):
|
||||||
"""
|
if parser is None:
|
||||||
res = self._download_webpage_handle(
|
return content
|
||||||
url_or_request, video_id, note, errnote, fatal=fatal,
|
# parser is fetched by name so subclasses can override it
|
||||||
encoding=encoding, data=data, headers=headers, query=query,
|
return getattr(ie, parser)(content, *args, **kwargs)
|
||||||
expected_status=expected_status)
|
|
||||||
if res is False:
|
|
||||||
return res
|
|
||||||
webpage, urlh = res
|
|
||||||
return self._parse_socket_response_as_json(
|
|
||||||
webpage, video_id, transform_source=transform_source,
|
|
||||||
fatal=fatal), urlh
|
|
||||||
|
|
||||||
def _download_socket_json(
|
def download_handle(self, url_or_request, video_id, note=note, errnote=errnote,
|
||||||
self, url_or_request, video_id, note='Polling socket',
|
transform_source=None, fatal=True, *args, **kwargs):
|
||||||
errnote='Unable to poll socket', transform_source=None,
|
res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, *args, **kwargs)
|
||||||
fatal=True, encoding=None, data=None, headers={}, query={},
|
if res is False:
|
||||||
expected_status=None):
|
return res
|
||||||
"""
|
content, urlh = res
|
||||||
Return the JSON object as a dict.
|
return parse(self, content, video_id, transform_source, fatal), urlh
|
||||||
|
|
||||||
See _download_webpage docstring for arguments specification.
|
def download_content(
|
||||||
|
self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, *args, **kwargs):
|
||||||
|
args = [url_or_request, video_id, note, errnote, transform_source, *args]
|
||||||
|
if parser is None:
|
||||||
|
args.pop(4) # transform_source
|
||||||
|
# The method is fetched by name so subclasses can override _download_..._handle
|
||||||
|
res = getattr(self, download_handle.__name__)(*args, **kwargs)
|
||||||
|
return res if res is False else res[0]
|
||||||
|
|
||||||
|
def impersonate(func, name, return_value):
|
||||||
|
func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
|
||||||
|
func.__doc__ = f'''
|
||||||
|
@param transform_source Apply this transformation before parsing
|
||||||
|
@returns {return_value}
|
||||||
|
|
||||||
|
See _download_webpage_handle docstring for other arguments specification
|
||||||
|
'''
|
||||||
|
|
||||||
|
impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
|
||||||
|
impersonate(download_content, f'_download_{name}', f'{return_value}')
|
||||||
|
return download_handle, download_content
|
||||||
|
|
||||||
|
_download_xml_handle, _download_xml = __create_download_methods(
|
||||||
|
'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
|
||||||
|
_download_json_handle, _download_json = __create_download_methods(
|
||||||
|
'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
|
||||||
|
_download_socket_json_handle, _download_socket_json = __create_download_methods(
|
||||||
|
'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
|
||||||
|
__download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
|
||||||
|
|
||||||
|
def _download_webpage(
|
||||||
|
self, url_or_request, video_id, note=None, errnote=None,
|
||||||
|
fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
res = self._download_socket_json_handle(
|
Return the data of the page as a string.
|
||||||
url_or_request, video_id, note=note, errnote=errnote,
|
|
||||||
transform_source=transform_source, fatal=fatal, encoding=encoding,
|
Keyword arguments:
|
||||||
data=data, headers=headers, query=query,
|
tries -- number of tries
|
||||||
expected_status=expected_status)
|
timeout -- sleep interval between tries
|
||||||
return res if res is False else res[0]
|
|
||||||
|
See _download_webpage_handle docstring for other arguments specification.
|
||||||
|
"""
|
||||||
|
|
||||||
|
R''' # NB: These are unused; should they be deprecated?
|
||||||
|
if tries != 1:
|
||||||
|
self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
|
||||||
|
if timeout is NO_DEFAULT:
|
||||||
|
timeout = 5
|
||||||
|
else:
|
||||||
|
self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
|
||||||
|
'''
|
||||||
|
|
||||||
|
try_count = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
|
||||||
|
except compat_http_client.IncompleteRead as e:
|
||||||
|
try_count += 1
|
||||||
|
if try_count >= tries:
|
||||||
|
raise e
|
||||||
|
self._sleep(timeout, video_id)
|
||||||
|
|
||||||
def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
|
def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
|
||||||
idstr = format_field(video_id, template='%s: ')
|
idstr = format_field(video_id, template='%s: ')
|
||||||
|
|
Loading…
Reference in a new issue