mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-01-27 19:58:29 +01:00
[YoutubeDL:utils] Move percent encode non-ASCII URLs workaround to http_request and simplify (Closes #6457)
This commit is contained in:
parent
47f53ad958
commit
51f267d9d4
2 changed files with 20 additions and 21 deletions
|
@ -1860,27 +1860,6 @@ class YoutubeDL(object):
|
||||||
|
|
||||||
def urlopen(self, req):
|
def urlopen(self, req):
|
||||||
""" Start an HTTP download """
|
""" Start an HTTP download """
|
||||||
|
|
||||||
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
|
|
||||||
# always respected by websites, some tend to give out URLs with non percent-encoded
|
|
||||||
# non-ASCII characters (see telemb.py, ard.py [#3412])
|
|
||||||
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
|
|
||||||
# To work around aforementioned issue we will replace request's original URL with
|
|
||||||
# percent-encoded one
|
|
||||||
req_is_string = isinstance(req, compat_basestring)
|
|
||||||
url = req if req_is_string else req.get_full_url()
|
|
||||||
url_escaped = escape_url(url)
|
|
||||||
|
|
||||||
# Substitute URL if any change after escaping
|
|
||||||
if url != url_escaped:
|
|
||||||
if req_is_string:
|
|
||||||
req = url_escaped
|
|
||||||
else:
|
|
||||||
req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
|
|
||||||
req = req_type(
|
|
||||||
url_escaped, data=req.data, headers=req.headers,
|
|
||||||
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
|
|
||||||
|
|
||||||
return self._opener.open(req, timeout=self._socket_timeout)
|
return self._opener.open(req, timeout=self._socket_timeout)
|
||||||
|
|
||||||
def print_debug_header(self):
|
def print_debug_header(self):
|
||||||
|
|
|
@ -651,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
|
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
|
||||||
|
# always respected by websites, some tend to give out URLs with non percent-encoded
|
||||||
|
# non-ASCII characters (see telemb.py, ard.py [#3412])
|
||||||
|
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
|
||||||
|
# To work around aforementioned issue we will replace request's original URL with
|
||||||
|
# percent-encoded one
|
||||||
|
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
|
||||||
|
# the code of this workaround has been moved here from YoutubeDL.urlopen()
|
||||||
|
url = req.get_full_url()
|
||||||
|
url_escaped = escape_url(url)
|
||||||
|
|
||||||
|
# Substitute URL if any change after escaping
|
||||||
|
if url != url_escaped:
|
||||||
|
req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
|
||||||
|
new_req = req_type(
|
||||||
|
url_escaped, data=req.data, headers=req.headers,
|
||||||
|
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
|
||||||
|
new_req.timeout = req.timeout
|
||||||
|
req = new_req
|
||||||
|
|
||||||
for h, v in std_headers.items():
|
for h, v in std_headers.items():
|
||||||
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
|
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
|
||||||
# The dict keys are capitalized because of this bug by urllib
|
# The dict keys are capitalized because of this bug by urllib
|
||||||
|
|
Loading…
Add table
Reference in a new issue