mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-01-18 10:26:48 +01:00
[utils] Decode HTML5 entities
Used in test_Vporn_1. Also related to #9270
This commit is contained in:
parent
9631a94fb5
commit
55b2f099c0
2 changed files with 12 additions and 2 deletions
|
@ -249,6 +249,8 @@ class TestUtil(unittest.TestCase):
|
||||||
self.assertEqual(unescapeHTML('/'), '/')
|
self.assertEqual(unescapeHTML('/'), '/')
|
||||||
self.assertEqual(unescapeHTML('é'), 'é')
|
self.assertEqual(unescapeHTML('é'), 'é')
|
||||||
self.assertEqual(unescapeHTML('�'), '�')
|
self.assertEqual(unescapeHTML('�'), '�')
|
||||||
|
# HTML5 entities
|
||||||
|
self.assertEqual(unescapeHTML('.''), '.\'')
|
||||||
|
|
||||||
def test_date_from_str(self):
|
def test_date_from_str(self):
|
||||||
self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))
|
self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))
|
||||||
|
|
|
@ -39,6 +39,7 @@ from .compat import (
|
||||||
compat_chr,
|
compat_chr,
|
||||||
compat_etree_fromstring,
|
compat_etree_fromstring,
|
||||||
compat_html_entities,
|
compat_html_entities,
|
||||||
|
compat_html_entities_html5,
|
||||||
compat_http_client,
|
compat_http_client,
|
||||||
compat_kwargs,
|
compat_kwargs,
|
||||||
compat_parse_qs,
|
compat_parse_qs,
|
||||||
|
@ -456,12 +457,19 @@ def orderedSet(iterable):
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def _htmlentity_transform(entity):
|
def _htmlentity_transform(entity_with_semicolon):
|
||||||
"""Transforms an HTML entity to a character."""
|
"""Transforms an HTML entity to a character."""
|
||||||
|
entity = entity_with_semicolon[:-1]
|
||||||
|
|
||||||
# Known non-numeric HTML entity
|
# Known non-numeric HTML entity
|
||||||
if entity in compat_html_entities.name2codepoint:
|
if entity in compat_html_entities.name2codepoint:
|
||||||
return compat_chr(compat_html_entities.name2codepoint[entity])
|
return compat_chr(compat_html_entities.name2codepoint[entity])
|
||||||
|
|
||||||
|
# TODO: HTML5 allows entities without a semicolon. For example,
|
||||||
|
# 'Éric' should be decoded as 'Éric'.
|
||||||
|
if entity_with_semicolon in compat_html_entities_html5:
|
||||||
|
return compat_html_entities_html5[entity_with_semicolon]
|
||||||
|
|
||||||
mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
|
mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
|
||||||
if mobj is not None:
|
if mobj is not None:
|
||||||
numstr = mobj.group(1)
|
numstr = mobj.group(1)
|
||||||
|
@ -486,7 +494,7 @@ def unescapeHTML(s):
|
||||||
assert type(s) == compat_str
|
assert type(s) == compat_str
|
||||||
|
|
||||||
return re.sub(
|
return re.sub(
|
||||||
r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
|
r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
|
||||||
|
|
||||||
|
|
||||||
def get_subprocess_encoding():
|
def get_subprocess_encoding():
|
||||||
|
|
Loading…
Reference in a new issue