mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-01-17 06:11:41 +01:00
[utils] more forgiving html parsing + unit tests
This commit is contained in:
parent
da0d84258b
commit
af03fa4542
2 changed files with 24 additions and 7 deletions
|
@ -1796,6 +1796,25 @@ Line 1
|
||||||
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
|
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
|
||||||
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
|
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
|
||||||
|
|
||||||
|
def test_get_element_text_and_html_by_tag_malformed(self):
|
||||||
|
inner_text = 'inner_text'
|
||||||
|
malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
|
||||||
|
html = f'<div>{malnested_elements}</div>'
|
||||||
|
|
||||||
|
self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html))
|
||||||
|
self.assertEqual(
|
||||||
|
get_element_text_and_html_by_tag('malnested_a', html),
|
||||||
|
(f'<malnested_b>{inner_text}',
|
||||||
|
f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
|
||||||
|
self.assertEqual(
|
||||||
|
get_element_text_and_html_by_tag('malnested_b', html),
|
||||||
|
(f'{inner_text}</malnested_a>',
|
||||||
|
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
|
||||||
|
self.assertRaises(
|
||||||
|
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
|
||||||
|
self.assertRaises(
|
||||||
|
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
|
||||||
|
|
||||||
def test_iri_to_uri(self):
|
def test_iri_to_uri(self):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
|
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
|
||||||
|
|
|
@ -466,17 +466,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def handle_starttag(self, tag, _):
|
def handle_starttag(self, tag, _):
|
||||||
self.tagstack.append(tag)
|
self.tagstack.appendleft(tag)
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
if not self.tagstack:
|
if not self.tagstack:
|
||||||
raise compat_HTMLParseError('no tags in the stack')
|
raise compat_HTMLParseError('no tags in the stack')
|
||||||
while self.tagstack:
|
with contextlib.suppress(ValueError):
|
||||||
inner_tag = self.tagstack.pop()
|
self.tagstack.remove(tag)
|
||||||
if inner_tag == tag:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
|
|
||||||
if not self.tagstack:
|
if not self.tagstack:
|
||||||
raise self.HTMLBreakOnClosingTagException()
|
raise self.HTMLBreakOnClosingTagException()
|
||||||
|
|
||||||
|
@ -510,6 +506,8 @@ def get_element_text_and_html_by_tag(tag, html):
|
||||||
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
|
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
|
||||||
try:
|
try:
|
||||||
parser.feed(html[offset:offset + next_closing_tag_end])
|
parser.feed(html[offset:offset + next_closing_tag_end])
|
||||||
|
if tag not in parser.tagstack:
|
||||||
|
raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
|
||||||
offset += next_closing_tag_end
|
offset += next_closing_tag_end
|
||||||
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
|
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
|
||||||
return html[content_start:offset + next_closing_tag_start], \
|
return html[content_start:offset + next_closing_tag_start], \
|
||||||
|
|
Loading…
Reference in a new issue