mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-01-17 06:11:41 +01:00
[utils] more forgiving html parsing + unit tests
This commit is contained in:
parent
da0d84258b
commit
af03fa4542
2 changed files with 24 additions and 7 deletions
|
@ -1796,6 +1796,25 @@ Line 1
|
|||
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
|
||||
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
|
||||
|
||||
def test_get_element_text_and_html_by_tag_malformed(self):
|
||||
inner_text = 'inner_text'
|
||||
malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
|
||||
html = f'<div>{malnested_elements}</div>'
|
||||
|
||||
self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html))
|
||||
self.assertEqual(
|
||||
get_element_text_and_html_by_tag('malnested_a', html),
|
||||
(f'<malnested_b>{inner_text}',
|
||||
f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
|
||||
self.assertEqual(
|
||||
get_element_text_and_html_by_tag('malnested_b', html),
|
||||
(f'{inner_text}</malnested_a>',
|
||||
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
|
||||
self.assertRaises(
|
||||
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
|
||||
self.assertRaises(
|
||||
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
|
||||
|
||||
def test_iri_to_uri(self):
|
||||
self.assertEqual(
|
||||
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
|
||||
|
|
|
@ -466,17 +466,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
|
|||
pass
|
||||
|
||||
def handle_starttag(self, tag, _):
|
||||
self.tagstack.append(tag)
|
||||
self.tagstack.appendleft(tag)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if not self.tagstack:
|
||||
raise compat_HTMLParseError('no tags in the stack')
|
||||
while self.tagstack:
|
||||
inner_tag = self.tagstack.pop()
|
||||
if inner_tag == tag:
|
||||
break
|
||||
else:
|
||||
raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
|
||||
with contextlib.suppress(ValueError):
|
||||
self.tagstack.remove(tag)
|
||||
if not self.tagstack:
|
||||
raise self.HTMLBreakOnClosingTagException()
|
||||
|
||||
|
@ -510,6 +506,8 @@ def get_element_text_and_html_by_tag(tag, html):
|
|||
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
|
||||
try:
|
||||
parser.feed(html[offset:offset + next_closing_tag_end])
|
||||
if tag not in parser.tagstack:
|
||||
raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
|
||||
offset += next_closing_tag_end
|
||||
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
|
||||
return html[content_start:offset + next_closing_tag_start], \
|
||||
|
|
Loading…
Reference in a new issue