[utils] more forgiving html parsing + unit tests

This commit is contained in:
Marcel 2022-11-17 01:20:25 +01:00
parent da0d84258b
commit af03fa4542
No known key found for this signature in database
GPG key ID: 7813C97693AD6AAE
2 changed files with 24 additions and 7 deletions

View file

@ -1796,6 +1796,25 @@ Line 1
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner_text'
malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
html = f'<div>{malnested_elements}</div>'
self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_a', html),
(f'<malnested_b>{inner_text}',
f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}</malnested_a>',
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
self.assertRaises(
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
self.assertRaises(
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
def test_iri_to_uri(self): def test_iri_to_uri(self):
self.assertEqual( self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),

View file

@ -466,17 +466,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
pass pass
def handle_starttag(self, tag, _): def handle_starttag(self, tag, _):
self.tagstack.append(tag) self.tagstack.appendleft(tag)
def handle_endtag(self, tag): def handle_endtag(self, tag):
if not self.tagstack: if not self.tagstack:
raise compat_HTMLParseError('no tags in the stack') raise compat_HTMLParseError('no tags in the stack')
while self.tagstack: with contextlib.suppress(ValueError):
inner_tag = self.tagstack.pop() self.tagstack.remove(tag)
if inner_tag == tag:
break
else:
raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
if not self.tagstack: if not self.tagstack:
raise self.HTMLBreakOnClosingTagException() raise self.HTMLBreakOnClosingTagException()
@ -510,6 +506,8 @@ def get_element_text_and_html_by_tag(tag, html):
next_closing_tag_end = next_closing_tag_start + len(closing_tag) next_closing_tag_end = next_closing_tag_start + len(closing_tag)
try: try:
parser.feed(html[offset:offset + next_closing_tag_end]) parser.feed(html[offset:offset + next_closing_tag_end])
if tag not in parser.tagstack:
raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
offset += next_closing_tag_end offset += next_closing_tag_end
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
return html[content_start:offset + next_closing_tag_start], \ return html[content_start:offset + next_closing_tag_start], \