diff --git a/test/test_utils.py b/test/test_utils.py index 334423619..022e821a6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1796,6 +1796,25 @@ Line 1 (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + def test_get_element_text_and_html_by_tag_malformed(self): + inner_text = 'inner_text' + malnested_elements = f'{inner_text}' + html = f'
{malnested_elements}
' + + self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html)) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_a', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_b', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 8c2c5593c..de058b0e6 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -466,17 +466,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser): pass def handle_starttag(self, tag, _): - self.tagstack.append(tag) + self.tagstack.appendleft(tag) def handle_endtag(self, tag): if not self.tagstack: raise compat_HTMLParseError('no tags in the stack') - while self.tagstack: - inner_tag = self.tagstack.pop() - if inner_tag == tag: - break - else: - raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found') + with contextlib.suppress(ValueError): + self.tagstack.remove(tag) if not self.tagstack: raise self.HTMLBreakOnClosingTagException() @@ -510,6 +506,8 @@ def get_element_text_and_html_by_tag(tag, html): next_closing_tag_end = next_closing_tag_start + len(closing_tag) try: parser.feed(html[offset:offset + next_closing_tag_end]) + if tag not in parser.tagstack: + raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException() offset += next_closing_tag_end except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: return html[content_start:offset + next_closing_tag_start], \