diff --git a/test/test_utils.py b/test/test_utils.py
index 334423619..022e821a6 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1796,6 +1796,25 @@ Line 1
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+ def test_get_element_text_and_html_by_tag_malformed(self):
+ inner_text = 'inner_text'
+ malnested_elements = f'{inner_text}'
+ html = f'
{malnested_elements}
'
+
+ self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('malnested_a', html),
+ (f'{inner_text}',
+ f'{inner_text}'))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('malnested_b', html),
+ (f'{inner_text}',
+ f'{inner_text}'))
+ self.assertRaises(
+ compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
+ self.assertRaises(
+ compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
+
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 8c2c5593c..de058b0e6 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -466,17 +466,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
pass
def handle_starttag(self, tag, _):
- self.tagstack.append(tag)
+ self.tagstack.appendleft(tag)
def handle_endtag(self, tag):
if not self.tagstack:
raise compat_HTMLParseError('no tags in the stack')
- while self.tagstack:
- inner_tag = self.tagstack.pop()
- if inner_tag == tag:
- break
- else:
- raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
+ with contextlib.suppress(ValueError):
+ self.tagstack.remove(tag)
if not self.tagstack:
raise self.HTMLBreakOnClosingTagException()
@@ -510,6 +506,8 @@ def get_element_text_and_html_by_tag(tag, html):
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
try:
parser.feed(html[offset:offset + next_closing_tag_end])
+ if tag not in parser.tagstack:
+ raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
offset += next_closing_tag_end
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
return html[content_start:offset + next_closing_tag_start], \