import textwrap import unittest from parsing import ( FirstMatchingElementParser, HTMLTagParser, MatchingElementParser, ) from yt_dlp.compat import compat_HTMLParseError get_element_by_attribute = FirstMatchingElementParser get_element_by_class = FirstMatchingElementParser get_element_html_by_attribute = FirstMatchingElementParser get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag get_elements_by_attribute = MatchingElementParser get_elements_by_class = MatchingElementParser get_elements_html_by_attribute = MatchingElementParser get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class get_elements_text_and_html_by_attribute = MatchingElementParser class TestParsing(unittest.TestCase): GET_ELEMENT_BY_CLASS_TEST_STRING = ''' nice ''' def test_get_element_by_class(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) def test_get_element_html_by_class(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_html_by_class('foo', html), html.strip()) self.assertEqual(get_element_by_class('no-such-class', html), None) GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' ''' def test_get_element_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') def test_get_element_html_by_attribute(self): html = self.GET_ELEMENT_BY_CLASS_TEST_STRING self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip()) self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip()) GET_ELEMENTS_BY_CLASS_TEST_STRING = ''' nice also nice ''' GET_ELEMENTS_BY_CLASS_RES = [ 'nice', 'also nice' ] def test_get_elements_by_class(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_class('no-such-class', html), []) def test_get_elements_html_by_class(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES) self.assertEqual(get_elements_html_by_class('no-such-class', html), []) def test_get_elements_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) def test_get_elements_html_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), self.GET_ELEMENTS_BY_CLASS_RES) self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), []) def test_get_elements_text_and_html_by_attribute(self): html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING self.assertEqual( get_elements_text_and_html_by_attribute('class', 'foo bar', html), list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) self.assertEqual(get_elements_text_and_html_by_attribute( 'class', 'foo', 'nicenice', tag='a'), [('nice', 'nice')]) def test_get_element_text_and_html_by_tag(self): get_element_by_tag_test_string = ''' random text lorem ipsum

this should be returned this should also be returned
this should also be returned
closing tag above should not trick, so this should also be returned
but this text should not be returned ''' html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) get_element_by_tag_res_outerdiv_html = html.strip()[32:276] get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] get_element_by_tag_res_innerspan_html = html.strip()[78:119] get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] self.assertEqual( get_element_text_and_html_by_tag('div', html), (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) self.assertEqual( get_element_text_and_html_by_tag('span', html), (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) def test_get_element_text_and_html_by_tag_malformed(self): inner_text = 'inner text' malnested_elements = f'{inner_text}' commented_html = '' outerdiv_html = f'
{malnested_elements}
' html = f'{commented_html}{outerdiv_html}' self.assertEqual( get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) self.assertEqual( get_element_text_and_html_by_tag('malnested_a', html), (f'{inner_text}', f'{inner_text}')) self.assertEqual( get_element_text_and_html_by_tag('malnested_b', html), (f'{inner_text}', f'{inner_text}')) self.assertRaises( compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') self.assertRaises( compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): STRICT = True parser = StrictTagParser() with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): parser.taglist('

/p>

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): parser.taglist('

', reset=True) with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): parser.taglist('must be empty', reset=True) def test_relaxed_html_parsing(self): Tag = HTMLTagParser.Tag parser = HTMLTagParser() self.assertEqual(parser.taglist('

', reset=True), []) self.assertEqual(parser.taglist('

', reset=True), []) tags = parser.taglist('

', reset=True) self.assertEqual(tags, [Tag('div'), Tag('p')]) tags = parser.taglist('

/p>

', reset=True) self.assertEqual(tags, [Tag('div')]) tags = parser.taglist('

paragraph

', reset=True) self.assertEqual(tags, [Tag('p'), Tag('div')]) self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraphmust be empty', reset=True) self.assertEqual(tags, [Tag('img')]) self.assertEqual(tags[0].text_and_html(), ('', '')) def test_compliant_html_parsing(self): # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) Tag = HTMLTagParser.Tag html = ''' no error without closing tag: self closing is ok: ''' parser = HTMLTagParser() tags = parser.taglist(html, reset=True) self.assertEqual(tags, [Tag('img'), Tag('img')]) # don't get fooled by '>' in attributes html = '''''' tags = parser.taglist(html, reset=True) self.assertEqual(tags[0].text_and_html(), ('', html))