diff --git a/test/test_parsing.py b/test/test_parsing.py new file mode 100644 index 000000000..782a1196d --- /dev/null +++ b/test/test_parsing.py @@ -0,0 +1,218 @@ +import textwrap +import unittest + +from parsing import ( + FirstMatchingElementParser, + HTMLTagParser, + MatchingElementParser, +) + +from yt_dlp.compat import compat_HTMLParseError + +get_element_by_attribute = FirstMatchingElementParser +get_element_by_class = FirstMatchingElementParser +get_element_html_by_attribute = FirstMatchingElementParser +get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class +get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag +get_elements_by_attribute = MatchingElementParser +get_elements_by_class = MatchingElementParser +get_elements_html_by_attribute = MatchingElementParser +get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class +get_elements_text_and_html_by_attribute = MatchingElementParser + + +class TestParsing(unittest.TestCase): + GET_ELEMENT_BY_CLASS_TEST_STRING = ''' + nice + ''' + + def test_get_element_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('no-such-class', html), None) + + def test_get_element_html_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_class('foo', html), html.strip()) + self.assertEqual(get_element_by_class('no-such-class', html), None) + + GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' + + ''' + + def test_get_element_by_attribute(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice') + self.assertEqual(get_element_by_attribute('class', 'foo', html), None) + self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) + + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING + + self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + + def test_get_element_html_by_attribute(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip()) + self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None) + self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None) + + html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING + + self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip()) + + GET_ELEMENTS_BY_CLASS_TEST_STRING = ''' + nice + also nice + ''' + GET_ELEMENTS_BY_CLASS_RES = [ + 'nice', + 'also nice' + ] + + def test_get_elements_by_class(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) + self.assertEqual(get_elements_by_class('no-such-class', html), []) + + def test_get_elements_html_by_class(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_class('no-such-class', html), []) + + def test_get_elements_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) + self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + + def test_get_elements_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html), + self.GET_ELEMENTS_BY_CLASS_RES) + self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), []) + + def test_get_elements_text_and_html_by_attribute(self): + html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING + + self.assertEqual( + get_elements_text_and_html_by_attribute('class', 'foo bar', html), + list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), []) + self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), []) + + self.assertEqual(get_elements_text_and_html_by_attribute( + 'class', 'foo', 'nicenice', tag='a'), + [('nice', 'nice')]) + + def test_get_element_text_and_html_by_tag(self): + get_element_by_tag_test_string = ''' + random text lorem ipsum

+
+ this should be returned + this should also be returned +
+ this should also be returned +
+ closing tag above should not trick, so this should also be returned +
+ but this text should not be returned + ''' + html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) + get_element_by_tag_res_outerdiv_html = html.strip()[32:276] + get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] + get_element_by_tag_res_innerspan_html = html.strip()[78:119] + get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] + + self.assertEqual( + get_element_text_and_html_by_tag('div', html), + (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) + self.assertEqual( + get_element_text_and_html_by_tag('span', html), + (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) + self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) + + def test_get_element_text_and_html_by_tag_malformed(self): + inner_text = 'inner text' + malnested_elements = f'{inner_text}' + commented_html = '' + outerdiv_html = f'
{malnested_elements}
' + html = f'{commented_html}{outerdiv_html}' + + self.assertEqual( + get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_a', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertEqual( + get_element_text_and_html_by_tag('malnested_b', html), + (f'{inner_text}', + f'{inner_text}')) + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + self.assertRaises( + compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') + + def test_strict_html_parsing(self): + class StrictTagParser(HTMLTagParser): + STRICT = True + + parser = StrictTagParser() + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): + parser.taglist('

/p>

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): + parser.taglist('

', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): + parser.taglist('must be empty', reset=True) + + def test_relaxed_html_parsing(self): + Tag = HTMLTagParser.Tag + parser = HTMLTagParser() + + self.assertEqual(parser.taglist('

', reset=True), []) + self.assertEqual(parser.taglist('

', reset=True), []) + + tags = parser.taglist('

', reset=True) + self.assertEqual(tags, [Tag('div'), Tag('p')]) + + tags = parser.taglist('

/p>

', reset=True) + self.assertEqual(tags, [Tag('div')]) + + tags = parser.taglist('

paragraph

', reset=True) + self.assertEqual(tags, [Tag('p'), Tag('div')]) + self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraphmust be empty', reset=True) + self.assertEqual(tags, [Tag('img')]) + self.assertEqual(tags[0].text_and_html(), ('', '')) + + def test_compliant_html_parsing(self): + # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) + Tag = HTMLTagParser.Tag + html = ''' + no error without closing tag: + self closing is ok: + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual(tags, [Tag('img'), Tag('img')]) + + # don't get fooled by '>' in attributes + html = '''''' + tags = parser.taglist(html, reset=True) + self.assertEqual(tags[0].text_and_html(), ('', html)) diff --git a/test/test_utils.py b/test/test_utils.py index d9a62258c..3045b6d7e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,7 +4,6 @@ import os import re import sys -import textwrap import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -21,14 +20,6 @@ from yt_dlp.compat import ( compat_HTMLParseError, compat_os_name, ) -from yt_dlp.parsing import ( - HTMLTagParser, - FirstMatchingElementParser, -) - -# some testcases don't work with current functions -get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag - from yt_dlp.utils import ( Config, DateRange, @@ -68,6 +59,7 @@ from yt_dlp.utils import ( get_element_by_class, get_element_html_by_attribute, get_element_html_by_class, + get_element_text_and_html_by_tag, get_elements_by_attribute, get_elements_by_class, get_elements_html_by_attribute, @@ -1776,110 +1768,34 @@ Line 1 self.assertEqual(list(get_elements_text_and_html_by_attribute( 'class', 'foo', 'nicenice', tag='a')), [('nice', 'nice')]) - def test_get_element_text_and_html_by_tag(self): - get_element_by_tag_test_string = ''' - random text lorem ipsum

+ GET_ELEMENT_BY_TAG_TEST_STRING = ''' + random text lorem ipsum

+
+ this should be returned + this should also be returned
- this should be returned - this should also be returned -
- this should also be returned -
- closing tag above should not trick, so this should also be returned + this should also be returned
- but this text should not be returned - ''' - html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4) - get_element_by_tag_res_outerdiv_html = html.strip()[32:276] - get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6] - get_element_by_tag_res_innerspan_html = html.strip()[78:119] - get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7] + closing tag above should not trick, so this should also be returned +
+ but this text should not be returned + ''' + GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276] + GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6] + GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119] + GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7] + + def test_get_element_text_and_html_by_tag(self): + html = self.GET_ELEMENT_BY_TAG_TEST_STRING self.assertEqual( get_element_text_and_html_by_tag('div', html), - (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html)) + (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML)) self.assertEqual( get_element_text_and_html_by_tag('span', html), - (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html)) + (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML)) self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html) - def test_get_element_text_and_html_by_tag_malformed(self): - inner_text = 'inner text' - malnested_elements = f'{inner_text}' - commented_html = '' - outerdiv_html = f'
{malnested_elements}
' - html = f'{commented_html}{outerdiv_html}' - - self.assertEqual( - get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html)) - self.assertEqual( - get_element_text_and_html_by_tag('malnested_a', html), - (f'{inner_text}', - f'{inner_text}')) - self.assertEqual( - get_element_text_and_html_by_tag('malnested_b', html), - (f'{inner_text}', - f'{inner_text}')) - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') - self.assertRaises( - compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}') - - def test_strict_html_parsing(self): - class StrictTagParser(HTMLTagParser): - STRICT = True - - parser = StrictTagParser() - with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '

'"): - parser.taglist('

/p>

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"): - parser.taglist('

', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"): - parser.taglist('must be empty', reset=True) - - def test_relaxed_html_parsing(self): - Tag = HTMLTagParser.Tag - parser = HTMLTagParser() - - self.assertEqual(parser.taglist('

', reset=True), []) - self.assertEqual(parser.taglist('

', reset=True), []) - - tags = parser.taglist('

', reset=True) - self.assertEqual(tags, [Tag('div'), Tag('p')]) - - tags = parser.taglist('

/p>

', reset=True) - self.assertEqual(tags, [Tag('div')]) - - tags = parser.taglist('

paragraph

', reset=True) - self.assertEqual(tags, [Tag('p'), Tag('div')]) - self.assertEqual(tags[0].text_and_html(), ('paragraph', '

paragraphmust be empty', reset=True) - self.assertEqual(tags, [Tag('img')]) - self.assertEqual(tags[0].text_and_html(), ('', '')) - - def test_compliant_html_parsing(self): - # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) - Tag = HTMLTagParser.Tag - html = ''' - no error without closing tag: - self closing is ok: - ''' - parser = HTMLTagParser() - tags = parser.taglist(html, reset=True) - self.assertEqual(tags, [Tag('img'), Tag('img')]) - - # don't get fooled by '>' in attributes - html = '''''' - tags = parser.taglist(html, reset=True) - self.assertEqual(tags[0].text_and_html(), ('', html)) - def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),