diff --git a/test/test_parsing.py b/test/test_parsing.py new file mode 100644 index 000000000..782a1196d --- /dev/null +++ b/test/test_parsing.py @@ -0,0 +1,218 @@ +import textwrap +import unittest + +from parsing import ( + FirstMatchingElementParser, + HTMLTagParser, + MatchingElementParser, +) + +from yt_dlp.compat import compat_HTMLParseError + +get_element_by_attribute = FirstMatchingElementParser +get_element_by_class = FirstMatchingElementParser +get_element_html_by_attribute = FirstMatchingElementParser +get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class +get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag +get_elements_by_attribute = MatchingElementParser +get_elements_by_class = MatchingElementParser +get_elements_html_by_attribute = MatchingElementParser +get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class +get_elements_text_and_html_by_attribute = MatchingElementParser + + +class TestParsing(unittest.TestCase): + GET_ELEMENT_BY_CLASS_TEST_STRING = ''' +
+ ''' + + def test_get_element_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('no-such-class', html), None) + + def test_get_element_html_by_class(self): + html = self.GET_ELEMENT_BY_CLASS_TEST_STRING + + self.assertEqual(get_element_html_by_class('foo', html), html.strip()) + self.assertEqual(get_element_by_class('no-such-class', html), None) + + GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = ''' +', reset=True) + with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '
'"): + parser.taglist('/p>
', reset=True), []) + + tags = parser.taglist('
/p>
paragraph
paragraph
must be empty', reset=True) + self.assertEqual(tags, [Tag('img')]) + self.assertEqual(tags[0].text_and_html(), ('', '')) + + def test_compliant_html_parsing(self): + # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) + Tag = HTMLTagParser.Tag + html = ''' + no error without closing tag: + self closing is ok: + ''' + parser = HTMLTagParser() + tags = parser.taglist(html, reset=True) + self.assertEqual(tags, [Tag('img'), Tag('img')]) + + # don't get fooled by '>' in attributes + html = '''''' + tags = parser.taglist(html, reset=True) + self.assertEqual(tags[0].text_and_html(), ('', html)) diff --git a/test/test_utils.py b/test/test_utils.py index d9a62258c..3045b6d7e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,7 +4,6 @@ import os import re import sys -import textwrap import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -21,14 +20,6 @@ from yt_dlp.compat import ( compat_HTMLParseError, compat_os_name, ) -from yt_dlp.parsing import ( - HTMLTagParser, - FirstMatchingElementParser, -) - -# some testcases don't work with current functions -get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag - from yt_dlp.utils import ( Config, DateRange, @@ -68,6 +59,7 @@ from yt_dlp.utils import ( get_element_by_class, get_element_html_by_attribute, get_element_html_by_class, + get_element_text_and_html_by_tag, get_elements_by_attribute, get_elements_by_class, get_elements_html_by_attribute, @@ -1776,110 +1768,34 @@ Line 1 self.assertEqual(list(get_elements_text_and_html_by_attribute( 'class', 'foo', 'nicenice', tag='a')), [('nice', 'nice')]) - def test_get_element_text_and_html_by_tag(self): - get_element_by_tag_test_string = ''' - random text lorem ipsum + GET_ELEMENT_BY_TAG_TEST_STRING = ''' + random text lorem ipsum +', reset=True) - with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '
'"): - parser.taglist('/p>
', reset=True), []) - - tags = parser.taglist('
/p>
paragraph
paragraph
must be empty', reset=True) - self.assertEqual(tags, [Tag('img')]) - self.assertEqual(tags[0].text_and_html(), ('', '')) - - def test_compliant_html_parsing(self): - # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS) - Tag = HTMLTagParser.Tag - html = ''' - no error without closing tag: - self closing is ok: - ''' - parser = HTMLTagParser() - tags = parser.taglist(html, reset=True) - self.assertEqual(tags, [Tag('img'), Tag('img')]) - - # don't get fooled by '>' in attributes - html = '''''' - tags = parser.taglist(html, reset=True) - self.assertEqual(tags[0].text_and_html(), ('', html)) - def test_iri_to_uri(self): self.assertEqual( iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),