mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-01-13 20:01:57 +01:00
[utils] Improve get_elements_text_and_html_by_attribute
regex (#2280)
Authored by: zmousm, pukkandan
This commit is contained in:
parent
a70b71e85a
commit
0254f16274
2 changed files with 15 additions and 16 deletions
|
@ -1659,10 +1659,10 @@ Line 1
|
||||||
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
get_elements_text_and_html_by_attribute('class', 'foo bar', html),
|
list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)),
|
||||||
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
|
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
|
||||||
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
|
self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), [])
|
||||||
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
|
self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), [])
|
||||||
|
|
||||||
GET_ELEMENT_BY_TAG_TEST_STRING = '''
|
GET_ELEMENT_BY_TAG_TEST_STRING = '''
|
||||||
random text lorem ipsum</p>
|
random text lorem ipsum</p>
|
||||||
|
|
|
@ -473,24 +473,23 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value
|
||||||
attribute in the passed HTML document
|
attribute in the passed HTML document
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
|
||||||
|
|
||||||
value = re.escape(value) if escape_value else value
|
value = re.escape(value) if escape_value else value
|
||||||
|
|
||||||
retlist = []
|
partial_element_re = r'''(?x)
|
||||||
for m in re.finditer(r'''(?xs)
|
|
||||||
<(?P<tag>[a-zA-Z0-9:._-]+)
|
<(?P<tag>[a-zA-Z0-9:._-]+)
|
||||||
(?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
|
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
|
||||||
\s+%(attribute)s(?:=%(value)s|\s*=\s*(?P<_q>['"]?)%(value)s(?P=_q))
|
\s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
|
||||||
(?:\s+[a-zA-Z0-9_:.-]+(?:=\S*?|\s*=\s*(?:"[^"]*"|'[^']*')|))*?
|
''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
|
||||||
\s*>
|
|
||||||
''' % {'attribute': re.escape(attribute), 'value': value}, html):
|
for m in re.finditer(partial_element_re, html):
|
||||||
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
|
content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
|
||||||
|
|
||||||
retlist.append((
|
yield (
|
||||||
unescapeHTML(re.sub(r'(?s)^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content)),
|
unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
|
||||||
whole,
|
whole
|
||||||
))
|
)
|
||||||
|
|
||||||
return retlist
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLBreakOnClosingTagParser(compat_HTMLParser):
|
class HTMLBreakOnClosingTagParser(compat_HTMLParser):
|
||||||
|
|
Loading…
Reference in a new issue