2022-11-22 14:07:14 +01:00
|
|
|
import collections
|
|
|
|
import contextlib
|
|
|
|
import itertools
|
|
|
|
import re
|
|
|
|
from html.parser import HTMLParser
|
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
from .compat import compat_HTMLParseError
|
2022-11-22 14:07:14 +01:00
|
|
|
from .utils import orderedSet
|
|
|
|
|
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
def iter_find(string, sub: str):
|
|
|
|
size = len(sub)
|
|
|
|
idx = -size
|
|
|
|
while True:
|
|
|
|
idx = string.find(sub, idx + size)
|
|
|
|
if idx == -1:
|
|
|
|
return
|
|
|
|
yield idx
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
class HTMLCommentRanges:
|
|
|
|
"""computes the offsets of HTML comments
|
2022-11-22 14:07:14 +01:00
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
comments start with '<!--' and end with the first '-->' encountered
|
|
|
|
note: markers within quotes are not ignored
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, html):
|
|
|
|
self._range_iter = self.ranges(html)
|
|
|
|
self._range = next(self._range_iter, None)
|
|
|
|
self._last_offset = 0
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def ranges(string, sopen='<!--', sclose='-->'):
|
|
|
|
assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
|
|
|
|
open_iter = iter_find(string, sopen)
|
|
|
|
close_len = len(sclose)
|
|
|
|
close_iter = (idx + close_len for idx in iter_find(string, sclose))
|
|
|
|
next_open = next(open_iter, None)
|
|
|
|
next_close = next(close_iter, None)
|
|
|
|
|
|
|
|
while True:
|
|
|
|
if next_open is None:
|
|
|
|
return
|
|
|
|
while next_close is not None and next_open > next_close:
|
|
|
|
next_close = next(close_iter, None)
|
|
|
|
yield slice(next_open, next_close)
|
|
|
|
if next_close is None:
|
|
|
|
return
|
|
|
|
while next_open is not None and next_open < next_close:
|
|
|
|
next_open = next(open_iter, None)
|
|
|
|
|
|
|
|
def __contains__(self, offset):
|
|
|
|
assert isinstance(offset, int)
|
|
|
|
assert offset >= self._last_offset, 'offset must be in increasing order'
|
|
|
|
self._last_offset = offset
|
|
|
|
while self._range and self._range.stop is not None and offset >= self._range.stop:
|
|
|
|
self._range = next(self._range_iter, None)
|
|
|
|
|
|
|
|
return not (self._range is None or offset < self._range.start)
|
2022-11-22 14:07:14 +01:00
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
|
|
|
|
class HTMLTagParser(HTMLParser):
|
|
|
|
"""HTML parser which returns found elements as instances of 'Tag'
|
|
|
|
when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
|
|
|
|
|
|
|
|
usage:
|
|
|
|
parser = HTMLTagParser()
|
|
|
|
for tag_obj in parser.taglist(html):
|
2022-11-22 14:07:14 +01:00
|
|
|
tag_obj.text_and_html()
|
2022-11-22 19:58:06 +01:00
|
|
|
|
2022-11-22 14:07:14 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
STRICT = False
|
|
|
|
ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
|
|
|
|
VOID_TAGS = {
|
|
|
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
|
|
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
|
|
|
|
}
|
|
|
|
|
|
|
|
class Tag:
|
2022-11-22 19:58:06 +01:00
|
|
|
__slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'
|
2022-11-22 14:07:14 +01:00
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
def __init__(self, name, *, string='', attrs=()):
|
2022-11-22 14:07:14 +01:00
|
|
|
self.name = name
|
|
|
|
self.string = string
|
|
|
|
self.attrs = tuple(attrs)
|
2022-11-22 19:58:06 +01:00
|
|
|
self._openrange = None
|
|
|
|
self._closerange = None
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return self.name
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return f'{self.__class__.__name__}({str(self)!r})'
|
|
|
|
|
|
|
|
def __eq__(self, other):
|
|
|
|
return self.name == other
|
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
def openrange(self, offset, startlen=0):
|
|
|
|
if isinstance(offset, slice):
|
|
|
|
self._openrange = offset
|
|
|
|
else:
|
|
|
|
self._openrange = slice(offset, offset + startlen)
|
|
|
|
|
|
|
|
def closerange(self, offset, stoplen=0):
|
|
|
|
if isinstance(offset, slice):
|
|
|
|
self._closerange = offset
|
|
|
|
else:
|
|
|
|
self._closerange = slice(offset, offset + stoplen)
|
|
|
|
|
|
|
|
def opentag(self):
|
|
|
|
return self.string[self._openrange] if self._openrange else ''
|
|
|
|
|
2022-11-22 14:07:14 +01:00
|
|
|
def html(self):
|
2022-11-22 19:58:06 +01:00
|
|
|
if not self._openrange:
|
|
|
|
return ''
|
|
|
|
if self._closerange:
|
|
|
|
return self.string[self._openrange.start:self._closerange.stop]
|
|
|
|
return self.string[self._openrange]
|
|
|
|
|
|
|
|
def text(self):
|
|
|
|
if self._openrange and self._closerange:
|
|
|
|
return self.string[self._openrange.stop:self._closerange.start]
|
|
|
|
return ''
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
def text_and_html(self):
|
2022-11-22 19:58:06 +01:00
|
|
|
return self.text(), self.html()
|
|
|
|
|
|
|
|
class AbortException(Exception):
|
2022-11-22 14:07:14 +01:00
|
|
|
pass
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.tagstack = collections.deque()
|
2022-11-22 19:58:06 +01:00
|
|
|
self._nestedtags = [[]]
|
|
|
|
super().__init__()
|
2022-11-22 14:07:14 +01:00
|
|
|
self._offset = self.offset
|
|
|
|
|
|
|
|
def predicate(self, tag, attrs):
|
2022-11-22 19:58:06 +01:00
|
|
|
""" return True for every encountered opening tag that should be processed """
|
2022-11-22 14:07:14 +01:00
|
|
|
return True
|
|
|
|
|
|
|
|
def callback(self, tag_obj):
|
2022-11-22 19:58:06 +01:00
|
|
|
""" this will be called when the requested tag is closed """
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
super().reset()
|
|
|
|
self.tagstack.clear()
|
|
|
|
|
|
|
|
def taglist(self, data, reset=True, depth_first=False):
|
|
|
|
""" parse data and return found tag objects
|
|
|
|
@param data: html string
|
|
|
|
@param reset: reset state
|
|
|
|
@param depth_first: return order: as opened (False), as closed (True), nested (None)
|
|
|
|
@return: list of Tag objects
|
|
|
|
"""
|
|
|
|
def flatten(_list, first=True):
|
|
|
|
rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
|
|
|
|
for item in rlist:
|
|
|
|
if isinstance(item, list):
|
|
|
|
yield from flatten(item, first=False)
|
|
|
|
else:
|
|
|
|
yield item
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
if reset:
|
|
|
|
self.reset()
|
2022-11-22 19:58:06 +01:00
|
|
|
with contextlib.suppress(HTMLTagParser.AbortException):
|
2022-11-22 14:07:14 +01:00
|
|
|
self.feed(data)
|
|
|
|
if self.STRICT and self.tagstack:
|
|
|
|
orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
|
|
|
|
raise compat_HTMLParseError(f'unclosed tag {orphans}')
|
2022-11-22 19:58:06 +01:00
|
|
|
taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
|
|
|
|
self._nestedtags = [[]]
|
|
|
|
return taglist
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
def updatepos(self, i, j):
|
|
|
|
offset = self._offset = super().updatepos(i, j)
|
|
|
|
return offset
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
try:
|
2022-11-22 19:58:06 +01:00
|
|
|
# we use internal variable for performance reasons
|
2022-11-22 14:07:14 +01:00
|
|
|
tag_text = getattr(self, '_HTMLParser__starttag_text')
|
|
|
|
except AttributeError:
|
|
|
|
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
|
2022-11-22 19:58:06 +01:00
|
|
|
|
|
|
|
tag_obj = tag
|
2022-11-27 16:22:03 +01:00
|
|
|
tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS)
|
2022-11-22 14:07:14 +01:00
|
|
|
if self.predicate(tag, attrs):
|
2022-11-22 19:58:06 +01:00
|
|
|
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
|
|
|
|
tag_obj.openrange(self._offset, len(tag_text))
|
2022-11-27 16:22:03 +01:00
|
|
|
if tag_is_open:
|
2022-11-27 16:34:06 +01:00
|
|
|
nesting = [tag_obj]
|
2022-11-27 16:22:03 +01:00
|
|
|
self._nestedtags[-1].append(nesting)
|
|
|
|
self._nestedtags.append(nesting)
|
|
|
|
else:
|
2022-11-22 19:58:06 +01:00
|
|
|
self._nestedtags[-1].append(tag_obj)
|
|
|
|
self.callback(tag_obj)
|
2022-11-27 16:22:03 +01:00
|
|
|
if tag_is_open:
|
|
|
|
self.tagstack.appendleft(tag_obj)
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
handle_startendtag = handle_starttag
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if '<' in tag:
|
|
|
|
if self.STRICT:
|
|
|
|
raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
|
|
|
|
tag = tag[:tag.index('<')]
|
|
|
|
|
|
|
|
try:
|
|
|
|
idx = self.tagstack.index(tag)
|
|
|
|
if self.STRICT and idx:
|
|
|
|
open_tags = ''.join(f'</{tag}>' for tag in itertools.islice(self.tagstack, idx))
|
|
|
|
raise compat_HTMLParseError(
|
|
|
|
f'malnested closing tag {tag!r}, expected after {open_tags!r}')
|
|
|
|
tag_obj = self.tagstack[idx]
|
|
|
|
self.tagstack.remove(tag)
|
2022-11-22 19:58:06 +01:00
|
|
|
if isinstance(tag_obj, self.Tag):
|
|
|
|
close_idx = self.rawdata.find('>', self._offset) + 1
|
|
|
|
tag_obj.closerange(self._offset, close_idx - self._offset)
|
2022-11-27 16:34:06 +01:00
|
|
|
self._nestedtags.pop()
|
2022-11-22 19:58:06 +01:00
|
|
|
self.callback(tag_obj)
|
2022-11-22 14:07:14 +01:00
|
|
|
except ValueError as exc:
|
|
|
|
if isinstance(exc, compat_HTMLParseError):
|
|
|
|
raise
|
2022-11-22 19:58:06 +01:00
|
|
|
if self.STRICT:
|
|
|
|
raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
class MatchingElementParser(HTMLTagParser):
|
|
|
|
""" optimized version of HTMLTagParser
|
|
|
|
"""
|
|
|
|
def __init__(self, matchfunc):
|
2022-11-22 14:07:14 +01:00
|
|
|
super().__init__()
|
|
|
|
self.matchfunc = matchfunc
|
2022-11-22 19:58:06 +01:00
|
|
|
self.found_none = True
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
super().reset()
|
|
|
|
self.found_none = True
|
|
|
|
|
|
|
|
def callback(self, tag_obj):
|
|
|
|
raise self.AbortException()
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
def predicate(self, tag, attrs):
|
2022-11-22 19:58:06 +01:00
|
|
|
if self.found_none and self.matchfunc(tag, attrs):
|
|
|
|
self.found_none = False
|
2022-11-22 14:07:14 +01:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
@staticmethod
|
|
|
|
def class_value_regex(class_name):
|
|
|
|
return rf'[\w\s\-]*(?<![\w\-]){re.escape(class_name)}(?![\w\-])[\w\s\-]*'
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def matching_tag_regex(tag, attribute, value_regex, escape=True):
|
|
|
|
if isinstance(value_regex, re.Pattern):
|
|
|
|
value_regex = value_regex.pattern
|
|
|
|
elif escape:
|
|
|
|
value_regex = re.escape(value_regex)
|
|
|
|
|
|
|
|
return rf'''(?x)
|
|
|
|
<(?:{tag})
|
2022-11-27 21:26:58 +01:00
|
|
|
(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
|
2022-11-22 19:58:06 +01:00
|
|
|
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
|
|
|
|
'''
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
@classmethod
|
2022-11-22 19:58:06 +01:00
|
|
|
def iter_tags(cls, regex, html, *, matchfunc):
|
|
|
|
comments = HTMLCommentRanges(html)
|
|
|
|
parser = cls(matchfunc)
|
|
|
|
for match in re.finditer(regex, html):
|
|
|
|
if match.start() not in comments:
|
|
|
|
yield from parser.taglist(html[match.start():], reset=True)
|
2022-11-22 14:07:14 +01:00
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
@classmethod
|
|
|
|
def tags_by_name(cls, tag, html):
|
|
|
|
def matchfunc(tag_str, _attrs):
|
|
|
|
return tag_str == tag
|
2022-11-22 14:07:14 +01:00
|
|
|
|
2022-11-27 21:26:58 +01:00
|
|
|
tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
|
|
|
|
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
|
2022-11-22 14:07:14 +01:00
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
@classmethod
|
|
|
|
def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
|
|
|
|
def matchfunc(_tag_str, attrs):
|
|
|
|
return any(attr == attribute and re.fullmatch(value, value_str)
|
|
|
|
for attr, value_str in attrs)
|
|
|
|
|
|
|
|
tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
|
|
|
|
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def extract_attributes(cls, html):
|
|
|
|
attr_dict = {}
|
|
|
|
|
|
|
|
def matchfunc(_tag, attrs):
|
|
|
|
attr_dict.update(attrs)
|
|
|
|
raise cls.AbortException()
|
|
|
|
|
|
|
|
with contextlib.suppress(cls.AbortException):
|
|
|
|
cls(matchfunc).feed(html)
|
2022-11-22 14:07:14 +01:00
|
|
|
|
2022-11-22 19:58:06 +01:00
|
|
|
return attr_dict
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_elements_text_and_html_by_tag(cls, tag, html):
|
|
|
|
return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]
|
2022-11-22 14:07:14 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_element_text_and_html_by_tag(cls, tag, html):
|
2022-11-22 19:58:06 +01:00
|
|
|
tag = next(cls.tags_by_name(tag, html), None)
|
|
|
|
return tag and tag.text_and_html()
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
|
|
|
|
return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_elements_by_attribute(cls, *args, **kwargs):
|
2022-11-27 16:56:45 +01:00
|
|
|
return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)]
|
2022-11-22 19:58:06 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_elements_html_by_attribute(cls, *args, **kwargs):
|
|
|
|
return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_element_by_attribute(cls, *args, **kwargs):
|
|
|
|
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
|
|
|
|
return tag and tag.text()
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_element_html_by_attribute(cls, *args, **kwargs):
|
|
|
|
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
|
|
|
|
return tag and tag.html()
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_elements_by_class(cls, class_name, html):
|
|
|
|
value = cls.class_value_regex(class_name)
|
|
|
|
return [tag.text() for tag
|
|
|
|
in cls.tags_by_attribute('class', value, html, escape_value=False)]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_elements_html_by_class(cls, class_name, html):
|
|
|
|
value = cls.class_value_regex(class_name)
|
|
|
|
return [tag.html() for tag
|
|
|
|
in cls.tags_by_attribute('class', value, html, escape_value=False)]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_elements_text_and_html_by_class(cls, class_name, html):
|
|
|
|
value = cls.class_value_regex(class_name)
|
2022-11-27 16:56:45 +01:00
|
|
|
return [tag.text_and_html() for tag
|
2022-11-22 19:58:06 +01:00
|
|
|
in cls.tags_by_attribute('class', value, html, escape_value=False)]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_element_html_by_class(cls, class_name, html):
|
|
|
|
value = cls.class_value_regex(class_name)
|
|
|
|
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
|
|
|
|
return tag and tag.html()
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_element_by_class(cls, class_name, html):
|
|
|
|
value = cls.class_value_regex(class_name)
|
|
|
|
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
|
|
|
|
return tag and tag.text()
|