diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py
index 80745c65d..706e8ad83 100644
--- a/yt_dlp/extractor/boomplay.py
+++ b/yt_dlp/extractor/boomplay.py
@@ -15,11 +15,12 @@ from ..utils import (
int_or_none,
join_nonempty,
merge_dicts,
- orderedSet,
parse_count,
parse_duration,
+ smuggle_url,
strip_or_none,
unified_strdate,
+ unsmuggle_url,
url_or_none,
urlencode_postdata,
urljoin,
@@ -45,7 +46,7 @@ class BoomplayBaseIE(InfoExtractor):
"""
# get_elements_text_and_html_by_attribute returns a generator
return get_elements_text_and_html_by_attribute(
- 'class', rf'''[^'"]*(?<=['"\s]){class_}(?=['"\s])[^'"]*''', html,
+ attribute='class', value=rf'''[^'"]*(?<=['"\s]){class_}(?=['"\s])[^'"]*''', html=html,
tag=tag, escape_value=False)
@classmethod
@@ -111,7 +112,7 @@ class BoomplayBaseIE(InfoExtractor):
metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or ''
metadata_entries = re.findall(r'(?si)(?P.*?)', metadata_div) or []
description = re.sub(
- '(?i)Listen and download music for free on Boomplay!', '',
+ r'(?i)Listen and download music for free on Boomplay!', '',
clean_html(self._get_element_by_class_and_tag(
'description_content', 'span', webpage)) or '') or None
@@ -145,39 +146,55 @@ class BoomplayBaseIE(InfoExtractor):
page_metadata['release_year'] = int_or_none(v)
return page_metadata
- def _extract_suitable_links(self, webpage, media_types=None):
- if media_types is None:
- media_types = self._MEDIA_TYPES
- media_types = list(variadic(media_types))
+ @classmethod
+ def _extract_from_webpage(cls, url, webpage, **kwargs):
+ if kwargs:
+ url = smuggle_url(url, kwargs)
+ return super()._extract_from_webpage(url, webpage)
- for idx, v in enumerate(media_types):
- media_types[idx] = re.escape(v) if v in self._MEDIA_TYPES else ''
- media_types = join_nonempty(*media_types, delim='|')
- return orderedSet(traverse_obj(re.finditer(
- rf'''(?x)
- "']|"[^"]*"|'[^']*')*)?
- (?<=\s)href\s*=\s*(?P<_q>['"])
- (?:
- (?!javascript:)(?P/(?:{media_types})/\d+/?[\-a-zA-Z=?:;@]*)
- )
- (?P=_q)
- (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
- >''', webpage), (..., 'link', {self._urljoin}, {self.url_result})))
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ url, smuggled_data = unsmuggle_url(url)
+ media_types = variadic(smuggled_data.get('media_types', cls._MEDIA_TYPES))
+ media_types = join_nonempty(*(
+ re.escape(v)for v in media_types if v in cls._MEDIA_TYPES),
+ delim='|')
- def _extract_playlist_entries(self, webpage, media_types, warn=True):
+ for mobj in re.finditer(
+ rf'''(?ix)
+ "']|"[^"]*"|'[^']*')*)?
+ (?<=\s)href\s*=\s*(?P<_q>['"])
+ (?:
+ (?!javascript:)(?P/(?:{media_types})/\d+/?[\-a-zA-Z=?:;@]*)
+ )
+ (?P=_q)
+ (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+ >''', webpage):
+ if url := cls._urljoin(mobj.group('href')):
+ yield url
+
+ @classmethod
+ def _extract_playlist_entries(cls, webpage, media_types, warn=True):
song_list = strip_or_none(
- self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage)
- or self._get_element_by_class_and_tag('morePart', 'ol', webpage)
+ cls._get_element_by_class_and_tag('morePart_musics', 'ol', webpage)
+ or cls._get_element_by_class_and_tag('morePart', 'ol', webpage)
or '')
- entries = traverse_obj(self.__yield_elements_html_by_class_and_tag(
+ entries = traverse_obj(cls.__yield_elements_html_by_class_and_tag(
'songName', 'a', song_list),
- (..., {extract_attributes}, 'href', {self._urljoin}, {self.url_result}))
+ (..., {extract_attributes}, 'href', {cls._urljoin}, {cls.url_result}))
if not entries:
if warn:
- self.report_warning('Failed to extract playlist entries, finding suitable links instead!')
- return self._extract_suitable_links(webpage, media_types)
+ cls.report_warning('Failed to extract playlist entries, finding suitable links instead!')
+
+ def strip_ie(entry):
+ # All our IEs have a _VALID_URL and set a key: don't use it
+ entry.pop('ie_key', None)
+ return entry
+
+ return (strip_ie(result) for result in
+ cls._extract_from_webpage(cls._BASE, webpage, media_types=media_types))
return entries
@@ -302,7 +319,7 @@ class BoomplayPodcastIE(BoomplayBaseIE):
webpage = self._download_webpage(url, playlist_id)
song_list = self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage)
song_list = traverse_obj(re.finditer(
- r'''(?x)
+ r'''(?ix)
"']|"[^"]*"|'[^']*')*)?
\sdata-id\s*=\s*