From 6d2de79b7a419e822ed3e2f1308fde121b413094 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 13 Oct 2024 23:07:33 +1300 Subject: [PATCH] BoomPlayGenericPlaylistIE, BoomPlaySearchIE --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/boomplay.py | 152 ++++++++++++++++++++++++++++++-- 2 files changed, 146 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1abca1ed9..5208639e3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -282,9 +282,11 @@ from .bokecc import BokeCCIE from .bongacams import BongaCamsIE from .boomplay import ( BoomPlayEpisodeIE, + BoomPlayGenericPlaylistIE, BoomPlayMusicIE, BoomPlayPlaylistIE, BoomPlayPodcastIE, + BoomPlaySearchIE, BoomPlayVideoIE, ) from .boosty import BoostyIE diff --git a/yt_dlp/extractor/boomplay.py b/yt_dlp/extractor/boomplay.py index dba8a1c9c..692f4d98b 100644 --- a/yt_dlp/extractor/boomplay.py +++ b/yt_dlp/extractor/boomplay.py @@ -2,22 +2,29 @@ import base64 import functools import json import re +import urllib.parse -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..aes import aes_cbc_decrypt_bytes, aes_cbc_encrypt_bytes, unpad_pkcs7 from ..utils import ( ExtractorError, clean_html, + extract_attributes, get_element_by_attribute, get_element_by_class, get_elements_by_attribute, int_or_none, + join_nonempty, merge_dicts, + orderedSet, + parse_count, parse_duration, strip_or_none, unified_strdate, url_or_none, urlencode_postdata, + urljoin, + variadic, ) from ..utils.traversal import traverse_obj @@ -27,6 +34,14 @@ class BoomPlayBaseIE(InfoExtractor): # Note that the real key/iv differs from `lhx.AESUtils.key`/`lhx.AESUtils.iv` _KEY = b'boomplayVr3xopAM' _IV = b'boomplay8xIsKTn9' + _BASE = 'https://www.boomplay.com' + _MEDIA_TYPES = ('songs', 'video', 'episode', 'podcasts', 'playlists', 'artists', 'albums') + + @classmethod + def _urljoin(cls, path): + if not hasattr(path, 'startswith') or path.startswith('javascript:'): + return None + return url_or_none(urljoin(base=cls._BASE, path=path)) def _get_playurl(self, item_id, item_type): resp = self._download_json( @@ -49,7 +64,6 @@ class BoomPlayBaseIE(InfoExtractor): if url := url_or_none(self._get_playurl(_id, item_type)): return [{ 'format_id': '0', - 'vcodec': 'none' if item_type == 'MUSIC' else None, 'url': url, 'http_headers': { 'Origin': 'https://www.boomplay.com', @@ -79,12 +93,12 @@ class BoomPlayBaseIE(InfoExtractor): metadata_entries.extend(re.findall(r'(?s)
  • (?P.*?)
  • ', details_section) or []) page_metadata = { 'id': _id, - 'title': self._html_search_regex(r'

    ([^<]+)

    ', metadata_div, 'title', default=''), + 'title': self._html_search_regex(r'

    ([^<]+)

    ', metadata_div, 'title', default=None), 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=''), - 'like_count': int_or_none(get_element_by_class('btn_favorite', metadata_div)), - 'repost_count': int_or_none(get_element_by_class('btn_share', metadata_div)), - 'comment_count': int_or_none(get_element_by_class('btn_comment', metadata_div)), + 'like_count': parse_count(get_element_by_class('btn_favorite', metadata_div)), + 'repost_count': parse_count(get_element_by_class('btn_share', metadata_div)), + 'comment_count': parse_count(get_element_by_class('btn_comment', metadata_div)), 'duration': parse_duration(get_element_by_class('btn_duration', metadata_div)), 'upload_date': unified_strdate(strip_or_none(get_element_by_class('btn_pubDate', metadata_div))), 'description': description, @@ -104,7 +118,55 @@ class BoomPlayBaseIE(InfoExtractor): page_metadata['release_year'] = int_or_none(v) return page_metadata - extract = lambda self, url: self.write_debug(json.dumps(a := super().extract(url), indent=2)) or a # rm + def _extract_suitable_links(self, webpage, media_types): + if not media_types: + media_types = self._MEDIA_TYPES + media_types = list(variadic(media_types)) + + for idx, v in enumerate(media_types): + media_types[idx] = re.escape(v) if v in self._MEDIA_TYPES else '' + media_types = join_nonempty(*media_types, delim='|') + return orderedSet(traverse_obj(re.finditer( + rf'''(?x) + "']|"[^"]*"|'[^']*')*)? + (?<=\s)href\s*=\s*(?P<_q>['"]) + (?: + (?!javascript:)(?P/(?:{media_types})/\d+?) + ) + (?P=_q) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + ''', webpage), (..., 'link', {self._urljoin}, {self.url_result}))) + + def _extract_playlist_entries(self, webpage, media_types, warn=True): + song_list = strip_or_none( + get_element_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])morePart_musics(?=[\'"\s])[^\'"]*', webpage, + tag='ol', escape_value=False) + or get_element_by_attribute( + 'class', r'[^\'"]*(?<=[\'"\s])morePart(?=[\'"\s])[^\'"]*', webpage, + tag='ol', escape_value=False) + or '') + + entries = traverse_obj(re.finditer( + r'''(?x) + "']|"[^"]*"|'[^']*')*)? + (?<=\s)class\s*=\s*(?P<_q>['"]) + (?: + [^\'"]*(?<=[\'"\s])songName(?=[\'"\s])[^\'"]* + ) + (?P=_q) + (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? + > + ''', song_list), + (..., 0, {extract_attributes}, 'href', {self._urljoin}, {self.url_result})) + if not entries: + if warn: + self.report_warning('Failed to extract playlist entries, finding suitable links instead!') + return self._extract_suitable_links(webpage, media_types) + + return entries class BoomPlayMusicIE(BoomPlayBaseIE): @@ -143,7 +205,7 @@ class BoomPlayMusicIE(BoomPlayBaseIE): 'artists': ('byArtist', ..., 'name'), 'duration': ('duration', {parse_duration}), }), { - 'formats': self._extract_formats(song_id, 'MUSIC'), + 'formats': self._extract_formats(song_id, 'MUSIC', vcodec='none'), }) @@ -217,6 +279,7 @@ class BoomPlayPodcastIE(BoomPlayBaseIE): 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/12/22/6f9cf97ad6f846a0a7882c98dfcf4f8c_320_320.jpg', 'repost_count': int, 'comment_count': int, + 'like_count': int, }, } @@ -253,6 +316,7 @@ class BoomPlayPlaylistIE(BoomPlayBaseIE): 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/08/19/d05d431ee616412caeacd7f78f4f68f5_320_320.jpeg', 'repost_count': int, 'comment_count': int, + 'like_count': int, 'description': 'md5:7ebdffc5137c77acb62acb3c89248445', }, 'playlist_count': 10, @@ -281,3 +345,75 @@ class BoomPlayPlaylistIE(BoomPlayBaseIE): 'artists': ('byArtist', ..., 'name'), 'channel_url': ('byArtist', 0, '@id'), }))) + + +class BoomPlayGenericPlaylistIE(BoomPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?boomplay\.com/.+' + _TESTS = [{ + 'url': 'https://www.boomplay.com/search/default/Rise%20of%20the%20Fallen%20Heroes', + 'md5': 'c5fb4f23e6aae98064230ef3c39c2178', + 'info_dict': { + 'id': '165481965', + 'ext': 'mp3', + 'title': 'Rise of the Fallen Heroes', + 'duration': 125.0, + 'genres': ['Metal'], + 'artists': ['fatbunny'], + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/04/29/375ecda38f6f48179a93c72ab909118f_464_464.jpg', + 'channel_url': 'https://www.boomplay.com/artists/52723101', + 'comment_count': int, + 'repost_count': int, + 'album': 'Legendary Battle', + 'release_year': 2024, + 'like_count': int, + }, + }, { + 'url': 'https://www.boomplay.com/search/video/%20Autumn%20blues', + 'md5': 'd9b67ad333d2292a82922062d065352d', + 'info_dict': { + 'id': '1154892', + 'title': 'Autumn blues', + 'ext': 'mp4', + 'timestamp': 1728599214, + 'view_count': int, + 'thumbnail': 'https://source.boomplaymusic.com/group10/M00/10/10/2171dee9e1f8452e84021560729edb88.jpg', + 'description': 'Autumn blues by Lugo', + 'upload_date': '20241010', + 'duration': 177.0, + }, + 'expected_warnings': ['Failed to extract playlist entries, finding suitable links instead!'], + 'params': {'playlist_items': '1'}, + }] + + @classmethod + def suitable(cls, url): + if not any(ie.suitable(url) for ie in ( + BoomPlayEpisodeIE, + BoomPlayMusicIE, + BoomPlayPlaylistIE, + BoomPlayPodcastIE, + BoomPlayVideoIE, + )): + return super().suitable(url) + return False + + def _real_extract(self, url): + _id = self._generic_id(url) + webpage = self._download_webpage(url, _id) + # TODO: pass media types based on search types + return self.playlist_result( + self._extract_playlist_entries(webpage, self._MEDIA_TYPES), + **self._extract_page_metadata(webpage, _id)) + + +class BoomPlaySearchIE(SearchInfoExtractor): + _SEARCH_KEY = 'boomplaysearch' + _RETURN_TYPE = 'url' + _TEST = { + 'url': 'boomplaysearch:rise of the fallen heroes', + 'only_matching': True, + } + + def _search_results(self, query): + yield self.url_result( + f'https://www.boomplay.com/search/default/{urllib.parse.quote(query)}')