diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 967010826..3aba459fe 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -584,6 +584,10 @@ from .egghead import ( EggheadCourseIE, EggheadLessonIE, ) +from .eggs import ( + EggsIE, + EggsArtistIE, +) from .eighttracks import EightTracksIE from .eitb import EitbIE from .elementorembed import ElementorEmbedIE diff --git a/yt_dlp/extractor/eggs.py b/yt_dlp/extractor/eggs.py new file mode 100644 index 000000000..368374bb7 --- /dev/null +++ b/yt_dlp/extractor/eggs.py @@ -0,0 +1,177 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + url_or_none, + unescapeHTML, +) + + +class EggsBaseIE(InfoExtractor): + def _parse_artist_name(self, webpage): + artist = self._search_regex( + r']+class=(["\'])artist_name\1[^>]*>([^<]+)', + webpage, 'artist name', fatal=False, default=None, group=2 + ) + if artist: + return artist.strip() + + og_title = self._html_search_meta(['og:title'], webpage, 'og:title', default=None) + if og_title: + artist_match = re.search(r'(?P[^()]+)(?:\([^)]*\))?のEggsページ', og_title) + if artist_match: + return artist_match.group('artist').strip() + + return 'Unknown Artist' + + def _parse_single_song(self, url, webpage, default_artist='Unknown Artist'): + song_id = self._search_regex( + r'/song/(?P[^/?#&]+)', + url, 'song id', fatal=False, default=None, group='id' + ) + + track_title = self._search_regex( + r']+class=(["\'])product_name\1[^>]*>\s*

([^<]+)

', + webpage, 'track title', fatal=False, default=None, group=2 + ) + + if not track_title: + page_title = self._search_regex( + r'(?P<title>[^<]+)', + webpage, 'page title', fatal=False, default=None, group='title' + ) + if page_title: + inner_match = re.search(r'「(?P[^」]+)」', page_title) + if inner_match: + track_title = inner_match.group('inner').strip() + + if not track_title: + track_title = 'Unknown Title' + + artist = default_artist + if not artist or artist == 'Unknown Artist': + artist_regex = r']+class=(["\'])artist_name\1[^>]*>\s*]*>([^<]+)' + fallback_artist = self._search_regex( + artist_regex, webpage, 'artist name', + fatal=False, default=None, group=2 + ) + if fallback_artist: + artist = fallback_artist.strip() + + audio_url = self._search_regex( + r']+class=(["\'])[^"\']*player[^"\']*\1[^>]+data-src=(["\'])(?P[^"\']+)\2', + webpage, 'audio url', fatal=True, group='audio_url' + ) + audio_url = url_or_none(unescapeHTML(audio_url)) + if not audio_url: + raise ExtractorError('Invalid audio URL.', expected=True) + + thumbnail = ( + self._html_search_meta(['og:image'], webpage, 'thumbnail', default=None) + or self._search_regex( + r']*>\s*]+src=(["\'])(?P[^"\']+)\1', + webpage, 'thumbnail', fatal=False, default=None, group='thumb' + ) + ) + + return { + 'id': song_id, + 'url': audio_url, + 'title': track_title, + 'uploader': artist, + 'vcodec': 'none', + 'thumbnail': thumbnail, + } + + def _parse_artist_page(self, webpage, artist_id, artist_name): + song_blocks = re.findall(r'(?s)]+id="songs\d+"[^>]*>.*?', webpage) + entries = [] + + for block in song_blocks: + audio_url = self._search_regex( + r'data-src=(["\'])(?Phttps?://.*?\.(?:mp3|m4a).*?)\1', + block, 'audio url', fatal=False, default=None, group='url' + ) + audio_url = url_or_none(unescapeHTML(audio_url)) + if not audio_url: + continue + + track_id = self._search_regex( + r'data-srcid=(["\'])(?P[^"\'<>]+)\1', + block, 'track id', fatal=False, default=None, group='id' + ) + if not track_id: + continue + + title = self._search_regex( + r'data-srcname=(["\'])(?P[^"\']+)\1', + block, 'track title', fatal=False, default=None, group='title' + ) + if not title: + title = 'Unknown Title' + + thumbnail = self._search_regex( + r'<img[^>]+src=(["\'])(?P<th>[^"\']+)\1', + block, 'thumbnail', fatal=False, default=None, group='th' + ) + + entries.append({ + 'id': track_id, + 'url': audio_url, + 'title': title, + 'uploader': artist_name, + 'vcodec': 'none', + 'thumbnail': thumbnail, + }) + + return entries + +class EggsIE(EggsBaseIE): + IE_NAME = 'eggs:single' + _VALID_URL = ( + r'https?://(?:www\.)?eggs\.mu/artist/(?P<artist_id>[^/]+)/song/(?P<song_id>[^/?#&]+)' + ) + _TESTS = [{ + 'url': 'https://eggs.mu/artist/32_sunny_girl/song/0e95fd1d-4d61-4d5b-8b18-6092c551da90', + 'info_dict': { + 'id': '0e95fd1d-4d61-4d5b-8b18-6092c551da90', + 'ext': 'm4a', + 'title': 'シネマと信号', + 'uploader': 'Sunny Girl', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', + }, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + song_id = mobj.group('song_id') + webpage = self._download_webpage(url, song_id) + artist_name = self._parse_artist_name(webpage) + info = self._parse_single_song(url, webpage, artist_name) + return info + +class EggsArtistIE(EggsBaseIE): + IE_NAME = 'eggs:artist' + _VALID_URL = ( + r'https?://(?:www\.)?eggs\.mu/artist/(?P<artist_id>[^/?#]+)(?:[/?#].*)?$' + ) + _TESTS = [{ + 'url': 'https://eggs.mu/artist/32_sunny_girl', + 'info_dict': { + 'id': '32_sunny_girl', + 'title': 'Sunny Girl', + }, + 'playlist_count': 18, + }] + + def _real_extract(self, url): + artist_id = self._match_valid_url(url).group('artist_id') + webpage = self._download_webpage(url, artist_id) + artist_name = self._parse_artist_name(webpage) + entries = self._parse_artist_page(webpage, artist_id, artist_name) + return self.playlist_result( + entries, + playlist_id=artist_id, + playlist_title=artist_name + ) \ No newline at end of file