[spotify] Detect iframe embeds (#3430)

Authored by: fstirlitz
2024-12-27 21:59:17 +01:00 · 2022-04-14 13:22:47 +00:00 · 2022-04-14 13:22:47 +00:00 · a49e777d59
commit a49e777d59
parent cda1bc5197
2 changed files with 18 additions and 3 deletions
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -67,6 +67,7 @@ from .simplecast import SimplecastIE
 from .soundcloud import SoundcloudEmbedIE
 from .spankwire import SpankwireIE
 from .sportbox import SportBoxIE
+from .spotify import SpotifyBaseIE
 from .springboardplatform import SpringboardPlatformIE
 from .svt import SVTIE
 from .teachable import TeachableIE
@ -3164,6 +3165,11 @@ class GenericIE(InfoExtractor):
        if sportbox_urls:
            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())

+        # Look for embedded Spotify player
+        spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
+        if spotify_urls:
+            return self.playlist_from_matches(spotify_urls, video_id, video_title)
+
        # Look for embedded XHamster player
        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
        if xhamster_urls:
--- a/yt_dlp/extractor/spotify.py
+++ b/yt_dlp/extractor/spotify.py
@ -19,7 +19,7 @@ class SpotifyBaseIE(InfoExtractor):
        'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0',
        'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d',
    }
-    _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P<id>[^/?&#]+)'
+    _VALID_URL_TEMPL = r'https?://open\.spotify\.com/(?:embed-podcast/|embed/|)%s/(?P<id>[^/?&#]+)'

    def _real_initialize(self):
        self._ACCESS_TOKEN = self._download_json(
@ -93,11 +93,17 @@ class SpotifyBaseIE(InfoExtractor):
            'series': series,
        }

+    @classmethod
+    def _extract_embed_urls(cls, webpage):
+        return re.findall(
+            r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
+            webpage)
+

 class SpotifyIE(SpotifyBaseIE):
    IE_NAME = 'spotify'
    _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode'
-    _TEST = {
+    _TESTS = [{
        'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo',
        'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b',
        'info_dict': {
@ -109,7 +115,10 @@ class SpotifyIE(SpotifyBaseIE):
            'release_date': '20201217',
            'series': "The Guardian's Audio Long Reads",
        }
-    }
+    }, {
+        'url': 'https://open.spotify.com/embed/episode/4TvCsKKs2thXmarHigWvXE?si=7eatS8AbQb6RxqO2raIuWA',
+        'only_matching': True,
+    }]

    def _real_extract(self, url):
        episode_id = self._match_id(url)