stations in url valid check

This commit is contained in:
Jesse Bannon 2024-05-11 20:56:52 -07:00
parent 53cebe3a41
commit 69f2dea115

View file

@ -1,4 +1,5 @@
import re
import urllib.parse
from .common import InfoExtractor
from ..compat import compat_str
@ -192,7 +193,7 @@ class PBSIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
(?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
(?:%s)/(?!show)(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
# Article with embedded player (or direct video)
(?:www\.)?pbs\.org/(?!show)(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
@ -763,11 +764,14 @@ class PBSKidsIE(InfoExtractor):
class PBSShowIE(InfoExtractor):
_VALID_URL = r'(?:https://)?(?:www\.)?pbs\.org\/show\/(?P<presumptive_id>[^/]+?)(?:\.html)?\/?(?:$|[?#])'
_VALID_URL = r'''(?x)https?://
(?:www\.)?(?:%s)/show\/(?P<presumptive_id>[^/]+?)(?:\.html)?\/?(?:$|[?#])
''' % '|'.join(list(zip(*PBSIE._STATIONS))[0])
_TESTS = [
# Full Show
{
'url': 'https://www.pbs.org/show/oregon-experience',
'url': 'https://video.ksps.org/show/oregon-experience/',
'info_dict': {
'id': 'oregon-experience',
'title': 'Oregon Experience',
@ -780,7 +784,7 @@ class PBSShowIE(InfoExtractor):
},
# Single Special
{
'url': 'https://www.pbs.org/show/betrayed-survivng-american-concentration-camp',
'url': 'https://video.ksps.org/show/betrayed-survivng-american-concentration-camp',
'info_dict': {
'id': 'betrayed-survivng-american-concentration-camp',
'title': 'Betrayed: Surviving an American Concentration Camp',
@ -793,7 +797,7 @@ class PBSShowIE(InfoExtractor):
},
# Non-Season Episodes (uses season 1)
{
'url': 'https://www.pbs.org/show/a-brief-history-of-the-future/',
'url': 'https://video.ksps.org/show/a-brief-history-of-the-future/',
'info_dict': {
'id': 'a-brief-history-of-the-future',
'title': 'A Brief History of the Future',
@ -810,9 +814,8 @@ class PBSShowIE(InfoExtractor):
_SHOW_JSON_SEARCH = r'GTMDataLayer\.push\('
@staticmethod
def _make_url(playlist_id):
# pbs does not show metadata, use a different station that does
return f'https://video.ksps.org/show/{playlist_id}'
def _make_url(url, playlist_id):
return f'https://{urllib.parse.urlparse(url).netloc}/show/{playlist_id}'
@staticmethod
def _extract_episode(popover_html):
@ -822,15 +825,15 @@ class PBSShowIE(InfoExtractor):
return maybe_ep[1]
return None
def _iterate_entries(self, playlist_id, season_indices):
playlist_url = self._make_url(playlist_id)
def _iterate_entries(self, url, playlist_id, season_indices):
base_url = urllib.parse.urlparse(url).netloc
for season_idx in season_indices:
season_id = f'{playlist_id}-season-{season_idx}'
season_page = self._download_webpage(
f'{playlist_url}/episodes/season/{season_idx}'
if season_idx > 0 else f'{playlist_url}/specials',
f'{url}/episodes/season/{season_idx}'
if season_idx > 0 else f'{url}/specials',
video_id=season_id
)
episodes = [
@ -850,7 +853,7 @@ class PBSShowIE(InfoExtractor):
url_kwargs['episode'] = episode_indices[i]
yield self.url_result(
url=f'https://pbs.org/video/{ep["data-video-slug"]}',
url=f'https://{base_url}/video/{ep["data-video-slug"]}',
ie=PBSIE,
video_id=ep["data-cid"],
url_transparent=True,
@ -861,7 +864,9 @@ class PBSShowIE(InfoExtractor):
def _real_extract(self, url):
playlist_id = self._match_valid_url(url).group('presumptive_id')
webpage = self._download_webpage(self._make_url(playlist_id), playlist_id)
url = self._make_url(url=url, playlist_id=playlist_id)
webpage = self._download_webpage(url, playlist_id)
show_data = self._search_json(self._JSON_SEARCH, webpage, 'seasons', playlist_id)
playlist_description = clean_html(get_element_html_by_class(
@ -886,7 +891,7 @@ class PBSShowIE(InfoExtractor):
season_indices = [0] + season_indices
return self.playlist_result(
LazyList(self._iterate_entries(playlist_id, season_indices)),
LazyList(self._iterate_entries(url, playlist_id, season_indices)),
playlist_id=playlist_id,
playlist_title=playlist_title,
playlist_description=playlist_description,