From 611ac379bb466267aded6726f9c85e79b08168c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:34:24 +0600 Subject: [PATCH] [vpro] Fix extraction and add support for vpro playlists --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/npo.py | 35 +++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1d55275dc4..06f21064b6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -389,7 +389,7 @@ from .npo import ( NPOLiveIE, NPORadioIE, NPORadioFragmentIE, - TegenlichtVproIE, + VPROIE, WNLIE ) from .nrk import ( diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c6bf7619de..28d5c90b39 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -404,9 +404,8 @@ class NPORadioFragmentIE(InfoExtractor): } -class TegenlichtVproIE(NPOIE): - IE_NAME = 'tegenlicht.vpro.nl' - _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?' +class VPROIE(NPOIE): + _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P[^/]+)\.html' _TESTS = [ { @@ -416,19 +415,35 @@ class TegenlichtVproIE(NPOIE): 'id': 'VPWON_1169289', 'ext': 'm4v', 'title': 'Tegenlicht', - 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, }, + { + 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', + 'info_dict': { + 'id': 'sergio-herman', + 'title': 'Sergio Herman: Fucking perfect', + }, + 'playlist_count': 2, + } ] def _real_extract(self, url): - name = url_basename(url) - webpage = self._download_webpage(url, name) - urn = self._html_search_meta('mediaurn', webpage) - info_page = self._download_json( - 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) - return self._get_info(info_page['mid']) + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id, 'NPO') + for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) + ] + + playlist_title = self._search_regex( + r'\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*', + webpage, 'playlist title', default=None) or self._og_search_title(webpage) + + return self.playlist_result(entries, playlist_id, playlist_title) class WNLIE(InfoExtractor):