From f58848011e746a53b9d9c4a45c18e47a05178a0a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 20:44:48 +0200 Subject: [PATCH] Move blip.tv extractors into their own file --- youtube_dl/InfoExtractors.py | 156 +---------------------------- youtube_dl/extractor/bliptv.py | 177 +++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 155 deletions(-) create mode 100644 youtube_dl/extractor/bliptv.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index fb10c2ec4a..a3cc2e6f5d 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -20,6 +20,7 @@ from .extractor.common import InfoExtractor, SearchInfoExtractor from .extractor.ard import ARDIE from .extractor.arte import ArteTvIE +from .extractor.bliptv import BlipTVIE, BlipTVUserIE from .extractor.dailymotion import DailymotionIE from .extractor.gametrailers import GametrailersIE from .extractor.generic import GenericIE @@ -47,64 +48,6 @@ from .extractor.zdf import ZDFIE -class BlipTVUserIE(InfoExtractor): - """Information Extractor for blip.tv users.""" - - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' - _PAGE_SIZE = 12 - IE_NAME = u'blip.tv:user' - - def _real_extract(self, url): - # Extract username - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - username = mobj.group(1) - - page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - - page = self._download_webpage(url, username, u'Downloading user page') - mobj = re.search(r'data-users-id="([^"]+)"', page) - page_base = page_base % mobj.group(1) - - - # Download video ids using BlipTV Ajax calls. Result size per - # query is limited (currently to 12 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. - - video_ids = [] - pagenum = 1 - - while True: - url = page_base + "&page=" + str(pagenum) - page = self._download_webpage(url, username, - u'Downloading video ids from page %d' % pagenum) - - # Extract video identifiers - ids_in_page = [] - - for mobj in re.finditer(r'href="/([^"]+)"', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(unescapeHTML(mobj.group(1))) - - video_ids.extend(ids_in_page) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(ids_in_page) < self._PAGE_SIZE: - break - - pagenum += 1 - - urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] - url_entries = [self.url_result(url, 'BlipTV') for url in urls] - return [self.playlist_result(url_entries, playlist_title = username)] class DepositFilesIE(InfoExtractor): @@ -249,103 +192,6 @@ class FacebookIE(InfoExtractor): return [info] -class BlipTVIE(InfoExtractor): - """Information extractor for blip.tv""" - - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' - _URL_EXT = r'^.*\.([a-z0-9]+)$' - IE_NAME = u'blip.tv' - - def report_direct_download(self, title): - """Report information extraction.""" - self.to_screen(u'%s: Direct download detected' % title) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - # See https://github.com/rg3/youtube-dl/issues/857 - api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P[\d\w]+)', url) - if api_mobj is not None: - url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id') - urlp = compat_urllib_parse_urlparse(url) - if urlp.path.startswith('/play/'): - request = compat_urllib_request.Request(url) - response = compat_urllib_request.urlopen(request) - redirecturl = response.geturl() - rurlp = compat_urllib_parse_urlparse(redirecturl) - file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] - url = 'http://blip.tv/a/a-' + file_id - return self._real_extract(url) - - - if '?' in url: - cchar = '&' - else: - cchar = '?' - json_url = url + cchar + 'skin=json&version=2&no_wrap=1' - request = compat_urllib_request.Request(json_url) - request.add_header('User-Agent', 'iTunes/10.6.1') - self.report_extraction(mobj.group(1)) - info = None - try: - urlh = compat_urllib_request.urlopen(request) - if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download - basename = url.split('/')[-1] - title,ext = os.path.splitext(basename) - title = title.decode('UTF-8') - ext = ext.replace('.', '') - self.report_direct_download(title) - info = { - 'id': title, - 'url': url, - 'uploader': None, - 'upload_date': None, - 'title': title, - 'ext': ext, - 'urlhandle': urlh - } - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) - if info is None: # Regular URL - try: - json_code_bytes = urlh.read() - json_code = json_code_bytes.decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) - - try: - json_data = json.loads(json_code) - if 'Post' in json_data: - data = json_data['Post'] - else: - data = json_data - - upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') - video_url = data['media']['url'] - umobj = re.match(self._URL_EXT, video_url) - if umobj is None: - raise ValueError('Can not determine filename extension') - ext = umobj.group(1) - - info = { - 'id': data['item_id'], - 'url': video_url, - 'uploader': data['display_name'], - 'upload_date': upload_date, - 'title': data['title'], - 'ext': ext, - 'format': data['media']['mimeType'], - 'thumbnail': data['thumbnailUrl'], - 'description': data['description'], - 'player_url': data['embedUrl'], - 'user_agent': 'iTunes/10.6.1', - } - except (ValueError,KeyError) as err: - raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) - - return [info] class MyVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py new file mode 100644 index 0000000000..df2ad4be2a --- /dev/null +++ b/youtube_dl/extractor/bliptv.py @@ -0,0 +1,177 @@ +import datetime +import json +import os +import re +import socket + +from .common import InfoExtractor +from ..utils import ( + compat_http_client, + compat_parse_qs, + compat_str, + compat_urllib_error, + compat_urllib_parse_urlparse, + compat_urllib_request, + + ExtractorError, + unescapeHTML, +) + + +class BlipTVIE(InfoExtractor): + """Information extractor for blip.tv""" + + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' + _URL_EXT = r'^.*\.([a-z0-9]+)$' + IE_NAME = u'blip.tv' + + def report_direct_download(self, title): + """Report information extraction.""" + self.to_screen(u'%s: Direct download detected' % title) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + # See https://github.com/rg3/youtube-dl/issues/857 + api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P[\d\w]+)', url) + if api_mobj is not None: + url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id') + urlp = compat_urllib_parse_urlparse(url) + if urlp.path.startswith('/play/'): + request = compat_urllib_request.Request(url) + response = compat_urllib_request.urlopen(request) + redirecturl = response.geturl() + rurlp = compat_urllib_parse_urlparse(redirecturl) + file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] + url = 'http://blip.tv/a/a-' + file_id + return self._real_extract(url) + + + if '?' in url: + cchar = '&' + else: + cchar = '?' + json_url = url + cchar + 'skin=json&version=2&no_wrap=1' + request = compat_urllib_request.Request(json_url) + request.add_header('User-Agent', 'iTunes/10.6.1') + self.report_extraction(mobj.group(1)) + info = None + try: + urlh = compat_urllib_request.urlopen(request) + if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download + basename = url.split('/')[-1] + title,ext = os.path.splitext(basename) + title = title.decode('UTF-8') + ext = ext.replace('.', '') + self.report_direct_download(title) + info = { + 'id': title, + 'url': url, + 'uploader': None, + 'upload_date': None, + 'title': title, + 'ext': ext, + 'urlhandle': urlh + } + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) + if info is None: # Regular URL + try: + json_code_bytes = urlh.read() + json_code = json_code_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) + + try: + json_data = json.loads(json_code) + if 'Post' in json_data: + data = json_data['Post'] + else: + data = json_data + + upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') + video_url = data['media']['url'] + umobj = re.match(self._URL_EXT, video_url) + if umobj is None: + raise ValueError('Can not determine filename extension') + ext = umobj.group(1) + + info = { + 'id': data['item_id'], + 'url': video_url, + 'uploader': data['display_name'], + 'upload_date': upload_date, + 'title': data['title'], + 'ext': ext, + 'format': data['media']['mimeType'], + 'thumbnail': data['thumbnailUrl'], + 'description': data['description'], + 'player_url': data['embedUrl'], + 'user_agent': 'iTunes/10.6.1', + } + except (ValueError,KeyError) as err: + raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) + + return [info] + + +class BlipTVUserIE(InfoExtractor): + """Information Extractor for blip.tv users.""" + + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' + _PAGE_SIZE = 12 + IE_NAME = u'blip.tv:user' + + def _real_extract(self, url): + # Extract username + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + username = mobj.group(1) + + page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' + + page = self._download_webpage(url, username, u'Downloading user page') + mobj = re.search(r'data-users-id="([^"]+)"', page) + page_base = page_base % mobj.group(1) + + + # Download video ids using BlipTV Ajax calls. Result size per + # query is limited (currently to 12 videos) so we need to query + # page by page until there are no video ids - it means we got + # all of them. + + video_ids = [] + pagenum = 1 + + while True: + url = page_base + "&page=" + str(pagenum) + page = self._download_webpage(url, username, + u'Downloading video ids from page %d' % pagenum) + + # Extract video identifiers + ids_in_page = [] + + for mobj in re.finditer(r'href="/([^"]+)"', page): + if mobj.group(1) not in ids_in_page: + ids_in_page.append(unescapeHTML(mobj.group(1))) + + video_ids.extend(ids_in_page) + + # A little optimization - if current page is not + # "full", ie. does not contain PAGE_SIZE video ids then + # we can assume that this page is the last one - there + # are no more ids on further pages - no need to query + # again. + + if len(ids_in_page) < self._PAGE_SIZE: + break + + pagenum += 1 + + urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] + url_entries = [self.url_result(url, 'BlipTV') for url in urls] + return [self.playlist_result(url_entries, playlist_title = username)]