diff --git a/.travis.yml b/.travis.yml index fb499845e4..c53c77e07f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,4 +35,7 @@ jobs: - env: JYTHON=true; YTDL_TEST_SET=core before_install: - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi +before_script: + - rm -rf /youtube_dlc/* + - cp /youtube_dl/* /youtube_dlc script: ./devscripts/run_tests.sh diff --git a/setup.py b/setup.py index ac56e49954..f5f0bae624 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ setup( long_description=LONG_DESCRIPTION, # long_description_content_type="text/markdown", url="https://github.com/blackjack4494/youtube-dlc", - packages=find_packages(), + packages=find_packages(exclude=("youtube_dl",)), #packages=[ # 'youtube_dlc', # 'youtube_dlc.extractor', 'youtube_dlc.downloader', diff --git a/youtube-dlc.spec b/youtube-dlc.spec new file mode 100644 index 0000000000..d2f4ca4537 --- /dev/null +++ b/youtube-dlc.spec @@ -0,0 +1,33 @@ +# -*- mode: python ; coding: utf-8 -*- + +block_cipher = None + + +a = Analysis(['youtube_dlc\\__main__.py'], + pathex=['D:\\gitkraken\\youtube-dl'], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + [], + name='youtube-dlc', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + upx_exclude=[], + runtime_tmpdir=None, + console=True ) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py new file mode 100644 index 0000000000..f79d31debb --- /dev/null +++ b/youtube_dl/YoutubeDL.py @@ -0,0 +1,2417 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import absolute_import, unicode_literals + +import collections +import contextlib +import copy +import datetime +import errno +import fileinput +import io +import itertools +import json +import locale +import operator +import os +import platform +import re +import shutil +import subprocess +import socket +import sys +import time +import tokenize +import traceback +import random + +from string import ascii_letters + +from .compat import ( + compat_basestring, + compat_cookiejar, + compat_get_terminal_size, + compat_http_client, + compat_kwargs, + compat_numeric_types, + compat_os_name, + compat_str, + compat_tokenize_tokenize, + compat_urllib_error, + compat_urllib_request, + compat_urllib_request_DataHandler, +) +from .utils import ( + age_restricted, + args_to_str, + ContentTooShortError, + date_from_str, + DateRange, + DEFAULT_OUTTMPL, + determine_ext, + determine_protocol, + DownloadError, + encode_compat_str, + encodeFilename, + error_to_compat_str, + expand_path, + ExtractorError, + format_bytes, + formatSeconds, + GeoRestrictedError, + int_or_none, + ISO3166Utils, + locked_file, + make_HTTPS_handler, + MaxDownloadsReached, + orderedSet, + PagedList, + parse_filesize, + PerRequestProxyHandler, + platform_name, + PostProcessingError, + preferredencoding, + prepend_extension, + register_socks_protocols, + render_table, + replace_extension, + SameFileError, + sanitize_filename, + sanitize_path, + sanitize_url, + sanitized_Request, + std_headers, + str_or_none, + subtitles_filename, + UnavailableVideoError, + url_basename, + version_tuple, + write_json_file, + write_string, + YoutubeDLCookieJar, + YoutubeDLCookieProcessor, + YoutubeDLHandler, + YoutubeDLRedirectHandler, +) +from .cache import Cache +from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER +from .extractor.openload import PhantomJSwrapper +from .downloader import get_suitable_downloader +from .downloader.rtmp import rtmpdump_version +from .postprocessor import ( + FFmpegFixupM3u8PP, + FFmpegFixupM4aPP, + FFmpegFixupStretchedPP, + FFmpegMergerPP, + FFmpegPostProcessor, + get_postprocessor, +) +from .version import __version__ + +if compat_os_name == 'nt': + import ctypes + + +class YoutubeDL(object): + """YoutubeDL class. + + YoutubeDL objects are the ones responsible of downloading the + actual video file and writing it to disk if the user has requested + it, among some other tasks. In most cases there should be one per + program. As, given a video URL, the downloader doesn't know how to + extract all the needed information, task that InfoExtractors do, it + has to pass the URL to one of them. + + For this, YoutubeDL objects have a method that allows + InfoExtractors to be registered in a given order. When it is passed + a URL, the YoutubeDL object handles it to the first InfoExtractor it + finds that reports being able to handle it. The InfoExtractor extracts + all the information about the video or videos the URL refers to, and + YoutubeDL process the extracted information, possibly using a File + Downloader to download the video. + + YoutubeDL objects accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. These options are available through the params + attribute for the InfoExtractors to use. The YoutubeDL also + registers itself as the downloader in charge for the InfoExtractors + that are added to it, so this is a "mutual registration". + + Available options: + + username: Username for authentication purposes. + password: Password for authentication purposes. + videopassword: Password for accessing a video. + ap_mso: Adobe Pass multiple-system operator identifier. + ap_username: Multiple-system operator account username. + ap_password: Multiple-system operator account password. + usenetrc: Use netrc for authentication instead. + verbose: Print additional info to stdout. + quiet: Do not print messages to stdout. + no_warnings: Do not print out anything for warnings. + forceurl: Force printing final URL. + forcetitle: Force printing title. + forceid: Force printing ID. + forcethumbnail: Force printing thumbnail URL. + forcedescription: Force printing description. + forcefilename: Force printing final filename. + forceduration: Force printing duration. + forcejson: Force printing info_dict as JSON. + dump_single_json: Force printing the info_dict of the whole playlist + (or video) as a single JSON line. + simulate: Do not download the video files. + format: Video format code. See options.py for more information. + outtmpl: Template for output names. + restrictfilenames: Do not allow "&" and spaces in file names + ignoreerrors: Do not stop on download errors. + force_generic_extractor: Force downloader to use the generic extractor + nooverwrites: Prevent overwriting files. + playliststart: Playlist item to start at. + playlistend: Playlist item to end at. + playlist_items: Specific indices of playlist to download. + playlistreverse: Download playlist items in reverse order. + playlistrandom: Download playlist items in random order. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. + logger: Log messages to a logging.Logger instance. + logtostderr: Log messages to stderr instead of stdout. + writedescription: Write the video description to a .description file + writeinfojson: Write the video description to a .info.json file + writeannotations: Write the video annotations to a .annotations.xml file + writethumbnail: Write the thumbnail image to a file + write_all_thumbnails: Write all thumbnail formats to files + writesubtitles: Write the video subtitles to a file + writeautomaticsub: Write the automatically generated subtitles to a file + allsubtitles: Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) + listsubtitles: Lists all available subtitles for the video + subtitlesformat: The format code for subtitles + subtitleslangs: List of languages of the subtitles to download + keepvideo: Keep the video file after post-processing + daterange: A DateRange object, download only if the upload_date is in the range. + skip_download: Skip the actual download of the video file + cachedir: Location of the cache files in the filesystem. + False to disable filesystem cache. + noplaylist: Download single video instead of a playlist if in doubt. + age_limit: An integer representing the user's age in years. + Unsuitable videos for the given age are skipped. + min_views: An integer representing the minimum view count the video + must have in order to not be skipped. + Videos without view count information are always + downloaded. None for no limit. + max_views: An integer representing the maximum view count. + Videos that are more popular than that are not + downloaded. + Videos without view count information are always + downloaded. None for no limit. + download_archive: File name of a file where all downloads are recorded. + Videos already present in the file are not downloaded + again. + cookiefile: File name where cookies should be read from and dumped to. + nocheckcertificate:Do not verify SSL certificates + prefer_insecure: Use HTTP instead of HTTPS to retrieve information. + At the moment, this is only supported by YouTube. + proxy: URL of the proxy server to use + geo_verification_proxy: URL of the proxy to use for IP address verification + on geo-restricted sites. + socket_timeout: Time to wait for unresponsive hosts, in seconds + bidi_workaround: Work around buggy terminals without bidirectional text + support, using fridibi + debug_printtraffic:Print out sent and received HTTP traffic + include_ads: Download ads as well + default_search: Prepend this string if an input url is not valid. + 'auto' for elaborate guessing + encoding: Use this encoding instead of the system-specified. + extract_flat: Do not resolve URLs, return the immediate result. + Pass in 'in_playlist' to only show this behavior for + playlist items. + postprocessors: A list of dictionaries, each with an entry + * key: The name of the postprocessor. See + youtube_dlc/postprocessor/__init__.py for a list. + as well as any further keyword arguments for the + postprocessor. + progress_hooks: A list of functions that get called on download + progress, with a dictionary with the entries + * status: One of "downloading", "error", or "finished". + Check this first and ignore unknown values. + + If status is one of "downloading", or "finished", the + following properties may also be present: + * filename: The final filename (always present) + * tmpfilename: The filename we're currently writing to + * downloaded_bytes: Bytes on disk + * total_bytes: Size of the whole file, None if unknown + * total_bytes_estimate: Guess of the eventual file size, + None if unavailable. + * elapsed: The number of seconds since download started. + * eta: The estimated time in seconds, None if unknown + * speed: The download speed in bytes/second, None if + unknown + * fragment_index: The counter of the currently + downloaded video fragment. + * fragment_count: The number of fragments (= individual + files that will be merged) + + Progress hooks are guaranteed to be called at least once + (with status "finished") if the download is successful. + merge_output_format: Extension to use when merging formats. + fixup: Automatically correct known faults of the file. + One of: + - "never": do nothing + - "warn": only emit a warning + - "detect_or_warn": check whether we can do anything + about it, warn otherwise (default) + source_address: Client-side IP address to bind to. + call_home: Boolean, true iff we are allowed to contact the + youtube-dlc servers for debugging. + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. + listformats: Print an overview of available video formats and exit. + list_thumbnails: Print a table of all thumbnails and exit. + match_filter: A function that gets called with the info_dict of + every video. + If it returns a message, the video is ignored. + If it returns None, the video is downloaded. + match_filter_func in utils.py is one example for this. + no_color: Do not emit color codes in output. + geo_bypass: Bypass geographic restriction via faking X-Forwarded-For + HTTP header + geo_bypass_country: + Two-letter ISO 3166-2 country code that will be used for + explicit geographic restriction bypassing via faking + X-Forwarded-For HTTP header + geo_bypass_ip_block: + IP range in CIDR notation that will be used similarly to + geo_bypass_country + + The following options determine which downloader is picked: + external_downloader: Executable of the external downloader to call. + None or unset for standard (built-in) downloader. + hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. + + The following parameters are not used by YoutubeDL itself, they are used by + the downloader (see youtube_dlc/downloader/common.py): + nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, + noresizebuffer, retries, continuedl, noprogress, consoletitle, + xattr_set_filesize, external_downloader_args, hls_use_mpegts, + http_chunk_size. + + The following options are used by the post processors: + prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. + ffmpeg_location: Location of the ffmpeg/avconv binary; either the path + to the binary or its containing directory. + postprocessor_args: A list of additional command-line arguments for the + postprocessor. + + The following options are used by the Youtube extractor: + youtube_include_dash_manifest: If True (default), DASH manifests and related + data will be downloaded and processed by extractor. + You can reduce network I/O by disabling it if you don't + care about DASH. + """ + + _NUMERIC_FIELDS = set(( + 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', + 'timestamp', 'upload_year', 'upload_month', 'upload_day', + 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', + 'average_rating', 'comment_count', 'age_limit', + 'start_time', 'end_time', + 'chapter_number', 'season_number', 'episode_number', + 'track_number', 'disc_number', 'release_year', + 'playlist_index', + )) + + params = None + _ies = [] + _pps = [] + _download_retcode = None + _num_downloads = None + _screen_file = None + + def __init__(self, params=None, auto_init=True): + """Create a FileDownloader object with the given options.""" + if params is None: + params = {} + self._ies = [] + self._ies_instances = {} + self._pps = [] + self._progress_hooks = [] + self._download_retcode = 0 + self._num_downloads = 0 + self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self._err_file = sys.stderr + self.params = { + # Default parameters + 'nocheckcertificate': False, + } + self.params.update(params) + self.cache = Cache(self) + + def check_deprecated(param, option, suggestion): + if self.params.get(param) is not None: + self.report_warning( + '%s is deprecated. Use %s instead.' % (option, suggestion)) + return True + return False + + if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): + if self.params.get('geo_verification_proxy') is None: + self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] + + check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits') + check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"') + check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') + + if params.get('bidi_workaround', False): + try: + import pty + master, slave = pty.openpty() + width = compat_get_terminal_size().columns + if width is None: + width_args = [] + else: + width_args = ['-w', str(width)] + sp_kwargs = dict( + stdin=subprocess.PIPE, + stdout=slave, + stderr=self._err_file) + try: + self._output_process = subprocess.Popen( + ['bidiv'] + width_args, **sp_kwargs + ) + except OSError: + self._output_process = subprocess.Popen( + ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_channel = os.fdopen(master, 'rb') + except OSError as ose: + if ose.errno == errno.ENOENT: + self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + else: + raise + + if (sys.platform != 'win32' + and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] + and not params.get('restrictfilenames', False)): + # Unicode filesystem API will throw errors (#1474, #13027) + self.report_warning( + 'Assuming --restrict-filenames since file system encoding ' + 'cannot encode all characters. ' + 'Set the LC_ALL environment variable to fix this.') + self.params['restrictfilenames'] = True + + if isinstance(params.get('outtmpl'), bytes): + self.report_warning( + 'Parameter outtmpl is bytes, but should be a unicode string. ' + 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') + + self._setup_opener() + + if auto_init: + self.print_debug_header() + self.add_default_info_extractors() + + for pp_def_raw in self.params.get('postprocessors', []): + pp_class = get_postprocessor(pp_def_raw['key']) + pp_def = dict(pp_def_raw) + del pp_def['key'] + pp = pp_class(self, **compat_kwargs(pp_def)) + self.add_post_processor(pp) + + for ph in self.params.get('progress_hooks', []): + self.add_progress_hook(ph) + + register_socks_protocols() + + def warn_if_short_id(self, argv): + # short YouTube ID starting with dash? + idxs = [ + i for i, a in enumerate(argv) + if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] + if idxs: + correct_argv = ( + ['youtube-dlc'] + + [a for i, a in enumerate(argv) if i not in idxs] + + ['--'] + [argv[i] for i in idxs] + ) + self.report_warning( + 'Long argument string detected. ' + 'Use -- to separate parameters and URLs, like this:\n%s\n' % + args_to_str(correct_argv)) + + def add_info_extractor(self, ie): + """Add an InfoExtractor object to the end of the list.""" + self._ies.append(ie) + if not isinstance(ie, type): + self._ies_instances[ie.ie_key()] = ie + ie.set_downloader(self) + + def get_info_extractor(self, ie_key): + """ + Get an instance of an IE with name ie_key, it will try to get one from + the _ies list, if there's no instance it will create a new one and add + it to the extractor list. + """ + ie = self._ies_instances.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key)() + self.add_info_extractor(ie) + return ie + + def add_default_info_extractors(self): + """ + Add the InfoExtractors returned by gen_extractors to the end of the list + """ + for ie in gen_extractor_classes(): + self.add_info_extractor(ie) + + def add_post_processor(self, pp): + """Add a PostProcessor object to the end of the chain.""" + self._pps.append(pp) + pp.set_downloader(self) + + def add_progress_hook(self, ph): + """Add the progress hook (currently only for the file downloader)""" + self._progress_hooks.append(ph) + + def _bidi_workaround(self, message): + if not hasattr(self, '_output_channel'): + return message + + assert hasattr(self, '_output_process') + assert isinstance(message, compat_str) + line_count = message.count('\n') + 1 + self._output_process.stdin.write((message + '\n').encode('utf-8')) + self._output_process.stdin.flush() + res = ''.join(self._output_channel.readline().decode('utf-8') + for _ in range(line_count)) + return res[:-len('\n')] + + def to_screen(self, message, skip_eol=False): + """Print message to stdout if not in quiet mode.""" + return self.to_stdout(message, skip_eol, check_quiet=True) + + def _write_string(self, s, out=None): + write_string(s, out=out, encoding=self.params.get('encoding')) + + def to_stdout(self, message, skip_eol=False, check_quiet=False): + """Print message to stdout if not in quiet mode.""" + if self.params.get('logger'): + self.params['logger'].debug(message) + elif not check_quiet or not self.params.get('quiet', False): + message = self._bidi_workaround(message) + terminator = ['\n', ''][skip_eol] + output = message + terminator + + self._write_string(output, self._screen_file) + + def to_stderr(self, message): + """Print message to stderr.""" + assert isinstance(message, compat_str) + if self.params.get('logger'): + self.params['logger'].error(message) + else: + message = self._bidi_workaround(message) + output = message + '\n' + self._write_string(output, self._err_file) + + def to_console_title(self, message): + if not self.params.get('consoletitle', False): + return + if compat_os_name == 'nt': + if ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + elif 'TERM' in os.environ: + self._write_string('\033]0;%s\007' % message, self._screen_file) + + def save_console_title(self): + if not self.params.get('consoletitle', False): + return + if self.params.get('simulate', False): + return + if compat_os_name != 'nt' and 'TERM' in os.environ: + # Save the title on stack + self._write_string('\033[22;0t', self._screen_file) + + def restore_console_title(self): + if not self.params.get('consoletitle', False): + return + if self.params.get('simulate', False): + return + if compat_os_name != 'nt' and 'TERM' in os.environ: + # Restore the title from stack + self._write_string('\033[23;0t', self._screen_file) + + def __enter__(self): + self.save_console_title() + return self + + def __exit__(self, *args): + self.restore_console_title() + + if self.params.get('cookiefile') is not None: + self.cookiejar.save(ignore_discard=True, ignore_expires=True) + + def trouble(self, message=None, tb=None): + """Determine action to take when a download problem appears. + + Depending on if the downloader has been configured to ignore + download errors or not, this method may throw an exception or + not when errors are found, after printing the message. + + tb, if given, is additional traceback information. + """ + if message is not None: + self.to_stderr(message) + if self.params.get('verbose'): + if tb is None: + if sys.exc_info()[0]: # if .trouble has been called from an except block + tb = '' + if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) + tb += encode_compat_str(traceback.format_exc()) + else: + tb_data = traceback.format_list(traceback.extract_stack()) + tb = ''.join(tb_data) + self.to_stderr(tb) + if not self.params.get('ignoreerrors', False): + if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + exc_info = sys.exc_info()[1].exc_info + else: + exc_info = sys.exc_info() + raise DownloadError(message, exc_info) + self._download_retcode = 1 + + def report_warning(self, message): + ''' + Print the message to stderr, it will be prefixed with 'WARNING:' + If stderr is a tty file the 'WARNING:' will be colored + ''' + if self.params.get('logger') is not None: + self.params['logger'].warning(message) + else: + if self.params.get('no_warnings'): + return + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': + _msg_header = '\033[0;33mWARNING:\033[0m' + else: + _msg_header = 'WARNING:' + warning_message = '%s %s' % (_msg_header, message) + self.to_stderr(warning_message) + + def report_error(self, message, tb=None): + ''' + Do the same as trouble, but prefixes the message with 'ERROR:', colored + in red if stderr is a tty file. + ''' + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': + _msg_header = '\033[0;31mERROR:\033[0m' + else: + _msg_header = 'ERROR:' + error_message = '%s %s' % (_msg_header, message) + self.trouble(error_message, tb) + + def report_file_already_downloaded(self, file_name): + """Report file has already been fully downloaded.""" + try: + self.to_screen('[download] %s has already been downloaded' % file_name) + except UnicodeEncodeError: + self.to_screen('[download] The file has already been downloaded') + + def prepare_filename(self, info_dict): + """Generate the output filename.""" + try: + template_dict = dict(info_dict) + + template_dict['epoch'] = int(time.time()) + autonumber_size = self.params.get('autonumber_size') + if autonumber_size is None: + autonumber_size = 5 + template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads + if template_dict.get('resolution') is None: + if template_dict.get('width') and template_dict.get('height'): + template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) + elif template_dict.get('height'): + template_dict['resolution'] = '%sp' % template_dict['height'] + elif template_dict.get('width'): + template_dict['resolution'] = '%dx?' % template_dict['width'] + + sanitize = lambda k, v: sanitize_filename( + compat_str(v), + restricted=self.params.get('restrictfilenames'), + is_id=(k == 'id' or k.endswith('_id'))) + template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) + for k, v in template_dict.items() + if v is not None and not isinstance(v, (list, tuple, dict))) + template_dict = collections.defaultdict(lambda: 'NA', template_dict) + + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) + + # For fields playlist_index and autonumber convert all occurrences + # of %(field)s to %(field)0Nd for backward compatibility + field_size_compat_map = { + 'playlist_index': len(str(template_dict['n_entries'])), + 'autonumber': autonumber_size, + } + FIELD_SIZE_COMPAT_RE = r'(?autonumber|playlist_index)\)s' + mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl) + if mobj: + outtmpl = re.sub( + FIELD_SIZE_COMPAT_RE, + r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')], + outtmpl) + + # Missing numeric fields used together with integer presentation types + # in format specification will break the argument substitution since + # string 'NA' is returned for missing fields. We will patch output + # template for missing fields to meet string presentation type. + for numeric_field in self._NUMERIC_FIELDS: + if numeric_field not in template_dict: + # As of [1] format syntax is: + # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type + # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting + FORMAT_RE = r'''(?x) + (? max_views: + return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) + if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): + return 'Skipping "%s" because it is age restricted' % video_title + if self.in_download_archive(info_dict): + return '%s has already been recorded in archive' % video_title + + if not incomplete: + match_filter = self.params.get('match_filter') + if match_filter is not None: + ret = match_filter(info_dict) + if ret is not None: + return ret + + return None + + @staticmethod + def add_extra_info(info_dict, extra_info): + '''Set the keys from extra_info in info dict if they are missing''' + for key, value in extra_info.items(): + info_dict.setdefault(key, value) + + def extract_info(self, url, download=True, ie_key=None, extra_info={}, + process=True, force_generic_extractor=False): + ''' + Returns a list with a dictionary for each video we find. + If 'download', also downloads the videos. + extra_info is a dict containing the extra values to add to each result + ''' + + if not ie_key and force_generic_extractor: + ie_key = 'Generic' + + if ie_key: + ies = [self.get_info_extractor(ie_key)] + else: + ies = self._ies + + for ie in ies: + if not ie.suitable(url): + continue + + ie = self.get_info_extractor(ie.ie_key()) + if not ie.working(): + self.report_warning('The program functionality for this site has been marked as broken, ' + 'and will probably not work.') + + try: + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + break + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) + else: + return ie_result + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + break + except ExtractorError as e: # An error we somewhat expected + self.report_error(compat_str(e), e.format_traceback()) + break + except MaxDownloadsReached: + raise + except Exception as e: + if self.params.get('ignoreerrors', False): + self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) + break + else: + raise + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def add_default_extra_info(self, ie_result, ie, url): + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'webpage_url': url, + 'webpage_url_basename': url_basename(url), + 'extractor_key': ie.ie_key(), + }) + + def process_ie_result(self, ie_result, download=True, extra_info={}): + """ + Take the result of the ie(may be modified) and resolve all unresolved + references (URLs, playlist items). + + It will also download the videos if 'download'. + Returns the resolved ie_result. + """ + result_type = ie_result.get('_type', 'video') + + if result_type in ('url', 'url_transparent'): + ie_result['url'] = sanitize_url(ie_result['url']) + extract_flat = self.params.get('extract_flat', False) + if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) + or extract_flat is True): + self.__forced_printings( + ie_result, self.prepare_filename(ie_result), + incomplete=True) + return ie_result + + if result_type == 'video': + self.add_extra_info(ie_result, extra_info) + return self.process_video_result(ie_result, download=download) + elif result_type == 'url': + # We have to add extra_info to the results because it may be + # contained in a playlist + return self.extract_info(ie_result['url'], + download, + ie_key=ie_result.get('ie_key'), + extra_info=extra_info) + elif result_type == 'url_transparent': + # Use the information from the embedding page + info = self.extract_info( + ie_result['url'], ie_key=ie_result.get('ie_key'), + extra_info=extra_info, download=False, process=False) + + # extract_info may return None when ignoreerrors is enabled and + # extraction failed with an error, don't crash and return early + # in this case + if not info: + return info + + force_properties = dict( + (k, v) for k, v in ie_result.items() if v is not None) + for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): + if f in force_properties: + del force_properties[f] + new_result = info.copy() + new_result.update(force_properties) + + # Extracted info may not be a video result (i.e. + # info.get('_type', 'video') != video) but rather an url or + # url_transparent. In such cases outer metadata (from ie_result) + # should be propagated to inner one (info). For this to happen + # _type of info should be overridden with url_transparent. This + # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163. + if new_result.get('_type') == 'url': + new_result['_type'] = 'url_transparent' + + return self.process_ie_result( + new_result, download=download, extra_info=extra_info) + elif result_type in ('playlist', 'multi_video'): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = make_playlistitems_entries(ie_entries) + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + report_download(n_entries) + else: # iterable + if playlistitems: + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) + else: + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + n_entries = len(entries) + report_download(n_entries) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + reason = self._match_entry(entry, incomplete=True) + if reason is not None: + self.to_screen('[download] ' + reason) + continue + + entry_result = self.process_ie_result(entry, + download=download, + extra_info=extra) + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + elif result_type == 'compat_list': + self.report_warning( + 'Extractor %s returned a compat_list result. ' + 'It needs to be updated.' % ie_result.get('extractor')) + + def _fixup(r): + self.add_extra_info( + r, + { + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + ) + return r + ie_result['entries'] = [ + self.process_ie_result(_fixup(r), download, extra_info) + for r in ie_result['entries'] + ] + return ie_result + else: + raise Exception('Invalid result type: %s' % result_type) + + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " + + OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s* + (?Pwidth|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps) + \s*(?P%s)(?P\s*\?)?\s* + (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) + $ + ''' % '|'.join(map(re.escape, OPERATORS.keys()))) + m = operator_rex.search(filter_spec) + if m: + try: + comparison_value = int(m.group('value')) + except ValueError: + comparison_value = parse_filesize(m.group('value')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('value') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid value %r in format specification %r' % ( + m.group('value'), filter_spec)) + op = OPERATORS[m.group('op')] + + if not m: + STR_OPERATORS = { + '=': operator.eq, + '^=': lambda attr, value: attr.startswith(value), + '$=': lambda attr, value: attr.endswith(value), + '*=': lambda attr, value: value in attr, + } + str_operator_rex = re.compile(r'''(?x) + \s*(?Pext|acodec|vcodec|container|protocol|format_id) + \s*(?P!\s*)?(?P%s)(?P\s*\?)? + \s*(?P[a-zA-Z0-9._-]+) + \s*$ + ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) + m = str_operator_rex.search(filter_spec) + if m: + comparison_value = m.group('value') + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op(attr, value) + else: + op = str_op + + if not m: + raise ValueError('Invalid filter specification %r' % filter_spec) + + def _filter(f): + actual_value = f.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + return _filter + + def _default_format_spec(self, info_dict, download=True): + + def can_merge(): + merger = FFmpegMergerPP(self) + return merger.available and merger.can_merge() + + def prefer_best(): + if self.params.get('simulate', False): + return False + if not download: + return False + if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': + return True + if info_dict.get('is_live'): + return True + if not can_merge(): + return True + return False + + req_format_list = ['bestvideo+bestaudio', 'best'] + if prefer_best(): + req_format_list.reverse() + return '/'.join(req_format_list) + + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) + + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + GROUP = 'GROUP' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) + + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the surrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break + elif inside_merge and string in ['/', ',']: + tokens.restore_last_token() + break + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) + selectors.append(current_selector) + current_selector = None + elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) + first_choice = current_selector + second_choice = _parse_format_selection(tokens, inside_choice=True) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) + elif string == '+': + video_selector = current_selector + audio_selector = _parse_format_selection(tokens, inside_merge=True) + if not video_selector or not audio_selector: + raise syntax_error('"+" must be between two format selectors', start) + current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _build_selector_function(selector): + if isinstance(selector, list): + fs = [_build_selector_function(s) for s in selector] + + def selector_function(ctx): + for f in fs: + for format in f(ctx): + yield format + return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) + elif selector.type == PICKFIRST: + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(ctx): + for f in fs: + picked_formats = list(f(ctx)) + if picked_formats: + return picked_formats + return [] + elif selector.type == SINGLE: + format_spec = selector.selector + + def selector_function(ctx): + formats = list(ctx['formats']) + if not formats: + return + if format_spec == 'all': + for f in formats: + yield f + elif format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 + audiovideo_formats = [ + f for f in formats + if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] + if audiovideo_formats: + yield audiovideo_formats[format_idx] + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) we will fallback to best/worst + # {video,audio}-only format + elif ctx['incomplete_formats']: + yield formats[format_idx] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[0] + else: + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, formats)) + if matches: + yield matches[-1] + elif selector.type == MERGE: + def _merge(formats_info): + format_1, format_2 = [f['format_id'] for f in formats_info] + # The first format must contain the video and the + # second the audio + if formats_info[0].get('vcodec') == 'none': + self.report_error('The first format must ' + 'contain the video, try using ' + '"-f %s+%s"' % (format_2, format_1)) + return + # Formats must be opposite (video+audio) + if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none': + self.report_error( + 'Both formats %s and %s are video-only, you must specify "-f video+audio"' + % (format_1, format_2)) + return + output_ext = ( + formats_info[0]['ext'] + if self.params.get('merge_output_format') is None + else self.params['merge_output_format']) + return { + 'requested_formats': formats_info, + 'format': '%s+%s' % (formats_info[0].get('format'), + formats_info[1].get('format')), + 'format_id': '%s+%s' % (formats_info[0].get('format_id'), + formats_info[1].get('format_id')), + 'width': formats_info[0].get('width'), + 'height': formats_info[0].get('height'), + 'resolution': formats_info[0].get('resolution'), + 'fps': formats_info[0].get('fps'), + 'vcodec': formats_info[0].get('vcodec'), + 'vbr': formats_info[0].get('vbr'), + 'stretched_ratio': formats_info[0].get('stretched_ratio'), + 'acodec': formats_info[1].get('acodec'), + 'abr': formats_info[1].get('abr'), + 'ext': output_ext, + } + video_selector, audio_selector = map(_build_selector_function, selector.selector) + + def selector_function(ctx): + for pair in itertools.product( + video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): + yield _merge(pair) + + filters = [self._build_format_filter(f) for f in selector.filters] + + def final_selector(ctx): + ctx_copy = copy.deepcopy(ctx) + for _filter in filters: + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) + return final_selector + + stream = io.BytesIO(format_spec.encode('utf-8')) + try: + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) + return _build_selector_function(parsed_selector) + + def _calc_headers(self, info_dict): + res = std_headers.copy() + + add_headers = info_dict.get('http_headers') + if add_headers: + res.update(add_headers) + + cookies = self._calc_cookies(info_dict) + if cookies: + res['Cookie'] = cookies + + if 'X-Forwarded-For' not in res: + x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') + if x_forwarded_for_ip: + res['X-Forwarded-For'] = x_forwarded_for_ip + + return res + + def _calc_cookies(self, info_dict): + pr = sanitized_Request(info_dict['url']) + self.cookiejar.add_cookie_header(pr) + return pr.get_header('Cookie') + + def process_video_result(self, info_dict, download=True): + assert info_dict.get('_type', 'video') == 'video' + + if 'id' not in info_dict: + raise ExtractorError('Missing "id" field in extractor result') + if 'title' not in info_dict: + raise ExtractorError('Missing "title" field in extractor result') + + def report_force_conversion(field, field_not, conversion): + self.report_warning( + '"%s" field is not %s - forcing %s conversion, there is an error in extractor' + % (field, field_not, conversion)) + + def sanitize_string_field(info, string_field): + field = info.get(string_field) + if field is None or isinstance(field, compat_str): + return + report_force_conversion(string_field, 'a string', 'string') + info[string_field] = compat_str(field) + + def sanitize_numeric_fields(info): + for numeric_field in self._NUMERIC_FIELDS: + field = info.get(numeric_field) + if field is None or isinstance(field, compat_numeric_types): + continue + report_force_conversion(numeric_field, 'numeric', 'int') + info[numeric_field] = int_or_none(field) + + sanitize_string_field(info_dict, 'id') + sanitize_numeric_fields(info_dict) + + if 'playlist' not in info_dict: + # It isn't part of a playlist + info_dict['playlist'] = None + info_dict['playlist_index'] = None + + thumbnails = info_dict.get('thumbnails') + if thumbnails is None: + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] + if thumbnails: + thumbnails.sort(key=lambda t: ( + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', t.get('url'))) + for i, t in enumerate(thumbnails): + t['url'] = sanitize_url(t['url']) + if t.get('width') and t.get('height'): + t['resolution'] = '%dx%d' % (t['width'], t['height']) + if t.get('id') is None: + t['id'] = '%d' % i + + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) + return + + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnail'] = sanitize_url(thumbnail) + elif thumbnails: + info_dict['thumbnail'] = thumbnails[-1]['url'] + + if 'display_id' not in info_dict and 'id' in info_dict: + info_dict['display_id'] = info_dict['id'] + + if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass + + # Auto generate title fields corresponding to the *_number fields when missing + # in order to always have clean titles. This is very common for TV series. + for field in ('chapter', 'season', 'episode'): + if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') + subtitles = info_dict.get('subtitles') + + if self.params.get('listsubtitles', False): + if 'automatic_captions' in info_dict: + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') + return + + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], subtitles, automatic_captions) + + # We now pick which formats have to be downloaded + if info_dict.get('formats') is None: + # There's only one format available + formats = [info_dict] + else: + formats = info_dict['formats'] + + if not formats: + raise ExtractorError('No video formats found!') + + def is_wellformed(f): + url = f.get('url') + if not url: + self.report_warning( + '"url" field is missing or empty - skipping format, ' + 'there is an error in extractor') + return False + if isinstance(url, bytes): + sanitize_string_field(f, 'url') + return True + + # Filter out malformed formats for better extraction robustness + formats = list(filter(is_wellformed, formats)) + + formats_dict = {} + + # We check that all the formats have the format and format_id fields + for i, format in enumerate(formats): + sanitize_string_field(format, 'format_id') + sanitize_numeric_fields(format) + format['url'] = sanitize_url(format['url']) + if not format.get('format_id'): + format['format_id'] = compat_str(i) + else: + # Sanitize format_id from characters used in format selector expression + format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): + if format.get('format') is None: + format['format'] = '{id} - {res}{note}'.format( + id=format['format_id'], + res=self.format_resolution(format), + note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', + ) + # Automatically determine file extension if missing + if format.get('ext') is None: + format['ext'] = determine_ext(format['url']).lower() + # Automatically determine protocol if missing (useful for format + # selection purposes) + if format.get('protocol') is None: + format['protocol'] = determine_protocol(format) + # Add HTTP headers, so that external programs can use them from the + # json output + full_format_info = info_dict.copy() + full_format_info.update(format) + format['http_headers'] = self._calc_headers(full_format_info) + # Remove private housekeeping stuff + if '__x_forwarded_for_ip' in info_dict: + del info_dict['__x_forwarded_for_ip'] + + # TODO Central sorting goes here + + if formats[0] is not info_dict: + # only set the 'formats' fields if the original info_dict list them + # otherwise we end up with a circular reference, the first (and unique) + # element in the 'formats' field in info_dict is info_dict itself, + # which can't be exported to json + info_dict['formats'] = formats + if self.params.get('listformats'): + self.list_formats(info_dict) + return + + req_format = self.params.get('format') + if req_format is None: + req_format = self._default_format_spec(info_dict, download=download) + if self.params.get('verbose'): + self.to_stdout('[debug] Default format spec: %s' % req_format) + + format_selector = self.build_format_selector(req_format) + + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/ytdl-org/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/ytdl-org/youtube-dl/issues/10083). + incomplete_formats = ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) + # all formats are audio-only + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) + if not formats_to_download: + raise ExtractorError('requested format not available', + expected=True) + + if download: + if len(formats_to_download) > 1: + self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download))) + for format in formats_to_download: + new_info = dict(info_dict) + new_info.update(format) + self.process_info(new_info) + # We update the info dict with the best quality format (backwards compatibility) + info_dict.update(formats_to_download[-1]) + return info_dict + + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): + """Select the requested subtitles and their format""" + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + + def __forced_printings(self, info_dict, filename, incomplete): + def print_mandatory(field): + if (self.params.get('force%s' % field, False) + and (not incomplete or info_dict.get(field) is not None)): + self.to_stdout(info_dict[field]) + + def print_optional(field): + if (self.params.get('force%s' % field, False) + and info_dict.get(field) is not None): + self.to_stdout(info_dict[field]) + + print_mandatory('title') + print_mandatory('id') + if self.params.get('forceurl', False) and not incomplete: + if info_dict.get('requested_formats') is not None: + for f in info_dict['requested_formats']: + self.to_stdout(f['url'] + f.get('play_path', '')) + else: + # For RTMP URLs, also include the playpath + self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) + print_optional('thumbnail') + print_optional('description') + if self.params.get('forcefilename', False) and filename is not None: + self.to_stdout(filename) + if self.params.get('forceduration', False) and info_dict.get('duration') is not None: + self.to_stdout(formatSeconds(info_dict['duration'])) + print_mandatory('format') + if self.params.get('forcejson', False): + self.to_stdout(json.dumps(info_dict)) + + def process_info(self, info_dict): + """Process a single resolved IE result.""" + + assert info_dict.get('_type', 'video') == 'video' + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads >= int(max_downloads): + raise MaxDownloadsReached() + + # TODO: backward compatibility, to be removed + info_dict['fulltitle'] = info_dict['title'] + + if 'format' not in info_dict: + info_dict['format'] = info_dict['ext'] + + reason = self._match_entry(info_dict, incomplete=False) + if reason is not None: + self.to_screen('[download] ' + reason) + return + + self._num_downloads += 1 + + info_dict['_filename'] = filename = self.prepare_filename(info_dict) + + # Forced printings + self.__forced_printings(info_dict, filename, incomplete=False) + + # Do nothing else if in simulate mode + if self.params.get('simulate', False): + return + + if filename is None: + return + + def ensure_dir_exists(path): + try: + dn = os.path.dirname(path) + if dn and not os.path.exists(dn): + os.makedirs(dn) + return True + except (OSError, IOError) as err: + self.report_error('unable to create directory ' + error_to_compat_str(err)) + return False + + if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): + return + + if self.params.get('writedescription', False): + descfn = replace_extension(filename, 'description', info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): + self.to_screen('[info] Video description is already present') + elif info_dict.get('description') is None: + self.report_warning('There\'s no description to write.') + else: + try: + self.to_screen('[info] Writing video description to: ' + descfn) + with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(info_dict['description']) + except (OSError, IOError): + self.report_error('Cannot write description file ' + descfn) + return + + if self.params.get('writeannotations', False): + annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): + self.to_screen('[info] Video annotations are already present') + elif not info_dict.get('annotations'): + self.report_warning('There are no annotations to write.') + else: + try: + self.to_screen('[info] Writing video annotations to: ' + annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning('There are no annotations to write.') + except (OSError, IOError): + self.report_error('Cannot write annotations file: ' + annofn) + return + + def dl(name, info): + fd = get_suitable_downloader(info, self.params)(self, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + if self.params.get('verbose'): + self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + return fd.download(name, info) + + subtitles_are_requested = any([self.params.get('writesubtitles', False), + self.params.get('writeautomaticsub')]) + + if subtitles_are_requested and info_dict.get('requested_subtitles'): + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + subtitles = info_dict['requested_subtitles'] + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) + else: + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/ytdl-org/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return + else: + try: + dl(sub_filename, sub_info) + except (ExtractorError, IOError, OSError, ValueError, + compat_urllib_error.URLError, + compat_http_client.HTTPException, + socket.error) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, error_to_compat_str(err))) + continue + + if self.params.get('writeinfojson', False): + infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): + self.to_screen('[info] Video description metadata is already present') + else: + self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) + try: + write_json_file(self.filter_requested_info(info_dict), infofn) + except (OSError, IOError): + self.report_error('Cannot write metadata to JSON file ' + infofn) + return + + self._write_thumbnails(info_dict, filename) + + if not self.params.get('skip_download', False): + try: + if info_dict.get('requested_formats') is not None: + downloaded = [] + success = True + merger = FFmpegMergerPP(self) + if not merger.available: + postprocessors = [] + self.report_warning('You have requested multiple ' + 'formats but ffmpeg or avconv are not installed.' + ' The formats won\'t be merged.') + else: + postprocessors = [merger] + + def compatible_formats(formats): + video, audio = formats + # Check extension + video_ext, audio_ext = video.get('ext'), audio.get('ext') + if video_ext and audio_ext: + COMPATIBLE_EXTS = ( + ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), + ('webm') + ) + for exts in COMPATIBLE_EXTS: + if video_ext in exts and audio_ext in exts: + return True + # TODO: Check acodec/vcodec + return False + + filename_real_ext = os.path.splitext(filename)[1][1:] + filename_wo_ext = ( + os.path.splitext(filename)[0] + if filename_real_ext == info_dict['ext'] + else filename) + requested_formats = info_dict['requested_formats'] + if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): + info_dict['ext'] = 'mkv' + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv.') + # Ensure filename always has a correct extension for successful merge + filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) + if os.path.exists(encodeFilename(filename)): + self.to_screen( + '[download] %s has already been downloaded and ' + 'merged' % filename) + else: + for f in requested_formats: + new_info = dict(info_dict) + new_info.update(f) + fname = prepend_extension( + self.prepare_filename(new_info), + 'f%s' % f['format_id'], new_info['ext']) + if not ensure_dir_exists(fname): + return + downloaded.append(fname) + partial_success = dl(fname, new_info) + success = success and partial_success + info_dict['__postprocessors'] = postprocessors + info_dict['__files_to_merge'] = downloaded + else: + # Just a single file + success = dl(filename, info_dict) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_error('unable to download video data: %s' % error_to_compat_str(err)) + return + except (OSError, IOError) as err: + raise UnavailableVideoError(err) + except (ContentTooShortError, ) as err: + self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + return + + if success and filename != '-': + # Fixup content + fixup_policy = self.params.get('fixup') + if fixup_policy is None: + fixup_policy = 'detect_or_warn' + + INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.' + + stretched_ratio = info_dict.get('stretched_ratio') + if stretched_ratio is not None and stretched_ratio != 1: + if fixup_policy == 'warn': + self.report_warning('%s: Non-uniform pixel ratio (%s)' % ( + info_dict['id'], stretched_ratio)) + elif fixup_policy == 'detect_or_warn': + stretched_pp = FFmpegFixupStretchedPP(self) + if stretched_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(stretched_pp) + else: + self.report_warning( + '%s: Non-uniform pixel ratio (%s). %s' + % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE)) + else: + assert fixup_policy in ('ignore', 'never') + + if (info_dict.get('requested_formats') is None + and info_dict.get('container') == 'm4a_dash'): + if fixup_policy == 'warn': + self.report_warning( + '%s: writing DASH m4a. ' + 'Only some players support this container.' + % info_dict['id']) + elif fixup_policy == 'detect_or_warn': + fixup_pp = FFmpegFixupM4aPP(self) + if fixup_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(fixup_pp) + else: + self.report_warning( + '%s: writing DASH m4a. ' + 'Only some players support this container. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) + else: + assert fixup_policy in ('ignore', 'never') + + if (info_dict.get('protocol') == 'm3u8_native' + or info_dict.get('protocol') == 'm3u8' + and self.params.get('hls_prefer_native')): + if fixup_policy == 'warn': + self.report_warning('%s: malformed AAC bitstream detected.' % ( + info_dict['id'])) + elif fixup_policy == 'detect_or_warn': + fixup_pp = FFmpegFixupM3u8PP(self) + if fixup_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(fixup_pp) + else: + self.report_warning( + '%s: malformed AAC bitstream detected. %s' + % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) + else: + assert fixup_policy in ('ignore', 'never') + + try: + self.post_process(filename, info_dict) + except (PostProcessingError) as err: + self.report_error('postprocessing: %s' % str(err)) + return + self.record_download_archive(info_dict) + + def download(self, url_list): + """Download a given list of URLs.""" + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) + if (len(url_list) > 1 + and outtmpl != '-' + and '%' not in outtmpl + and self.params.get('max_downloads') != 1): + raise SameFileError(outtmpl) + + for url in url_list: + try: + # It also downloads the videos + res = self.extract_info( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) + except UnavailableVideoError: + self.report_error('unable to download video') + except MaxDownloadsReached: + self.to_screen('[info] Maximum number of downloaded files reached.') + raise + else: + if self.params.get('dump_single_json', False): + self.to_stdout(json.dumps(res)) + + return self._download_retcode + + def download_with_info_file(self, info_filename): + with contextlib.closing(fileinput.FileInput( + [info_filename], mode='r', + openhook=fileinput.hook_encoded('utf-8'))) as f: + # FileInput doesn't have a read method, we can't call json.load + info = self.filter_requested_info(json.loads('\n'.join(f))) + try: + self.process_ie_result(info, download=True) + except DownloadError: + webpage_url = info.get('webpage_url') + if webpage_url is not None: + self.report_warning('The info failed to download, trying with "%s"' % webpage_url) + return self.download([webpage_url]) + else: + raise + return self._download_retcode + + @staticmethod + def filter_requested_info(info_dict): + return dict( + (k, v) for k, v in info_dict.items() + if k not in ['requested_formats', 'requested_subtitles']) + + def post_process(self, filename, ie_info): + """Run all the postprocessors on the given file.""" + info = dict(ie_info) + info['filepath'] = filename + pps_chain = [] + if ie_info.get('__postprocessors') is not None: + pps_chain.extend(ie_info['__postprocessors']) + pps_chain.extend(self._pps) + for pp in pps_chain: + files_to_delete = [] + try: + files_to_delete, info = pp.run(info) + except PostProcessingError as e: + self.report_error(e.msg) + if files_to_delete and not self.params.get('keepvideo', False): + for old_filename in files_to_delete: + self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) + try: + os.remove(encodeFilename(old_filename)) + except (IOError, OSError): + self.report_warning('Unable to remove downloaded original file') + + def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist + if extractor is None: + url = str_or_none(info_dict.get('url')) + if not url: + return + # Try to find matching extractor for the URL and take its ie_key + for ie in self._ies: + if ie.suitable(url): + extractor = ie.ie_key() + break + else: + return + return extractor.lower() + ' ' + video_id + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + + vid_id = self._make_archive_id(info_dict) + if not vid_id: + return False # Incomplete video information + + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + if line.strip() == vid_id: + return True + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + return False + + def record_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return + vid_id = self._make_archive_id(info_dict) + assert vid_id + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') + + @staticmethod + def format_resolution(format, default='unknown'): + if format.get('vcodec') == 'none': + return 'audio only' + if format.get('resolution') is not None: + return format['resolution'] + if format.get('height') is not None: + if format.get('width') is not None: + res = '%sx%s' % (format['width'], format['height']) + else: + res = '%sp' % format['height'] + elif format.get('width') is not None: + res = '%dx?' % format['width'] + else: + res = default + return res + + def _format_note(self, fdict): + res = '' + if fdict.get('ext') in ['f4f', 'f4m']: + res += '(unsupported) ' + if fdict.get('language'): + if res: + res += ' ' + res += '[%s] ' % fdict['language'] + if fdict.get('format_note') is not None: + res += fdict['format_note'] + ' ' + if fdict.get('tbr') is not None: + res += '%4dk ' % fdict['tbr'] + if fdict.get('container') is not None: + if res: + res += ', ' + res += '%s container' % fdict['container'] + if (fdict.get('vcodec') is not None + and fdict.get('vcodec') != 'none'): + if res: + res += ', ' + res += fdict['vcodec'] + if fdict.get('vbr') is not None: + res += '@' + elif fdict.get('vbr') is not None and fdict.get('abr') is not None: + res += 'video@' + if fdict.get('vbr') is not None: + res += '%4dk' % fdict['vbr'] + if fdict.get('fps') is not None: + if res: + res += ', ' + res += '%sfps' % fdict['fps'] + if fdict.get('acodec') is not None: + if res: + res += ', ' + if fdict['acodec'] == 'none': + res += 'video only' + else: + res += '%-5s' % fdict['acodec'] + elif fdict.get('abr') is not None: + if res: + res += ', ' + res += 'audio' + if fdict.get('abr') is not None: + res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] + if fdict.get('filesize') is not None: + if res: + res += ', ' + res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) + return res + + def list_formats(self, info_dict): + formats = info_dict.get('formats', [info_dict]) + table = [ + [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)] + for f in formats + if f.get('preference') is None or f['preference'] >= -1000] + if len(formats) > 1: + table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)' + + header_line = ['format code', 'extension', 'resolution', 'note'] + self.to_screen( + '[info] Available formats for %s:\n%s' % + (info_dict['id'], render_table(header_line, table))) + + def list_thumbnails(self, info_dict): + thumbnails = info_dict.get('thumbnails') + if not thumbnails: + self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) + return + + self.to_screen( + '[info] Thumbnails for %s:' % info_dict['id']) + self.to_screen(render_table( + ['ID', 'width', 'height', 'URL'], + [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + + def list_subtitles(self, video_id, subtitles, name='subtitles'): + if not subtitles: + self.to_screen('%s has no %s' % (video_id, name)) + return + self.to_screen( + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) + + def urlopen(self, req): + """ Start an HTTP download """ + if isinstance(req, compat_basestring): + req = sanitized_Request(req) + return self._opener.open(req, timeout=self._socket_timeout) + + def print_debug_header(self): + if not self.params.get('verbose'): + return + + if type('') is not compat_str: + # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326) + self.report_warning( + 'Your Python is broken! Update to a newer and supported version') + + stdout_encoding = getattr( + sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__) + encoding_str = ( + '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + stdout_encoding, + self.get_encoding())) + write_string(encoding_str, encoding=None) + + self._write_string('[debug] youtube-dlc version ' + __version__ + '\n') + if _LAZY_LOADER: + self._write_string('[debug] Lazy loading extractors enabled' + '\n') + try: + sp = subprocess.Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + self._write_string('[debug] Git HEAD: ' + out + '\n') + except Exception: + try: + sys.exc_clear() + except Exception: + pass + + def python_implementation(): + impl_name = platform.python_implementation() + if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): + return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] + return impl_name + + self._write_string('[debug] Python version %s (%s) - %s\n' % ( + platform.python_version(), python_implementation(), + platform_name())) + + exe_versions = FFmpegPostProcessor.get_versions(self) + exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() + exe_str = ', '.join( + '%s %s' % (exe, v) + for exe, v in sorted(exe_versions.items()) + if v + ) + if not exe_str: + exe_str = 'none' + self._write_string('[debug] exe versions: %s\n' % exe_str) + + proxy_map = {} + for handler in self._opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') + + if self.params.get('call_home', False): + ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') + self._write_string('[debug] Public IP address: %s\n' % ipaddr) + latest_version = self.urlopen( + 'https://yt-dl.org/latest/version').read().decode('utf-8') + if version_tuple(latest_version) > version_tuple(__version__): + self.report_warning( + 'You are using an outdated version (newest version: %s)! ' + 'See https://yt-dl.org/update if you need help updating.' % + latest_version) + + def _setup_opener(self): + timeout_val = self.params.get('socket_timeout') + self._socket_timeout = 600 if timeout_val is None else float(timeout_val) + + opts_cookiefile = self.params.get('cookiefile') + opts_proxy = self.params.get('proxy') + + if opts_cookiefile is None: + self.cookiejar = compat_cookiejar.CookieJar() + else: + opts_cookiefile = expand_path(opts_cookiefile) + self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) + if os.access(opts_cookiefile, os.R_OK): + self.cookiejar.load(ignore_discard=True, ignore_expires=True) + + cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) + if opts_proxy is not None: + if opts_proxy == '': + proxies = {} + else: + proxies = {'http': opts_proxy, 'https': opts_proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = PerRequestProxyHandler(proxies) + + debuglevel = 1 if self.params.get('debug_printtraffic') else 0 + https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) + ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + redirect_handler = YoutubeDLRedirectHandler() + data_handler = compat_urllib_request_DataHandler() + + # When passing our own FileHandler instance, build_opener won't add the + # default FileHandler and allows us to disable the file protocol, which + # can be used for malicious purposes (see + # https://github.com/ytdl-org/youtube-dl/issues/8227) + file_handler = compat_urllib_request.FileHandler() + + def file_open(*args, **kwargs): + raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dlc for security reasons') + file_handler.file_open = file_open + + opener = compat_urllib_request.build_opener( + proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) + + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) + opener.addheaders = [] + self._opener = opener + + def encode(self, s): + if isinstance(s, bytes): + return s # Already encoded + + try: + return s.encode(self.get_encoding()) + except UnicodeEncodeError as err: + err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' + raise + + def get_encoding(self): + encoding = self.params.get('encoding') + if encoding is None: + encoding = preferredencoding() + return encoding + + def _write_thumbnails(self, info_dict, filename): + if self.params.get('writethumbnail', False): + thumbnails = info_dict.get('thumbnails') + if thumbnails: + thumbnails = [thumbnails[-1]] + elif self.params.get('write_all_thumbnails', False): + thumbnails = info_dict.get('thumbnails') + else: + return + + if not thumbnails: + # No thumbnails present, so return immediately + return + + for t in thumbnails: + thumb_ext = determine_ext(t['url'], 'jpg') + suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' + thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' + t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): + self.to_screen('[%s] %s: Thumbnail %sis already present' % + (info_dict['extractor'], info_dict['id'], thumb_display_id)) + else: + self.to_screen('[%s] %s: Downloading thumbnail %s...' % + (info_dict['extractor'], info_dict['id'], thumb_display_id)) + try: + uf = self.urlopen(t['url']) + with open(encodeFilename(thumb_filename), 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % + (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download thumbnail "%s": %s' % + (t['url'], error_to_compat_str(err))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py new file mode 100644 index 0000000000..a663417dab --- /dev/null +++ b/youtube_dl/__init__.py @@ -0,0 +1,483 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +__license__ = 'Public Domain' + +import codecs +import io +import os +import random +import sys + + +from .options import ( + parseOpts, +) +from .compat import ( + compat_getpass, + compat_shlex_split, + workaround_optparse_bug9161, +) +from .utils import ( + DateRange, + decodeOption, + DEFAULT_OUTTMPL, + DownloadError, + expand_path, + match_filter_func, + MaxDownloadsReached, + preferredencoding, + read_batch_urls, + SameFileError, + setproctitle, + std_headers, + write_string, + render_table, +) +from .update import update_self +from .downloader import ( + FileDownloader, +) +from .extractor import gen_extractors, list_extractors +from .extractor.adobepass import MSO_INFO +from .YoutubeDL import YoutubeDL + + +def _real_main(argv=None): + # Compatibility fixes for Windows + if sys.platform == 'win32': + # https://github.com/ytdl-org/youtube-dl/issues/820 + codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) + + workaround_optparse_bug9161() + + setproctitle('youtube-dlc') + + parser, opts, args = parseOpts(argv) + + # Set user agent + if opts.user_agent is not None: + std_headers['User-Agent'] = opts.user_agent + + # Set referer + if opts.referer is not None: + std_headers['Referer'] = opts.referer + + # Custom HTTP headers + if opts.headers is not None: + for h in opts.headers: + if ':' not in h: + parser.error('wrong header formatting, it should be key:value, not "%s"' % h) + key, value = h.split(':', 1) + if opts.verbose: + write_string('[debug] Adding header from command line option %s:%s\n' % (key, value)) + std_headers[key] = value + + # Dump user agent + if opts.dump_user_agent: + write_string(std_headers['User-Agent'] + '\n', out=sys.stdout) + sys.exit(0) + + # Batch file verification + batch_urls = [] + if opts.batchfile is not None: + try: + if opts.batchfile == '-': + batchfd = sys.stdin + else: + batchfd = io.open( + expand_path(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') + batch_urls = read_batch_urls(batchfd) + if opts.verbose: + write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') + except IOError: + sys.exit('ERROR: batch file %s could not be read' % opts.batchfile) + all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls + _enc = preferredencoding() + all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] + + if opts.list_extractors: + for ie in list_extractors(opts.age_limit): + write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout) + matchedUrls = [url for url in all_urls if ie.suitable(url)] + for mu in matchedUrls: + write_string(' ' + mu + '\n', out=sys.stdout) + sys.exit(0) + if opts.list_extractor_descriptions: + for ie in list_extractors(opts.age_limit): + if not ie._WORKING: + continue + desc = getattr(ie, 'IE_DESC', ie.IE_NAME) + if desc is False: + continue + if hasattr(ie, 'SEARCH_KEY'): + _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') + _COUNTS = ('', '5', '10', 'all') + desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) + write_string(desc + '\n', out=sys.stdout) + sys.exit(0) + if opts.ap_list_mso: + table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] + write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) + sys.exit(0) + + # Conflicting, missing and erroneous options + if opts.usenetrc and (opts.username is not None or opts.password is not None): + parser.error('using .netrc conflicts with giving username/password') + if opts.password is not None and opts.username is None: + parser.error('account username missing\n') + if opts.ap_password is not None and opts.ap_username is None: + parser.error('TV Provider account username missing\n') + if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): + parser.error('using output template conflicts with using title, video ID or auto number') + if opts.autonumber_size is not None: + if opts.autonumber_size <= 0: + parser.error('auto number size must be positive') + if opts.autonumber_start is not None: + if opts.autonumber_start < 0: + parser.error('auto number start must be positive or 0') + if opts.usetitle and opts.useid: + parser.error('using title conflicts with using video ID') + if opts.username is not None and opts.password is None: + opts.password = compat_getpass('Type account password and press [Return]: ') + if opts.ap_username is not None and opts.ap_password is None: + opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') + if opts.ratelimit is not None: + numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) + if numeric_limit is None: + parser.error('invalid rate limit specified') + opts.ratelimit = numeric_limit + if opts.min_filesize is not None: + numeric_limit = FileDownloader.parse_bytes(opts.min_filesize) + if numeric_limit is None: + parser.error('invalid min_filesize specified') + opts.min_filesize = numeric_limit + if opts.max_filesize is not None: + numeric_limit = FileDownloader.parse_bytes(opts.max_filesize) + if numeric_limit is None: + parser.error('invalid max_filesize specified') + opts.max_filesize = numeric_limit + if opts.sleep_interval is not None: + if opts.sleep_interval < 0: + parser.error('sleep interval must be positive or 0') + if opts.max_sleep_interval is not None: + if opts.max_sleep_interval < 0: + parser.error('max sleep interval must be positive or 0') + if opts.sleep_interval is None: + parser.error('min sleep interval must be specified, use --min-sleep-interval') + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval must be greater than or equal to min sleep interval') + else: + opts.max_sleep_interval = opts.sleep_interval + if opts.ap_mso and opts.ap_mso not in MSO_INFO: + parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') + + def parse_retries(retries): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') + else: + try: + parsed_retries = int(retries) + except (TypeError, ValueError): + parser.error('invalid retry count specified') + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries) + if opts.buffersize is not None: + numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) + if numeric_buffersize is None: + parser.error('invalid buffer size specified') + opts.buffersize = numeric_buffersize + if opts.http_chunk_size is not None: + numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) + if not numeric_chunksize: + parser.error('invalid http chunk size specified') + opts.http_chunk_size = numeric_chunksize + if opts.playliststart <= 0: + raise ValueError('Playlist start must be positive') + if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: + raise ValueError('Playlist end must be greater than playlist start') + if opts.extractaudio: + if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: + parser.error('invalid audio format specified') + if opts.audioquality: + opts.audioquality = opts.audioquality.strip('k').strip('K') + if not opts.audioquality.isdigit(): + parser.error('invalid audio quality specified') + if opts.recodevideo is not None: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: + parser.error('invalid video recode format specified') + if opts.convertsubtitles is not None: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: + parser.error('invalid subtitle format specified') + + if opts.date is not None: + date = DateRange.day(opts.date) + else: + date = DateRange(opts.dateafter, opts.datebefore) + + # Do not download videos when there are audio-only formats + if opts.extractaudio and not opts.keepvideo and opts.format is None: + opts.format = 'bestaudio/best' + + # --all-sub automatically sets --write-sub if --write-auto-sub is not given + # this was the old behaviour if only --all-sub was given. + if opts.allsubtitles and not opts.writeautomaticsub: + opts.writesubtitles = True + + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) + or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') + or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') + or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') + or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') + or (opts.useid and '%(id)s.%(ext)s') + or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') + or DEFAULT_OUTTMPL) + if not os.path.splitext(outtmpl)[1] and opts.extractaudio: + parser.error('Cannot download a video and extract audio into the same' + ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' + ' template'.format(outtmpl)) + + any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json + any_printing = opts.print_json + download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive + + # PostProcessors + postprocessors = [] + if opts.metafromtitle: + postprocessors.append({ + 'key': 'MetadataFromTitle', + 'titleformat': opts.metafromtitle + }) + if opts.extractaudio: + postprocessors.append({ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': opts.audioformat, + 'preferredquality': opts.audioquality, + 'nopostoverwrites': opts.nopostoverwrites, + }) + if opts.recodevideo: + postprocessors.append({ + 'key': 'FFmpegVideoConvertor', + 'preferedformat': opts.recodevideo, + }) + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + # And this post-processor should be placed before other metadata + # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of + # extra metadata. By default ffmpeg preserves metadata applicable for both + # source and target containers. From this point the container won't change, + # so metadata can be added here. + if opts.addmetadata: + postprocessors.append({'key': 'FFmpegMetadata'}) + if opts.convertsubtitles: + postprocessors.append({ + 'key': 'FFmpegSubtitlesConvertor', + 'format': opts.convertsubtitles, + }) + if opts.embedsubtitles: + postprocessors.append({ + 'key': 'FFmpegEmbedSubtitle', + }) + if opts.embedthumbnail: + already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails + postprocessors.append({ + 'key': 'EmbedThumbnail', + 'already_have_thumbnail': already_have_thumbnail + }) + if not already_have_thumbnail: + opts.writethumbnail = True + # XAttrMetadataPP should be run after post-processors that may change file + # contents + if opts.xattrs: + postprocessors.append({'key': 'XAttrMetadata'}) + # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. + # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. + if opts.exec_cmd: + postprocessors.append({ + 'key': 'ExecAfterDownload', + 'exec_cmd': opts.exec_cmd, + }) + external_downloader_args = None + if opts.external_downloader_args: + external_downloader_args = compat_shlex_split(opts.external_downloader_args) + postprocessor_args = None + if opts.postprocessor_args: + postprocessor_args = compat_shlex_split(opts.postprocessor_args) + match_filter = ( + None if opts.match_filter is None + else match_filter_func(opts.match_filter)) + + ydl_opts = { + 'usenetrc': opts.usenetrc, + 'username': opts.username, + 'password': opts.password, + 'twofactor': opts.twofactor, + 'videopassword': opts.videopassword, + 'ap_mso': opts.ap_mso, + 'ap_username': opts.ap_username, + 'ap_password': opts.ap_password, + 'quiet': (opts.quiet or any_getting or any_printing), + 'no_warnings': opts.no_warnings, + 'forceurl': opts.geturl, + 'forcetitle': opts.gettitle, + 'forceid': opts.getid, + 'forcethumbnail': opts.getthumbnail, + 'forcedescription': opts.getdescription, + 'forceduration': opts.getduration, + 'forcefilename': opts.getfilename, + 'forceformat': opts.getformat, + 'forcejson': opts.dumpjson or opts.print_json, + 'dump_single_json': opts.dump_single_json, + 'simulate': opts.simulate or any_getting, + 'skip_download': opts.skip_download, + 'format': opts.format, + 'listformats': opts.listformats, + 'outtmpl': outtmpl, + 'autonumber_size': opts.autonumber_size, + 'autonumber_start': opts.autonumber_start, + 'restrictfilenames': opts.restrictfilenames, + 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, + 'ratelimit': opts.ratelimit, + 'nooverwrites': opts.nooverwrites, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, + 'keep_fragments': opts.keep_fragments, + 'buffersize': opts.buffersize, + 'noresizebuffer': opts.noresizebuffer, + 'http_chunk_size': opts.http_chunk_size, + 'continuedl': opts.continue_dl, + 'noprogress': opts.noprogress, + 'progress_with_newline': opts.progress_with_newline, + 'playliststart': opts.playliststart, + 'playlistend': opts.playlistend, + 'playlistreverse': opts.playlist_reverse, + 'playlistrandom': opts.playlist_random, + 'noplaylist': opts.noplaylist, + 'logtostderr': opts.outtmpl == '-', + 'consoletitle': opts.consoletitle, + 'nopart': opts.nopart, + 'updatetime': opts.updatetime, + 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, + 'writeinfojson': opts.writeinfojson, + 'writethumbnail': opts.writethumbnail, + 'write_all_thumbnails': opts.write_all_thumbnails, + 'writesubtitles': opts.writesubtitles, + 'writeautomaticsub': opts.writeautomaticsub, + 'allsubtitles': opts.allsubtitles, + 'listsubtitles': opts.listsubtitles, + 'subtitlesformat': opts.subtitlesformat, + 'subtitleslangs': opts.subtitleslangs, + 'matchtitle': decodeOption(opts.matchtitle), + 'rejecttitle': decodeOption(opts.rejecttitle), + 'max_downloads': opts.max_downloads, + 'prefer_free_formats': opts.prefer_free_formats, + 'verbose': opts.verbose, + 'dump_intermediate_pages': opts.dump_intermediate_pages, + 'write_pages': opts.write_pages, + 'test': opts.test, + 'keepvideo': opts.keepvideo, + 'min_filesize': opts.min_filesize, + 'max_filesize': opts.max_filesize, + 'min_views': opts.min_views, + 'max_views': opts.max_views, + 'daterange': date, + 'cachedir': opts.cachedir, + 'youtube_print_sig_code': opts.youtube_print_sig_code, + 'age_limit': opts.age_limit, + 'download_archive': download_archive_fn, + 'cookiefile': opts.cookiefile, + 'nocheckcertificate': opts.no_check_certificate, + 'prefer_insecure': opts.prefer_insecure, + 'proxy': opts.proxy, + 'socket_timeout': opts.socket_timeout, + 'bidi_workaround': opts.bidi_workaround, + 'debug_printtraffic': opts.debug_printtraffic, + 'prefer_ffmpeg': opts.prefer_ffmpeg, + 'include_ads': opts.include_ads, + 'default_search': opts.default_search, + 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, + 'encoding': opts.encoding, + 'extract_flat': opts.extract_flat, + 'mark_watched': opts.mark_watched, + 'merge_output_format': opts.merge_output_format, + 'postprocessors': postprocessors, + 'fixup': opts.fixup, + 'source_address': opts.source_address, + 'call_home': opts.call_home, + 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, + 'external_downloader': opts.external_downloader, + 'list_thumbnails': opts.list_thumbnails, + 'playlist_items': opts.playlist_items, + 'xattr_set_filesize': opts.xattr_set_filesize, + 'match_filter': match_filter, + 'no_color': opts.no_color, + 'ffmpeg_location': opts.ffmpeg_location, + 'hls_prefer_native': opts.hls_prefer_native, + 'hls_use_mpegts': opts.hls_use_mpegts, + 'external_downloader_args': external_downloader_args, + 'postprocessor_args': postprocessor_args, + 'cn_verification_proxy': opts.cn_verification_proxy, + 'geo_verification_proxy': opts.geo_verification_proxy, + 'config_location': opts.config_location, + 'geo_bypass': opts.geo_bypass, + 'geo_bypass_country': opts.geo_bypass_country, + 'geo_bypass_ip_block': opts.geo_bypass_ip_block, + # just for deprecation check + 'autonumber': opts.autonumber if opts.autonumber is True else None, + 'usetitle': opts.usetitle if opts.usetitle is True else None, + } + + with YoutubeDL(ydl_opts) as ydl: + # Update version + if opts.update_self: + update_self(ydl.to_screen, opts.verbose, ydl._opener) + + # Remove cache dir + if opts.rm_cachedir: + ydl.cache.remove() + + # Maybe do nothing + if (len(all_urls) < 1) and (opts.load_info_filename is None): + if opts.update_self or opts.rm_cachedir: + sys.exit() + + ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) + parser.error( + 'You must provide at least one URL.\n' + 'Type youtube-dlc --help to see a list of all options.') + + try: + if opts.load_info_filename is not None: + retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) + else: + retcode = ydl.download(all_urls) + except MaxDownloadsReached: + ydl.to_screen('--max-download limit reached, aborting.') + retcode = 101 + + sys.exit(retcode) + + +def main(argv=None): + try: + _real_main(argv) + except DownloadError: + sys.exit(1) + except SameFileError: + sys.exit('ERROR: fixed output name but more than one file to download') + except KeyboardInterrupt: + sys.exit('\nERROR: Interrupted by user') + + +__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/__main__.py b/youtube_dl/__main__.py new file mode 100644 index 0000000000..0e76016868 --- /dev/null +++ b/youtube_dl/__main__.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Execute with +# $ python youtube_dlc/__main__.py (2.6+) +# $ python -m youtube_dlc (2.7+) + +import sys + +if __package__ is None and not hasattr(sys, 'frozen'): + # direct call of __main__.py + import os.path + path = os.path.realpath(os.path.abspath(__file__)) + sys.path.insert(0, os.path.dirname(os.path.dirname(path))) + +import youtube_dlc + +if __name__ == '__main__': + youtube_dlc.main() diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py new file mode 100644 index 0000000000..461bb6d413 --- /dev/null +++ b/youtube_dl/aes.py @@ -0,0 +1,361 @@ +from __future__ import unicode_literals + +from math import ceil + +from .compat import compat_b64decode +from .utils import bytes_to_intlist, intlist_to_bytes + +BLOCK_SIZE_BYTES = 16 + + +def aes_ctr_decrypt(data, key, counter): + """ + Decrypt with aes in counter mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {instance} counter Instance whose next_value function (@returns {int[]} 16-Byte block) + returns the next counter block + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data = [] + for i in range(block_count): + counter_block = counter.next_value() + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + cipher_counter_block = aes_encrypt(counter_block, expanded_key) + decrypted_data += xor(block, cipher_counter_block) + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + + +def aes_cbc_decrypt(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + decrypted_block = aes_decrypt(block, expanded_key) + decrypted_data += xor(decrypted_block, previous_cipher_block) + previous_cipher_block = block + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + + +def aes_cbc_encrypt(data, key, iv): + """ + Encrypt with aes in CBC mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + remaining_length = BLOCK_SIZE_BYTES - len(block) + block += [remaining_length] * remaining_length + mixed_block = xor(block, previous_cipher_block) + + encrypted_block = aes_encrypt(mixed_block, expanded_key) + encrypted_data += encrypted_block + + previous_cipher_block = encrypted_block + + return encrypted_data + + +def key_expansion(data): + """ + Generate key schedule + + @param {int[]} data 16/24/32-Byte cipher key + @returns {int[]} 176/208/240-Byte expanded key + """ + data = data[:] # copy + rcon_iteration = 1 + key_size_bytes = len(data) + expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES + + while len(data) < expanded_key_size_bytes: + temp = data[-4:] + temp = key_schedule_core(temp, rcon_iteration) + rcon_iteration += 1 + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + if key_size_bytes == 32: + temp = data[-4:] + temp = sub_bytes(temp) + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + data = data[:expanded_key_size_bytes] + + return data + + +def aes_encrypt(data, expanded_key): + """ + Encrypt one block with aes + + @param {int[]} data 16-Byte state + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte cipher + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + for i in range(1, rounds + 1): + data = sub_bytes(data) + data = shift_rows(data) + if i != rounds: + data = mix_columns(data) + data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt(data, expanded_key): + """ + Decrypt one block with aes + + @param {int[]} data 16-Byte cipher + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte state + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + for i in range(rounds, 0, -1): + data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) + if i != rounds: + data = mix_columns_inv(data) + data = shift_rows_inv(data) + data = sub_bytes_inv(data) + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + + return data + + +def aes_decrypt_text(data, password, key_size_bytes): + """ + Decrypt text + - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter + - The cipher key is retrieved by encrypting the first 16 Byte of 'password' + with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) + - Mode of operation is 'counter' + + @param {str} data Base64 encoded string + @param {str,unicode} password Password (will be encoded with utf-8) + @param {int} key_size_bytes Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit + @returns {str} Decrypted data + """ + NONCE_LENGTH_BYTES = 8 + + data = bytes_to_intlist(compat_b64decode(data)) + password = bytes_to_intlist(password.encode('utf-8')) + + key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) + key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) + + nonce = data[:NONCE_LENGTH_BYTES] + cipher = data[NONCE_LENGTH_BYTES:] + + class Counter(object): + __value = nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) + + def next_value(self): + temp = self.__value + self.__value = inc(self.__value) + return temp + + decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) + plaintext = intlist_to_bytes(decrypted_data) + + return plaintext + + +RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) +SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) +SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d) +MIX_COLUMN_MATRIX = ((0x2, 0x3, 0x1, 0x1), + (0x1, 0x2, 0x3, 0x1), + (0x1, 0x1, 0x2, 0x3), + (0x3, 0x1, 0x1, 0x2)) +MIX_COLUMN_MATRIX_INV = ((0xE, 0xB, 0xD, 0x9), + (0x9, 0xE, 0xB, 0xD), + (0xD, 0x9, 0xE, 0xB), + (0xB, 0xD, 0x9, 0xE)) +RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, + 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, + 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, + 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, + 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, + 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, + 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, + 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, + 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, + 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, + 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, + 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, + 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, + 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, + 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01) +RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) + + +def sub_bytes(data): + return [SBOX[x] for x in data] + + +def sub_bytes_inv(data): + return [SBOX_INV[x] for x in data] + + +def rotate(data): + return data[1:] + [data[0]] + + +def key_schedule_core(data, rcon_iteration): + data = rotate(data) + data = sub_bytes(data) + data[0] = data[0] ^ RCON[rcon_iteration] + + return data + + +def xor(data1, data2): + return [x ^ y for x, y in zip(data1, data2)] + + +def rijndael_mul(a, b): + if(a == 0 or b == 0): + return 0 + return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] + + +def mix_column(data, matrix): + data_mixed = [] + for row in range(4): + mixed = 0 + for column in range(4): + # xor is (+) and (-) + mixed ^= rijndael_mul(data[column], matrix[row][column]) + data_mixed.append(mixed) + return data_mixed + + +def mix_columns(data, matrix=MIX_COLUMN_MATRIX): + data_mixed = [] + for i in range(4): + column = data[i * 4: (i + 1) * 4] + data_mixed += mix_column(column, matrix) + return data_mixed + + +def mix_columns_inv(data): + return mix_columns(data, MIX_COLUMN_MATRIX_INV) + + +def shift_rows(data): + data_shifted = [] + for column in range(4): + for row in range(4): + data_shifted.append(data[((column + row) & 0b11) * 4 + row]) + return data_shifted + + +def shift_rows_inv(data): + data_shifted = [] + for column in range(4): + for row in range(4): + data_shifted.append(data[((column - row) & 0b11) * 4 + row]) + return data_shifted + + +def inc(data): + data = data[:] # copy + for i in range(len(data) - 1, -1, -1): + if data[i] == 255: + data[i] = 0 + else: + data[i] = data[i] + 1 + break + return data + + +__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text'] diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py new file mode 100644 index 0000000000..ada6aa1f28 --- /dev/null +++ b/youtube_dl/cache.py @@ -0,0 +1,96 @@ +from __future__ import unicode_literals + +import errno +import io +import json +import os +import re +import shutil +import traceback + +from .compat import compat_getenv +from .utils import ( + expand_path, + write_json_file, +) + + +class Cache(object): + def __init__(self, ydl): + self._ydl = ydl + + def _get_root_dir(self): + res = self._ydl.params.get('cachedir') + if res is None: + cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') + res = os.path.join(cache_root, 'youtube-dlc') + return expand_path(res) + + def _get_cache_fn(self, section, key, dtype): + assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ + 'invalid section %r' % section + assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key + return os.path.join( + self._get_root_dir(), section, '%s.%s' % (key, dtype)) + + @property + def enabled(self): + return self._ydl.params.get('cachedir') is not False + + def store(self, section, key, data, dtype='json'): + assert dtype in ('json',) + + if not self.enabled: + return + + fn = self._get_cache_fn(section, key, dtype) + try: + try: + os.makedirs(os.path.dirname(fn)) + except OSError as ose: + if ose.errno != errno.EEXIST: + raise + write_json_file(data, fn) + except Exception: + tb = traceback.format_exc() + self._ydl.report_warning( + 'Writing cache to %r failed: %s' % (fn, tb)) + + def load(self, section, key, dtype='json', default=None): + assert dtype in ('json',) + + if not self.enabled: + return default + + cache_fn = self._get_cache_fn(section, key, dtype) + try: + try: + with io.open(cache_fn, 'r', encoding='utf-8') as cachef: + return json.load(cachef) + except ValueError: + try: + file_size = os.path.getsize(cache_fn) + except (OSError, IOError) as oe: + file_size = str(oe) + self._ydl.report_warning( + 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) + except IOError: + pass # No cache available + + return default + + def remove(self): + if not self.enabled: + self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)') + return + + cachedir = self._get_root_dir() + if not any((term in cachedir) for term in ('cache', 'tmp')): + raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir) + + self._ydl.to_screen( + 'Removing cache dir %s .' % cachedir, skip_eol=True) + if os.path.exists(cachedir): + self._ydl.to_screen('.', skip_eol=True) + shutil.rmtree(cachedir) + self._ydl.to_screen('.') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py new file mode 100644 index 0000000000..1cf7efed61 --- /dev/null +++ b/youtube_dl/compat.py @@ -0,0 +1,3050 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import binascii +import collections +import ctypes +import email +import getpass +import io +import itertools +import optparse +import os +import platform +import re +import shlex +import shutil +import socket +import struct +import subprocess +import sys +import xml.etree.ElementTree + + +try: + import urllib.request as compat_urllib_request +except ImportError: # Python 2 + import urllib2 as compat_urllib_request + +try: + import urllib.error as compat_urllib_error +except ImportError: # Python 2 + import urllib2 as compat_urllib_error + +try: + import urllib.parse as compat_urllib_parse +except ImportError: # Python 2 + import urllib as compat_urllib_parse + +try: + from urllib.parse import urlparse as compat_urllib_parse_urlparse +except ImportError: # Python 2 + from urlparse import urlparse as compat_urllib_parse_urlparse + +try: + import urllib.parse as compat_urlparse +except ImportError: # Python 2 + import urlparse as compat_urlparse + +try: + import urllib.response as compat_urllib_response +except ImportError: # Python 2 + import urllib as compat_urllib_response + +try: + import http.cookiejar as compat_cookiejar +except ImportError: # Python 2 + import cookielib as compat_cookiejar + +if sys.version_info[0] == 2: + class compat_cookiejar_Cookie(compat_cookiejar.Cookie): + def __init__(self, version, name, value, *args, **kwargs): + if isinstance(name, compat_str): + name = name.encode() + if isinstance(value, compat_str): + value = value.encode() + compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) +else: + compat_cookiejar_Cookie = compat_cookiejar.Cookie + +try: + import http.cookies as compat_cookies +except ImportError: # Python 2 + import Cookie as compat_cookies + +try: + import html.entities as compat_html_entities +except ImportError: # Python 2 + import htmlentitydefs as compat_html_entities + +try: # Python >= 3.3 + compat_html_entities_html5 = compat_html_entities.html5 +except AttributeError: + # Copied from CPython 3.5.1 html/entities.py + compat_html_entities_html5 = { + 'Aacute': '\xc1', + 'aacute': '\xe1', + 'Aacute;': '\xc1', + 'aacute;': '\xe1', + 'Abreve;': '\u0102', + 'abreve;': '\u0103', + 'ac;': '\u223e', + 'acd;': '\u223f', + 'acE;': '\u223e\u0333', + 'Acirc': '\xc2', + 'acirc': '\xe2', + 'Acirc;': '\xc2', + 'acirc;': '\xe2', + 'acute': '\xb4', + 'acute;': '\xb4', + 'Acy;': '\u0410', + 'acy;': '\u0430', + 'AElig': '\xc6', + 'aelig': '\xe6', + 'AElig;': '\xc6', + 'aelig;': '\xe6', + 'af;': '\u2061', + 'Afr;': '\U0001d504', + 'afr;': '\U0001d51e', + 'Agrave': '\xc0', + 'agrave': '\xe0', + 'Agrave;': '\xc0', + 'agrave;': '\xe0', + 'alefsym;': '\u2135', + 'aleph;': '\u2135', + 'Alpha;': '\u0391', + 'alpha;': '\u03b1', + 'Amacr;': '\u0100', + 'amacr;': '\u0101', + 'amalg;': '\u2a3f', + 'AMP': '&', + 'amp': '&', + 'AMP;': '&', + 'amp;': '&', + 'And;': '\u2a53', + 'and;': '\u2227', + 'andand;': '\u2a55', + 'andd;': '\u2a5c', + 'andslope;': '\u2a58', + 'andv;': '\u2a5a', + 'ang;': '\u2220', + 'ange;': '\u29a4', + 'angle;': '\u2220', + 'angmsd;': '\u2221', + 'angmsdaa;': '\u29a8', + 'angmsdab;': '\u29a9', + 'angmsdac;': '\u29aa', + 'angmsdad;': '\u29ab', + 'angmsdae;': '\u29ac', + 'angmsdaf;': '\u29ad', + 'angmsdag;': '\u29ae', + 'angmsdah;': '\u29af', + 'angrt;': '\u221f', + 'angrtvb;': '\u22be', + 'angrtvbd;': '\u299d', + 'angsph;': '\u2222', + 'angst;': '\xc5', + 'angzarr;': '\u237c', + 'Aogon;': '\u0104', + 'aogon;': '\u0105', + 'Aopf;': '\U0001d538', + 'aopf;': '\U0001d552', + 'ap;': '\u2248', + 'apacir;': '\u2a6f', + 'apE;': '\u2a70', + 'ape;': '\u224a', + 'apid;': '\u224b', + 'apos;': "'", + 'ApplyFunction;': '\u2061', + 'approx;': '\u2248', + 'approxeq;': '\u224a', + 'Aring': '\xc5', + 'aring': '\xe5', + 'Aring;': '\xc5', + 'aring;': '\xe5', + 'Ascr;': '\U0001d49c', + 'ascr;': '\U0001d4b6', + 'Assign;': '\u2254', + 'ast;': '*', + 'asymp;': '\u2248', + 'asympeq;': '\u224d', + 'Atilde': '\xc3', + 'atilde': '\xe3', + 'Atilde;': '\xc3', + 'atilde;': '\xe3', + 'Auml': '\xc4', + 'auml': '\xe4', + 'Auml;': '\xc4', + 'auml;': '\xe4', + 'awconint;': '\u2233', + 'awint;': '\u2a11', + 'backcong;': '\u224c', + 'backepsilon;': '\u03f6', + 'backprime;': '\u2035', + 'backsim;': '\u223d', + 'backsimeq;': '\u22cd', + 'Backslash;': '\u2216', + 'Barv;': '\u2ae7', + 'barvee;': '\u22bd', + 'Barwed;': '\u2306', + 'barwed;': '\u2305', + 'barwedge;': '\u2305', + 'bbrk;': '\u23b5', + 'bbrktbrk;': '\u23b6', + 'bcong;': '\u224c', + 'Bcy;': '\u0411', + 'bcy;': '\u0431', + 'bdquo;': '\u201e', + 'becaus;': '\u2235', + 'Because;': '\u2235', + 'because;': '\u2235', + 'bemptyv;': '\u29b0', + 'bepsi;': '\u03f6', + 'bernou;': '\u212c', + 'Bernoullis;': '\u212c', + 'Beta;': '\u0392', + 'beta;': '\u03b2', + 'beth;': '\u2136', + 'between;': '\u226c', + 'Bfr;': '\U0001d505', + 'bfr;': '\U0001d51f', + 'bigcap;': '\u22c2', + 'bigcirc;': '\u25ef', + 'bigcup;': '\u22c3', + 'bigodot;': '\u2a00', + 'bigoplus;': '\u2a01', + 'bigotimes;': '\u2a02', + 'bigsqcup;': '\u2a06', + 'bigstar;': '\u2605', + 'bigtriangledown;': '\u25bd', + 'bigtriangleup;': '\u25b3', + 'biguplus;': '\u2a04', + 'bigvee;': '\u22c1', + 'bigwedge;': '\u22c0', + 'bkarow;': '\u290d', + 'blacklozenge;': '\u29eb', + 'blacksquare;': '\u25aa', + 'blacktriangle;': '\u25b4', + 'blacktriangledown;': '\u25be', + 'blacktriangleleft;': '\u25c2', + 'blacktriangleright;': '\u25b8', + 'blank;': '\u2423', + 'blk12;': '\u2592', + 'blk14;': '\u2591', + 'blk34;': '\u2593', + 'block;': '\u2588', + 'bne;': '=\u20e5', + 'bnequiv;': '\u2261\u20e5', + 'bNot;': '\u2aed', + 'bnot;': '\u2310', + 'Bopf;': '\U0001d539', + 'bopf;': '\U0001d553', + 'bot;': '\u22a5', + 'bottom;': '\u22a5', + 'bowtie;': '\u22c8', + 'boxbox;': '\u29c9', + 'boxDL;': '\u2557', + 'boxDl;': '\u2556', + 'boxdL;': '\u2555', + 'boxdl;': '\u2510', + 'boxDR;': '\u2554', + 'boxDr;': '\u2553', + 'boxdR;': '\u2552', + 'boxdr;': '\u250c', + 'boxH;': '\u2550', + 'boxh;': '\u2500', + 'boxHD;': '\u2566', + 'boxHd;': '\u2564', + 'boxhD;': '\u2565', + 'boxhd;': '\u252c', + 'boxHU;': '\u2569', + 'boxHu;': '\u2567', + 'boxhU;': '\u2568', + 'boxhu;': '\u2534', + 'boxminus;': '\u229f', + 'boxplus;': '\u229e', + 'boxtimes;': '\u22a0', + 'boxUL;': '\u255d', + 'boxUl;': '\u255c', + 'boxuL;': '\u255b', + 'boxul;': '\u2518', + 'boxUR;': '\u255a', + 'boxUr;': '\u2559', + 'boxuR;': '\u2558', + 'boxur;': '\u2514', + 'boxV;': '\u2551', + 'boxv;': '\u2502', + 'boxVH;': '\u256c', + 'boxVh;': '\u256b', + 'boxvH;': '\u256a', + 'boxvh;': '\u253c', + 'boxVL;': '\u2563', + 'boxVl;': '\u2562', + 'boxvL;': '\u2561', + 'boxvl;': '\u2524', + 'boxVR;': '\u2560', + 'boxVr;': '\u255f', + 'boxvR;': '\u255e', + 'boxvr;': '\u251c', + 'bprime;': '\u2035', + 'Breve;': '\u02d8', + 'breve;': '\u02d8', + 'brvbar': '\xa6', + 'brvbar;': '\xa6', + 'Bscr;': '\u212c', + 'bscr;': '\U0001d4b7', + 'bsemi;': '\u204f', + 'bsim;': '\u223d', + 'bsime;': '\u22cd', + 'bsol;': '\\', + 'bsolb;': '\u29c5', + 'bsolhsub;': '\u27c8', + 'bull;': '\u2022', + 'bullet;': '\u2022', + 'bump;': '\u224e', + 'bumpE;': '\u2aae', + 'bumpe;': '\u224f', + 'Bumpeq;': '\u224e', + 'bumpeq;': '\u224f', + 'Cacute;': '\u0106', + 'cacute;': '\u0107', + 'Cap;': '\u22d2', + 'cap;': '\u2229', + 'capand;': '\u2a44', + 'capbrcup;': '\u2a49', + 'capcap;': '\u2a4b', + 'capcup;': '\u2a47', + 'capdot;': '\u2a40', + 'CapitalDifferentialD;': '\u2145', + 'caps;': '\u2229\ufe00', + 'caret;': '\u2041', + 'caron;': '\u02c7', + 'Cayleys;': '\u212d', + 'ccaps;': '\u2a4d', + 'Ccaron;': '\u010c', + 'ccaron;': '\u010d', + 'Ccedil': '\xc7', + 'ccedil': '\xe7', + 'Ccedil;': '\xc7', + 'ccedil;': '\xe7', + 'Ccirc;': '\u0108', + 'ccirc;': '\u0109', + 'Cconint;': '\u2230', + 'ccups;': '\u2a4c', + 'ccupssm;': '\u2a50', + 'Cdot;': '\u010a', + 'cdot;': '\u010b', + 'cedil': '\xb8', + 'cedil;': '\xb8', + 'Cedilla;': '\xb8', + 'cemptyv;': '\u29b2', + 'cent': '\xa2', + 'cent;': '\xa2', + 'CenterDot;': '\xb7', + 'centerdot;': '\xb7', + 'Cfr;': '\u212d', + 'cfr;': '\U0001d520', + 'CHcy;': '\u0427', + 'chcy;': '\u0447', + 'check;': '\u2713', + 'checkmark;': '\u2713', + 'Chi;': '\u03a7', + 'chi;': '\u03c7', + 'cir;': '\u25cb', + 'circ;': '\u02c6', + 'circeq;': '\u2257', + 'circlearrowleft;': '\u21ba', + 'circlearrowright;': '\u21bb', + 'circledast;': '\u229b', + 'circledcirc;': '\u229a', + 'circleddash;': '\u229d', + 'CircleDot;': '\u2299', + 'circledR;': '\xae', + 'circledS;': '\u24c8', + 'CircleMinus;': '\u2296', + 'CirclePlus;': '\u2295', + 'CircleTimes;': '\u2297', + 'cirE;': '\u29c3', + 'cire;': '\u2257', + 'cirfnint;': '\u2a10', + 'cirmid;': '\u2aef', + 'cirscir;': '\u29c2', + 'ClockwiseContourIntegral;': '\u2232', + 'CloseCurlyDoubleQuote;': '\u201d', + 'CloseCurlyQuote;': '\u2019', + 'clubs;': '\u2663', + 'clubsuit;': '\u2663', + 'Colon;': '\u2237', + 'colon;': ':', + 'Colone;': '\u2a74', + 'colone;': '\u2254', + 'coloneq;': '\u2254', + 'comma;': ',', + 'commat;': '@', + 'comp;': '\u2201', + 'compfn;': '\u2218', + 'complement;': '\u2201', + 'complexes;': '\u2102', + 'cong;': '\u2245', + 'congdot;': '\u2a6d', + 'Congruent;': '\u2261', + 'Conint;': '\u222f', + 'conint;': '\u222e', + 'ContourIntegral;': '\u222e', + 'Copf;': '\u2102', + 'copf;': '\U0001d554', + 'coprod;': '\u2210', + 'Coproduct;': '\u2210', + 'COPY': '\xa9', + 'copy': '\xa9', + 'COPY;': '\xa9', + 'copy;': '\xa9', + 'copysr;': '\u2117', + 'CounterClockwiseContourIntegral;': '\u2233', + 'crarr;': '\u21b5', + 'Cross;': '\u2a2f', + 'cross;': '\u2717', + 'Cscr;': '\U0001d49e', + 'cscr;': '\U0001d4b8', + 'csub;': '\u2acf', + 'csube;': '\u2ad1', + 'csup;': '\u2ad0', + 'csupe;': '\u2ad2', + 'ctdot;': '\u22ef', + 'cudarrl;': '\u2938', + 'cudarrr;': '\u2935', + 'cuepr;': '\u22de', + 'cuesc;': '\u22df', + 'cularr;': '\u21b6', + 'cularrp;': '\u293d', + 'Cup;': '\u22d3', + 'cup;': '\u222a', + 'cupbrcap;': '\u2a48', + 'CupCap;': '\u224d', + 'cupcap;': '\u2a46', + 'cupcup;': '\u2a4a', + 'cupdot;': '\u228d', + 'cupor;': '\u2a45', + 'cups;': '\u222a\ufe00', + 'curarr;': '\u21b7', + 'curarrm;': '\u293c', + 'curlyeqprec;': '\u22de', + 'curlyeqsucc;': '\u22df', + 'curlyvee;': '\u22ce', + 'curlywedge;': '\u22cf', + 'curren': '\xa4', + 'curren;': '\xa4', + 'curvearrowleft;': '\u21b6', + 'curvearrowright;': '\u21b7', + 'cuvee;': '\u22ce', + 'cuwed;': '\u22cf', + 'cwconint;': '\u2232', + 'cwint;': '\u2231', + 'cylcty;': '\u232d', + 'Dagger;': '\u2021', + 'dagger;': '\u2020', + 'daleth;': '\u2138', + 'Darr;': '\u21a1', + 'dArr;': '\u21d3', + 'darr;': '\u2193', + 'dash;': '\u2010', + 'Dashv;': '\u2ae4', + 'dashv;': '\u22a3', + 'dbkarow;': '\u290f', + 'dblac;': '\u02dd', + 'Dcaron;': '\u010e', + 'dcaron;': '\u010f', + 'Dcy;': '\u0414', + 'dcy;': '\u0434', + 'DD;': '\u2145', + 'dd;': '\u2146', + 'ddagger;': '\u2021', + 'ddarr;': '\u21ca', + 'DDotrahd;': '\u2911', + 'ddotseq;': '\u2a77', + 'deg': '\xb0', + 'deg;': '\xb0', + 'Del;': '\u2207', + 'Delta;': '\u0394', + 'delta;': '\u03b4', + 'demptyv;': '\u29b1', + 'dfisht;': '\u297f', + 'Dfr;': '\U0001d507', + 'dfr;': '\U0001d521', + 'dHar;': '\u2965', + 'dharl;': '\u21c3', + 'dharr;': '\u21c2', + 'DiacriticalAcute;': '\xb4', + 'DiacriticalDot;': '\u02d9', + 'DiacriticalDoubleAcute;': '\u02dd', + 'DiacriticalGrave;': '`', + 'DiacriticalTilde;': '\u02dc', + 'diam;': '\u22c4', + 'Diamond;': '\u22c4', + 'diamond;': '\u22c4', + 'diamondsuit;': '\u2666', + 'diams;': '\u2666', + 'die;': '\xa8', + 'DifferentialD;': '\u2146', + 'digamma;': '\u03dd', + 'disin;': '\u22f2', + 'div;': '\xf7', + 'divide': '\xf7', + 'divide;': '\xf7', + 'divideontimes;': '\u22c7', + 'divonx;': '\u22c7', + 'DJcy;': '\u0402', + 'djcy;': '\u0452', + 'dlcorn;': '\u231e', + 'dlcrop;': '\u230d', + 'dollar;': '$', + 'Dopf;': '\U0001d53b', + 'dopf;': '\U0001d555', + 'Dot;': '\xa8', + 'dot;': '\u02d9', + 'DotDot;': '\u20dc', + 'doteq;': '\u2250', + 'doteqdot;': '\u2251', + 'DotEqual;': '\u2250', + 'dotminus;': '\u2238', + 'dotplus;': '\u2214', + 'dotsquare;': '\u22a1', + 'doublebarwedge;': '\u2306', + 'DoubleContourIntegral;': '\u222f', + 'DoubleDot;': '\xa8', + 'DoubleDownArrow;': '\u21d3', + 'DoubleLeftArrow;': '\u21d0', + 'DoubleLeftRightArrow;': '\u21d4', + 'DoubleLeftTee;': '\u2ae4', + 'DoubleLongLeftArrow;': '\u27f8', + 'DoubleLongLeftRightArrow;': '\u27fa', + 'DoubleLongRightArrow;': '\u27f9', + 'DoubleRightArrow;': '\u21d2', + 'DoubleRightTee;': '\u22a8', + 'DoubleUpArrow;': '\u21d1', + 'DoubleUpDownArrow;': '\u21d5', + 'DoubleVerticalBar;': '\u2225', + 'DownArrow;': '\u2193', + 'Downarrow;': '\u21d3', + 'downarrow;': '\u2193', + 'DownArrowBar;': '\u2913', + 'DownArrowUpArrow;': '\u21f5', + 'DownBreve;': '\u0311', + 'downdownarrows;': '\u21ca', + 'downharpoonleft;': '\u21c3', + 'downharpoonright;': '\u21c2', + 'DownLeftRightVector;': '\u2950', + 'DownLeftTeeVector;': '\u295e', + 'DownLeftVector;': '\u21bd', + 'DownLeftVectorBar;': '\u2956', + 'DownRightTeeVector;': '\u295f', + 'DownRightVector;': '\u21c1', + 'DownRightVectorBar;': '\u2957', + 'DownTee;': '\u22a4', + 'DownTeeArrow;': '\u21a7', + 'drbkarow;': '\u2910', + 'drcorn;': '\u231f', + 'drcrop;': '\u230c', + 'Dscr;': '\U0001d49f', + 'dscr;': '\U0001d4b9', + 'DScy;': '\u0405', + 'dscy;': '\u0455', + 'dsol;': '\u29f6', + 'Dstrok;': '\u0110', + 'dstrok;': '\u0111', + 'dtdot;': '\u22f1', + 'dtri;': '\u25bf', + 'dtrif;': '\u25be', + 'duarr;': '\u21f5', + 'duhar;': '\u296f', + 'dwangle;': '\u29a6', + 'DZcy;': '\u040f', + 'dzcy;': '\u045f', + 'dzigrarr;': '\u27ff', + 'Eacute': '\xc9', + 'eacute': '\xe9', + 'Eacute;': '\xc9', + 'eacute;': '\xe9', + 'easter;': '\u2a6e', + 'Ecaron;': '\u011a', + 'ecaron;': '\u011b', + 'ecir;': '\u2256', + 'Ecirc': '\xca', + 'ecirc': '\xea', + 'Ecirc;': '\xca', + 'ecirc;': '\xea', + 'ecolon;': '\u2255', + 'Ecy;': '\u042d', + 'ecy;': '\u044d', + 'eDDot;': '\u2a77', + 'Edot;': '\u0116', + 'eDot;': '\u2251', + 'edot;': '\u0117', + 'ee;': '\u2147', + 'efDot;': '\u2252', + 'Efr;': '\U0001d508', + 'efr;': '\U0001d522', + 'eg;': '\u2a9a', + 'Egrave': '\xc8', + 'egrave': '\xe8', + 'Egrave;': '\xc8', + 'egrave;': '\xe8', + 'egs;': '\u2a96', + 'egsdot;': '\u2a98', + 'el;': '\u2a99', + 'Element;': '\u2208', + 'elinters;': '\u23e7', + 'ell;': '\u2113', + 'els;': '\u2a95', + 'elsdot;': '\u2a97', + 'Emacr;': '\u0112', + 'emacr;': '\u0113', + 'empty;': '\u2205', + 'emptyset;': '\u2205', + 'EmptySmallSquare;': '\u25fb', + 'emptyv;': '\u2205', + 'EmptyVerySmallSquare;': '\u25ab', + 'emsp13;': '\u2004', + 'emsp14;': '\u2005', + 'emsp;': '\u2003', + 'ENG;': '\u014a', + 'eng;': '\u014b', + 'ensp;': '\u2002', + 'Eogon;': '\u0118', + 'eogon;': '\u0119', + 'Eopf;': '\U0001d53c', + 'eopf;': '\U0001d556', + 'epar;': '\u22d5', + 'eparsl;': '\u29e3', + 'eplus;': '\u2a71', + 'epsi;': '\u03b5', + 'Epsilon;': '\u0395', + 'epsilon;': '\u03b5', + 'epsiv;': '\u03f5', + 'eqcirc;': '\u2256', + 'eqcolon;': '\u2255', + 'eqsim;': '\u2242', + 'eqslantgtr;': '\u2a96', + 'eqslantless;': '\u2a95', + 'Equal;': '\u2a75', + 'equals;': '=', + 'EqualTilde;': '\u2242', + 'equest;': '\u225f', + 'Equilibrium;': '\u21cc', + 'equiv;': '\u2261', + 'equivDD;': '\u2a78', + 'eqvparsl;': '\u29e5', + 'erarr;': '\u2971', + 'erDot;': '\u2253', + 'Escr;': '\u2130', + 'escr;': '\u212f', + 'esdot;': '\u2250', + 'Esim;': '\u2a73', + 'esim;': '\u2242', + 'Eta;': '\u0397', + 'eta;': '\u03b7', + 'ETH': '\xd0', + 'eth': '\xf0', + 'ETH;': '\xd0', + 'eth;': '\xf0', + 'Euml': '\xcb', + 'euml': '\xeb', + 'Euml;': '\xcb', + 'euml;': '\xeb', + 'euro;': '\u20ac', + 'excl;': '!', + 'exist;': '\u2203', + 'Exists;': '\u2203', + 'expectation;': '\u2130', + 'ExponentialE;': '\u2147', + 'exponentiale;': '\u2147', + 'fallingdotseq;': '\u2252', + 'Fcy;': '\u0424', + 'fcy;': '\u0444', + 'female;': '\u2640', + 'ffilig;': '\ufb03', + 'fflig;': '\ufb00', + 'ffllig;': '\ufb04', + 'Ffr;': '\U0001d509', + 'ffr;': '\U0001d523', + 'filig;': '\ufb01', + 'FilledSmallSquare;': '\u25fc', + 'FilledVerySmallSquare;': '\u25aa', + 'fjlig;': 'fj', + 'flat;': '\u266d', + 'fllig;': '\ufb02', + 'fltns;': '\u25b1', + 'fnof;': '\u0192', + 'Fopf;': '\U0001d53d', + 'fopf;': '\U0001d557', + 'ForAll;': '\u2200', + 'forall;': '\u2200', + 'fork;': '\u22d4', + 'forkv;': '\u2ad9', + 'Fouriertrf;': '\u2131', + 'fpartint;': '\u2a0d', + 'frac12': '\xbd', + 'frac12;': '\xbd', + 'frac13;': '\u2153', + 'frac14': '\xbc', + 'frac14;': '\xbc', + 'frac15;': '\u2155', + 'frac16;': '\u2159', + 'frac18;': '\u215b', + 'frac23;': '\u2154', + 'frac25;': '\u2156', + 'frac34': '\xbe', + 'frac34;': '\xbe', + 'frac35;': '\u2157', + 'frac38;': '\u215c', + 'frac45;': '\u2158', + 'frac56;': '\u215a', + 'frac58;': '\u215d', + 'frac78;': '\u215e', + 'frasl;': '\u2044', + 'frown;': '\u2322', + 'Fscr;': '\u2131', + 'fscr;': '\U0001d4bb', + 'gacute;': '\u01f5', + 'Gamma;': '\u0393', + 'gamma;': '\u03b3', + 'Gammad;': '\u03dc', + 'gammad;': '\u03dd', + 'gap;': '\u2a86', + 'Gbreve;': '\u011e', + 'gbreve;': '\u011f', + 'Gcedil;': '\u0122', + 'Gcirc;': '\u011c', + 'gcirc;': '\u011d', + 'Gcy;': '\u0413', + 'gcy;': '\u0433', + 'Gdot;': '\u0120', + 'gdot;': '\u0121', + 'gE;': '\u2267', + 'ge;': '\u2265', + 'gEl;': '\u2a8c', + 'gel;': '\u22db', + 'geq;': '\u2265', + 'geqq;': '\u2267', + 'geqslant;': '\u2a7e', + 'ges;': '\u2a7e', + 'gescc;': '\u2aa9', + 'gesdot;': '\u2a80', + 'gesdoto;': '\u2a82', + 'gesdotol;': '\u2a84', + 'gesl;': '\u22db\ufe00', + 'gesles;': '\u2a94', + 'Gfr;': '\U0001d50a', + 'gfr;': '\U0001d524', + 'Gg;': '\u22d9', + 'gg;': '\u226b', + 'ggg;': '\u22d9', + 'gimel;': '\u2137', + 'GJcy;': '\u0403', + 'gjcy;': '\u0453', + 'gl;': '\u2277', + 'gla;': '\u2aa5', + 'glE;': '\u2a92', + 'glj;': '\u2aa4', + 'gnap;': '\u2a8a', + 'gnapprox;': '\u2a8a', + 'gnE;': '\u2269', + 'gne;': '\u2a88', + 'gneq;': '\u2a88', + 'gneqq;': '\u2269', + 'gnsim;': '\u22e7', + 'Gopf;': '\U0001d53e', + 'gopf;': '\U0001d558', + 'grave;': '`', + 'GreaterEqual;': '\u2265', + 'GreaterEqualLess;': '\u22db', + 'GreaterFullEqual;': '\u2267', + 'GreaterGreater;': '\u2aa2', + 'GreaterLess;': '\u2277', + 'GreaterSlantEqual;': '\u2a7e', + 'GreaterTilde;': '\u2273', + 'Gscr;': '\U0001d4a2', + 'gscr;': '\u210a', + 'gsim;': '\u2273', + 'gsime;': '\u2a8e', + 'gsiml;': '\u2a90', + 'GT': '>', + 'gt': '>', + 'GT;': '>', + 'Gt;': '\u226b', + 'gt;': '>', + 'gtcc;': '\u2aa7', + 'gtcir;': '\u2a7a', + 'gtdot;': '\u22d7', + 'gtlPar;': '\u2995', + 'gtquest;': '\u2a7c', + 'gtrapprox;': '\u2a86', + 'gtrarr;': '\u2978', + 'gtrdot;': '\u22d7', + 'gtreqless;': '\u22db', + 'gtreqqless;': '\u2a8c', + 'gtrless;': '\u2277', + 'gtrsim;': '\u2273', + 'gvertneqq;': '\u2269\ufe00', + 'gvnE;': '\u2269\ufe00', + 'Hacek;': '\u02c7', + 'hairsp;': '\u200a', + 'half;': '\xbd', + 'hamilt;': '\u210b', + 'HARDcy;': '\u042a', + 'hardcy;': '\u044a', + 'hArr;': '\u21d4', + 'harr;': '\u2194', + 'harrcir;': '\u2948', + 'harrw;': '\u21ad', + 'Hat;': '^', + 'hbar;': '\u210f', + 'Hcirc;': '\u0124', + 'hcirc;': '\u0125', + 'hearts;': '\u2665', + 'heartsuit;': '\u2665', + 'hellip;': '\u2026', + 'hercon;': '\u22b9', + 'Hfr;': '\u210c', + 'hfr;': '\U0001d525', + 'HilbertSpace;': '\u210b', + 'hksearow;': '\u2925', + 'hkswarow;': '\u2926', + 'hoarr;': '\u21ff', + 'homtht;': '\u223b', + 'hookleftarrow;': '\u21a9', + 'hookrightarrow;': '\u21aa', + 'Hopf;': '\u210d', + 'hopf;': '\U0001d559', + 'horbar;': '\u2015', + 'HorizontalLine;': '\u2500', + 'Hscr;': '\u210b', + 'hscr;': '\U0001d4bd', + 'hslash;': '\u210f', + 'Hstrok;': '\u0126', + 'hstrok;': '\u0127', + 'HumpDownHump;': '\u224e', + 'HumpEqual;': '\u224f', + 'hybull;': '\u2043', + 'hyphen;': '\u2010', + 'Iacute': '\xcd', + 'iacute': '\xed', + 'Iacute;': '\xcd', + 'iacute;': '\xed', + 'ic;': '\u2063', + 'Icirc': '\xce', + 'icirc': '\xee', + 'Icirc;': '\xce', + 'icirc;': '\xee', + 'Icy;': '\u0418', + 'icy;': '\u0438', + 'Idot;': '\u0130', + 'IEcy;': '\u0415', + 'iecy;': '\u0435', + 'iexcl': '\xa1', + 'iexcl;': '\xa1', + 'iff;': '\u21d4', + 'Ifr;': '\u2111', + 'ifr;': '\U0001d526', + 'Igrave': '\xcc', + 'igrave': '\xec', + 'Igrave;': '\xcc', + 'igrave;': '\xec', + 'ii;': '\u2148', + 'iiiint;': '\u2a0c', + 'iiint;': '\u222d', + 'iinfin;': '\u29dc', + 'iiota;': '\u2129', + 'IJlig;': '\u0132', + 'ijlig;': '\u0133', + 'Im;': '\u2111', + 'Imacr;': '\u012a', + 'imacr;': '\u012b', + 'image;': '\u2111', + 'ImaginaryI;': '\u2148', + 'imagline;': '\u2110', + 'imagpart;': '\u2111', + 'imath;': '\u0131', + 'imof;': '\u22b7', + 'imped;': '\u01b5', + 'Implies;': '\u21d2', + 'in;': '\u2208', + 'incare;': '\u2105', + 'infin;': '\u221e', + 'infintie;': '\u29dd', + 'inodot;': '\u0131', + 'Int;': '\u222c', + 'int;': '\u222b', + 'intcal;': '\u22ba', + 'integers;': '\u2124', + 'Integral;': '\u222b', + 'intercal;': '\u22ba', + 'Intersection;': '\u22c2', + 'intlarhk;': '\u2a17', + 'intprod;': '\u2a3c', + 'InvisibleComma;': '\u2063', + 'InvisibleTimes;': '\u2062', + 'IOcy;': '\u0401', + 'iocy;': '\u0451', + 'Iogon;': '\u012e', + 'iogon;': '\u012f', + 'Iopf;': '\U0001d540', + 'iopf;': '\U0001d55a', + 'Iota;': '\u0399', + 'iota;': '\u03b9', + 'iprod;': '\u2a3c', + 'iquest': '\xbf', + 'iquest;': '\xbf', + 'Iscr;': '\u2110', + 'iscr;': '\U0001d4be', + 'isin;': '\u2208', + 'isindot;': '\u22f5', + 'isinE;': '\u22f9', + 'isins;': '\u22f4', + 'isinsv;': '\u22f3', + 'isinv;': '\u2208', + 'it;': '\u2062', + 'Itilde;': '\u0128', + 'itilde;': '\u0129', + 'Iukcy;': '\u0406', + 'iukcy;': '\u0456', + 'Iuml': '\xcf', + 'iuml': '\xef', + 'Iuml;': '\xcf', + 'iuml;': '\xef', + 'Jcirc;': '\u0134', + 'jcirc;': '\u0135', + 'Jcy;': '\u0419', + 'jcy;': '\u0439', + 'Jfr;': '\U0001d50d', + 'jfr;': '\U0001d527', + 'jmath;': '\u0237', + 'Jopf;': '\U0001d541', + 'jopf;': '\U0001d55b', + 'Jscr;': '\U0001d4a5', + 'jscr;': '\U0001d4bf', + 'Jsercy;': '\u0408', + 'jsercy;': '\u0458', + 'Jukcy;': '\u0404', + 'jukcy;': '\u0454', + 'Kappa;': '\u039a', + 'kappa;': '\u03ba', + 'kappav;': '\u03f0', + 'Kcedil;': '\u0136', + 'kcedil;': '\u0137', + 'Kcy;': '\u041a', + 'kcy;': '\u043a', + 'Kfr;': '\U0001d50e', + 'kfr;': '\U0001d528', + 'kgreen;': '\u0138', + 'KHcy;': '\u0425', + 'khcy;': '\u0445', + 'KJcy;': '\u040c', + 'kjcy;': '\u045c', + 'Kopf;': '\U0001d542', + 'kopf;': '\U0001d55c', + 'Kscr;': '\U0001d4a6', + 'kscr;': '\U0001d4c0', + 'lAarr;': '\u21da', + 'Lacute;': '\u0139', + 'lacute;': '\u013a', + 'laemptyv;': '\u29b4', + 'lagran;': '\u2112', + 'Lambda;': '\u039b', + 'lambda;': '\u03bb', + 'Lang;': '\u27ea', + 'lang;': '\u27e8', + 'langd;': '\u2991', + 'langle;': '\u27e8', + 'lap;': '\u2a85', + 'Laplacetrf;': '\u2112', + 'laquo': '\xab', + 'laquo;': '\xab', + 'Larr;': '\u219e', + 'lArr;': '\u21d0', + 'larr;': '\u2190', + 'larrb;': '\u21e4', + 'larrbfs;': '\u291f', + 'larrfs;': '\u291d', + 'larrhk;': '\u21a9', + 'larrlp;': '\u21ab', + 'larrpl;': '\u2939', + 'larrsim;': '\u2973', + 'larrtl;': '\u21a2', + 'lat;': '\u2aab', + 'lAtail;': '\u291b', + 'latail;': '\u2919', + 'late;': '\u2aad', + 'lates;': '\u2aad\ufe00', + 'lBarr;': '\u290e', + 'lbarr;': '\u290c', + 'lbbrk;': '\u2772', + 'lbrace;': '{', + 'lbrack;': '[', + 'lbrke;': '\u298b', + 'lbrksld;': '\u298f', + 'lbrkslu;': '\u298d', + 'Lcaron;': '\u013d', + 'lcaron;': '\u013e', + 'Lcedil;': '\u013b', + 'lcedil;': '\u013c', + 'lceil;': '\u2308', + 'lcub;': '{', + 'Lcy;': '\u041b', + 'lcy;': '\u043b', + 'ldca;': '\u2936', + 'ldquo;': '\u201c', + 'ldquor;': '\u201e', + 'ldrdhar;': '\u2967', + 'ldrushar;': '\u294b', + 'ldsh;': '\u21b2', + 'lE;': '\u2266', + 'le;': '\u2264', + 'LeftAngleBracket;': '\u27e8', + 'LeftArrow;': '\u2190', + 'Leftarrow;': '\u21d0', + 'leftarrow;': '\u2190', + 'LeftArrowBar;': '\u21e4', + 'LeftArrowRightArrow;': '\u21c6', + 'leftarrowtail;': '\u21a2', + 'LeftCeiling;': '\u2308', + 'LeftDoubleBracket;': '\u27e6', + 'LeftDownTeeVector;': '\u2961', + 'LeftDownVector;': '\u21c3', + 'LeftDownVectorBar;': '\u2959', + 'LeftFloor;': '\u230a', + 'leftharpoondown;': '\u21bd', + 'leftharpoonup;': '\u21bc', + 'leftleftarrows;': '\u21c7', + 'LeftRightArrow;': '\u2194', + 'Leftrightarrow;': '\u21d4', + 'leftrightarrow;': '\u2194', + 'leftrightarrows;': '\u21c6', + 'leftrightharpoons;': '\u21cb', + 'leftrightsquigarrow;': '\u21ad', + 'LeftRightVector;': '\u294e', + 'LeftTee;': '\u22a3', + 'LeftTeeArrow;': '\u21a4', + 'LeftTeeVector;': '\u295a', + 'leftthreetimes;': '\u22cb', + 'LeftTriangle;': '\u22b2', + 'LeftTriangleBar;': '\u29cf', + 'LeftTriangleEqual;': '\u22b4', + 'LeftUpDownVector;': '\u2951', + 'LeftUpTeeVector;': '\u2960', + 'LeftUpVector;': '\u21bf', + 'LeftUpVectorBar;': '\u2958', + 'LeftVector;': '\u21bc', + 'LeftVectorBar;': '\u2952', + 'lEg;': '\u2a8b', + 'leg;': '\u22da', + 'leq;': '\u2264', + 'leqq;': '\u2266', + 'leqslant;': '\u2a7d', + 'les;': '\u2a7d', + 'lescc;': '\u2aa8', + 'lesdot;': '\u2a7f', + 'lesdoto;': '\u2a81', + 'lesdotor;': '\u2a83', + 'lesg;': '\u22da\ufe00', + 'lesges;': '\u2a93', + 'lessapprox;': '\u2a85', + 'lessdot;': '\u22d6', + 'lesseqgtr;': '\u22da', + 'lesseqqgtr;': '\u2a8b', + 'LessEqualGreater;': '\u22da', + 'LessFullEqual;': '\u2266', + 'LessGreater;': '\u2276', + 'lessgtr;': '\u2276', + 'LessLess;': '\u2aa1', + 'lesssim;': '\u2272', + 'LessSlantEqual;': '\u2a7d', + 'LessTilde;': '\u2272', + 'lfisht;': '\u297c', + 'lfloor;': '\u230a', + 'Lfr;': '\U0001d50f', + 'lfr;': '\U0001d529', + 'lg;': '\u2276', + 'lgE;': '\u2a91', + 'lHar;': '\u2962', + 'lhard;': '\u21bd', + 'lharu;': '\u21bc', + 'lharul;': '\u296a', + 'lhblk;': '\u2584', + 'LJcy;': '\u0409', + 'ljcy;': '\u0459', + 'Ll;': '\u22d8', + 'll;': '\u226a', + 'llarr;': '\u21c7', + 'llcorner;': '\u231e', + 'Lleftarrow;': '\u21da', + 'llhard;': '\u296b', + 'lltri;': '\u25fa', + 'Lmidot;': '\u013f', + 'lmidot;': '\u0140', + 'lmoust;': '\u23b0', + 'lmoustache;': '\u23b0', + 'lnap;': '\u2a89', + 'lnapprox;': '\u2a89', + 'lnE;': '\u2268', + 'lne;': '\u2a87', + 'lneq;': '\u2a87', + 'lneqq;': '\u2268', + 'lnsim;': '\u22e6', + 'loang;': '\u27ec', + 'loarr;': '\u21fd', + 'lobrk;': '\u27e6', + 'LongLeftArrow;': '\u27f5', + 'Longleftarrow;': '\u27f8', + 'longleftarrow;': '\u27f5', + 'LongLeftRightArrow;': '\u27f7', + 'Longleftrightarrow;': '\u27fa', + 'longleftrightarrow;': '\u27f7', + 'longmapsto;': '\u27fc', + 'LongRightArrow;': '\u27f6', + 'Longrightarrow;': '\u27f9', + 'longrightarrow;': '\u27f6', + 'looparrowleft;': '\u21ab', + 'looparrowright;': '\u21ac', + 'lopar;': '\u2985', + 'Lopf;': '\U0001d543', + 'lopf;': '\U0001d55d', + 'loplus;': '\u2a2d', + 'lotimes;': '\u2a34', + 'lowast;': '\u2217', + 'lowbar;': '_', + 'LowerLeftArrow;': '\u2199', + 'LowerRightArrow;': '\u2198', + 'loz;': '\u25ca', + 'lozenge;': '\u25ca', + 'lozf;': '\u29eb', + 'lpar;': '(', + 'lparlt;': '\u2993', + 'lrarr;': '\u21c6', + 'lrcorner;': '\u231f', + 'lrhar;': '\u21cb', + 'lrhard;': '\u296d', + 'lrm;': '\u200e', + 'lrtri;': '\u22bf', + 'lsaquo;': '\u2039', + 'Lscr;': '\u2112', + 'lscr;': '\U0001d4c1', + 'Lsh;': '\u21b0', + 'lsh;': '\u21b0', + 'lsim;': '\u2272', + 'lsime;': '\u2a8d', + 'lsimg;': '\u2a8f', + 'lsqb;': '[', + 'lsquo;': '\u2018', + 'lsquor;': '\u201a', + 'Lstrok;': '\u0141', + 'lstrok;': '\u0142', + 'LT': '<', + 'lt': '<', + 'LT;': '<', + 'Lt;': '\u226a', + 'lt;': '<', + 'ltcc;': '\u2aa6', + 'ltcir;': '\u2a79', + 'ltdot;': '\u22d6', + 'lthree;': '\u22cb', + 'ltimes;': '\u22c9', + 'ltlarr;': '\u2976', + 'ltquest;': '\u2a7b', + 'ltri;': '\u25c3', + 'ltrie;': '\u22b4', + 'ltrif;': '\u25c2', + 'ltrPar;': '\u2996', + 'lurdshar;': '\u294a', + 'luruhar;': '\u2966', + 'lvertneqq;': '\u2268\ufe00', + 'lvnE;': '\u2268\ufe00', + 'macr': '\xaf', + 'macr;': '\xaf', + 'male;': '\u2642', + 'malt;': '\u2720', + 'maltese;': '\u2720', + 'Map;': '\u2905', + 'map;': '\u21a6', + 'mapsto;': '\u21a6', + 'mapstodown;': '\u21a7', + 'mapstoleft;': '\u21a4', + 'mapstoup;': '\u21a5', + 'marker;': '\u25ae', + 'mcomma;': '\u2a29', + 'Mcy;': '\u041c', + 'mcy;': '\u043c', + 'mdash;': '\u2014', + 'mDDot;': '\u223a', + 'measuredangle;': '\u2221', + 'MediumSpace;': '\u205f', + 'Mellintrf;': '\u2133', + 'Mfr;': '\U0001d510', + 'mfr;': '\U0001d52a', + 'mho;': '\u2127', + 'micro': '\xb5', + 'micro;': '\xb5', + 'mid;': '\u2223', + 'midast;': '*', + 'midcir;': '\u2af0', + 'middot': '\xb7', + 'middot;': '\xb7', + 'minus;': '\u2212', + 'minusb;': '\u229f', + 'minusd;': '\u2238', + 'minusdu;': '\u2a2a', + 'MinusPlus;': '\u2213', + 'mlcp;': '\u2adb', + 'mldr;': '\u2026', + 'mnplus;': '\u2213', + 'models;': '\u22a7', + 'Mopf;': '\U0001d544', + 'mopf;': '\U0001d55e', + 'mp;': '\u2213', + 'Mscr;': '\u2133', + 'mscr;': '\U0001d4c2', + 'mstpos;': '\u223e', + 'Mu;': '\u039c', + 'mu;': '\u03bc', + 'multimap;': '\u22b8', + 'mumap;': '\u22b8', + 'nabla;': '\u2207', + 'Nacute;': '\u0143', + 'nacute;': '\u0144', + 'nang;': '\u2220\u20d2', + 'nap;': '\u2249', + 'napE;': '\u2a70\u0338', + 'napid;': '\u224b\u0338', + 'napos;': '\u0149', + 'napprox;': '\u2249', + 'natur;': '\u266e', + 'natural;': '\u266e', + 'naturals;': '\u2115', + 'nbsp': '\xa0', + 'nbsp;': '\xa0', + 'nbump;': '\u224e\u0338', + 'nbumpe;': '\u224f\u0338', + 'ncap;': '\u2a43', + 'Ncaron;': '\u0147', + 'ncaron;': '\u0148', + 'Ncedil;': '\u0145', + 'ncedil;': '\u0146', + 'ncong;': '\u2247', + 'ncongdot;': '\u2a6d\u0338', + 'ncup;': '\u2a42', + 'Ncy;': '\u041d', + 'ncy;': '\u043d', + 'ndash;': '\u2013', + 'ne;': '\u2260', + 'nearhk;': '\u2924', + 'neArr;': '\u21d7', + 'nearr;': '\u2197', + 'nearrow;': '\u2197', + 'nedot;': '\u2250\u0338', + 'NegativeMediumSpace;': '\u200b', + 'NegativeThickSpace;': '\u200b', + 'NegativeThinSpace;': '\u200b', + 'NegativeVeryThinSpace;': '\u200b', + 'nequiv;': '\u2262', + 'nesear;': '\u2928', + 'nesim;': '\u2242\u0338', + 'NestedGreaterGreater;': '\u226b', + 'NestedLessLess;': '\u226a', + 'NewLine;': '\n', + 'nexist;': '\u2204', + 'nexists;': '\u2204', + 'Nfr;': '\U0001d511', + 'nfr;': '\U0001d52b', + 'ngE;': '\u2267\u0338', + 'nge;': '\u2271', + 'ngeq;': '\u2271', + 'ngeqq;': '\u2267\u0338', + 'ngeqslant;': '\u2a7e\u0338', + 'nges;': '\u2a7e\u0338', + 'nGg;': '\u22d9\u0338', + 'ngsim;': '\u2275', + 'nGt;': '\u226b\u20d2', + 'ngt;': '\u226f', + 'ngtr;': '\u226f', + 'nGtv;': '\u226b\u0338', + 'nhArr;': '\u21ce', + 'nharr;': '\u21ae', + 'nhpar;': '\u2af2', + 'ni;': '\u220b', + 'nis;': '\u22fc', + 'nisd;': '\u22fa', + 'niv;': '\u220b', + 'NJcy;': '\u040a', + 'njcy;': '\u045a', + 'nlArr;': '\u21cd', + 'nlarr;': '\u219a', + 'nldr;': '\u2025', + 'nlE;': '\u2266\u0338', + 'nle;': '\u2270', + 'nLeftarrow;': '\u21cd', + 'nleftarrow;': '\u219a', + 'nLeftrightarrow;': '\u21ce', + 'nleftrightarrow;': '\u21ae', + 'nleq;': '\u2270', + 'nleqq;': '\u2266\u0338', + 'nleqslant;': '\u2a7d\u0338', + 'nles;': '\u2a7d\u0338', + 'nless;': '\u226e', + 'nLl;': '\u22d8\u0338', + 'nlsim;': '\u2274', + 'nLt;': '\u226a\u20d2', + 'nlt;': '\u226e', + 'nltri;': '\u22ea', + 'nltrie;': '\u22ec', + 'nLtv;': '\u226a\u0338', + 'nmid;': '\u2224', + 'NoBreak;': '\u2060', + 'NonBreakingSpace;': '\xa0', + 'Nopf;': '\u2115', + 'nopf;': '\U0001d55f', + 'not': '\xac', + 'Not;': '\u2aec', + 'not;': '\xac', + 'NotCongruent;': '\u2262', + 'NotCupCap;': '\u226d', + 'NotDoubleVerticalBar;': '\u2226', + 'NotElement;': '\u2209', + 'NotEqual;': '\u2260', + 'NotEqualTilde;': '\u2242\u0338', + 'NotExists;': '\u2204', + 'NotGreater;': '\u226f', + 'NotGreaterEqual;': '\u2271', + 'NotGreaterFullEqual;': '\u2267\u0338', + 'NotGreaterGreater;': '\u226b\u0338', + 'NotGreaterLess;': '\u2279', + 'NotGreaterSlantEqual;': '\u2a7e\u0338', + 'NotGreaterTilde;': '\u2275', + 'NotHumpDownHump;': '\u224e\u0338', + 'NotHumpEqual;': '\u224f\u0338', + 'notin;': '\u2209', + 'notindot;': '\u22f5\u0338', + 'notinE;': '\u22f9\u0338', + 'notinva;': '\u2209', + 'notinvb;': '\u22f7', + 'notinvc;': '\u22f6', + 'NotLeftTriangle;': '\u22ea', + 'NotLeftTriangleBar;': '\u29cf\u0338', + 'NotLeftTriangleEqual;': '\u22ec', + 'NotLess;': '\u226e', + 'NotLessEqual;': '\u2270', + 'NotLessGreater;': '\u2278', + 'NotLessLess;': '\u226a\u0338', + 'NotLessSlantEqual;': '\u2a7d\u0338', + 'NotLessTilde;': '\u2274', + 'NotNestedGreaterGreater;': '\u2aa2\u0338', + 'NotNestedLessLess;': '\u2aa1\u0338', + 'notni;': '\u220c', + 'notniva;': '\u220c', + 'notnivb;': '\u22fe', + 'notnivc;': '\u22fd', + 'NotPrecedes;': '\u2280', + 'NotPrecedesEqual;': '\u2aaf\u0338', + 'NotPrecedesSlantEqual;': '\u22e0', + 'NotReverseElement;': '\u220c', + 'NotRightTriangle;': '\u22eb', + 'NotRightTriangleBar;': '\u29d0\u0338', + 'NotRightTriangleEqual;': '\u22ed', + 'NotSquareSubset;': '\u228f\u0338', + 'NotSquareSubsetEqual;': '\u22e2', + 'NotSquareSuperset;': '\u2290\u0338', + 'NotSquareSupersetEqual;': '\u22e3', + 'NotSubset;': '\u2282\u20d2', + 'NotSubsetEqual;': '\u2288', + 'NotSucceeds;': '\u2281', + 'NotSucceedsEqual;': '\u2ab0\u0338', + 'NotSucceedsSlantEqual;': '\u22e1', + 'NotSucceedsTilde;': '\u227f\u0338', + 'NotSuperset;': '\u2283\u20d2', + 'NotSupersetEqual;': '\u2289', + 'NotTilde;': '\u2241', + 'NotTildeEqual;': '\u2244', + 'NotTildeFullEqual;': '\u2247', + 'NotTildeTilde;': '\u2249', + 'NotVerticalBar;': '\u2224', + 'npar;': '\u2226', + 'nparallel;': '\u2226', + 'nparsl;': '\u2afd\u20e5', + 'npart;': '\u2202\u0338', + 'npolint;': '\u2a14', + 'npr;': '\u2280', + 'nprcue;': '\u22e0', + 'npre;': '\u2aaf\u0338', + 'nprec;': '\u2280', + 'npreceq;': '\u2aaf\u0338', + 'nrArr;': '\u21cf', + 'nrarr;': '\u219b', + 'nrarrc;': '\u2933\u0338', + 'nrarrw;': '\u219d\u0338', + 'nRightarrow;': '\u21cf', + 'nrightarrow;': '\u219b', + 'nrtri;': '\u22eb', + 'nrtrie;': '\u22ed', + 'nsc;': '\u2281', + 'nsccue;': '\u22e1', + 'nsce;': '\u2ab0\u0338', + 'Nscr;': '\U0001d4a9', + 'nscr;': '\U0001d4c3', + 'nshortmid;': '\u2224', + 'nshortparallel;': '\u2226', + 'nsim;': '\u2241', + 'nsime;': '\u2244', + 'nsimeq;': '\u2244', + 'nsmid;': '\u2224', + 'nspar;': '\u2226', + 'nsqsube;': '\u22e2', + 'nsqsupe;': '\u22e3', + 'nsub;': '\u2284', + 'nsubE;': '\u2ac5\u0338', + 'nsube;': '\u2288', + 'nsubset;': '\u2282\u20d2', + 'nsubseteq;': '\u2288', + 'nsubseteqq;': '\u2ac5\u0338', + 'nsucc;': '\u2281', + 'nsucceq;': '\u2ab0\u0338', + 'nsup;': '\u2285', + 'nsupE;': '\u2ac6\u0338', + 'nsupe;': '\u2289', + 'nsupset;': '\u2283\u20d2', + 'nsupseteq;': '\u2289', + 'nsupseteqq;': '\u2ac6\u0338', + 'ntgl;': '\u2279', + 'Ntilde': '\xd1', + 'ntilde': '\xf1', + 'Ntilde;': '\xd1', + 'ntilde;': '\xf1', + 'ntlg;': '\u2278', + 'ntriangleleft;': '\u22ea', + 'ntrianglelefteq;': '\u22ec', + 'ntriangleright;': '\u22eb', + 'ntrianglerighteq;': '\u22ed', + 'Nu;': '\u039d', + 'nu;': '\u03bd', + 'num;': '#', + 'numero;': '\u2116', + 'numsp;': '\u2007', + 'nvap;': '\u224d\u20d2', + 'nVDash;': '\u22af', + 'nVdash;': '\u22ae', + 'nvDash;': '\u22ad', + 'nvdash;': '\u22ac', + 'nvge;': '\u2265\u20d2', + 'nvgt;': '>\u20d2', + 'nvHarr;': '\u2904', + 'nvinfin;': '\u29de', + 'nvlArr;': '\u2902', + 'nvle;': '\u2264\u20d2', + 'nvlt;': '<\u20d2', + 'nvltrie;': '\u22b4\u20d2', + 'nvrArr;': '\u2903', + 'nvrtrie;': '\u22b5\u20d2', + 'nvsim;': '\u223c\u20d2', + 'nwarhk;': '\u2923', + 'nwArr;': '\u21d6', + 'nwarr;': '\u2196', + 'nwarrow;': '\u2196', + 'nwnear;': '\u2927', + 'Oacute': '\xd3', + 'oacute': '\xf3', + 'Oacute;': '\xd3', + 'oacute;': '\xf3', + 'oast;': '\u229b', + 'ocir;': '\u229a', + 'Ocirc': '\xd4', + 'ocirc': '\xf4', + 'Ocirc;': '\xd4', + 'ocirc;': '\xf4', + 'Ocy;': '\u041e', + 'ocy;': '\u043e', + 'odash;': '\u229d', + 'Odblac;': '\u0150', + 'odblac;': '\u0151', + 'odiv;': '\u2a38', + 'odot;': '\u2299', + 'odsold;': '\u29bc', + 'OElig;': '\u0152', + 'oelig;': '\u0153', + 'ofcir;': '\u29bf', + 'Ofr;': '\U0001d512', + 'ofr;': '\U0001d52c', + 'ogon;': '\u02db', + 'Ograve': '\xd2', + 'ograve': '\xf2', + 'Ograve;': '\xd2', + 'ograve;': '\xf2', + 'ogt;': '\u29c1', + 'ohbar;': '\u29b5', + 'ohm;': '\u03a9', + 'oint;': '\u222e', + 'olarr;': '\u21ba', + 'olcir;': '\u29be', + 'olcross;': '\u29bb', + 'oline;': '\u203e', + 'olt;': '\u29c0', + 'Omacr;': '\u014c', + 'omacr;': '\u014d', + 'Omega;': '\u03a9', + 'omega;': '\u03c9', + 'Omicron;': '\u039f', + 'omicron;': '\u03bf', + 'omid;': '\u29b6', + 'ominus;': '\u2296', + 'Oopf;': '\U0001d546', + 'oopf;': '\U0001d560', + 'opar;': '\u29b7', + 'OpenCurlyDoubleQuote;': '\u201c', + 'OpenCurlyQuote;': '\u2018', + 'operp;': '\u29b9', + 'oplus;': '\u2295', + 'Or;': '\u2a54', + 'or;': '\u2228', + 'orarr;': '\u21bb', + 'ord;': '\u2a5d', + 'order;': '\u2134', + 'orderof;': '\u2134', + 'ordf': '\xaa', + 'ordf;': '\xaa', + 'ordm': '\xba', + 'ordm;': '\xba', + 'origof;': '\u22b6', + 'oror;': '\u2a56', + 'orslope;': '\u2a57', + 'orv;': '\u2a5b', + 'oS;': '\u24c8', + 'Oscr;': '\U0001d4aa', + 'oscr;': '\u2134', + 'Oslash': '\xd8', + 'oslash': '\xf8', + 'Oslash;': '\xd8', + 'oslash;': '\xf8', + 'osol;': '\u2298', + 'Otilde': '\xd5', + 'otilde': '\xf5', + 'Otilde;': '\xd5', + 'otilde;': '\xf5', + 'Otimes;': '\u2a37', + 'otimes;': '\u2297', + 'otimesas;': '\u2a36', + 'Ouml': '\xd6', + 'ouml': '\xf6', + 'Ouml;': '\xd6', + 'ouml;': '\xf6', + 'ovbar;': '\u233d', + 'OverBar;': '\u203e', + 'OverBrace;': '\u23de', + 'OverBracket;': '\u23b4', + 'OverParenthesis;': '\u23dc', + 'par;': '\u2225', + 'para': '\xb6', + 'para;': '\xb6', + 'parallel;': '\u2225', + 'parsim;': '\u2af3', + 'parsl;': '\u2afd', + 'part;': '\u2202', + 'PartialD;': '\u2202', + 'Pcy;': '\u041f', + 'pcy;': '\u043f', + 'percnt;': '%', + 'period;': '.', + 'permil;': '\u2030', + 'perp;': '\u22a5', + 'pertenk;': '\u2031', + 'Pfr;': '\U0001d513', + 'pfr;': '\U0001d52d', + 'Phi;': '\u03a6', + 'phi;': '\u03c6', + 'phiv;': '\u03d5', + 'phmmat;': '\u2133', + 'phone;': '\u260e', + 'Pi;': '\u03a0', + 'pi;': '\u03c0', + 'pitchfork;': '\u22d4', + 'piv;': '\u03d6', + 'planck;': '\u210f', + 'planckh;': '\u210e', + 'plankv;': '\u210f', + 'plus;': '+', + 'plusacir;': '\u2a23', + 'plusb;': '\u229e', + 'pluscir;': '\u2a22', + 'plusdo;': '\u2214', + 'plusdu;': '\u2a25', + 'pluse;': '\u2a72', + 'PlusMinus;': '\xb1', + 'plusmn': '\xb1', + 'plusmn;': '\xb1', + 'plussim;': '\u2a26', + 'plustwo;': '\u2a27', + 'pm;': '\xb1', + 'Poincareplane;': '\u210c', + 'pointint;': '\u2a15', + 'Popf;': '\u2119', + 'popf;': '\U0001d561', + 'pound': '\xa3', + 'pound;': '\xa3', + 'Pr;': '\u2abb', + 'pr;': '\u227a', + 'prap;': '\u2ab7', + 'prcue;': '\u227c', + 'prE;': '\u2ab3', + 'pre;': '\u2aaf', + 'prec;': '\u227a', + 'precapprox;': '\u2ab7', + 'preccurlyeq;': '\u227c', + 'Precedes;': '\u227a', + 'PrecedesEqual;': '\u2aaf', + 'PrecedesSlantEqual;': '\u227c', + 'PrecedesTilde;': '\u227e', + 'preceq;': '\u2aaf', + 'precnapprox;': '\u2ab9', + 'precneqq;': '\u2ab5', + 'precnsim;': '\u22e8', + 'precsim;': '\u227e', + 'Prime;': '\u2033', + 'prime;': '\u2032', + 'primes;': '\u2119', + 'prnap;': '\u2ab9', + 'prnE;': '\u2ab5', + 'prnsim;': '\u22e8', + 'prod;': '\u220f', + 'Product;': '\u220f', + 'profalar;': '\u232e', + 'profline;': '\u2312', + 'profsurf;': '\u2313', + 'prop;': '\u221d', + 'Proportion;': '\u2237', + 'Proportional;': '\u221d', + 'propto;': '\u221d', + 'prsim;': '\u227e', + 'prurel;': '\u22b0', + 'Pscr;': '\U0001d4ab', + 'pscr;': '\U0001d4c5', + 'Psi;': '\u03a8', + 'psi;': '\u03c8', + 'puncsp;': '\u2008', + 'Qfr;': '\U0001d514', + 'qfr;': '\U0001d52e', + 'qint;': '\u2a0c', + 'Qopf;': '\u211a', + 'qopf;': '\U0001d562', + 'qprime;': '\u2057', + 'Qscr;': '\U0001d4ac', + 'qscr;': '\U0001d4c6', + 'quaternions;': '\u210d', + 'quatint;': '\u2a16', + 'quest;': '?', + 'questeq;': '\u225f', + 'QUOT': '"', + 'quot': '"', + 'QUOT;': '"', + 'quot;': '"', + 'rAarr;': '\u21db', + 'race;': '\u223d\u0331', + 'Racute;': '\u0154', + 'racute;': '\u0155', + 'radic;': '\u221a', + 'raemptyv;': '\u29b3', + 'Rang;': '\u27eb', + 'rang;': '\u27e9', + 'rangd;': '\u2992', + 'range;': '\u29a5', + 'rangle;': '\u27e9', + 'raquo': '\xbb', + 'raquo;': '\xbb', + 'Rarr;': '\u21a0', + 'rArr;': '\u21d2', + 'rarr;': '\u2192', + 'rarrap;': '\u2975', + 'rarrb;': '\u21e5', + 'rarrbfs;': '\u2920', + 'rarrc;': '\u2933', + 'rarrfs;': '\u291e', + 'rarrhk;': '\u21aa', + 'rarrlp;': '\u21ac', + 'rarrpl;': '\u2945', + 'rarrsim;': '\u2974', + 'Rarrtl;': '\u2916', + 'rarrtl;': '\u21a3', + 'rarrw;': '\u219d', + 'rAtail;': '\u291c', + 'ratail;': '\u291a', + 'ratio;': '\u2236', + 'rationals;': '\u211a', + 'RBarr;': '\u2910', + 'rBarr;': '\u290f', + 'rbarr;': '\u290d', + 'rbbrk;': '\u2773', + 'rbrace;': '}', + 'rbrack;': ']', + 'rbrke;': '\u298c', + 'rbrksld;': '\u298e', + 'rbrkslu;': '\u2990', + 'Rcaron;': '\u0158', + 'rcaron;': '\u0159', + 'Rcedil;': '\u0156', + 'rcedil;': '\u0157', + 'rceil;': '\u2309', + 'rcub;': '}', + 'Rcy;': '\u0420', + 'rcy;': '\u0440', + 'rdca;': '\u2937', + 'rdldhar;': '\u2969', + 'rdquo;': '\u201d', + 'rdquor;': '\u201d', + 'rdsh;': '\u21b3', + 'Re;': '\u211c', + 'real;': '\u211c', + 'realine;': '\u211b', + 'realpart;': '\u211c', + 'reals;': '\u211d', + 'rect;': '\u25ad', + 'REG': '\xae', + 'reg': '\xae', + 'REG;': '\xae', + 'reg;': '\xae', + 'ReverseElement;': '\u220b', + 'ReverseEquilibrium;': '\u21cb', + 'ReverseUpEquilibrium;': '\u296f', + 'rfisht;': '\u297d', + 'rfloor;': '\u230b', + 'Rfr;': '\u211c', + 'rfr;': '\U0001d52f', + 'rHar;': '\u2964', + 'rhard;': '\u21c1', + 'rharu;': '\u21c0', + 'rharul;': '\u296c', + 'Rho;': '\u03a1', + 'rho;': '\u03c1', + 'rhov;': '\u03f1', + 'RightAngleBracket;': '\u27e9', + 'RightArrow;': '\u2192', + 'Rightarrow;': '\u21d2', + 'rightarrow;': '\u2192', + 'RightArrowBar;': '\u21e5', + 'RightArrowLeftArrow;': '\u21c4', + 'rightarrowtail;': '\u21a3', + 'RightCeiling;': '\u2309', + 'RightDoubleBracket;': '\u27e7', + 'RightDownTeeVector;': '\u295d', + 'RightDownVector;': '\u21c2', + 'RightDownVectorBar;': '\u2955', + 'RightFloor;': '\u230b', + 'rightharpoondown;': '\u21c1', + 'rightharpoonup;': '\u21c0', + 'rightleftarrows;': '\u21c4', + 'rightleftharpoons;': '\u21cc', + 'rightrightarrows;': '\u21c9', + 'rightsquigarrow;': '\u219d', + 'RightTee;': '\u22a2', + 'RightTeeArrow;': '\u21a6', + 'RightTeeVector;': '\u295b', + 'rightthreetimes;': '\u22cc', + 'RightTriangle;': '\u22b3', + 'RightTriangleBar;': '\u29d0', + 'RightTriangleEqual;': '\u22b5', + 'RightUpDownVector;': '\u294f', + 'RightUpTeeVector;': '\u295c', + 'RightUpVector;': '\u21be', + 'RightUpVectorBar;': '\u2954', + 'RightVector;': '\u21c0', + 'RightVectorBar;': '\u2953', + 'ring;': '\u02da', + 'risingdotseq;': '\u2253', + 'rlarr;': '\u21c4', + 'rlhar;': '\u21cc', + 'rlm;': '\u200f', + 'rmoust;': '\u23b1', + 'rmoustache;': '\u23b1', + 'rnmid;': '\u2aee', + 'roang;': '\u27ed', + 'roarr;': '\u21fe', + 'robrk;': '\u27e7', + 'ropar;': '\u2986', + 'Ropf;': '\u211d', + 'ropf;': '\U0001d563', + 'roplus;': '\u2a2e', + 'rotimes;': '\u2a35', + 'RoundImplies;': '\u2970', + 'rpar;': ')', + 'rpargt;': '\u2994', + 'rppolint;': '\u2a12', + 'rrarr;': '\u21c9', + 'Rrightarrow;': '\u21db', + 'rsaquo;': '\u203a', + 'Rscr;': '\u211b', + 'rscr;': '\U0001d4c7', + 'Rsh;': '\u21b1', + 'rsh;': '\u21b1', + 'rsqb;': ']', + 'rsquo;': '\u2019', + 'rsquor;': '\u2019', + 'rthree;': '\u22cc', + 'rtimes;': '\u22ca', + 'rtri;': '\u25b9', + 'rtrie;': '\u22b5', + 'rtrif;': '\u25b8', + 'rtriltri;': '\u29ce', + 'RuleDelayed;': '\u29f4', + 'ruluhar;': '\u2968', + 'rx;': '\u211e', + 'Sacute;': '\u015a', + 'sacute;': '\u015b', + 'sbquo;': '\u201a', + 'Sc;': '\u2abc', + 'sc;': '\u227b', + 'scap;': '\u2ab8', + 'Scaron;': '\u0160', + 'scaron;': '\u0161', + 'sccue;': '\u227d', + 'scE;': '\u2ab4', + 'sce;': '\u2ab0', + 'Scedil;': '\u015e', + 'scedil;': '\u015f', + 'Scirc;': '\u015c', + 'scirc;': '\u015d', + 'scnap;': '\u2aba', + 'scnE;': '\u2ab6', + 'scnsim;': '\u22e9', + 'scpolint;': '\u2a13', + 'scsim;': '\u227f', + 'Scy;': '\u0421', + 'scy;': '\u0441', + 'sdot;': '\u22c5', + 'sdotb;': '\u22a1', + 'sdote;': '\u2a66', + 'searhk;': '\u2925', + 'seArr;': '\u21d8', + 'searr;': '\u2198', + 'searrow;': '\u2198', + 'sect': '\xa7', + 'sect;': '\xa7', + 'semi;': ';', + 'seswar;': '\u2929', + 'setminus;': '\u2216', + 'setmn;': '\u2216', + 'sext;': '\u2736', + 'Sfr;': '\U0001d516', + 'sfr;': '\U0001d530', + 'sfrown;': '\u2322', + 'sharp;': '\u266f', + 'SHCHcy;': '\u0429', + 'shchcy;': '\u0449', + 'SHcy;': '\u0428', + 'shcy;': '\u0448', + 'ShortDownArrow;': '\u2193', + 'ShortLeftArrow;': '\u2190', + 'shortmid;': '\u2223', + 'shortparallel;': '\u2225', + 'ShortRightArrow;': '\u2192', + 'ShortUpArrow;': '\u2191', + 'shy': '\xad', + 'shy;': '\xad', + 'Sigma;': '\u03a3', + 'sigma;': '\u03c3', + 'sigmaf;': '\u03c2', + 'sigmav;': '\u03c2', + 'sim;': '\u223c', + 'simdot;': '\u2a6a', + 'sime;': '\u2243', + 'simeq;': '\u2243', + 'simg;': '\u2a9e', + 'simgE;': '\u2aa0', + 'siml;': '\u2a9d', + 'simlE;': '\u2a9f', + 'simne;': '\u2246', + 'simplus;': '\u2a24', + 'simrarr;': '\u2972', + 'slarr;': '\u2190', + 'SmallCircle;': '\u2218', + 'smallsetminus;': '\u2216', + 'smashp;': '\u2a33', + 'smeparsl;': '\u29e4', + 'smid;': '\u2223', + 'smile;': '\u2323', + 'smt;': '\u2aaa', + 'smte;': '\u2aac', + 'smtes;': '\u2aac\ufe00', + 'SOFTcy;': '\u042c', + 'softcy;': '\u044c', + 'sol;': '/', + 'solb;': '\u29c4', + 'solbar;': '\u233f', + 'Sopf;': '\U0001d54a', + 'sopf;': '\U0001d564', + 'spades;': '\u2660', + 'spadesuit;': '\u2660', + 'spar;': '\u2225', + 'sqcap;': '\u2293', + 'sqcaps;': '\u2293\ufe00', + 'sqcup;': '\u2294', + 'sqcups;': '\u2294\ufe00', + 'Sqrt;': '\u221a', + 'sqsub;': '\u228f', + 'sqsube;': '\u2291', + 'sqsubset;': '\u228f', + 'sqsubseteq;': '\u2291', + 'sqsup;': '\u2290', + 'sqsupe;': '\u2292', + 'sqsupset;': '\u2290', + 'sqsupseteq;': '\u2292', + 'squ;': '\u25a1', + 'Square;': '\u25a1', + 'square;': '\u25a1', + 'SquareIntersection;': '\u2293', + 'SquareSubset;': '\u228f', + 'SquareSubsetEqual;': '\u2291', + 'SquareSuperset;': '\u2290', + 'SquareSupersetEqual;': '\u2292', + 'SquareUnion;': '\u2294', + 'squarf;': '\u25aa', + 'squf;': '\u25aa', + 'srarr;': '\u2192', + 'Sscr;': '\U0001d4ae', + 'sscr;': '\U0001d4c8', + 'ssetmn;': '\u2216', + 'ssmile;': '\u2323', + 'sstarf;': '\u22c6', + 'Star;': '\u22c6', + 'star;': '\u2606', + 'starf;': '\u2605', + 'straightepsilon;': '\u03f5', + 'straightphi;': '\u03d5', + 'strns;': '\xaf', + 'Sub;': '\u22d0', + 'sub;': '\u2282', + 'subdot;': '\u2abd', + 'subE;': '\u2ac5', + 'sube;': '\u2286', + 'subedot;': '\u2ac3', + 'submult;': '\u2ac1', + 'subnE;': '\u2acb', + 'subne;': '\u228a', + 'subplus;': '\u2abf', + 'subrarr;': '\u2979', + 'Subset;': '\u22d0', + 'subset;': '\u2282', + 'subseteq;': '\u2286', + 'subseteqq;': '\u2ac5', + 'SubsetEqual;': '\u2286', + 'subsetneq;': '\u228a', + 'subsetneqq;': '\u2acb', + 'subsim;': '\u2ac7', + 'subsub;': '\u2ad5', + 'subsup;': '\u2ad3', + 'succ;': '\u227b', + 'succapprox;': '\u2ab8', + 'succcurlyeq;': '\u227d', + 'Succeeds;': '\u227b', + 'SucceedsEqual;': '\u2ab0', + 'SucceedsSlantEqual;': '\u227d', + 'SucceedsTilde;': '\u227f', + 'succeq;': '\u2ab0', + 'succnapprox;': '\u2aba', + 'succneqq;': '\u2ab6', + 'succnsim;': '\u22e9', + 'succsim;': '\u227f', + 'SuchThat;': '\u220b', + 'Sum;': '\u2211', + 'sum;': '\u2211', + 'sung;': '\u266a', + 'sup1': '\xb9', + 'sup1;': '\xb9', + 'sup2': '\xb2', + 'sup2;': '\xb2', + 'sup3': '\xb3', + 'sup3;': '\xb3', + 'Sup;': '\u22d1', + 'sup;': '\u2283', + 'supdot;': '\u2abe', + 'supdsub;': '\u2ad8', + 'supE;': '\u2ac6', + 'supe;': '\u2287', + 'supedot;': '\u2ac4', + 'Superset;': '\u2283', + 'SupersetEqual;': '\u2287', + 'suphsol;': '\u27c9', + 'suphsub;': '\u2ad7', + 'suplarr;': '\u297b', + 'supmult;': '\u2ac2', + 'supnE;': '\u2acc', + 'supne;': '\u228b', + 'supplus;': '\u2ac0', + 'Supset;': '\u22d1', + 'supset;': '\u2283', + 'supseteq;': '\u2287', + 'supseteqq;': '\u2ac6', + 'supsetneq;': '\u228b', + 'supsetneqq;': '\u2acc', + 'supsim;': '\u2ac8', + 'supsub;': '\u2ad4', + 'supsup;': '\u2ad6', + 'swarhk;': '\u2926', + 'swArr;': '\u21d9', + 'swarr;': '\u2199', + 'swarrow;': '\u2199', + 'swnwar;': '\u292a', + 'szlig': '\xdf', + 'szlig;': '\xdf', + 'Tab;': '\t', + 'target;': '\u2316', + 'Tau;': '\u03a4', + 'tau;': '\u03c4', + 'tbrk;': '\u23b4', + 'Tcaron;': '\u0164', + 'tcaron;': '\u0165', + 'Tcedil;': '\u0162', + 'tcedil;': '\u0163', + 'Tcy;': '\u0422', + 'tcy;': '\u0442', + 'tdot;': '\u20db', + 'telrec;': '\u2315', + 'Tfr;': '\U0001d517', + 'tfr;': '\U0001d531', + 'there4;': '\u2234', + 'Therefore;': '\u2234', + 'therefore;': '\u2234', + 'Theta;': '\u0398', + 'theta;': '\u03b8', + 'thetasym;': '\u03d1', + 'thetav;': '\u03d1', + 'thickapprox;': '\u2248', + 'thicksim;': '\u223c', + 'ThickSpace;': '\u205f\u200a', + 'thinsp;': '\u2009', + 'ThinSpace;': '\u2009', + 'thkap;': '\u2248', + 'thksim;': '\u223c', + 'THORN': '\xde', + 'thorn': '\xfe', + 'THORN;': '\xde', + 'thorn;': '\xfe', + 'Tilde;': '\u223c', + 'tilde;': '\u02dc', + 'TildeEqual;': '\u2243', + 'TildeFullEqual;': '\u2245', + 'TildeTilde;': '\u2248', + 'times': '\xd7', + 'times;': '\xd7', + 'timesb;': '\u22a0', + 'timesbar;': '\u2a31', + 'timesd;': '\u2a30', + 'tint;': '\u222d', + 'toea;': '\u2928', + 'top;': '\u22a4', + 'topbot;': '\u2336', + 'topcir;': '\u2af1', + 'Topf;': '\U0001d54b', + 'topf;': '\U0001d565', + 'topfork;': '\u2ada', + 'tosa;': '\u2929', + 'tprime;': '\u2034', + 'TRADE;': '\u2122', + 'trade;': '\u2122', + 'triangle;': '\u25b5', + 'triangledown;': '\u25bf', + 'triangleleft;': '\u25c3', + 'trianglelefteq;': '\u22b4', + 'triangleq;': '\u225c', + 'triangleright;': '\u25b9', + 'trianglerighteq;': '\u22b5', + 'tridot;': '\u25ec', + 'trie;': '\u225c', + 'triminus;': '\u2a3a', + 'TripleDot;': '\u20db', + 'triplus;': '\u2a39', + 'trisb;': '\u29cd', + 'tritime;': '\u2a3b', + 'trpezium;': '\u23e2', + 'Tscr;': '\U0001d4af', + 'tscr;': '\U0001d4c9', + 'TScy;': '\u0426', + 'tscy;': '\u0446', + 'TSHcy;': '\u040b', + 'tshcy;': '\u045b', + 'Tstrok;': '\u0166', + 'tstrok;': '\u0167', + 'twixt;': '\u226c', + 'twoheadleftarrow;': '\u219e', + 'twoheadrightarrow;': '\u21a0', + 'Uacute': '\xda', + 'uacute': '\xfa', + 'Uacute;': '\xda', + 'uacute;': '\xfa', + 'Uarr;': '\u219f', + 'uArr;': '\u21d1', + 'uarr;': '\u2191', + 'Uarrocir;': '\u2949', + 'Ubrcy;': '\u040e', + 'ubrcy;': '\u045e', + 'Ubreve;': '\u016c', + 'ubreve;': '\u016d', + 'Ucirc': '\xdb', + 'ucirc': '\xfb', + 'Ucirc;': '\xdb', + 'ucirc;': '\xfb', + 'Ucy;': '\u0423', + 'ucy;': '\u0443', + 'udarr;': '\u21c5', + 'Udblac;': '\u0170', + 'udblac;': '\u0171', + 'udhar;': '\u296e', + 'ufisht;': '\u297e', + 'Ufr;': '\U0001d518', + 'ufr;': '\U0001d532', + 'Ugrave': '\xd9', + 'ugrave': '\xf9', + 'Ugrave;': '\xd9', + 'ugrave;': '\xf9', + 'uHar;': '\u2963', + 'uharl;': '\u21bf', + 'uharr;': '\u21be', + 'uhblk;': '\u2580', + 'ulcorn;': '\u231c', + 'ulcorner;': '\u231c', + 'ulcrop;': '\u230f', + 'ultri;': '\u25f8', + 'Umacr;': '\u016a', + 'umacr;': '\u016b', + 'uml': '\xa8', + 'uml;': '\xa8', + 'UnderBar;': '_', + 'UnderBrace;': '\u23df', + 'UnderBracket;': '\u23b5', + 'UnderParenthesis;': '\u23dd', + 'Union;': '\u22c3', + 'UnionPlus;': '\u228e', + 'Uogon;': '\u0172', + 'uogon;': '\u0173', + 'Uopf;': '\U0001d54c', + 'uopf;': '\U0001d566', + 'UpArrow;': '\u2191', + 'Uparrow;': '\u21d1', + 'uparrow;': '\u2191', + 'UpArrowBar;': '\u2912', + 'UpArrowDownArrow;': '\u21c5', + 'UpDownArrow;': '\u2195', + 'Updownarrow;': '\u21d5', + 'updownarrow;': '\u2195', + 'UpEquilibrium;': '\u296e', + 'upharpoonleft;': '\u21bf', + 'upharpoonright;': '\u21be', + 'uplus;': '\u228e', + 'UpperLeftArrow;': '\u2196', + 'UpperRightArrow;': '\u2197', + 'Upsi;': '\u03d2', + 'upsi;': '\u03c5', + 'upsih;': '\u03d2', + 'Upsilon;': '\u03a5', + 'upsilon;': '\u03c5', + 'UpTee;': '\u22a5', + 'UpTeeArrow;': '\u21a5', + 'upuparrows;': '\u21c8', + 'urcorn;': '\u231d', + 'urcorner;': '\u231d', + 'urcrop;': '\u230e', + 'Uring;': '\u016e', + 'uring;': '\u016f', + 'urtri;': '\u25f9', + 'Uscr;': '\U0001d4b0', + 'uscr;': '\U0001d4ca', + 'utdot;': '\u22f0', + 'Utilde;': '\u0168', + 'utilde;': '\u0169', + 'utri;': '\u25b5', + 'utrif;': '\u25b4', + 'uuarr;': '\u21c8', + 'Uuml': '\xdc', + 'uuml': '\xfc', + 'Uuml;': '\xdc', + 'uuml;': '\xfc', + 'uwangle;': '\u29a7', + 'vangrt;': '\u299c', + 'varepsilon;': '\u03f5', + 'varkappa;': '\u03f0', + 'varnothing;': '\u2205', + 'varphi;': '\u03d5', + 'varpi;': '\u03d6', + 'varpropto;': '\u221d', + 'vArr;': '\u21d5', + 'varr;': '\u2195', + 'varrho;': '\u03f1', + 'varsigma;': '\u03c2', + 'varsubsetneq;': '\u228a\ufe00', + 'varsubsetneqq;': '\u2acb\ufe00', + 'varsupsetneq;': '\u228b\ufe00', + 'varsupsetneqq;': '\u2acc\ufe00', + 'vartheta;': '\u03d1', + 'vartriangleleft;': '\u22b2', + 'vartriangleright;': '\u22b3', + 'Vbar;': '\u2aeb', + 'vBar;': '\u2ae8', + 'vBarv;': '\u2ae9', + 'Vcy;': '\u0412', + 'vcy;': '\u0432', + 'VDash;': '\u22ab', + 'Vdash;': '\u22a9', + 'vDash;': '\u22a8', + 'vdash;': '\u22a2', + 'Vdashl;': '\u2ae6', + 'Vee;': '\u22c1', + 'vee;': '\u2228', + 'veebar;': '\u22bb', + 'veeeq;': '\u225a', + 'vellip;': '\u22ee', + 'Verbar;': '\u2016', + 'verbar;': '|', + 'Vert;': '\u2016', + 'vert;': '|', + 'VerticalBar;': '\u2223', + 'VerticalLine;': '|', + 'VerticalSeparator;': '\u2758', + 'VerticalTilde;': '\u2240', + 'VeryThinSpace;': '\u200a', + 'Vfr;': '\U0001d519', + 'vfr;': '\U0001d533', + 'vltri;': '\u22b2', + 'vnsub;': '\u2282\u20d2', + 'vnsup;': '\u2283\u20d2', + 'Vopf;': '\U0001d54d', + 'vopf;': '\U0001d567', + 'vprop;': '\u221d', + 'vrtri;': '\u22b3', + 'Vscr;': '\U0001d4b1', + 'vscr;': '\U0001d4cb', + 'vsubnE;': '\u2acb\ufe00', + 'vsubne;': '\u228a\ufe00', + 'vsupnE;': '\u2acc\ufe00', + 'vsupne;': '\u228b\ufe00', + 'Vvdash;': '\u22aa', + 'vzigzag;': '\u299a', + 'Wcirc;': '\u0174', + 'wcirc;': '\u0175', + 'wedbar;': '\u2a5f', + 'Wedge;': '\u22c0', + 'wedge;': '\u2227', + 'wedgeq;': '\u2259', + 'weierp;': '\u2118', + 'Wfr;': '\U0001d51a', + 'wfr;': '\U0001d534', + 'Wopf;': '\U0001d54e', + 'wopf;': '\U0001d568', + 'wp;': '\u2118', + 'wr;': '\u2240', + 'wreath;': '\u2240', + 'Wscr;': '\U0001d4b2', + 'wscr;': '\U0001d4cc', + 'xcap;': '\u22c2', + 'xcirc;': '\u25ef', + 'xcup;': '\u22c3', + 'xdtri;': '\u25bd', + 'Xfr;': '\U0001d51b', + 'xfr;': '\U0001d535', + 'xhArr;': '\u27fa', + 'xharr;': '\u27f7', + 'Xi;': '\u039e', + 'xi;': '\u03be', + 'xlArr;': '\u27f8', + 'xlarr;': '\u27f5', + 'xmap;': '\u27fc', + 'xnis;': '\u22fb', + 'xodot;': '\u2a00', + 'Xopf;': '\U0001d54f', + 'xopf;': '\U0001d569', + 'xoplus;': '\u2a01', + 'xotime;': '\u2a02', + 'xrArr;': '\u27f9', + 'xrarr;': '\u27f6', + 'Xscr;': '\U0001d4b3', + 'xscr;': '\U0001d4cd', + 'xsqcup;': '\u2a06', + 'xuplus;': '\u2a04', + 'xutri;': '\u25b3', + 'xvee;': '\u22c1', + 'xwedge;': '\u22c0', + 'Yacute': '\xdd', + 'yacute': '\xfd', + 'Yacute;': '\xdd', + 'yacute;': '\xfd', + 'YAcy;': '\u042f', + 'yacy;': '\u044f', + 'Ycirc;': '\u0176', + 'ycirc;': '\u0177', + 'Ycy;': '\u042b', + 'ycy;': '\u044b', + 'yen': '\xa5', + 'yen;': '\xa5', + 'Yfr;': '\U0001d51c', + 'yfr;': '\U0001d536', + 'YIcy;': '\u0407', + 'yicy;': '\u0457', + 'Yopf;': '\U0001d550', + 'yopf;': '\U0001d56a', + 'Yscr;': '\U0001d4b4', + 'yscr;': '\U0001d4ce', + 'YUcy;': '\u042e', + 'yucy;': '\u044e', + 'yuml': '\xff', + 'Yuml;': '\u0178', + 'yuml;': '\xff', + 'Zacute;': '\u0179', + 'zacute;': '\u017a', + 'Zcaron;': '\u017d', + 'zcaron;': '\u017e', + 'Zcy;': '\u0417', + 'zcy;': '\u0437', + 'Zdot;': '\u017b', + 'zdot;': '\u017c', + 'zeetrf;': '\u2128', + 'ZeroWidthSpace;': '\u200b', + 'Zeta;': '\u0396', + 'zeta;': '\u03b6', + 'Zfr;': '\u2128', + 'zfr;': '\U0001d537', + 'ZHcy;': '\u0416', + 'zhcy;': '\u0436', + 'zigrarr;': '\u21dd', + 'Zopf;': '\u2124', + 'zopf;': '\U0001d56b', + 'Zscr;': '\U0001d4b5', + 'zscr;': '\U0001d4cf', + 'zwj;': '\u200d', + 'zwnj;': '\u200c', + } + +try: + import http.client as compat_http_client +except ImportError: # Python 2 + import httplib as compat_http_client + +try: + from urllib.error import HTTPError as compat_HTTPError +except ImportError: # Python 2 + from urllib2 import HTTPError as compat_HTTPError + +try: + from urllib.request import urlretrieve as compat_urlretrieve +except ImportError: # Python 2 + from urllib import urlretrieve as compat_urlretrieve + +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + +try: # Python 2 + from HTMLParser import HTMLParseError as compat_HTMLParseError +except ImportError: # Python <3.4 + try: + from html.parser import HTMLParseError as compat_HTMLParseError + except ImportError: # Python >3.4 + + # HTMLParseError has been deprecated in Python 3.3 and removed in + # Python 3.5. Introducing dummy exception for Python >3.5 for compatible + # and uniform cross-version exceptiong handling + class compat_HTMLParseError(Exception): + pass + +try: + from subprocess import DEVNULL + compat_subprocess_get_DEVNULL = lambda: DEVNULL +except ImportError: + compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') + +try: + import http.server as compat_http_server +except ImportError: + import BaseHTTPServer as compat_http_server + +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes + from urllib.parse import unquote as compat_urllib_parse_unquote + from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus +except ImportError: # Python 2 + _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') + else re.compile(r'([\x00-\x7f]+)')) + + # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus + # implementations from cpython 3.4.3's stdlib. Python 2's version + # is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244) + + def compat_urllib_parse_unquote_to_bytes(string): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + if not string: + # Is it a string-like object? + string.split + return b'' + if isinstance(string, compat_str): + string = string.encode('utf-8') + bits = string.split(b'%') + if len(bits) == 1: + return string + res = [bits[0]] + append = res.append + for item in bits[1:]: + try: + append(compat_urllib_parse._hextochr[item[:2]]) + append(item[2:]) + except KeyError: + append(b'%') + append(item) + return b''.join(res) + + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid + sequences are replaced by a placeholder character. + + unquote('abc%20def') -> 'abc def'. + """ + if '%' not in string: + string.split + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + bits = _asciire.split(string) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors)) + append(bits[i + 1]) + return ''.join(res) + + def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + + unquote_plus('%7e/abc+def') -> '~/abc def' + """ + string = string.replace('+', ' ') + return compat_urllib_parse_unquote(string, encoding, errors) + +try: + from urllib.parse import urlencode as compat_urllib_parse_urlencode +except ImportError: # Python 2 + # Python 2 will choke in urlencode on mixture of byte and unicode strings. + # Possible solutions are to either port it from python 3 with all + # the friends or manually ensure input query contains only byte strings. + # We will stick with latter thus recursively encoding the whole query. + def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'): + def encode_elem(e): + if isinstance(e, dict): + e = encode_dict(e) + elif isinstance(e, (list, tuple,)): + list_e = encode_list(e) + e = tuple(list_e) if isinstance(e, tuple) else list_e + elif isinstance(e, compat_str): + e = e.encode(encoding) + return e + + def encode_dict(d): + return dict((encode_elem(k), encode_elem(v)) for k, v in d.items()) + + def encode_list(l): + return [encode_elem(e) for e in l] + + return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + +try: + from urllib.request import DataHandler as compat_urllib_request_DataHandler +except ImportError: # Python < 3.4 + # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py + class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.get_full_url() + + scheme, data = url.split(':', 1) + mediatype, data = data.split(',', 1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = compat_urllib_parse_unquote_to_bytes(data) + if mediatype.endswith(';base64'): + data = binascii.a2b_base64(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = 'text/plain;charset=US-ASCII' + + headers = email.message_from_string( + 'Content-type: %s\nContent-length: %d\n' % (mediatype, len(data))) + + return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) + +try: + compat_basestring = basestring # Python 2 +except NameError: + compat_basestring = str + +try: + compat_chr = unichr # Python 2 +except NameError: + compat_chr = chr + +try: + from xml.etree.ElementTree import ParseError as compat_xml_parse_error +except ImportError: # Python 2.6 + from xml.parsers.expat import ExpatError as compat_xml_parse_error + + +etree = xml.etree.ElementTree + + +class _TreeBuilder(etree.TreeBuilder): + def doctype(self, name, pubid, system): + pass + + +try: + # xml.etree.ElementTree.Element is a method in Python <=2.6 and + # the following will crash with: + # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types + isinstance(None, xml.etree.ElementTree.Element) + from xml.etree.ElementTree import Element as compat_etree_Element +except TypeError: # Python <=2.6 + from xml.etree.ElementTree import _ElementInterface as compat_etree_Element + +if sys.version_info[0] >= 3: + def compat_etree_fromstring(text): + return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) +else: + # python 2.x tries to encode unicode strings with ascii (see the + # XMLParser._fixtext method) + try: + _etree_iter = etree.Element.iter + except AttributeError: # Python <=2.6 + def _etree_iter(root): + for el in root.findall('*'): + yield el + for sub in _etree_iter(el): + yield sub + + # on 2.6 XML doesn't have a parser argument, function copied from CPython + # 2.7 source + def _XML(text, parser=None): + if not parser: + parser = etree.XMLParser(target=_TreeBuilder()) + parser.feed(text) + return parser.close() + + def _element_factory(*args, **kwargs): + el = etree.Element(*args, **kwargs) + for k, v in el.items(): + if isinstance(v, bytes): + el.set(k, v.decode('utf-8')) + return el + + def compat_etree_fromstring(text): + doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory))) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc + +if hasattr(etree, 'register_namespace'): + compat_etree_register_namespace = etree.register_namespace +else: + def compat_etree_register_namespace(prefix, uri): + """Register a namespace prefix. + The registry is global, and any existing mapping for either the + given prefix or the namespace URI will be removed. + *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and + attributes in this namespace will be serialized with prefix if possible. + ValueError is raised if prefix is reserved or is invalid. + """ + if re.match(r"ns\d+$", prefix): + raise ValueError("Prefix format reserved for internal use") + for k, v in list(etree._namespace_map.items()): + if k == uri or v == prefix: + del etree._namespace_map[k] + etree._namespace_map[uri] = prefix + +if sys.version_info < (2, 7): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + def compat_xpath(xpath): + if isinstance(xpath, compat_str): + xpath = xpath.encode('ascii') + return xpath +else: + compat_xpath = lambda xpath: xpath + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, compat_str + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError('bad query field: %r' % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt': + def compat_shlex_quote(s): + return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') +else: + try: + from shlex import quote as compat_shlex_quote + except ImportError: # Python < 3.3 + def compat_shlex_quote(s): + if re.match(r'^[-_\w./]+$', s): + return s + else: + return "'" + s.replace("'", "'\"'\"'") + "'" + + +try: + args = shlex.split('中文') + assert (isinstance(args, list) + and isinstance(args[0], compat_str) + and args[0] == '中文') + compat_shlex_split = shlex.split +except (AssertionError, UnicodeEncodeError): + # Working around shlex issue with unicode strings on some python 2 + # versions (see http://bugs.python.org/issue1548891) + def compat_shlex_split(s, comments=False, posix=True): + if isinstance(s, compat_str): + s = s.encode('utf-8') + return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) + + +def compat_ord(c): + if type(c) is int: + return c + else: + return ord(c) + + +if sys.version_info >= (3, 0): + compat_getenv = os.getenv + compat_expanduser = os.path.expanduser + + def compat_setenv(key, value, env=os.environ): + env[key] = value +else: + # Environment variables should be decoded with filesystem encoding. + # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) + + def compat_getenv(key, default=None): + from .utils import get_filesystem_encoding + env = os.getenv(key, default) + if env: + env = env.decode(get_filesystem_encoding()) + return env + + def compat_setenv(key, value, env=os.environ): + def encode(v): + from .utils import get_filesystem_encoding + return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v + env[encode(key)] = encode(value) + + # HACK: The default implementations of os.path.expanduser from cpython do not decode + # environment variables with filesystem encoding. We will work around this by + # providing adjusted implementations. + # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib + # for different platforms with correct environment variables decoding. + + if compat_os_name == 'posix': + def compat_expanduser(path): + """Expand ~ and ~user constructions. If user or $HOME is unknown, + do nothing.""" + if not path.startswith('~'): + return path + i = path.find('/', 1) + if i < 0: + i = len(path) + if i == 1: + if 'HOME' not in os.environ: + import pwd + userhome = pwd.getpwuid(os.getuid()).pw_dir + else: + userhome = compat_getenv('HOME') + else: + import pwd + try: + pwent = pwd.getpwnam(path[1:i]) + except KeyError: + return path + userhome = pwent.pw_dir + userhome = userhome.rstrip('/') + return (userhome + path[i:]) or '/' + elif compat_os_name in ('nt', 'ce'): + def compat_expanduser(path): + """Expand ~ and ~user constructs. + + If user or $HOME is unknown, do nothing.""" + if path[:1] != '~': + return path + i, n = 1, len(path) + while i < n and path[i] not in '/\\': + i = i + 1 + + if 'HOME' in os.environ: + userhome = compat_getenv('HOME') + elif 'USERPROFILE' in os.environ: + userhome = compat_getenv('USERPROFILE') + elif 'HOMEPATH' not in os.environ: + return path + else: + try: + drive = compat_getenv('HOMEDRIVE') + except KeyError: + drive = '' + userhome = os.path.join(drive, compat_getenv('HOMEPATH')) + + if i != 1: # ~user + userhome = os.path.join(os.path.dirname(userhome), path[1:i]) + + return userhome + path[i:] + else: + compat_expanduser = os.path.expanduser + + +if compat_os_name == 'nt' and sys.version_info < (3, 8): + # os.path.realpath on Windows does not follow symbolic links + # prior to Python 3.8 (see https://bugs.python.org/issue9949) + def compat_realpath(path): + while os.path.islink(path): + path = os.path.abspath(os.readlink(path)) + return path +else: + compat_realpath = os.path.realpath + + +if sys.version_info < (3, 0): + def compat_print(s): + from .utils import preferredencoding + print(s.encode(preferredencoding(), 'xmlcharrefreplace')) +else: + def compat_print(s): + assert isinstance(s, compat_str) + print(s) + + +if sys.version_info < (3, 0) and sys.platform == 'win32': + def compat_getpass(prompt, *args, **kwargs): + if isinstance(prompt, compat_str): + from .utils import preferredencoding + prompt = prompt.encode(preferredencoding()) + return getpass.getpass(prompt, *args, **kwargs) +else: + compat_getpass = getpass.getpass + +try: + compat_input = raw_input +except NameError: # Python 3 + compat_input = input + +# Python < 2.6.5 require kwargs to be bytes +try: + def _testfunc(x): + pass + _testfunc(**{'x': 0}) +except TypeError: + def compat_kwargs(kwargs): + return dict((bytes(k), v) for k, v in kwargs.items()) +else: + compat_kwargs = lambda kwargs: kwargs + + +try: + compat_numeric_types = (int, float, long, complex) +except NameError: # Python 3 + compat_numeric_types = (int, float, complex) + + +try: + compat_integer_types = (int, long) +except NameError: # Python 3 + compat_integer_types = (int, ) + + +if sys.version_info < (2, 7): + def compat_socket_create_connection(address, timeout, source_address=None): + host, port = address + err = None + for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM): + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect(sa) + return sock + except socket.error as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise socket.error('getaddrinfo returns an empty list') +else: + compat_socket_create_connection = socket.create_connection + + +# Fix https://github.com/ytdl-org/youtube-dl/issues/4223 +# See http://bugs.python.org/issue9161 for what is broken +def workaround_optparse_bug9161(): + op = optparse.OptionParser() + og = optparse.OptionGroup(op, 'foo') + try: + og.add_option('-t') + except TypeError: + real_add_option = optparse.OptionGroup.add_option + + def _compat_add_option(self, *args, **kwargs): + enc = lambda v: ( + v.encode('ascii', 'replace') if isinstance(v, compat_str) + else v) + bargs = [enc(a) for a in args] + bkwargs = dict( + (k, enc(v)) for k, v in kwargs.items()) + return real_add_option(self, *bargs, **bkwargs) + optparse.OptionGroup.add_option = _compat_add_option + + +if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 + compat_get_terminal_size = shutil.get_terminal_size +else: + _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) + + def compat_get_terminal_size(fallback=(80, 24)): + columns = compat_getenv('COLUMNS') + if columns: + columns = int(columns) + else: + columns = None + lines = compat_getenv('LINES') + if lines: + lines = int(lines) + else: + lines = None + + if columns is None or lines is None or columns <= 0 or lines <= 0: + try: + sp = subprocess.Popen( + ['stty', 'size'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = sp.communicate() + _lines, _columns = map(int, out.split()) + except Exception: + _columns, _lines = _terminal_size(*fallback) + + if columns is None or columns <= 0: + columns = _columns + if lines is None or lines <= 0: + lines = _lines + return _terminal_size(columns, lines) + +try: + itertools.count(start=0, step=1) + compat_itertools_count = itertools.count +except TypeError: # Python 2.6 + def compat_itertools_count(start=0, step=1): + n = start + while True: + yield n + n += step + +if sys.version_info >= (3, 0): + from tokenize import tokenize as compat_tokenize_tokenize +else: + from tokenize import generate_tokens as compat_tokenize_tokenize + + +try: + struct.pack('!I', 0) +except TypeError: + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 + def compat_struct_pack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.pack(spec, *args) + + def compat_struct_unpack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.unpack(spec, *args) + + class compat_Struct(struct.Struct): + def __init__(self, fmt): + if isinstance(fmt, compat_str): + fmt = fmt.encode('ascii') + super(compat_Struct, self).__init__(fmt) +else: + compat_struct_pack = struct.pack + compat_struct_unpack = struct.unpack + if platform.python_implementation() == 'IronPython' and sys.version_info < (2, 7, 8): + class compat_Struct(struct.Struct): + def unpack(self, string): + if not isinstance(string, buffer): # noqa: F821 + string = buffer(string) # noqa: F821 + return super(compat_Struct, self).unpack(string) + else: + compat_Struct = struct.Struct + + +try: + from future_builtins import zip as compat_zip +except ImportError: # not 2.6+ or is 3.x + try: + from itertools import izip as compat_zip # < 2.5 or 3.x + except ImportError: + compat_zip = zip + + +if sys.version_info < (3, 3): + def compat_b64decode(s, *args, **kwargs): + if isinstance(s, compat_str): + s = s.encode('ascii') + return base64.b64decode(s, *args, **kwargs) +else: + compat_b64decode = base64.b64decode + + +if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): + # PyPy2 prior to version 5.4.0 expects byte strings as Windows function + # names, see the original PyPy issue [1] and the youtube-dlc one [2]. + # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name + # 2. https://github.com/ytdl-org/youtube-dl/pull/4392 + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + real = ctypes.WINFUNCTYPE(*args, **kwargs) + + def resf(tpl, *args, **kwargs): + funcname, dll = tpl + return real((str(funcname), dll), *args, **kwargs) + + return resf +else: + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) + + +__all__ = [ + 'compat_HTMLParseError', + 'compat_HTMLParser', + 'compat_HTTPError', + 'compat_Struct', + 'compat_b64decode', + 'compat_basestring', + 'compat_chr', + 'compat_cookiejar', + 'compat_cookiejar_Cookie', + 'compat_cookies', + 'compat_ctypes_WINFUNCTYPE', + 'compat_etree_Element', + 'compat_etree_fromstring', + 'compat_etree_register_namespace', + 'compat_expanduser', + 'compat_get_terminal_size', + 'compat_getenv', + 'compat_getpass', + 'compat_html_entities', + 'compat_html_entities_html5', + 'compat_http_client', + 'compat_http_server', + 'compat_input', + 'compat_integer_types', + 'compat_itertools_count', + 'compat_kwargs', + 'compat_numeric_types', + 'compat_ord', + 'compat_os_name', + 'compat_parse_qs', + 'compat_print', + 'compat_realpath', + 'compat_setenv', + 'compat_shlex_quote', + 'compat_shlex_split', + 'compat_socket_create_connection', + 'compat_str', + 'compat_struct_pack', + 'compat_struct_unpack', + 'compat_subprocess_get_DEVNULL', + 'compat_tokenize_tokenize', + 'compat_urllib_error', + 'compat_urllib_parse', + 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', + 'compat_urllib_parse_urlparse', + 'compat_urllib_request', + 'compat_urllib_request_DataHandler', + 'compat_urllib_response', + 'compat_urlparse', + 'compat_urlretrieve', + 'compat_xml_parse_error', + 'compat_xpath', + 'compat_zip', + 'workaround_optparse_bug9161', +] diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py new file mode 100644 index 0000000000..4ae81f516e --- /dev/null +++ b/youtube_dl/downloader/__init__.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals + +from .common import FileDownloader +from .f4m import F4mFD +from .hls import HlsFD +from .http import HttpFD +from .rtmp import RtmpFD +from .dash import DashSegmentsFD +from .rtsp import RtspFD +from .ism import IsmFD +from .youtube_live_chat import YoutubeLiveChatReplayFD +from .external import ( + get_external_downloader, + FFmpegFD, +) + +from ..utils import ( + determine_protocol, +) + +PROTOCOL_MAP = { + 'rtmp': RtmpFD, + 'm3u8_native': HlsFD, + 'm3u8': FFmpegFD, + 'mms': RtspFD, + 'rtsp': RtspFD, + 'f4m': F4mFD, + 'http_dash_segments': DashSegmentsFD, + 'ism': IsmFD, + 'youtube_live_chat_replay': YoutubeLiveChatReplayFD, +} + + +def get_suitable_downloader(info_dict, params={}): + """Get the downloader class that can handle the info dict.""" + protocol = determine_protocol(info_dict) + info_dict['protocol'] = protocol + + # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): + # return FFmpegFD + + external_downloader = params.get('external_downloader') + if external_downloader is not None: + ed = get_external_downloader(external_downloader) + if ed.can_download(info_dict): + return ed + + if protocol.startswith('m3u8') and info_dict.get('is_live'): + return FFmpegFD + + if protocol == 'm3u8' and params.get('hls_prefer_native') is True: + return HlsFD + + if protocol == 'm3u8_native' and params.get('hls_prefer_native') is False: + return FFmpegFD + + return PROTOCOL_MAP.get(protocol, HttpFD) + + +__all__ = [ + 'get_suitable_downloader', + 'FileDownloader', +] diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py new file mode 100644 index 0000000000..31c2864584 --- /dev/null +++ b/youtube_dl/downloader/common.py @@ -0,0 +1,391 @@ +from __future__ import division, unicode_literals + +import os +import re +import sys +import time +import random + +from ..compat import compat_os_name +from ..utils import ( + decodeArgument, + encodeFilename, + error_to_compat_str, + format_bytes, + shell_quote, + timeconvert, +) + + +class FileDownloader(object): + """File Downloader class. + + File downloader objects are the ones responsible of downloading the + actual video file and writing it to disk. + + File downloaders accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. + + Available options: + + verbose: Print additional info to stdout. + quiet: Do not print messages to stdout. + ratelimit: Download speed limit, in bytes/sec. + retries: Number of times to retry for HTTP error 5xx + buffersize: Size of download buffer in bytes. + noresizebuffer: Do not automatically resize the download buffer. + continuedl: Try to continue downloads if possible. + noprogress: Do not print the progress bar. + logtostderr: Log messages to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. + nopart: Do not use temporary .part files. + updatetime: Use the Last-modified header to set output file timestamps. + test: Download only first bytes to test the downloader. + min_filesize: Skip files smaller than this size + max_filesize: Skip files larger than this size + xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. + external_downloader_args: A list of additional command-line arguments for the + external downloader. + hls_use_mpegts: Use the mpegts container for HLS videos. + http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be + useful for bypassing bandwidth throttling imposed by + a webserver (experimental) + + Subclasses of this one must re-define the real_download method. + """ + + _TEST_FILE_SIZE = 10241 + params = None + + def __init__(self, ydl, params): + """Create a FileDownloader object with the given options.""" + self.ydl = ydl + self._progress_hooks = [] + self.params = params + self.add_progress_hook(self.report_progress) + + @staticmethod + def format_seconds(seconds): + (mins, secs) = divmod(seconds, 60) + (hours, mins) = divmod(mins, 60) + if hours > 99: + return '--:--:--' + if hours == 0: + return '%02d:%02d' % (mins, secs) + else: + return '%02d:%02d:%02d' % (hours, mins, secs) + + @staticmethod + def calc_percent(byte_counter, data_len): + if data_len is None: + return None + return float(byte_counter) / float(data_len) * 100.0 + + @staticmethod + def format_percent(percent): + if percent is None: + return '---.-%' + return '%6s' % ('%3.1f%%' % percent) + + @staticmethod + def calc_eta(start, now, total, current): + if total is None: + return None + if now is None: + now = time.time() + dif = now - start + if current == 0 or dif < 0.001: # One millisecond + return None + rate = float(current) / dif + return int((float(total) - float(current)) / rate) + + @staticmethod + def format_eta(eta): + if eta is None: + return '--:--' + return FileDownloader.format_seconds(eta) + + @staticmethod + def calc_speed(start, now, bytes): + dif = now - start + if bytes == 0 or dif < 0.001: # One millisecond + return None + return float(bytes) / dif + + @staticmethod + def format_speed(speed): + if speed is None: + return '%10s' % '---b/s' + return '%10s' % ('%s/s' % format_bytes(speed)) + + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else '%.0f' % retries + + @staticmethod + def best_block_size(elapsed_time, bytes): + new_min = max(bytes / 2.0, 1.0) + new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB + if elapsed_time < 0.001: + return int(new_max) + rate = bytes / elapsed_time + if rate > new_max: + return int(new_max) + if rate < new_min: + return int(new_min) + return int(rate) + + @staticmethod + def parse_bytes(bytestr): + """Parse a string indicating a byte quantity into an integer.""" + matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) + if matchobj is None: + return None + number = float(matchobj.group(1)) + multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) + return int(round(number * multiplier)) + + def to_screen(self, *args, **kargs): + self.ydl.to_screen(*args, **kargs) + + def to_stderr(self, message): + self.ydl.to_screen(message) + + def to_console_title(self, message): + self.ydl.to_console_title(message) + + def trouble(self, *args, **kargs): + self.ydl.trouble(*args, **kargs) + + def report_warning(self, *args, **kargs): + self.ydl.report_warning(*args, **kargs) + + def report_error(self, *args, **kargs): + self.ydl.report_error(*args, **kargs) + + def slow_down(self, start_time, now, byte_counter): + """Sleep if the download speed is over the rate limit.""" + rate_limit = self.params.get('ratelimit') + if rate_limit is None or byte_counter == 0: + return + if now is None: + now = time.time() + elapsed = now - start_time + if elapsed <= 0.0: + return + speed = float(byte_counter) / elapsed + if speed > rate_limit: + sleep_time = float(byte_counter) / rate_limit - elapsed + if sleep_time > 0: + time.sleep(sleep_time) + + def temp_name(self, filename): + """Returns a temporary filename for the given filename.""" + if self.params.get('nopart', False) or filename == '-' or \ + (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): + return filename + return filename + '.part' + + def undo_temp_name(self, filename): + if filename.endswith('.part'): + return filename[:-len('.part')] + return filename + + def ytdl_filename(self, filename): + return filename + '.ytdl' + + def try_rename(self, old_filename, new_filename): + try: + if old_filename == new_filename: + return + os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) + except (IOError, OSError) as err: + self.report_error('unable to rename file: %s' % error_to_compat_str(err)) + + def try_utime(self, filename, last_modified_hdr): + """Try to set the last-modified time of the given file.""" + if last_modified_hdr is None: + return + if not os.path.isfile(encodeFilename(filename)): + return + timestr = last_modified_hdr + if timestr is None: + return + filetime = timeconvert(timestr) + if filetime is None: + return filetime + # Ignore obviously invalid dates + if filetime == 0: + return + try: + os.utime(filename, (time.time(), filetime)) + except Exception: + pass + return filetime + + def report_destination(self, filename): + """Report destination filename.""" + self.to_screen('[download] Destination: ' + filename) + + def _report_progress_status(self, msg, is_last_line=False): + fullmsg = '[download] ' + msg + if self.params.get('progress_with_newline', False): + self.to_screen(fullmsg) + else: + if compat_os_name == 'nt': + prev_len = getattr(self, '_report_progress_prev_line_length', + 0) + if prev_len > len(fullmsg): + fullmsg += ' ' * (prev_len - len(fullmsg)) + self._report_progress_prev_line_length = len(fullmsg) + clear_line = '\r' + else: + clear_line = ('\r\x1b[K' if sys.stderr.isatty() else '\r') + self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) + self.to_console_title('youtube-dlc ' + msg) + + def report_progress(self, s): + if s['status'] == 'finished': + if self.params.get('noprogress', False): + self.to_screen('[download] Download completed') + else: + msg_template = '100%%' + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template += ' of %(_total_bytes_str)s' + if s.get('elapsed') is not None: + s['_elapsed_str'] = self.format_seconds(s['elapsed']) + msg_template += ' in %(_elapsed_str)s' + self._report_progress_status( + msg_template % s, is_last_line=True) + + if self.params.get('noprogress'): + return + + if s['status'] != 'downloading': + return + + if s.get('eta') is not None: + s['_eta_str'] = self.format_eta(s['eta']) + else: + s['_eta_str'] = 'Unknown ETA' + + if s.get('total_bytes') and s.get('downloaded_bytes') is not None: + s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) + elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: + s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) + else: + if s.get('downloaded_bytes') == 0: + s['_percent_str'] = self.format_percent(0) + else: + s['_percent_str'] = 'Unknown %' + + if s.get('speed') is not None: + s['_speed_str'] = self.format_speed(s['speed']) + else: + s['_speed_str'] = 'Unknown speed' + + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' + elif s.get('total_bytes_estimate') is not None: + s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) + msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' + else: + if s.get('downloaded_bytes') is not None: + s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) + if s.get('elapsed'): + s['_elapsed_str'] = self.format_seconds(s['elapsed']) + msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' + else: + msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' + else: + msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' + + self._report_progress_status(msg_template % s) + + def report_resuming_byte(self, resume_len): + """Report attempt to resume at given byte.""" + self.to_screen('[download] Resuming download at byte %s' % resume_len) + + def report_retry(self, err, count, retries): + """Report retry in case of HTTP error 5xx""" + self.to_screen( + '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' + % (error_to_compat_str(err), count, self.format_retries(retries))) + + def report_file_already_downloaded(self, file_name): + """Report file has already been fully downloaded.""" + try: + self.to_screen('[download] %s has already been downloaded' % file_name) + except UnicodeEncodeError: + self.to_screen('[download] The file has already been downloaded') + + def report_unable_to_resume(self): + """Report it was impossible to resume download.""" + self.to_screen('[download] Unable to resume') + + def download(self, filename, info_dict): + """Download to a filename using the info from info_dict + Return True on success and False otherwise + """ + + nooverwrites_and_exists = ( + self.params.get('nooverwrites', False) + and os.path.exists(encodeFilename(filename)) + ) + + if not hasattr(filename, 'write'): + continuedl_and_exists = ( + self.params.get('continuedl', True) + and os.path.isfile(encodeFilename(filename)) + and not self.params.get('nopart', False) + ) + + # Check file already present + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): + self.report_file_already_downloaded(filename) + self._hook_progress({ + 'filename': filename, + 'status': 'finished', + 'total_bytes': os.path.getsize(encodeFilename(filename)), + }) + return True + + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) + self.to_screen( + '[download] Sleeping %s seconds...' % ( + int(sleep_interval) if sleep_interval.is_integer() + else '%.2f' % sleep_interval)) + time.sleep(sleep_interval) + + return self.real_download(filename, info_dict) + + def real_download(self, filename, info_dict): + """Real download process. Redefine in subclasses.""" + raise NotImplementedError('This method must be implemented by subclasses') + + def _hook_progress(self, status): + for ph in self._progress_hooks: + ph(status) + + def add_progress_hook(self, ph): + # See YoutubeDl.py (search for progress_hooks) for a description of + # this interface + self._progress_hooks.append(ph) + + def _debug_cmd(self, args, exe=None): + if not self.params.get('verbose', False): + return + + str_args = [decodeArgument(a) for a in args] + + if exe is None: + exe = os.path.basename(str_args[0]) + + self.to_screen('[debug] %s command line: %s' % ( + exe, shell_quote(str_args))) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py new file mode 100644 index 0000000000..c6d674bc6c --- /dev/null +++ b/youtube_dl/downloader/dash.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +from .fragment import FragmentFD +from ..compat import compat_urllib_error +from ..utils import ( + DownloadError, + urljoin, +) + + +class DashSegmentsFD(FragmentFD): + """ + Download segments in a DASH manifest + """ + + FD_NAME = 'dashsegments' + + def real_download(self, filename, info_dict): + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + + ctx = { + 'filename': filename, + 'total_frags': len(fragments), + } + + self._prepare_and_start_frag_download(ctx) + + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + frag_index = 0 + for i, fragment in enumerate(fragments): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + # In DASH, the first segment contains necessary headers to + # generate a valid MP4 file, so always abort for the first segment + fatal = i == 0 or not skip_unavailable_fragments + count = 0 + while count <= fragment_retries: + try: + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) + if not success: + return False + self._append_fragment(ctx, frag_content) + break + except compat_urllib_error.HTTPError as err: + # YouTube may often return 404 HTTP error for a fragment causing the + # whole download to fail. However if the same fragment is immediately + # retried with the same request data this usually succeeds (1-2 attempts + # is usually enough) thus allowing to download the whole file successfully. + # To be future-proof we will retry all fragments that fail with any + # HTTP error. + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_index, count, fragment_retries) + except DownloadError: + # Don't retry fragment if error occurred during HTTP downloading + # itself since it has own retry settings + if not fatal: + self.report_skip_fragment(frag_index) + break + raise + + if count > fragment_retries: + if not fatal: + self.report_skip_fragment(frag_index) + continue + self.report_error('giving up after %s fragment retries' % fragment_retries) + return False + + self._finish_frag_download(ctx) + + return True diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py new file mode 100644 index 0000000000..c31f8910ad --- /dev/null +++ b/youtube_dl/downloader/external.py @@ -0,0 +1,371 @@ +from __future__ import unicode_literals + +import os.path +import re +import subprocess +import sys +import time + +from .common import FileDownloader +from ..compat import ( + compat_setenv, + compat_str, +) +from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS +from ..utils import ( + cli_option, + cli_valueless_option, + cli_bool_option, + cli_configuration_args, + encodeFilename, + encodeArgument, + handle_youtubedl_headers, + check_executable, + is_outdated_version, +) + + +class ExternalFD(FileDownloader): + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + try: + started = time.time() + retval = self._call_downloader(tmpfilename, info_dict) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + # Live stream downloading cancellation should be considered as + # correct and expected termination thus all postprocessing + # should take place + retval = 0 + self.to_screen('[%s] Interrupted by user' % self.get_basename()) + + if retval == 0: + status = { + 'filename': filename, + 'status': 'finished', + 'elapsed': time.time() - started, + } + if filename != '-': + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) + self.try_rename(tmpfilename, filename) + status.update({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + }) + self._hook_progress(status) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % ( + self.get_basename(), retval)) + return False + + @classmethod + def get_basename(cls): + return cls.__name__[:-2].lower() + + @property + def exe(self): + return self.params.get('external_downloader') + + @classmethod + def available(cls): + return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT]) + + @classmethod + def supports(cls, info_dict): + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') + + @classmethod + def can_download(cls, info_dict): + return cls.available() and cls.supports(info_dict) + + def _option(self, command_option, param): + return cli_option(self.params, command_option, param) + + def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): + return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) + + def _valueless_option(self, command_option, param, expected_value=True): + return cli_valueless_option(self.params, command_option, param, expected_value) + + def _configuration_args(self, default=[]): + return cli_configuration_args(self.params, 'external_downloader_args', default) + + def _call_downloader(self, tmpfilename, info_dict): + """ Either overwrite this or implement _make_cmd """ + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + p = subprocess.Popen( + cmd, stderr=subprocess.PIPE) + _, stderr = p.communicate() + if p.returncode != 0: + self.to_stderr(stderr.decode('utf-8', 'replace')) + return p.returncode + + +class CurlFD(ExternalFD): + AVAILABLE_OPT = '-V' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '--location', '-o', tmpfilename] + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') + cmd += self._valueless_option('--silent', 'noprogress') + cmd += self._valueless_option('--verbose', 'verbose') + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--retry', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '2147483647' + cmd += retry + cmd += self._option('--max-filesize', 'max_filesize') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--insecure', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + def _call_downloader(self, tmpfilename, info_dict): + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + # curl writes the progress to stderr so don't capture it. + p = subprocess.Popen(cmd) + p.communicate() + return p.returncode + + +class AxelFD(ExternalFD): + AVAILABLE_OPT = '-V' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-o', tmpfilename] + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class WgetFD(ExternalFD): + AVAILABLE_OPT = '--version' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--tries', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '0' + cmd += retry + cmd += self._option('--bind-address', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class Aria2cFD(ExternalFD): + AVAILABLE_OPT = '-v' + + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-c'] + cmd += self._configuration_args([ + '--min-split-size', '1M', '--max-connection-per-server', '4']) + dn = os.path.dirname(tmpfilename) + if dn: + cmd += ['--dir', dn] + cmd += ['--out', os.path.basename(tmpfilename)] + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--interface', 'source_address') + cmd += self._option('--all-proxy', 'proxy') + cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') + cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') + cmd += ['--', info_dict['url']] + return cmd + + +class HttpieFD(ExternalFD): + @classmethod + def available(cls): + return check_executable('http', ['--version']) + + def _make_cmd(self, tmpfilename, info_dict): + cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] + return cmd + + +class FFmpegFD(ExternalFD): + @classmethod + def supports(cls, info_dict): + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + + @classmethod + def available(cls): + return FFmpegPostProcessor().available + + def _call_downloader(self, tmpfilename, info_dict): + url = info_dict['url'] + ffpp = FFmpegPostProcessor(downloader=self) + if not ffpp.available: + self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') + return False + ffpp.check_version() + + args = [ffpp.executable, '-y'] + + for log_level in ('quiet', 'verbose'): + if self.params.get(log_level, False): + args += ['-loglevel', log_level] + break + + seekable = info_dict.get('_seekable') + if seekable is not None: + # setting -seekable prevents ffmpeg from guessing if the server + # supports seeking(by adding the header `Range: bytes=0-`), which + # can cause problems in some cases + # https://github.com/ytdl-org/youtube-dl/issues/11800#issuecomment-275037127 + # http://trac.ffmpeg.org/ticket/6125#comment:10 + args += ['-seekable', '1' if seekable else '0'] + + args += self._configuration_args() + + # start_time = info_dict.get('start_time') or 0 + # if start_time: + # args += ['-ss', compat_str(start_time)] + # end_time = info_dict.get('end_time') + # if end_time: + # args += ['-t', compat_str(end_time - start_time)] + + if info_dict['http_headers'] and re.match(r'^https?://', url): + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + + env = None + proxy = self.params.get('proxy') + if proxy: + if not re.match(r'^[\da-zA-Z]+://', proxy): + proxy = 'http://%s' % proxy + + if proxy.startswith('socks'): + self.report_warning( + '%s does not support SOCKS proxies. Downloading is likely to fail. ' + 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) + + # Since December 2015 ffmpeg supports -http_proxy option (see + # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) + # We could switch to the following code if we are able to detect version properly + # args += ['-http_proxy', proxy] + env = os.environ.copy() + compat_setenv('HTTP_PROXY', proxy, env=env) + compat_setenv('http_proxy', proxy, env=env) + + protocol = info_dict.get('protocol') + + if protocol == 'rtmp': + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + conn = info_dict.get('rtmp_conn') + if player_url is not None: + args += ['-rtmp_swfverify', player_url] + if page_url is not None: + args += ['-rtmp_pageurl', page_url] + if app is not None: + args += ['-rtmp_app', app] + if play_path is not None: + args += ['-rtmp_playpath', play_path] + if tc_url is not None: + args += ['-rtmp_tcurl', tc_url] + if flash_version is not None: + args += ['-rtmp_flashver', flash_version] + if live: + args += ['-rtmp_live', 'live'] + if isinstance(conn, list): + for entry in conn: + args += ['-rtmp_conn', entry] + elif isinstance(conn, compat_str): + args += ['-rtmp_conn', conn] + + args += ['-i', url, '-c', 'copy'] + + if self.params.get('test', False): + args += ['-fs', compat_str(self._TEST_FILE_SIZE)] + + if protocol in ('m3u8', 'm3u8_native'): + if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': + args += ['-f', 'mpegts'] + else: + args += ['-f', 'mp4'] + if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + args += ['-bsf:a', 'aac_adtstoasc'] + elif protocol == 'rtmp': + args += ['-f', 'flv'] + else: + args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])] + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + + self._debug_cmd(args) + + proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) + try: + retval = proc.wait() + except KeyboardInterrupt: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). + if sys.platform != 'win32': + proc.communicate(b'q') + raise + return retval + + +class AVconvFD(FFmpegFD): + pass + + +_BY_NAME = dict( + (klass.get_basename(), klass) + for name, klass in globals().items() + if name.endswith('FD') and name != 'ExternalFD' +) + + +def list_external_downloaders(): + return sorted(_BY_NAME.keys()) + + +def get_external_downloader(external_downloader): + """ Given the name of the executable, see whether we support the given + downloader . """ + # Drop .exe extension on Windows + bn = os.path.splitext(os.path.basename(external_downloader))[0] + return _BY_NAME[bn] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py new file mode 100644 index 0000000000..8dd3c2eeb3 --- /dev/null +++ b/youtube_dl/downloader/f4m.py @@ -0,0 +1,438 @@ +from __future__ import division, unicode_literals + +import io +import itertools +import time + +from .fragment import FragmentFD +from ..compat import ( + compat_b64decode, + compat_etree_fromstring, + compat_urlparse, + compat_urllib_error, + compat_urllib_parse_urlparse, + compat_struct_pack, + compat_struct_unpack, +) +from ..utils import ( + fix_xml_ampersands, + xpath_text, +) + + +class DataTruncatedError(Exception): + pass + + +class FlvReader(io.BytesIO): + """ + Reader for Flv files + The file format is documented in https://www.adobe.com/devnet/f4v.html + """ + + def read_bytes(self, n): + data = self.read(n) + if len(data) < n: + raise DataTruncatedError( + 'FlvReader error: need %d bytes while only %d bytes got' % ( + n, len(data))) + return data + + # Utility functions for reading numbers and strings + def read_unsigned_long_long(self): + return compat_struct_unpack('!Q', self.read_bytes(8))[0] + + def read_unsigned_int(self): + return compat_struct_unpack('!I', self.read_bytes(4))[0] + + def read_unsigned_char(self): + return compat_struct_unpack('!B', self.read_bytes(1))[0] + + def read_string(self): + res = b'' + while True: + char = self.read_bytes(1) + if char == b'\x00': + break + res += char + return res + + def read_box_info(self): + """ + Read a box and return the info as a tuple: (box_size, box_type, box_data) + """ + real_size = size = self.read_unsigned_int() + box_type = self.read_bytes(4) + header_end = 8 + if size == 1: + real_size = self.read_unsigned_long_long() + header_end = 16 + return real_size, box_type, self.read_bytes(real_size - header_end) + + def read_asrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + quality_entry_count = self.read_unsigned_char() + # QualityEntryCount + for i in range(quality_entry_count): + self.read_string() + + segment_run_count = self.read_unsigned_int() + segments = [] + for i in range(segment_run_count): + first_segment = self.read_unsigned_int() + fragments_per_segment = self.read_unsigned_int() + segments.append((first_segment, fragments_per_segment)) + + return { + 'segment_run': segments, + } + + def read_afrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + # time scale + self.read_unsigned_int() + + quality_entry_count = self.read_unsigned_char() + # QualitySegmentUrlModifiers + for i in range(quality_entry_count): + self.read_string() + + fragments_count = self.read_unsigned_int() + fragments = [] + for i in range(fragments_count): + first = self.read_unsigned_int() + first_ts = self.read_unsigned_long_long() + duration = self.read_unsigned_int() + if duration == 0: + discontinuity_indicator = self.read_unsigned_char() + else: + discontinuity_indicator = None + fragments.append({ + 'first': first, + 'ts': first_ts, + 'duration': duration, + 'discontinuity_indicator': discontinuity_indicator, + }) + + return { + 'fragments': fragments, + } + + def read_abst(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + + self.read_unsigned_int() # BootstrapinfoVersion + # Profile,Live,Update,Reserved + flags = self.read_unsigned_char() + live = flags & 0x20 != 0 + # time scale + self.read_unsigned_int() + # CurrentMediaTime + self.read_unsigned_long_long() + # SmpteTimeCodeOffset + self.read_unsigned_long_long() + + self.read_string() # MovieIdentifier + server_count = self.read_unsigned_char() + # ServerEntryTable + for i in range(server_count): + self.read_string() + quality_count = self.read_unsigned_char() + # QualityEntryTable + for i in range(quality_count): + self.read_string() + # DrmData + self.read_string() + # MetaData + self.read_string() + + segments_count = self.read_unsigned_char() + segments = [] + for i in range(segments_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'asrt' + segment = FlvReader(box_data).read_asrt() + segments.append(segment) + fragments_run_count = self.read_unsigned_char() + fragments = [] + for i in range(fragments_run_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'afrt' + fragments.append(FlvReader(box_data).read_afrt()) + + return { + 'segments': segments, + 'fragments': fragments, + 'live': live, + } + + def read_bootstrap_info(self): + total_size, box_type, box_data = self.read_box_info() + assert box_type == b'abst' + return FlvReader(box_data).read_abst() + + +def read_bootstrap_info(bootstrap_bytes): + return FlvReader(bootstrap_bytes).read_bootstrap_info() + + +def build_fragments_list(boot_info): + """ Return a list of (segment, fragment) for each fragment in the video """ + res = [] + segment_run_table = boot_info['segments'][0] + fragment_run_entry_table = boot_info['fragments'][0]['fragments'] + first_frag_number = fragment_run_entry_table[0]['first'] + fragments_counter = itertools.count(first_frag_number) + for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (for example Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 + for _ in range(fragments_count): + res.append((segment, next(fragments_counter))) + + if boot_info['live']: + res = res[-2:] + + return res + + +def write_unsigned_int(stream, val): + stream.write(compat_struct_pack('!I', val)) + + +def write_unsigned_int_24(stream, val): + stream.write(compat_struct_pack('!I', val)[1:]) + + +def write_flv_header(stream): + """Writes the FLV header to stream""" + # FLV header + stream.write(b'FLV\x01') + stream.write(b'\x05') + stream.write(b'\x00\x00\x00\x09') + stream.write(b'\x00\x00\x00\x00') + + +def write_metadata_tag(stream, metadata): + """Writes optional metadata tag to stream""" + SCRIPT_TAG = b'\x12' + FLV_TAG_HEADER_LEN = 11 + + if metadata: + stream.write(SCRIPT_TAG) + write_unsigned_int_24(stream, len(metadata)) + stream.write(b'\x00\x00\x00\x00\x00\x00\x00') + stream.write(metadata) + write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) + + +def remove_encrypted_media(media): + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib + and 'drmAdditionalHeaderSetId' not in e.attrib, + media)) + + +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url + + +class F4mFD(FragmentFD): + """ + A downloader for f4m manifests or AdobeHDS. + """ + + FD_NAME = 'f4m' + + def _get_unencrypted_media(self, doc): + media = doc.findall(_add_ns('media')) + if not media: + self.report_error('No media found') + for e in (doc.findall(_add_ns('drmAdditionalHeader')) + + doc.findall(_add_ns('drmAdditionalHeaderSet'))): + # If id attribute is missing it's valid for all media nodes + # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute + if 'id' not in e.attrib: + self.report_error('Missing ID in f4m DRM') + media = remove_encrypted_media(media) + if not media: + self.report_error('Unsupported DRM') + return media + + def _get_bootstrap_from_url(self, bootstrap_url): + bootstrap = self.ydl.urlopen(bootstrap_url).read() + return read_bootstrap_info(bootstrap) + + def _update_live_fragments(self, bootstrap_url, latest_fragment): + fragments_list = [] + retries = 30 + while (not fragments_list) and (retries > 0): + boot_info = self._get_bootstrap_from_url(bootstrap_url) + fragments_list = build_fragments_list(boot_info) + fragments_list = [f for f in fragments_list if f[1] > latest_fragment] + if not fragments_list: + # Retry after a while + time.sleep(5.0) + retries -= 1 + + if not fragments_list: + self.report_error('Failed to update fragments') + + return fragments_list + + def _parse_bootstrap_node(self, node, base_url): + # Sometimes non empty inline bootstrap info can be specified along + # with bootstrap url attribute (e.g. dummy inline bootstrap info + # contains whitespace characters in [1]). We will prefer bootstrap + # url over inline bootstrap info when present. + # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m + bootstrap_url = node.get('url') + if bootstrap_url: + bootstrap_url = compat_urlparse.urljoin( + base_url, bootstrap_url) + boot_info = self._get_bootstrap_from_url(bootstrap_url) + else: + bootstrap_url = None + bootstrap = compat_b64decode(node.text) + boot_info = read_bootstrap_info(bootstrap) + return boot_info, bootstrap_url + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + requested_bitrate = info_dict.get('tbr') + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 + # and https://github.com/ytdl-org/youtube-dl/issues/7823) + manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() + + doc = compat_etree_fromstring(manifest) + formats = [(int(f.attrib.get('bitrate', -1)), f) + for f in self._get_unencrypted_media(doc)] + if requested_bitrate is None or len(formats) == 1: + # get the best format + formats = sorted(formats, key=lambda f: f[0]) + rate, media = formats[-1] + else: + rate, media = list(filter( + lambda f: int(f[0]) == requested_bitrate, formats))[0] + + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) + bootstrap_node = doc.find(_add_ns('bootstrapInfo')) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) + live = boot_info['live'] + metadata_node = media.find(_add_ns('metadata')) + if metadata_node is not None: + metadata = compat_b64decode(metadata_node.text) + else: + metadata = None + + fragments_list = build_fragments_list(boot_info) + test = self.params.get('test', False) + if test: + # We only download the first fragment + fragments_list = fragments_list[:1] + total_frags = len(fragments_list) + # For some akamai manifests we'll need to add a query to the fragment url + akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) + + ctx = { + 'filename': filename, + 'total_frags': total_frags, + 'live': live, + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] + + if ctx['complete_frags_downloaded_bytes'] == 0: + write_flv_header(dest_stream) + if not live: + write_metadata_tag(dest_stream, metadata) + + base_url_parsed = compat_urllib_parse_urlparse(base_url) + + self._start_frag_download(ctx) + + frag_index = 0 + while fragments_list: + seg_i, frag_i = fragments_list.pop(0) + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + name = 'Seg%d-Frag%d' % (seg_i, frag_i) + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) + if akamai_pv: + query.append(akamai_pv.strip(';')) + if info_dict.get('extra_param_to_segment_url'): + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) + try: + success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) + if not success: + return False + reader = FlvReader(down_data) + while True: + try: + _, box_type, box_data = reader.read_box_info() + except DataTruncatedError: + if test: + # In tests, segments may be truncated, and thus + # FlvReader may not be able to parse the whole + # chunk. If so, write the segment as is + # See https://github.com/ytdl-org/youtube-dl/issues/9214 + dest_stream.write(down_data) + break + raise + if box_type == b'mdat': + self._append_fragment(ctx, box_data) + break + except (compat_urllib_error.HTTPError, ) as err: + if live and (err.code == 404 or err.code == 410): + # We didn't keep up with the live window. Continue + # with the next available fragment. + msg = 'Fragment %d unavailable' % frag_i + self.report_warning(msg) + fragments_list = [] + else: + raise + + if not fragments_list and not test and live and bootstrap_url: + fragments_list = self._update_live_fragments(bootstrap_url, frag_i) + total_frags += len(fragments_list) + if fragments_list and (fragments_list[0][1] > frag_i + 1): + msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) + self.report_warning(msg) + + self._finish_frag_download(ctx) + + return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py new file mode 100644 index 0000000000..9339b3a62c --- /dev/null +++ b/youtube_dl/downloader/fragment.py @@ -0,0 +1,269 @@ +from __future__ import division, unicode_literals + +import os +import time +import json + +from .common import FileDownloader +from .http import HttpFD +from ..utils import ( + error_to_compat_str, + encodeFilename, + sanitize_open, + sanitized_Request, +) + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + +class FragmentFD(FileDownloader): + """ + A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error (DASH + and hlsnative only) + skip_unavailable_fragments: + Skip unavailable fragments (DASH and hlsnative only) + keep_fragments: Keep downloaded fragments on disk after downloading is + finished + + For each incomplete fragment download youtube-dlc keeps on disk a special + bookkeeping file with download state and metadata (in future such files will + be used for any incomplete download handled by youtube-dlc). This file is + used to properly handle resuming, check download file consistency and detect + potential errors. The file has a .ytdl extension and represents a standard + JSON file of the following format: + + extractor: + Dictionary of extractor related data. TBD. + + downloader: + Dictionary of downloader related data. May contain following data: + current_fragment: + Dictionary with current (being downloaded) fragment data: + index: 0-based index of current fragment among all fragments + fragment_count: + Total count of fragments + + This feature is experimental and file format may change in future. + """ + + def report_retry_fragment(self, err, frag_index, count, retries): + self.to_screen( + '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...' + % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) + + def report_skip_fragment(self, frag_index): + self.to_screen('[download] Skipping fragment %d...' % frag_index) + + def _prepare_url(self, info_dict, url): + headers = info_dict.get('http_headers') + return sanitized_Request(url, None, headers) if headers else url + + def _prepare_and_start_frag_download(self, ctx): + self._prepare_frag_download(ctx) + self._start_frag_download(ctx) + + @staticmethod + def __do_ytdl_file(ctx): + return not ctx['live'] and not ctx['tmpfilename'] == '-' + + def _read_ytdl_file(self, ctx): + assert 'ytdl_corrupt' not in ctx + stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') + try: + ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] + except Exception: + ctx['ytdl_corrupt'] = True + finally: + stream.close() + + def _write_ytdl_file(self, ctx): + frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') + downloader = { + 'current_fragment': { + 'index': ctx['fragment_index'], + }, + } + if ctx.get('fragment_count') is not None: + downloader['fragment_count'] = ctx['fragment_count'] + frag_index_stream.write(json.dumps({'downloader': downloader})) + frag_index_stream.close() + + def _download_fragment(self, ctx, frag_url, info_dict, headers=None): + fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) + success = ctx['dl'].download(fragment_filename, { + 'url': frag_url, + 'http_headers': headers or info_dict.get('http_headers'), + }) + if not success: + return False, None + down, frag_sanitized = sanitize_open(fragment_filename, 'rb') + ctx['fragment_filename_sanitized'] = frag_sanitized + frag_content = down.read() + down.close() + return True, frag_content + + def _append_fragment(self, ctx, frag_content): + try: + ctx['dest_stream'].write(frag_content) + ctx['dest_stream'].flush() + finally: + if self.__do_ytdl_file(ctx): + self._write_ytdl_file(ctx) + if not self.params.get('keep_fragments', False): + os.remove(encodeFilename(ctx['fragment_filename_sanitized'])) + del ctx['fragment_filename_sanitized'] + + def _prepare_frag_download(self, ctx): + if 'live' not in ctx: + ctx['live'] = False + if not ctx['live']: + total_frags_str = '%d' % ctx['total_frags'] + ad_frags = ctx.get('ad_frags', 0) + if ad_frags: + total_frags_str += ' (not including %d ad)' % ad_frags + else: + total_frags_str = 'unknown (live)' + self.to_screen( + '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) + self.report_destination(ctx['filename']) + dl = HttpQuietDownloader( + self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'ratelimit': self.params.get('ratelimit'), + 'retries': self.params.get('retries', 0), + 'nopart': self.params.get('nopart', False), + 'test': self.params.get('test', False), + } + ) + tmpfilename = self.temp_name(ctx['filename']) + open_mode = 'wb' + resume_len = 0 + + # Establish possible resume length + if os.path.isfile(encodeFilename(tmpfilename)): + open_mode = 'ab' + resume_len = os.path.getsize(encodeFilename(tmpfilename)) + + # Should be initialized before ytdl file check + ctx.update({ + 'tmpfilename': tmpfilename, + 'fragment_index': 0, + }) + + if self.__do_ytdl_file(ctx): + if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + self._read_ytdl_file(ctx) + is_corrupt = ctx.get('ytdl_corrupt') is True + is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 + if is_corrupt or is_inconsistent: + message = ( + '.ytdl file is corrupt' if is_corrupt else + 'Inconsistent state of incomplete fragment download') + self.report_warning( + '%s. Restarting from the beginning...' % message) + ctx['fragment_index'] = resume_len = 0 + if 'ytdl_corrupt' in ctx: + del ctx['ytdl_corrupt'] + self._write_ytdl_file(ctx) + else: + self._write_ytdl_file(ctx) + assert ctx['fragment_index'] == 0 + + dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) + + ctx.update({ + 'dl': dl, + 'dest_stream': dest_stream, + 'tmpfilename': tmpfilename, + # Total complete fragments downloaded so far in bytes + 'complete_frags_downloaded_bytes': resume_len, + }) + + def _start_frag_download(self, ctx): + resume_len = ctx['complete_frags_downloaded_bytes'] + total_frags = ctx['total_frags'] + # This dict stores the download progress, it's updated by the progress + # hook + state = { + 'status': 'downloading', + 'downloaded_bytes': resume_len, + 'fragment_index': ctx['fragment_index'], + 'fragment_count': total_frags, + 'filename': ctx['filename'], + 'tmpfilename': ctx['tmpfilename'], + } + + start = time.time() + ctx.update({ + 'started': start, + # Amount of fragment's bytes downloaded by the time of the previous + # frag progress hook invocation + 'prev_frag_downloaded_bytes': 0, + }) + + def frag_progress_hook(s): + if s['status'] not in ('downloading', 'finished'): + return + + time_now = time.time() + state['elapsed'] = time_now - start + frag_total_bytes = s.get('total_bytes') or 0 + if not ctx['live']: + estimated_size = ( + (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) + / (state['fragment_index'] + 1) * total_frags) + state['total_bytes_estimate'] = estimated_size + + if s['status'] == 'finished': + state['fragment_index'] += 1 + ctx['fragment_index'] = state['fragment_index'] + state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] + ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['prev_frag_downloaded_bytes'] = 0 + else: + frag_downloaded_bytes = s['downloaded_bytes'] + state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] + if not ctx['live']: + state['eta'] = self.calc_eta( + start, time_now, estimated_size - resume_len, + state['downloaded_bytes'] - resume_len) + state['speed'] = s.get('speed') or ctx.get('speed') + ctx['speed'] = state['speed'] + ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes + self._hook_progress(state) + + ctx['dl'].add_progress_hook(frag_progress_hook) + + return start + + def _finish_frag_download(self, ctx): + ctx['dest_stream'].close() + if self.__do_ytdl_file(ctx): + ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) + if os.path.isfile(ytdl_filename): + os.remove(ytdl_filename) + elapsed = time.time() - ctx['started'] + + if ctx['tmpfilename'] == '-': + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + else: + self.try_rename(ctx['tmpfilename'], ctx['filename']) + downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + + self._hook_progress({ + 'downloaded_bytes': downloaded_bytes, + 'total_bytes': downloaded_bytes, + 'filename': ctx['filename'], + 'status': 'finished', + 'elapsed': elapsed, + }) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py new file mode 100644 index 0000000000..84bc349288 --- /dev/null +++ b/youtube_dl/downloader/hls.py @@ -0,0 +1,210 @@ +from __future__ import unicode_literals + +import re +import binascii +try: + from Crypto.Cipher import AES + can_decrypt_frag = True +except ImportError: + can_decrypt_frag = False + +from .fragment import FragmentFD +from .external import FFmpegFD + +from ..compat import ( + compat_urllib_error, + compat_urlparse, + compat_struct_pack, +) +from ..utils import ( + parse_m3u8_attributes, + update_url_query, +) + + +class HlsFD(FragmentFD): + """ A limited implementation that does not require ffmpeg """ + + FD_NAME = 'hlsnative' + + @staticmethod + def can_download(manifest, info_dict): + UNSUPPORTED_FEATURES = ( + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] + # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] + + # Live streams heuristic does not always work (e.g. geo restricted to Germany + # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) + # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + + # This heuristic also is not correct since segments may not be appended as well. + # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite + # no segments will definitely be appended to the end of the playlist. + # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of + # # event media playlists [4] + + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + ) + check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] + is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest + check_results.append(can_decrypt_frag or not is_aes128_enc) + check_results.append(not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest)) + check_results.append(not info_dict.get('is_live')) + return all(check_results) + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + s = urlh.read().decode('utf-8', 'ignore') + + if not self.can_download(s, info_dict): + if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'): + self.report_error('pycrypto not found. Please install it.') + return False + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + + def is_ad_fragment_start(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + + def is_ad_fragment_end(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) + + media_frags = 0 + ad_frags = 0 + ad_frag_next = False + for line in s.splitlines(): + line = line.strip() + if not line: + continue + if line.startswith('#'): + if is_ad_fragment_start(line): + ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False + continue + if ad_frag_next: + ad_frags += 1 + continue + media_frags += 1 + + ctx = { + 'filename': filename, + 'total_frags': media_frags, + 'ad_frags': ad_frags, + } + + self._prepare_and_start_frag_download(ctx) + + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + test = self.params.get('test', False) + + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) + i = 0 + media_sequence = 0 + decrypt_info = {'METHOD': 'NONE'} + byte_range = {} + frag_index = 0 + ad_frag_next = False + for line in s.splitlines(): + line = line.strip() + if line: + if not line.startswith('#'): + if ad_frag_next: + continue + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + frag_url = ( + line + if re.match(r'^https?://', line) + else compat_urlparse.urljoin(man_url, line)) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) + count = 0 + headers = info_dict.get('http_headers', {}) + if byte_range: + headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end']) + while count <= fragment_retries: + try: + success, frag_content = self._download_fragment( + ctx, frag_url, info_dict, headers) + if not success: + return False + break + except compat_urllib_error.HTTPError as err: + # Unavailable (possibly temporary) fragments may be served. + # First we try to retry then either skip or abort. + # See https://github.com/ytdl-org/youtube-dl/issues/10165, + # https://github.com/ytdl-org/youtube-dl/issues/10448). + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + i += 1 + media_sequence += 1 + self.report_skip_fragment(frag_index) + continue + self.report_error( + 'giving up after %s fragment retries' % fragment_retries) + return False + if decrypt_info['METHOD'] == 'AES-128': + iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) + decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( + self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + self._append_fragment(ctx, frag_content) + # We only download the first fragment during the test + if test: + break + i += 1 + media_sequence += 1 + elif line.startswith('#EXT-X-KEY'): + decrypt_url = decrypt_info.get('URI') + decrypt_info = parse_m3u8_attributes(line[11:]) + if decrypt_info['METHOD'] == 'AES-128': + if 'IV' in decrypt_info: + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) + if not re.match(r'^https?://', decrypt_info['URI']): + decrypt_info['URI'] = compat_urlparse.urljoin( + man_url, decrypt_info['URI']) + if extra_query: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) + if decrypt_url != decrypt_info['URI']: + decrypt_info['KEY'] = None + elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): + media_sequence = int(line[22:]) + elif line.startswith('#EXT-X-BYTERANGE'): + splitted_byte_range = line[17:].split('@') + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + byte_range = { + 'start': sub_range_start, + 'end': sub_range_start + int(splitted_byte_range[0]), + } + elif is_ad_fragment_start(line): + ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False + + self._finish_frag_download(ctx) + + return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py new file mode 100644 index 0000000000..5046878dfc --- /dev/null +++ b/youtube_dl/downloader/http.py @@ -0,0 +1,354 @@ +from __future__ import unicode_literals + +import errno +import os +import socket +import time +import random +import re + +from .common import FileDownloader +from ..compat import ( + compat_str, + compat_urllib_error, +) +from ..utils import ( + ContentTooShortError, + encodeFilename, + int_or_none, + sanitize_open, + sanitized_Request, + write_xattr, + XAttrMetadataError, + XAttrUnavailableError, +) + + +class HttpFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None + + # Do not include the Accept-Encoding header + headers = {'Youtubedl-no-compression': 'True'} + add_headers = info_dict.get('http_headers') + if add_headers: + headers.update(add_headers) + + is_test = self.params.get('test', False) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + info_dict.get('downloader_options', {}).get('http_chunk_size') + or self.params.get('http_chunk_size') or 0) + + ctx.open_mode = 'wb' + ctx.resume_len = 0 + ctx.data_len = None + ctx.block_size = self.params.get('buffersize', 1024) + ctx.start_time = time.time() + ctx.chunk_size = None + + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize( + encodeFilename(ctx.tmpfilename)) + + ctx.is_resume = ctx.resume_len > 0 + + count = 0 + retries = self.params.get('retries', 0) + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + class NextFragment(Exception): + pass + + def set_range(req, start, end): + range_header = 'bytes=%d-' % start + if end: + range_header += compat_str(end) + req.add_header('Range', range_header) + + def establish_connection(): + ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) + if not is_test and chunk_size else chunk_size) + if ctx.resume_len > 0: + range_start = ctx.resume_len + if ctx.is_resume: + self.report_resuming_byte(ctx.resume_len) + ctx.open_mode = 'ab' + elif ctx.chunk_size > 0: + range_start = 0 + else: + range_start = None + ctx.is_resume = False + range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None + if range_end and ctx.data_len is not None and range_end >= ctx.data_len: + range_end = ctx.data_len - 1 + has_range = range_start is not None + ctx.has_range = has_range + request = sanitized_Request(url, None, headers) + if has_range: + set_range(request, range_start, range_end) + # Establish connection + try: + ctx.data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) + if has_range: + content_range = ctx.data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) + # Content-Range is present and matches requested Range, resume is possible + if content_range_m: + if range_start == int(content_range_m.group(1)): + content_range_end = int_or_none(content_range_m.group(2)) + content_len = int_or_none(content_range_m.group(3)) + accept_content_len = ( + # Non-chunked download + not ctx.chunk_size + # Chunked download and requested piece or + # its part is promised to be served + or content_range_end == range_end + or content_len < range_end) + if accept_content_len: + ctx.data_len = content_len + return + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) + return + except (compat_urllib_error.HTTPError, ) as err: + if err.code == 416: + # Unable to resume (requested range not satisfiable) + try: + # Open the connection again without the range header + ctx.data = self.ydl.urlopen( + sanitized_Request(url, None, headers)) + content_length = ctx.data.info()['Content-Length'] + except (compat_urllib_error.HTTPError, ) as err: + if err.code < 500 or err.code >= 600: + raise + else: + # Examine the reported length + if (content_length is not None + and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): + # The file had already been fully downloaded. + # Explanation to the above condition: in issue #175 it was revealed that + # YouTube sometimes adds or removes a few bytes from the end of the file, + # changing the file size slightly and causing problems for some users. So + # I decided to implement a suggested change and consider the file + # completely downloaded if the file size differs less than 100 bytes from + # the one in the hard drive. + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) + self._hook_progress({ + 'filename': ctx.filename, + 'status': 'finished', + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, + }) + raise SucceedDownload() + else: + # The length does not match, we start the download over + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + elif err.code < 500 or err.code >= 600: + # Unexpected HTTP error + raise + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: + # Connection reset is no problem, just retry + raise + raise RetryDownload(err) + + def download(): + data_len = ctx.data.info().get('Content-length', None) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) + return False + + byte_counter = 0 + ctx.resume_len + block_size = ctx.block_size + start = time.time() + + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring + + def retry(e): + to_stdout = ctx.tmpfilename == '-' + if not to_stdout: + ctx.stream.close() + ctx.stream = None + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) + + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): + raise + retry(e) + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: + try: + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False + + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) + + try: + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) + return False + + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if ctx.data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': ctx.data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - ctx.start_time, + }) + + if data_len is not None and byte_counter == data_len: + break + + if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + ctx.resume_len = byte_counter + # ctx.block_size = block_size + raise NextFragment() + + if ctx.stream is None: + self.to_stderr('\n') + self.report_error('Did not get any data blocks') + return False + if ctx.tmpfilename != '-': + ctx.stream.close() + + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err + + self.try_rename(ctx.tmpfilename, ctx.filename) + + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - ctx.start_time, + }) + + return True + + while count <= retries: + try: + establish_connection() + return download() + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + continue + except NextFragment: + continue + except SucceedDownload: + return True + + self.report_error('giving up after %s retries' % retries) + return False diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py new file mode 100644 index 0000000000..1ca666b4a1 --- /dev/null +++ b/youtube_dl/downloader/ism.py @@ -0,0 +1,259 @@ +from __future__ import unicode_literals + +import time +import binascii +import io + +from .fragment import FragmentFD +from ..compat import ( + compat_Struct, + compat_urllib_error, +) + + +u8 = compat_Struct('>B') +u88 = compat_Struct('>Bx') +u16 = compat_Struct('>H') +u1616 = compat_Struct('>Hxx') +u32 = compat_Struct('>I') +u64 = compat_Struct('>Q') + +s88 = compat_Struct('>bx') +s16 = compat_Struct('>h') +s1616 = compat_Struct('>hxx') +s32 = compat_Struct('>i') + +unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) + +TRACK_ENABLED = 0x1 +TRACK_IN_MOVIE = 0x2 +TRACK_IN_PREVIEW = 0x4 + +SELF_CONTAINED = 0x1 + + +def box(box_type, payload): + return u32.pack(8 + len(payload)) + box_type + payload + + +def full_box(box_type, version, flags, payload): + return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload) + + +def write_piff_header(stream, params): + track_id = params['track_id'] + fourcc = params['fourcc'] + duration = params['duration'] + timescale = params.get('timescale', 10000000) + language = params.get('language', 'und') + height = params.get('height', 0) + width = params.get('width', 0) + is_audio = width == 0 and height == 0 + creation_time = modification_time = int(time.time()) + + ftyp_payload = b'isml' # major brand + ftyp_payload += u32.pack(1) # minor version + ftyp_payload += b'piff' + b'iso2' # compatible brands + stream.write(box(b'ftyp', ftyp_payload)) # File Type Box + + mvhd_payload = u64.pack(creation_time) + mvhd_payload += u64.pack(modification_time) + mvhd_payload += u32.pack(timescale) + mvhd_payload += u64.pack(duration) + mvhd_payload += s1616.pack(1) # rate + mvhd_payload += s88.pack(1) # volume + mvhd_payload += u16.pack(0) # reserved + mvhd_payload += u32.pack(0) * 2 # reserved + mvhd_payload += unity_matrix + mvhd_payload += u32.pack(0) * 6 # pre defined + mvhd_payload += u32.pack(0xffffffff) # next track id + moov_payload = full_box(b'mvhd', 1, 0, mvhd_payload) # Movie Header Box + + tkhd_payload = u64.pack(creation_time) + tkhd_payload += u64.pack(modification_time) + tkhd_payload += u32.pack(track_id) # track id + tkhd_payload += u32.pack(0) # reserved + tkhd_payload += u64.pack(duration) + tkhd_payload += u32.pack(0) * 2 # reserved + tkhd_payload += s16.pack(0) # layer + tkhd_payload += s16.pack(0) # alternate group + tkhd_payload += s88.pack(1 if is_audio else 0) # volume + tkhd_payload += u16.pack(0) # reserved + tkhd_payload += unity_matrix + tkhd_payload += u1616.pack(width) + tkhd_payload += u1616.pack(height) + trak_payload = full_box(b'tkhd', 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, tkhd_payload) # Track Header Box + + mdhd_payload = u64.pack(creation_time) + mdhd_payload += u64.pack(modification_time) + mdhd_payload += u32.pack(timescale) + mdhd_payload += u64.pack(duration) + mdhd_payload += u16.pack(((ord(language[0]) - 0x60) << 10) | ((ord(language[1]) - 0x60) << 5) | (ord(language[2]) - 0x60)) + mdhd_payload += u16.pack(0) # pre defined + mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box + + hdlr_payload = u32.pack(0) # pre defined + hdlr_payload += b'soun' if is_audio else b'vide' # handler type + hdlr_payload += u32.pack(0) * 3 # reserved + hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name + mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box + + if is_audio: + smhd_payload = s88.pack(0) # balance + smhd_payload += u16.pack(0) # reserved + media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header + else: + vmhd_payload = u16.pack(0) # graphics mode + vmhd_payload += u16.pack(0) * 3 # opcolor + media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header + minf_payload = media_header_box + + dref_payload = u32.pack(1) # entry count + dref_payload += full_box(b'url ', 0, SELF_CONTAINED, b'') # Data Entry URL Box + dinf_payload = full_box(b'dref', 0, 0, dref_payload) # Data Reference Box + minf_payload += box(b'dinf', dinf_payload) # Data Information Box + + stsd_payload = u32.pack(1) # entry count + + sample_entry_payload = u8.pack(0) * 6 # reserved + sample_entry_payload += u16.pack(1) # data reference index + if is_audio: + sample_entry_payload += u32.pack(0) * 2 # reserved + sample_entry_payload += u16.pack(params.get('channels', 2)) + sample_entry_payload += u16.pack(params.get('bits_per_sample', 16)) + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u1616.pack(params['sampling_rate']) + + if fourcc == 'AACL': + sample_entry_box = box(b'mp4a', sample_entry_payload) + else: + sample_entry_payload += u16.pack(0) # pre defined + sample_entry_payload += u16.pack(0) # reserved + sample_entry_payload += u32.pack(0) * 3 # pre defined + sample_entry_payload += u16.pack(width) + sample_entry_payload += u16.pack(height) + sample_entry_payload += u1616.pack(0x48) # horiz resolution 72 dpi + sample_entry_payload += u1616.pack(0x48) # vert resolution 72 dpi + sample_entry_payload += u32.pack(0) # reserved + sample_entry_payload += u16.pack(1) # frame count + sample_entry_payload += u8.pack(0) * 32 # compressor name + sample_entry_payload += u16.pack(0x18) # depth + sample_entry_payload += s16.pack(-1) # pre defined + + codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) + if fourcc in ('H264', 'AVC1'): + sps, pps = codec_private_data.split(u32.pack(1))[1:] + avcc_payload = u8.pack(1) # configuration version + avcc_payload += sps[1:4] # avc profile indication + profile compatibility + avc level indication + avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete representation (1) + reserved (11111) + length size minus one + avcc_payload += u8.pack(1) # reserved (0) + number of sps (0000001) + avcc_payload += u16.pack(len(sps)) + avcc_payload += sps + avcc_payload += u8.pack(1) # number of pps + avcc_payload += u16.pack(len(pps)) + avcc_payload += pps + sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record + sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry + stsd_payload += sample_entry_box + + stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box + + stts_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stts', 0, 0, stts_payload) # Decoding Time to Sample Box + + stsc_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stsc', 0, 0, stsc_payload) # Sample To Chunk Box + + stco_payload = u32.pack(0) # entry count + stbl_payload += full_box(b'stco', 0, 0, stco_payload) # Chunk Offset Box + + minf_payload += box(b'stbl', stbl_payload) # Sample Table Box + + mdia_payload += box(b'minf', minf_payload) # Media Information Box + + trak_payload += box(b'mdia', mdia_payload) # Media Box + + moov_payload += box(b'trak', trak_payload) # Track Box + + mehd_payload = u64.pack(duration) + mvex_payload = full_box(b'mehd', 1, 0, mehd_payload) # Movie Extends Header Box + + trex_payload = u32.pack(track_id) # track id + trex_payload += u32.pack(1) # default sample description index + trex_payload += u32.pack(0) # default sample duration + trex_payload += u32.pack(0) # default sample size + trex_payload += u32.pack(0) # default sample flags + mvex_payload += full_box(b'trex', 0, 0, trex_payload) # Track Extends Box + + moov_payload += box(b'mvex', mvex_payload) # Movie Extends Box + stream.write(box(b'moov', moov_payload)) # Movie Box + + +def extract_box_data(data, box_sequence): + data_reader = io.BytesIO(data) + while True: + box_size = u32.unpack(data_reader.read(4))[0] + box_type = data_reader.read(4) + if box_type == box_sequence[0]: + box_data = data_reader.read(box_size - 8) + if len(box_sequence) == 1: + return box_data + return extract_box_data(box_data, box_sequence[1:]) + data_reader.seek(box_size - 8, 1) + + +class IsmFD(FragmentFD): + """ + Download segments in a ISM manifest + """ + + FD_NAME = 'ism' + + def real_download(self, filename, info_dict): + segments = info_dict['fragments'][:1] if self.params.get( + 'test', False) else info_dict['fragments'] + + ctx = { + 'filename': filename, + 'total_frags': len(segments), + } + + self._prepare_and_start_frag_download(ctx) + + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + track_written = False + frag_index = 0 + for i, segment in enumerate(segments): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + count = 0 + while count <= fragment_retries: + try: + success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) + if not success: + return False + if not track_written: + tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) + info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] + write_piff_header(ctx['dest_stream'], info_dict['_download_params']) + track_written = True + self._append_fragment(ctx, frag_content) + break + except compat_urllib_error.HTTPError as err: + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + self.report_skip_fragment(frag_index) + continue + self.report_error('giving up after %s fragment retries' % fragment_retries) + return False + + self._finish_frag_download(ctx) + + return True diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py new file mode 100644 index 0000000000..fbb7f51b01 --- /dev/null +++ b/youtube_dl/downloader/rtmp.py @@ -0,0 +1,214 @@ +from __future__ import unicode_literals + +import os +import re +import subprocess +import time + +from .common import FileDownloader +from ..compat import compat_str +from ..utils import ( + check_executable, + encodeFilename, + encodeArgument, + get_exe_version, +) + + +def rtmpdump_version(): + return get_exe_version( + 'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)') + + +class RtmpFD(FileDownloader): + def real_download(self, filename, info_dict): + def run_rtmpdump(args): + start = time.time() + resume_percent = None + resume_downloaded_data_len = None + proc = subprocess.Popen(args, stderr=subprocess.PIPE) + cursor_in_new_line = True + proc_stderr_closed = False + try: + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len + time_now = time.time() + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'eta': eta, + 'elapsed': time_now - start, + 'speed': speed, + }) + cursor_in_new_line = False + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + finally: + proc.wait() + if not cursor_in_new_line: + self.to_screen('') + return proc.returncode + + url = info_dict['url'] + player_url = info_dict.get('player_url') + page_url = info_dict.get('page_url') + app = info_dict.get('app') + play_path = info_dict.get('play_path') + tc_url = info_dict.get('tc_url') + flash_version = info_dict.get('flash_version') + live = info_dict.get('rtmp_live', False) + conn = info_dict.get('rtmp_conn') + protocol = info_dict.get('rtmp_protocol') + real_time = info_dict.get('rtmp_real_time', False) + no_resume = info_dict.get('no_resume', False) + continue_dl = self.params.get('continuedl', True) + + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + test = self.params.get('test', False) + + # Check for rtmpdump first + if not check_executable('rtmpdump', ['-h']): + self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install it.') + return False + + # Download using rtmpdump. rtmpdump returns exit code 2 when + # the connection was interrupted and resuming appears to be + # possible. This is part of rtmpdump's normal usage, AFAIK. + basic_args = [ + 'rtmpdump', '--verbose', '-r', url, + '-o', tmpfilename] + if player_url is not None: + basic_args += ['--swfVfy', player_url] + if page_url is not None: + basic_args += ['--pageUrl', page_url] + if app is not None: + basic_args += ['--app', app] + if play_path is not None: + basic_args += ['--playpath', play_path] + if tc_url is not None: + basic_args += ['--tcUrl', tc_url] + if test: + basic_args += ['--stop', '1'] + if flash_version is not None: + basic_args += ['--flashVer', flash_version] + if live: + basic_args += ['--live'] + if isinstance(conn, list): + for entry in conn: + basic_args += ['--conn', entry] + elif isinstance(conn, compat_str): + basic_args += ['--conn', conn] + if protocol is not None: + basic_args += ['--protocol', protocol] + if real_time: + basic_args += ['--realtime'] + + args = basic_args + if not no_resume and continue_dl and not live: + args += ['--resume'] + if not live and continue_dl: + args += ['--skip', '1'] + + args = [encodeArgument(a) for a in args] + + self._debug_cmd(args, exe='rtmpdump') + + RD_SUCCESS = 0 + RD_FAILED = 1 + RD_INCOMPLETE = 2 + RD_NO_CONNECT = 3 + + started = time.time() + + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') + + if retval == RD_NO_CONNECT: + self.report_error('[rtmpdump] Could not connect to RTMP server.') + return False + + while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: + prevsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) + time.sleep(5.0) # This seems to be needed + args = basic_args + ['--resume'] + if retval == RD_FAILED: + args += ['--skip', '1'] + args = [encodeArgument(a) for a in args] + retval = run_rtmpdump(args) + cursize = os.path.getsize(encodeFilename(tmpfilename)) + if prevsize == cursize and retval == RD_FAILED: + break + # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those + if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024: + self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = RD_SUCCESS + break + if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + 'elapsed': time.time() - started, + }) + return True + else: + self.to_stderr('\n') + self.report_error('rtmpdump exited with code %d' % retval) + return False diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py new file mode 100644 index 0000000000..939358b2a2 --- /dev/null +++ b/youtube_dl/downloader/rtsp.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +import os +import subprocess + +from .common import FileDownloader +from ..utils import ( + check_executable, + encodeFilename, +) + + +class RtspFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + if check_executable('mplayer', ['-h']): + args = [ + 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', + '-dumpstream', '-dumpfile', tmpfilename, url] + elif check_executable('mpv', ['-h']): + args = [ + 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] + else: + self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') + return False + + self._debug_cmd(args) + + retval = subprocess.call(args) + if retval == 0: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % (args[0], retval)) + return False diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py new file mode 100644 index 0000000000..4932dd9c52 --- /dev/null +++ b/youtube_dl/downloader/youtube_live_chat.py @@ -0,0 +1,94 @@ +from __future__ import division, unicode_literals + +import re +import json + +from .fragment import FragmentFD + + +class YoutubeLiveChatReplayFD(FragmentFD): + """ Downloads YouTube live chat replays fragment by fragment """ + + FD_NAME = 'youtube_live_chat_replay' + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + self.to_screen('[%s] Downloading live chat' % self.FD_NAME) + + test = self.params.get('test', False) + + ctx = { + 'filename': filename, + 'live': True, + 'total_frags': None, + } + + def dl_fragment(url): + headers = info_dict.get('http_headers', {}) + return self._download_fragment(ctx, url, info_dict, headers) + + def parse_yt_initial_data(data): + window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?)(?<=});' + var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?)(?<=});' + for patt in window_patt, var_patt: + try: + raw_json = re.search(patt, data).group(1) + return json.loads(raw_json) + except AttributeError: + continue + + self._prepare_and_start_frag_download(ctx) + + success, raw_fragment = dl_fragment( + 'https://www.youtube.com/watch?v={}'.format(video_id)) + if not success: + return False + data = parse_yt_initial_data(raw_fragment) + continuation_id = data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] + # no data yet but required to call _append_fragment + self._append_fragment(ctx, b'') + + first = True + offset = None + while continuation_id is not None: + data = None + if first: + url = 'https://www.youtube.com/live_chat_replay?continuation={}'.format(continuation_id) + success, raw_fragment = dl_fragment(url) + if not success: + return False + data = parse_yt_initial_data(raw_fragment) + else: + url = ('https://www.youtube.com/live_chat_replay/get_live_chat_replay' + + '?continuation={}'.format(continuation_id) + + '&playerOffsetMs={}'.format(offset - 5000) + + '&hidden=false' + + '&pbj=1') + success, raw_fragment = dl_fragment(url) + if not success: + return False + data = json.loads(raw_fragment)['response'] + + first = False + continuation_id = None + + live_chat_continuation = data['continuationContents']['liveChatContinuation'] + offset = None + processed_fragment = bytearray() + if 'actions' in live_chat_continuation: + for action in live_chat_continuation['actions']: + if 'replayChatItemAction' in action: + replay_chat_item_action = action['replayChatItemAction'] + offset = int(replay_chat_item_action['videoOffsetTimeMsec']) + processed_fragment.extend( + json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') + continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + + self._append_fragment(ctx, processed_fragment) + + if test or offset is None: + break + + self._finish_frag_download(ctx) + + return True diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py new file mode 100644 index 0000000000..18d8dbcd66 --- /dev/null +++ b/youtube_dl/extractor/__init__.py @@ -0,0 +1,46 @@ +from __future__ import unicode_literals + +try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True +except ImportError: + _LAZY_LOADER = False + from .extractors import * + + _ALL_CLASSES = [ + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) + + +def gen_extractor_classes(): + """ Return a list of supported extractors. + The order does matter; the first extractor matched is the one handling the URL. + """ + return _ALL_CLASSES + + +def gen_extractors(): + """ Return a list of an instance of every supported extractor. + The order does matter; the first extractor matched is the one handling the URL. + """ + return [klass() for klass in gen_extractor_classes()] + + +def list_extractors(age_limit): + """ + Return a list of extractors that are suitable for the given age, + sorted by extractor ID. + """ + + return sorted( + filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), + key=lambda ie: ie.IE_NAME.lower()) + + +def get_info_extractor(ie_name): + """Returns the info extractor class with the given ie_name""" + return globals()[ie_name + 'IE'] diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py new file mode 100644 index 0000000000..6637f4f353 --- /dev/null +++ b/youtube_dl/extractor/abc.py @@ -0,0 +1,193 @@ +from __future__ import unicode_literals + +import hashlib +import hmac +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + int_or_none, + parse_iso8601, + try_get, + unescapeHTML, + update_url_query, +) + + +class ABCIE(InfoExtractor): + IE_NAME = 'abc.net.au' + _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' + + _TESTS = [{ + 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', + 'md5': 'cb3dd03b18455a661071ee1e28344d9f', + 'info_dict': { + 'id': '5868334', + 'ext': 'mp4', + 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', + 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', + }, + 'skip': 'this video has expired', + }, { + 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', + 'md5': 'db2a5369238b51f9811ad815b69dc086', + 'info_dict': { + 'id': 'NvqvPeNZsHU', + 'ext': 'mp4', + 'upload_date': '20150816', + 'uploader': 'ABC News (Australia)', + 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef', + 'uploader_id': 'NewsOnABC', + 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', + }, + 'add_ie': ['Youtube'], + 'skip': 'Not accessible from Travis CI server', + }, { + 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', + 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', + 'info_dict': { + 'id': '6880080', + 'ext': 'mp3', + 'title': 'NAB lifts interest rates, following Westpac and CBA', + 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', + }, + }, { + 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + mobj = re.search( + r'inline(?PVideo|Audio|YouTube)Data\.push\((?P[^)]+)\);', + webpage) + if mobj is None: + expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?(.+?)', webpage, 'expired', None) + if expired: + raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True) + raise ExtractorError('Unable to extract video urls') + + urls_info = self._parse_json( + mobj.group('json_data'), video_id, transform_source=js_to_json) + + if not isinstance(urls_info, list): + urls_info = [urls_info] + + if mobj.group('type') == 'YouTube': + return self.playlist_result([ + self.url_result(url_info['url']) for url_info in urls_info]) + + formats = [{ + 'url': url_info['url'], + 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none', + 'width': int_or_none(url_info.get('width')), + 'height': int_or_none(url_info.get('height')), + 'tbr': int_or_none(url_info.get('bitrate')), + 'filesize': int_or_none(url_info.get('filesize')), + } for url_info in urls_info] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } + + +class ABCIViewIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview' + _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' + _GEO_COUNTRIES = ['AU'] + + # ABC iview programs are normally available for 14 days only. + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', + 'md5': '67715ce3c78426b11ba167d875ac6abf', + 'info_dict': { + 'id': 'LE1927H001S00', + 'ext': 'mp4', + 'title': "Series 11 Ep 1", + 'series': "Gruen", + 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', + 'upload_date': '20190925', + 'uploader_id': 'abc1', + 'timestamp': 1569445289, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_params = self._download_json( + 'https://iview.abc.net.au/api/programs/' + video_id, video_id) + title = unescapeHTML(video_params.get('title') or video_params['seriesTitle']) + stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream')) + + house_number = video_params.get('episodeHouseNumber') or video_id + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format( + int(time.time()), house_number) + sig = hmac.new( + b'android.content.res.Resources', + path.encode('utf-8'), hashlib.sha256).hexdigest() + token = self._download_webpage( + 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) + + def tokenize_url(url, token): + return update_url_query(url, { + 'hdnea': token, + }) + + for sd in ('720', 'sd', 'sd-low'): + sd_url = try_get( + stream, lambda x: x['streams']['hls'][sd], compat_str) + if not sd_url: + continue + formats = self._extract_m3u8_formats( + tokenize_url(sd_url, token), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + if formats: + break + self._sort_formats(formats) + + subtitles = {} + src_vtt = stream.get('captions', {}).get('src-vtt') + if src_vtt: + subtitles['en'] = [{ + 'url': src_vtt, + 'ext': 'vtt', + }] + + is_live = video_params.get('livestream') == '1' + if is_live: + title = self._live_title(title) + + return { + 'id': video_id, + 'title': title, + 'description': video_params.get('description'), + 'thumbnail': video_params.get('thumbnail'), + 'duration': int_or_none(video_params.get('eventDuration')), + 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), + 'series': unescapeHTML(video_params.get('seriesTitle')), + 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], + 'season_number': int_or_none(self._search_regex( + r'\bSeries\s+(\d+)\b', title, 'season number', default=None)), + 'episode_number': int_or_none(self._search_regex( + r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), + 'episode_id': house_number, + 'uploader_id': video_params.get('channel'), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py new file mode 100644 index 0000000000..8b407bf9c6 --- /dev/null +++ b/youtube_dl/extractor/abcnews.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import calendar +import re +import time + +from .amp import AMPIE +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import compat_urlparse + + +class AbcNewsVideoIE(AMPIE): + IE_NAME = 'abcnews:video' + _VALID_URL = r'''(?x) + https?:// + (?: + abcnews\.go\.com/ + (?: + [^/]+/video/(?P[0-9a-z-]+)-| + video/embed\?.*?\bid= + )| + fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ + ) + (?P\d+) + ''' + + _TESTS = [{ + 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', + 'info_dict': { + 'id': '20411932', + 'ext': 'mp4', + 'display_id': 'week-exclusive-irans-foreign-minister-zarif', + 'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif', + 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', + 'duration': 180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://abcnews.go.com/video/embed?id=46979033', + 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + video_id = mobj.group('id') + info_dict = self._extract_feed_info( + 'http://abcnews.go.com/video/itemfeed?id=%s' % video_id) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + return info_dict + + +class AbcNewsIE(InfoExtractor): + IE_NAME = 'abcnews' + _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' + + _TESTS = [{ + 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + 'info_dict': { + 'id': '10505354', + 'ext': 'flv', + 'display_id': 'dramatic-video-rare-death-job-america', + 'title': 'Occupational Hazards', + 'description': 'Nightline investigates the dangers that lurk at various jobs.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20100428', + 'timestamp': 1272412800, + }, + 'add_ie': ['AbcNewsVideo'], + }, { + 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', + 'info_dict': { + 'id': '38897857', + 'ext': 'mp4', + 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', + 'title': 'Justin Timberlake Drops Hints For Secret Single', + 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', + 'upload_date': '20160515', + 'timestamp': 1463329500, + }, + 'params': { + # m3u8 download + 'skip_download': True, + # The embedded YouTube video is blocked due to copyright issues + 'playlist_items': '1', + }, + 'add_ie': ['AbcNewsVideo'], + }, { + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex( + r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') + full_video_url = compat_urlparse.urljoin(url, video_url) + + youtube_url = YoutubeIE._extract_url(webpage) + + timestamp = None + date_str = self._html_search_regex( + r']+class="timestamp">([^<]+)', + webpage, 'timestamp', fatal=False) + if date_str: + tz_offset = 0 + if date_str.endswith(' ET'): # Eastern Time + tz_offset = -5 + date_str = date_str[:-3] + date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] + for date_format in date_formats: + try: + timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) + except ValueError: + continue + if timestamp is not None: + timestamp -= tz_offset * 3600 + + entry = { + '_type': 'url_transparent', + 'ie_key': AbcNewsVideoIE.ie_key(), + 'url': full_video_url, + 'id': video_id, + 'display_id': display_id, + 'timestamp': timestamp, + } + + if youtube_url: + entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] + return self.playlist_result(entries) + + return entry diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py new file mode 100644 index 0000000000..0bc69a64f6 --- /dev/null +++ b/youtube_dl/extractor/abcotvs.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + dict_get, + int_or_none, + try_get, +) + + +class ABCOTVSIE(InfoExtractor): + IE_NAME = 'abcotvs' + IE_DESC = 'ABC Owned Television Stations' + _VALID_URL = r'https?://(?Pabc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P[^/]+))?/(?P\d+)' + _TESTS = [ + { + 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', + 'info_dict': { + 'id': '472548', + 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', + 'ext': 'mp4', + 'title': 'East Bay museum celebrates synthesized music', + 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1421118520, + 'upload_date': '20150113', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://abc7news.com/472581', + 'only_matching': True, + }, + { + 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/', + 'only_matching': True, + }, + ] + _SITE_MAP = { + '6abc': 'wpvi', + 'abc11': 'wtvd', + 'abc13': 'ktrk', + 'abc30': 'kfsn', + 'abc7': 'kabc', + 'abc7chicago': 'wls', + 'abc7news': 'kgo', + 'abc7ny': 'wabc', + } + + def _real_extract(self, url): + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = display_id or video_id + station = self._SITE_MAP[site] + + data = self._download_json( + 'https://api.abcotvs.com/v2/content', display_id, query={ + 'id': video_id, + 'key': 'otv.web.%s.story' % station, + 'station': station, + })['data'] + video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data + video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id)) + title = video.get('title') or video['linkText'] + + formats = [] + m3u8_url = video.get('m3u8') + if m3u8_url: + formats = self._extract_m3u8_formats( + video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False) + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'abr': 128, + 'format_id': 'https', + 'height': 360, + 'url': mp4_url, + 'width': 640, + }) + self._sort_formats(formats) + + image = video.get('image') or {} + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': dict_get(video, ('description', 'caption'), try_get(video, lambda x: x['meta']['description'])), + 'thumbnail': dict_get(image, ('source', 'dynamicSource')), + 'timestamp': int_or_none(video.get('date')), + 'duration': int_or_none(video.get('length')), + 'formats': formats, + } + + +class ABCOTVSClipsIE(InfoExtractor): + IE_NAME = 'abcotvs:clips' + _VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P\d+)' + _TEST = { + 'url': 'https://clips.abcotvs.com/kabc/video/214814', + 'info_dict': { + 'id': '214814', + 'ext': 'mp4', + 'title': 'SpaceX launch pad explosion destroys rocket, satellite', + 'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b', + 'upload_date': '20160901', + 'timestamp': 1472756695, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0] + title = video_data['title'] + formats = self._extract_m3u8_formats( + video_data['videoURL'].split('?')[0], video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailURL'), + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('pubDate')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py new file mode 100644 index 0000000000..34095501cf --- /dev/null +++ b/youtube_dl/extractor/academicearth.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class AcademicEarthCourseIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' + IE_NAME = 'AcademicEarth:Course' + _TEST = { + 'url': 'http://academicearth.org/playlists/laws-of-nature/', + 'info_dict': { + 'id': 'laws-of-nature', + 'title': 'Laws of Nature', + 'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.', + }, + 'playlist_count': 3, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + title = self._html_search_regex( + r'

]*?>(.*?)

', webpage, 'title') + description = self._html_search_regex( + r'

]*?>(.*?)

', + webpage, 'description', fatal=False) + urls = re.findall( + r'
  • \s*?', + webpage) + entries = [self.url_result(u) for u in urls] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'description': description, + 'entries': entries, + } diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py new file mode 100644 index 0000000000..b17c792d23 --- /dev/null +++ b/youtube_dl/extractor/acast.py @@ -0,0 +1,135 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import functools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + OnDemandPagedList, +) + + +class ACastIE(InfoExtractor): + IE_NAME = 'acast' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:embed|www)\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P[^/]+)/(?P[^/#?]+) + ''' + _TESTS = [{ + 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', + 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', + 'info_dict': { + 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', + 'ext': 'mp3', + 'title': '2. Raggarmordet - Röster ur det förflutna', + 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'timestamp': 1477346700, + 'upload_date': '20161024', + 'duration': 2766.602563, + 'creator': 'Anton Berg & Martin Johnson', + 'series': 'Spår', + 'episode': '2. Raggarmordet - Röster ur det förflutna', + } + }, { + 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', + 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel, display_id = re.match(self._VALID_URL, url).groups() + s = self._download_json( + 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), + display_id) + media_url = s['url'] + if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): + episode_url = s.get('episodeUrl') + if episode_url: + display_id = episode_url + else: + channel, display_id = re.match(self._VALID_URL, s['link']).groups() + cast_data = self._download_json( + 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), + display_id)['result'] + e = cast_data['episode'] + title = e.get('name') or s['title'] + return { + 'id': compat_str(e['id']), + 'display_id': display_id, + 'url': media_url, + 'title': title, + 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), + 'thumbnail': e.get('image'), + 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), + 'duration': float_or_none(e.get('duration') or s.get('duration')), + 'filesize': int_or_none(e.get('contentLength')), + 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), + 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), + 'season_number': int_or_none(e.get('seasonNumber')), + 'episode': title, + 'episode_number': int_or_none(e.get('episodeNumber')), + } + + +class ACastChannelIE(InfoExtractor): + IE_NAME = 'acast:channel' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P[^/#?]+) + ''' + _TESTS = [{ + 'url': 'https://www.acast.com/todayinfocus', + 'info_dict': { + 'id': '4efc5294-5385-4847-98bd-519799ce5786', + 'title': 'Today in Focus', + 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', + }, + 'playlist_mincount': 35, + }, { + 'url': 'http://play.acast.com/s/ft-banking-weekly', + 'only_matching': True, + }] + _API_BASE_URL = 'https://play.acast.com/api/' + _PAGE_SIZE = 10 + + @classmethod + def suitable(cls, url): + return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) + + def _fetch_page(self, channel_slug, page): + casts = self._download_json( + self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), + channel_slug, note='Download page %d of channel data' % page) + for cast in casts: + yield self.url_result( + 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), + 'ACast', cast['id']) + + def _real_extract(self, url): + channel_slug = self._match_id(url) + channel_data = self._download_json( + self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, channel_slug), self._PAGE_SIZE) + return self.playlist_result(entries, compat_str( + channel_data['id']), channel_data['name'], channel_data.get('description')) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py new file mode 100644 index 0000000000..c95ad21735 --- /dev/null +++ b/youtube_dl/extractor/adn.py @@ -0,0 +1,207 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import binascii +import json +import os +import random + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import ( + compat_b64decode, + compat_ord, +) +from ..utils import ( + bytes_to_intlist, + bytes_to_long, + ExtractorError, + float_or_none, + intlist_to_bytes, + long_to_bytes, + pkcs1pad, + strip_or_none, + urljoin, +) + + +class ADNIE(InfoExtractor): + IE_DESC = 'Anime Digital Network' + _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' + _TEST = { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'info_dict': { + 'id': '7778', + 'ext': 'mp4', + 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + } + } + _BASE_URL = 'http://animedigitalnetwork.fr' + _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) + _POS_ALIGN_MAP = { + 'start': 1, + 'end': 3, + } + _LINE_ALIGN_MAP = { + 'middle': 8, + 'end': 4, + } + + @staticmethod + def _ass_subtitles_timecode(seconds): + return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) + + def _get_subtitles(self, sub_path, video_id): + if not sub_path: + return None + + enc_subtitles = self._download_webpage( + urljoin(self._BASE_URL, sub_path), + video_id, 'Downloading subtitles location', fatal=False) or '{}' + subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location') + if subtitle_location: + enc_subtitles = self._download_webpage( + urljoin(self._BASE_URL, subtitle_location), + video_id, 'Downloading subtitles data', fatal=False, + headers={'Origin': 'https://animedigitalnetwork.fr'}) + if not enc_subtitles: + return None + + # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), + bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')), + bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) + )) + subtitles_json = self._parse_json( + dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(), + None, fatal=False) + if not subtitles_json: + return None + + subtitles = {} + for sub_lang, sub in subtitles_json.items(): + ssa = '''[Script Info] +ScriptType:V4.00 +[V4 Styles] +Format: Name,Fontname,Fontsize,PrimaryColour,SecondaryColour,TertiaryColour,BackColour,Bold,Italic,BorderStyle,Outline,Shadow,Alignment,MarginL,MarginR,MarginV,AlphaLevel,Encoding +Style: Default,Arial,18,16777215,16777215,16777215,0,-1,0,1,1,0,2,20,20,20,0,0 +[Events] +Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' + for current in sub: + start, end, text, line_align, position_align = ( + float_or_none(current.get('startTime')), + float_or_none(current.get('endTime')), + current.get('text'), current.get('lineAlign'), + current.get('positionAlign')) + if start is None or end is None or text is None: + continue + alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0) + ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % ( + self._ass_subtitles_timecode(start), + self._ass_subtitles_timecode(end), + '{\\a%d}' % alignment if alignment != 2 else '', + text.replace('\n', '\\N').replace('', '{\\i1}').replace('', '{\\i0}')) + + if sub_lang == 'vostf': + sub_lang = 'fr' + subtitles.setdefault(sub_lang, []).extend([{ + 'ext': 'json', + 'data': json.dumps(sub), + }, { + 'ext': 'ssa', + 'data': ssa, + }]) + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_config = self._parse_json(self._search_regex( + r'playerConfig\s*=\s*({.+});', webpage, + 'player config', default='{}'), video_id, fatal=False) + if not player_config: + config_url = urljoin(self._BASE_URL, self._search_regex( + r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"', + webpage, 'config url')) + player_config = self._download_json( + config_url, video_id, + 'Downloading player config JSON metadata')['player'] + + video_info = {} + video_info_str = self._search_regex( + r'videoInfo\s*=\s*({.+});', webpage, + 'video info', fatal=False) + if video_info_str: + video_info = self._parse_json( + video_info_str, video_id, fatal=False) or {} + + options = player_config.get('options') or {} + metas = options.get('metas') or {} + links = player_config.get('links') or {} + sub_path = player_config.get('subtitles') + error = None + if not links: + links_url = player_config.get('linksurl') or options['videoUrl'] + token = options['token'] + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 'e': 60, + 't': token, + })) + padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + n, e = self._RSA_KEY + encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) + authorization = base64.b64encode(encrypted_message).decode() + links_data = self._download_json( + urljoin(self._BASE_URL, links_url), video_id, + 'Downloading links JSON metadata', headers={ + 'Authorization': 'Bearer ' + authorization, + }) + links = links_data.get('links') or {} + metas = metas or links_data.get('meta') or {} + sub_path = sub_path or links_data.get('subtitles') or \ + 'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id + sub_path += '&token=' + token + error = links_data.get('error') + title = metas.get('title') or video_info['title'] + + formats = [] + for format_id, qualities in links.items(): + if not isinstance(qualities, dict): + continue + for quality, load_balancer_url in qualities.items(): + load_balancer_data = self._download_json( + load_balancer_url, video_id, + 'Downloading %s %s JSON metadata' % (format_id, quality), + fatal=False) or {} + m3u8_url = load_balancer_data.get('location') + if not m3u8_url: + continue + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False) + if format_id == 'vf': + for f in m3u8_formats: + f['language'] = 'fr' + formats.extend(m3u8_formats) + if not error: + error = options.get('error') + if not formats and error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), + 'thumbnail': video_info.get('image'), + 'formats': formats, + 'subtitles': self.extract_subtitles(sub_path, video_id), + 'episode': metas.get('subtitle') or video_info.get('videoTitle'), + 'series': video_info.get('playlistTitle'), + } diff --git a/youtube_dl/extractor/adobeconnect.py b/youtube_dl/extractor/adobeconnect.py new file mode 100644 index 0000000000..728549eb92 --- /dev/null +++ b/youtube_dl/extractor/adobeconnect.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class AdobeConnectIE(InfoExtractor): + _VALID_URL = r'https?://\w+\.adobeconnect\.com/(?P[\w-]+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'(.+?)', webpage, 'title') + qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) + is_live = qs.get('isLive', ['false'])[0] == 'true' + formats = [] + for con_string in qs['conStrings'][0].split(','): + formats.append({ + 'format_id': con_string.split('://')[0], + 'app': compat_urlparse.quote('?' + con_string.split('?')[1] + 'flvplayerapp/' + qs['appInstance'][0]), + 'ext': 'flv', + 'play_path': 'mp4:' + qs['streamName'][0], + 'rtmp_conn': 'S:' + qs['ticket'][0], + 'rtmp_live': is_live, + 'url': con_string, + }) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py new file mode 100644 index 0000000000..38dca1b0a4 --- /dev/null +++ b/youtube_dl/extractor/adobepass.py @@ -0,0 +1,1572 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time +import xml.etree.ElementTree as etree + +from .common import InfoExtractor +from ..compat import ( + compat_kwargs, + compat_urlparse, +) +from ..utils import ( + unescapeHTML, + urlencode_postdata, + unified_timestamp, + ExtractorError, + NO_DEFAULT, +) + + +MSO_INFO = { + 'DTV': { + 'name': 'DIRECTV', + 'username_field': 'username', + 'password_field': 'password', + }, + 'ATT': { + 'name': 'AT&T U-verse', + 'username_field': 'userid', + 'password_field': 'password', + }, + 'ATTOTT': { + 'name': 'DIRECTV NOW', + 'username_field': 'email', + 'password_field': 'loginpassword', + }, + 'Rogers': { + 'name': 'Rogers', + 'username_field': 'UserName', + 'password_field': 'UserPassword', + }, + 'Comcast_SSO': { + 'name': 'Comcast XFINITY', + 'username_field': 'user', + 'password_field': 'passwd', + }, + 'TWC': { + 'name': 'Time Warner Cable | Spectrum', + 'username_field': 'Ecom_User_ID', + 'password_field': 'Ecom_Password', + }, + 'Brighthouse': { + 'name': 'Bright House Networks | Spectrum', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, + 'Charter_Direct': { + 'name': 'Charter Spectrum', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, + 'Verizon': { + 'name': 'Verizon FiOS', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, + 'thr030': { + 'name': '3 Rivers Communications' + }, + 'com140': { + 'name': 'Access Montana' + }, + 'acecommunications': { + 'name': 'AcenTek' + }, + 'acm010': { + 'name': 'Acme Communications' + }, + 'ada020': { + 'name': 'Adams Cable Service' + }, + 'alb020': { + 'name': 'Albany Mutual Telephone' + }, + 'algona': { + 'name': 'Algona Municipal Utilities' + }, + 'allwest': { + 'name': 'All West Communications' + }, + 'all025': { + 'name': 'Allen\'s Communications' + }, + 'spl010': { + 'name': 'Alliance Communications' + }, + 'all070': { + 'name': 'ALLO Communications' + }, + 'alpine': { + 'name': 'Alpine Communications' + }, + 'hun015': { + 'name': 'American Broadband' + }, + 'nwc010': { + 'name': 'American Broadband Missouri' + }, + 'com130-02': { + 'name': 'American Community Networks' + }, + 'com130-01': { + 'name': 'American Warrior Networks' + }, + 'tom020': { + 'name': 'Amherst Telephone/Tomorrow Valley' + }, + 'tvc020': { + 'name': 'Andycable' + }, + 'arkwest': { + 'name': 'Arkwest Communications' + }, + 'art030': { + 'name': 'Arthur Mutual Telephone Company' + }, + 'arvig': { + 'name': 'Arvig' + }, + 'nttcash010': { + 'name': 'Ashland Home Net' + }, + 'astound': { + 'name': 'Astound (now Wave)' + }, + 'dix030': { + 'name': 'ATC Broadband' + }, + 'ara010': { + 'name': 'ATC Communications' + }, + 'she030-02': { + 'name': 'Ayersville Communications' + }, + 'baldwin': { + 'name': 'Baldwin Lightstream' + }, + 'bal040': { + 'name': 'Ballard TV' + }, + 'cit025': { + 'name': 'Bardstown Cable TV' + }, + 'bay030': { + 'name': 'Bay Country Communications' + }, + 'tel095': { + 'name': 'Beaver Creek Cooperative Telephone' + }, + 'bea020': { + 'name': 'Beaver Valley Cable' + }, + 'bee010': { + 'name': 'Bee Line Cable' + }, + 'wir030': { + 'name': 'Beehive Broadband' + }, + 'bra020': { + 'name': 'BELD' + }, + 'bel020': { + 'name': 'Bellevue Municipal Cable' + }, + 'vol040-01': { + 'name': 'Ben Lomand Connect / BLTV' + }, + 'bev010': { + 'name': 'BEVCOMM' + }, + 'big020': { + 'name': 'Big Sandy Broadband' + }, + 'ble020': { + 'name': 'Bledsoe Telephone Cooperative' + }, + 'bvt010': { + 'name': 'Blue Valley Tele-Communications' + }, + 'bra050': { + 'name': 'Brandenburg Telephone Co.' + }, + 'bte010': { + 'name': 'Bristol Tennessee Essential Services' + }, + 'annearundel': { + 'name': 'Broadstripe' + }, + 'btc010': { + 'name': 'BTC Communications' + }, + 'btc040': { + 'name': 'BTC Vision - Nahunta' + }, + 'bul010': { + 'name': 'Bulloch Telephone Cooperative' + }, + 'but010': { + 'name': 'Butler-Bremer Communications' + }, + 'tel160-csp': { + 'name': 'C Spire SNAP' + }, + 'csicable': { + 'name': 'Cable Services Inc.' + }, + 'cableamerica': { + 'name': 'CableAmerica' + }, + 'cab038': { + 'name': 'CableSouth Media 3' + }, + 'weh010-camtel': { + 'name': 'Cam-Tel Company' + }, + 'car030': { + 'name': 'Cameron Communications' + }, + 'canbytel': { + 'name': 'Canby Telcom' + }, + 'crt020': { + 'name': 'CapRock Tv' + }, + 'car050': { + 'name': 'Carnegie Cable' + }, + 'cas': { + 'name': 'CAS Cable' + }, + 'casscomm': { + 'name': 'CASSCOMM' + }, + 'mid180-02': { + 'name': 'Catalina Broadband Solutions' + }, + 'cccomm': { + 'name': 'CC Communications' + }, + 'nttccde010': { + 'name': 'CDE Lightband' + }, + 'cfunet': { + 'name': 'Cedar Falls Utilities' + }, + 'dem010-01': { + 'name': 'Celect-Bloomer Telephone Area' + }, + 'dem010-02': { + 'name': 'Celect-Bruce Telephone Area' + }, + 'dem010-03': { + 'name': 'Celect-Citizens Connected Area' + }, + 'dem010-04': { + 'name': 'Celect-Elmwood/Spring Valley Area' + }, + 'dem010-06': { + 'name': 'Celect-Mosaic Telecom' + }, + 'dem010-05': { + 'name': 'Celect-West WI Telephone Area' + }, + 'net010-02': { + 'name': 'Cellcom/Nsight Telservices' + }, + 'cen100': { + 'name': 'CentraCom' + }, + 'nttccst010': { + 'name': 'Central Scott / CSTV' + }, + 'cha035': { + 'name': 'Chaparral CableVision' + }, + 'cha050': { + 'name': 'Chariton Valley Communication Corporation, Inc.' + }, + 'cha060': { + 'name': 'Chatmoss Cablevision' + }, + 'nttcche010': { + 'name': 'Cherokee Communications' + }, + 'che050': { + 'name': 'Chesapeake Bay Communications' + }, + 'cimtel': { + 'name': 'Cim-Tel Cable, LLC.' + }, + 'cit180': { + 'name': 'Citizens Cablevision - Floyd, VA' + }, + 'cit210': { + 'name': 'Citizens Cablevision, Inc.' + }, + 'cit040': { + 'name': 'Citizens Fiber' + }, + 'cit250': { + 'name': 'Citizens Mutual' + }, + 'war040': { + 'name': 'Citizens Telephone Corporation' + }, + 'wat025': { + 'name': 'City Of Monroe' + }, + 'wadsworth': { + 'name': 'CityLink' + }, + 'nor100': { + 'name': 'CL Tel' + }, + 'cla010': { + 'name': 'Clarence Telephone and Cedar Communications' + }, + 'ser060': { + 'name': 'Clear Choice Communications' + }, + 'tac020': { + 'name': 'Click! Cable TV' + }, + 'war020': { + 'name': 'CLICK1.NET' + }, + 'cml010': { + 'name': 'CML Telephone Cooperative Association' + }, + 'cns': { + 'name': 'CNS' + }, + 'com160': { + 'name': 'Co-Mo Connect' + }, + 'coa020': { + 'name': 'Coast Communications' + }, + 'coa030': { + 'name': 'Coaxial Cable TV' + }, + 'mid055': { + 'name': 'Cobalt TV (Mid-State Community TV)' + }, + 'col070': { + 'name': 'Columbia Power & Water Systems' + }, + 'col080': { + 'name': 'Columbus Telephone' + }, + 'nor105': { + 'name': 'Communications 1 Cablevision, Inc.' + }, + 'com150': { + 'name': 'Community Cable & Broadband' + }, + 'com020': { + 'name': 'Community Communications Company' + }, + 'coy010': { + 'name': 'commZoom' + }, + 'com025': { + 'name': 'Complete Communication Services' + }, + 'cat020': { + 'name': 'Comporium' + }, + 'com071': { + 'name': 'ComSouth Telesys' + }, + 'consolidatedcable': { + 'name': 'Consolidated' + }, + 'conwaycorp': { + 'name': 'Conway Corporation' + }, + 'coo050': { + 'name': 'Coon Valley Telecommunications Inc' + }, + 'coo080': { + 'name': 'Cooperative Telephone Company' + }, + 'cpt010': { + 'name': 'CP-TEL' + }, + 'cra010': { + 'name': 'Craw-Kan Telephone' + }, + 'crestview': { + 'name': 'Crestview Cable Communications' + }, + 'cross': { + 'name': 'Cross TV' + }, + 'cro030': { + 'name': 'Crosslake Communications' + }, + 'ctc040': { + 'name': 'CTC - Brainerd MN' + }, + 'phe030': { + 'name': 'CTV-Beam - East Alabama' + }, + 'cun010': { + 'name': 'Cunningham Telephone & Cable' + }, + 'dpc010': { + 'name': 'D & P Communications' + }, + 'dak030': { + 'name': 'Dakota Central Telecommunications' + }, + 'nttcdel010': { + 'name': 'Delcambre Telephone LLC' + }, + 'tel160-del': { + 'name': 'Delta Telephone Company' + }, + 'sal040': { + 'name': 'DiamondNet' + }, + 'ind060-dc': { + 'name': 'Direct Communications' + }, + 'doy010': { + 'name': 'Doylestown Cable TV' + }, + 'dic010': { + 'name': 'DRN' + }, + 'dtc020': { + 'name': 'DTC' + }, + 'dtc010': { + 'name': 'DTC Cable (Delhi)' + }, + 'dum010': { + 'name': 'Dumont Telephone Company' + }, + 'dun010': { + 'name': 'Dunkerton Telephone Cooperative' + }, + 'cci010': { + 'name': 'Duo County Telecom' + }, + 'eagle': { + 'name': 'Eagle Communications' + }, + 'weh010-east': { + 'name': 'East Arkansas Cable TV' + }, + 'eatel': { + 'name': 'EATEL Video, LLC' + }, + 'ell010': { + 'name': 'ECTA' + }, + 'emerytelcom': { + 'name': 'Emery Telcom Video LLC' + }, + 'nor200': { + 'name': 'Empire Access' + }, + 'endeavor': { + 'name': 'Endeavor Communications' + }, + 'sun045': { + 'name': 'Enhanced Telecommunications Corporation' + }, + 'mid030': { + 'name': 'enTouch' + }, + 'epb020': { + 'name': 'EPB Smartnet' + }, + 'jea010': { + 'name': 'EPlus Broadband' + }, + 'com065': { + 'name': 'ETC' + }, + 'ete010': { + 'name': 'Etex Communications' + }, + 'fbc-tele': { + 'name': 'F&B Communications' + }, + 'fal010': { + 'name': 'Falcon Broadband' + }, + 'fam010': { + 'name': 'FamilyView CableVision' + }, + 'far020': { + 'name': 'Farmers Mutual Telephone Company' + }, + 'fay010': { + 'name': 'Fayetteville Public Utilities' + }, + 'sal060': { + 'name': 'fibrant' + }, + 'fid010': { + 'name': 'Fidelity Communications' + }, + 'for030': { + 'name': 'FJ Communications' + }, + 'fli020': { + 'name': 'Flint River Communications' + }, + 'far030': { + 'name': 'FMT - Jesup' + }, + 'foo010': { + 'name': 'Foothills Communications' + }, + 'for080': { + 'name': 'Forsyth CableNet' + }, + 'fbcomm': { + 'name': 'Frankfort Plant Board' + }, + 'tel160-fra': { + 'name': 'Franklin Telephone Company' + }, + 'nttcftc010': { + 'name': 'FTC' + }, + 'fullchannel': { + 'name': 'Full Channel, Inc.' + }, + 'gar040': { + 'name': 'Gardonville Cooperative Telephone Association' + }, + 'gbt010': { + 'name': 'GBT Communications, Inc.' + }, + 'tec010': { + 'name': 'Genuine Telecom' + }, + 'clr010': { + 'name': 'Giant Communications' + }, + 'gla010': { + 'name': 'Glasgow EPB' + }, + 'gle010': { + 'name': 'Glenwood Telecommunications' + }, + 'gra060': { + 'name': 'GLW Broadband Inc.' + }, + 'goldenwest': { + 'name': 'Golden West Cablevision' + }, + 'vis030': { + 'name': 'Grantsburg Telcom' + }, + 'gpcom': { + 'name': 'Great Plains Communications' + }, + 'gri010': { + 'name': 'Gridley Cable Inc' + }, + 'hbc010': { + 'name': 'H&B Cable Services' + }, + 'hae010': { + 'name': 'Haefele TV Inc.' + }, + 'htc010': { + 'name': 'Halstad Telephone Company' + }, + 'har005': { + 'name': 'Harlan Municipal Utilities' + }, + 'har020': { + 'name': 'Hart Communications' + }, + 'ced010': { + 'name': 'Hartelco TV' + }, + 'hea040': { + 'name': 'Heart of Iowa Communications Cooperative' + }, + 'htc020': { + 'name': 'Hickory Telephone Company' + }, + 'nttchig010': { + 'name': 'Highland Communication Services' + }, + 'hig030': { + 'name': 'Highland Media' + }, + 'spc010': { + 'name': 'Hilliary Communications' + }, + 'hin020': { + 'name': 'Hinton CATV Co.' + }, + 'hometel': { + 'name': 'HomeTel Entertainment, Inc.' + }, + 'hoodcanal': { + 'name': 'Hood Canal Communications' + }, + 'weh010-hope': { + 'name': 'Hope - Prescott Cable TV' + }, + 'horizoncable': { + 'name': 'Horizon Cable TV, Inc.' + }, + 'hor040': { + 'name': 'Horizon Chillicothe Telephone' + }, + 'htc030': { + 'name': 'HTC Communications Co. - IL' + }, + 'htccomm': { + 'name': 'HTC Communications, Inc. - IA' + }, + 'wal005': { + 'name': 'Huxley Communications' + }, + 'imon': { + 'name': 'ImOn Communications' + }, + 'ind040': { + 'name': 'Independence Telecommunications' + }, + 'rrc010': { + 'name': 'Inland Networks' + }, + 'stc020': { + 'name': 'Innovative Cable TV St Croix' + }, + 'car100': { + 'name': 'Innovative Cable TV St Thomas-St John' + }, + 'icc010': { + 'name': 'Inside Connect Cable' + }, + 'int100': { + 'name': 'Integra Telecom' + }, + 'int050': { + 'name': 'Interstate Telecommunications Coop' + }, + 'irv010': { + 'name': 'Irvine Cable' + }, + 'k2c010': { + 'name': 'K2 Communications' + }, + 'kal010': { + 'name': 'Kalida Telephone Company, Inc.' + }, + 'kal030': { + 'name': 'Kalona Cooperative Telephone Company' + }, + 'kmt010': { + 'name': 'KMTelecom' + }, + 'kpu010': { + 'name': 'KPU Telecommunications' + }, + 'kuh010': { + 'name': 'Kuhn Communications, Inc.' + }, + 'lak130': { + 'name': 'Lakeland Communications' + }, + 'lan010': { + 'name': 'Langco' + }, + 'lau020': { + 'name': 'Laurel Highland Total Communications, Inc.' + }, + 'leh010': { + 'name': 'Lehigh Valley Cooperative Telephone' + }, + 'bra010': { + 'name': 'Limestone Cable/Bracken Cable' + }, + 'loc020': { + 'name': 'LISCO' + }, + 'lit020': { + 'name': 'Litestream' + }, + 'tel140': { + 'name': 'LivCom' + }, + 'loc010': { + 'name': 'LocalTel Communications' + }, + 'weh010-longview': { + 'name': 'Longview - Kilgore Cable TV' + }, + 'lon030': { + 'name': 'Lonsdale Video Ventures, LLC' + }, + 'lns010': { + 'name': 'Lost Nation-Elwood Telephone Co.' + }, + 'nttclpc010': { + 'name': 'LPC Connect' + }, + 'lumos': { + 'name': 'Lumos Networks' + }, + 'madison': { + 'name': 'Madison Communications' + }, + 'mad030': { + 'name': 'Madison County Cable Inc.' + }, + 'nttcmah010': { + 'name': 'Mahaska Communication Group' + }, + 'mar010': { + 'name': 'Marne & Elk Horn Telephone Company' + }, + 'mcc040': { + 'name': 'McClure Telephone Co.' + }, + 'mctv': { + 'name': 'MCTV' + }, + 'merrimac': { + 'name': 'Merrimac Communications Ltd.' + }, + 'metronet': { + 'name': 'Metronet' + }, + 'mhtc': { + 'name': 'MHTC' + }, + 'midhudson': { + 'name': 'Mid-Hudson Cable' + }, + 'midrivers': { + 'name': 'Mid-Rivers Communications' + }, + 'mid045': { + 'name': 'Midstate Communications' + }, + 'mil080': { + 'name': 'Milford Communications' + }, + 'min030': { + 'name': 'MINET' + }, + 'nttcmin010': { + 'name': 'Minford TV' + }, + 'san040-02': { + 'name': 'Mitchell Telecom' + }, + 'mlg010': { + 'name': 'MLGC' + }, + 'mon060': { + 'name': 'Mon-Cre TVE' + }, + 'mou110': { + 'name': 'Mountain Telephone' + }, + 'mou050': { + 'name': 'Mountain Village Cable' + }, + 'mtacomm': { + 'name': 'MTA Communications, LLC' + }, + 'mtc010': { + 'name': 'MTC Cable' + }, + 'med040': { + 'name': 'MTC Technologies' + }, + 'man060': { + 'name': 'MTCC' + }, + 'mtc030': { + 'name': 'MTCO Communications' + }, + 'mul050': { + 'name': 'Mulberry Telecommunications' + }, + 'mur010': { + 'name': 'Murray Electric System' + }, + 'musfiber': { + 'name': 'MUS FiberNET' + }, + 'mpw': { + 'name': 'Muscatine Power & Water' + }, + 'nttcsli010': { + 'name': 'myEVTV.com' + }, + 'nor115': { + 'name': 'NCC' + }, + 'nor260': { + 'name': 'NDTC' + }, + 'nctc': { + 'name': 'Nebraska Central Telecom, Inc.' + }, + 'nel020': { + 'name': 'Nelsonville TV Cable' + }, + 'nem010': { + 'name': 'Nemont' + }, + 'new075': { + 'name': 'New Hope Telephone Cooperative' + }, + 'nor240': { + 'name': 'NICP' + }, + 'cic010': { + 'name': 'NineStar Connect' + }, + 'nktelco': { + 'name': 'NKTelco' + }, + 'nortex': { + 'name': 'Nortex Communications' + }, + 'nor140': { + 'name': 'North Central Telephone Cooperative' + }, + 'nor030': { + 'name': 'Northland Communications' + }, + 'nor075': { + 'name': 'Northwest Communications' + }, + 'nor125': { + 'name': 'Norwood Light Broadband' + }, + 'net010': { + 'name': 'Nsight Telservices' + }, + 'dur010': { + 'name': 'Ntec' + }, + 'nts010': { + 'name': 'NTS Communications' + }, + 'new045': { + 'name': 'NU-Telecom' + }, + 'nulink': { + 'name': 'NuLink' + }, + 'jam030': { + 'name': 'NVC' + }, + 'far035': { + 'name': 'OmniTel Communications' + }, + 'onesource': { + 'name': 'OneSource Communications' + }, + 'cit230': { + 'name': 'Opelika Power Services' + }, + 'daltonutilities': { + 'name': 'OptiLink' + }, + 'mid140': { + 'name': 'OPTURA' + }, + 'ote010': { + 'name': 'OTEC Communication Company' + }, + 'cci020': { + 'name': 'Packerland Broadband' + }, + 'pan010': { + 'name': 'Panora Telco/Guthrie Center Communications' + }, + 'otter': { + 'name': 'Park Region Telephone & Otter Tail Telcom' + }, + 'mid050': { + 'name': 'Partner Communications Cooperative' + }, + 'fib010': { + 'name': 'Pathway' + }, + 'paulbunyan': { + 'name': 'Paul Bunyan Communications' + }, + 'pem020': { + 'name': 'Pembroke Telephone Company' + }, + 'mck010': { + 'name': 'Peoples Rural Telephone Cooperative' + }, + 'pul010': { + 'name': 'PES Energize' + }, + 'phi010': { + 'name': 'Philippi Communications System' + }, + 'phonoscope': { + 'name': 'Phonoscope Cable' + }, + 'pin070': { + 'name': 'Pine Belt Communications, Inc.' + }, + 'weh010-pine': { + 'name': 'Pine Bluff Cable TV' + }, + 'pin060': { + 'name': 'Pineland Telephone Cooperative' + }, + 'cam010': { + 'name': 'Pinpoint Communications' + }, + 'pio060': { + 'name': 'Pioneer Broadband' + }, + 'pioncomm': { + 'name': 'Pioneer Communications' + }, + 'pioneer': { + 'name': 'Pioneer DTV' + }, + 'pla020': { + 'name': 'Plant TiftNet, Inc.' + }, + 'par010': { + 'name': 'PLWC' + }, + 'pro035': { + 'name': 'PMT' + }, + 'vik011': { + 'name': 'Polar Cablevision' + }, + 'pottawatomie': { + 'name': 'Pottawatomie Telephone Co.' + }, + 'premiercomm': { + 'name': 'Premier Communications' + }, + 'psc010': { + 'name': 'PSC' + }, + 'pan020': { + 'name': 'PTCI' + }, + 'qco010': { + 'name': 'QCOL' + }, + 'qua010': { + 'name': 'Quality Cablevision' + }, + 'rad010': { + 'name': 'Radcliffe Telephone Company' + }, + 'car040': { + 'name': 'Rainbow Communications' + }, + 'rai030': { + 'name': 'Rainier Connect' + }, + 'ral010': { + 'name': 'Ralls Technologies' + }, + 'rct010': { + 'name': 'RC Technologies' + }, + 'red040': { + 'name': 'Red River Communications' + }, + 'ree010': { + 'name': 'Reedsburg Utility Commission' + }, + 'mol010': { + 'name': 'Reliance Connects- Oregon' + }, + 'res020': { + 'name': 'Reserve Telecommunications' + }, + 'weh010-resort': { + 'name': 'Resort TV Cable' + }, + 'rld010': { + 'name': 'Richland Grant Telephone Cooperative, Inc.' + }, + 'riv030': { + 'name': 'River Valley Telecommunications Coop' + }, + 'rockportcable': { + 'name': 'Rock Port Cablevision' + }, + 'rsf010': { + 'name': 'RS Fiber' + }, + 'rtc': { + 'name': 'RTC Communication Corp' + }, + 'res040': { + 'name': 'RTC-Reservation Telephone Coop.' + }, + 'rte010': { + 'name': 'RTEC Communications' + }, + 'stc010': { + 'name': 'S&T' + }, + 'san020': { + 'name': 'San Bruno Cable TV' + }, + 'san040-01': { + 'name': 'Santel' + }, + 'sav010': { + 'name': 'SCI Broadband-Savage Communications Inc.' + }, + 'sco050': { + 'name': 'Scottsboro Electric Power Board' + }, + 'scr010': { + 'name': 'Scranton Telephone Company' + }, + 'selco': { + 'name': 'SELCO' + }, + 'she010': { + 'name': 'Shentel' + }, + 'she030': { + 'name': 'Sherwood Mutual Telephone Association, Inc.' + }, + 'ind060-ssc': { + 'name': 'Silver Star Communications' + }, + 'sjoberg': { + 'name': 'Sjoberg\'s Inc.' + }, + 'sou025': { + 'name': 'SKT' + }, + 'sky050': { + 'name': 'SkyBest TV' + }, + 'nttcsmi010': { + 'name': 'Smithville Communications' + }, + 'woo010': { + 'name': 'Solarus' + }, + 'sou075': { + 'name': 'South Central Rural Telephone Cooperative' + }, + 'sou065': { + 'name': 'South Holt Cablevision, Inc.' + }, + 'sou035': { + 'name': 'South Slope Cooperative Communications' + }, + 'spa020': { + 'name': 'Spanish Fork Community Network' + }, + 'spe010': { + 'name': 'Spencer Municipal Utilities' + }, + 'spi005': { + 'name': 'Spillway Communications, Inc.' + }, + 'srt010': { + 'name': 'SRT' + }, + 'cccsmc010': { + 'name': 'St. Maarten Cable TV' + }, + 'sta025': { + 'name': 'Star Communications' + }, + 'sco020': { + 'name': 'STE' + }, + 'uin010': { + 'name': 'STRATA Networks' + }, + 'sum010': { + 'name': 'Sumner Cable TV' + }, + 'pie010': { + 'name': 'Surry TV/PCSI TV' + }, + 'swa010': { + 'name': 'Swayzee Communications' + }, + 'sweetwater': { + 'name': 'Sweetwater Cable Television Co' + }, + 'weh010-talequah': { + 'name': 'Tahlequah Cable TV' + }, + 'tct': { + 'name': 'TCT' + }, + 'tel050': { + 'name': 'Tele-Media Company' + }, + 'com050': { + 'name': 'The Community Agency' + }, + 'thr020': { + 'name': 'Three River' + }, + 'cab140': { + 'name': 'Town & Country Technologies' + }, + 'tra010': { + 'name': 'Trans-Video' + }, + 'tre010': { + 'name': 'Trenton TV Cable Company' + }, + 'tcc': { + 'name': 'Tri County Communications Cooperative' + }, + 'tri025': { + 'name': 'TriCounty Telecom' + }, + 'tri110': { + 'name': 'TrioTel Communications, Inc.' + }, + 'tro010': { + 'name': 'Troy Cablevision, Inc.' + }, + 'tsc': { + 'name': 'TSC' + }, + 'cit220': { + 'name': 'Tullahoma Utilities Board' + }, + 'tvc030': { + 'name': 'TV Cable of Rensselaer' + }, + 'tvc015': { + 'name': 'TVC Cable' + }, + 'cab180': { + 'name': 'TVision' + }, + 'twi040': { + 'name': 'Twin Lakes' + }, + 'tvtinc': { + 'name': 'Twin Valley' + }, + 'uis010': { + 'name': 'Union Telephone Company' + }, + 'uni110': { + 'name': 'United Communications - TN' + }, + 'uni120': { + 'name': 'United Services' + }, + 'uss020': { + 'name': 'US Sonet' + }, + 'cab060': { + 'name': 'USA Communications' + }, + 'she005': { + 'name': 'USA Communications/Shellsburg, IA' + }, + 'val040': { + 'name': 'Valley TeleCom Group' + }, + 'val025': { + 'name': 'Valley Telecommunications' + }, + 'val030': { + 'name': 'Valparaiso Broadband' + }, + 'cla050': { + 'name': 'Vast Broadband' + }, + 'sul015': { + 'name': 'Venture Communications Cooperative, Inc.' + }, + 'ver025': { + 'name': 'Vernon Communications Co-op' + }, + 'weh010-vicksburg': { + 'name': 'Vicksburg Video' + }, + 'vis070': { + 'name': 'Vision Communications' + }, + 'volcanotel': { + 'name': 'Volcano Vision, Inc.' + }, + 'vol040-02': { + 'name': 'VolFirst / BLTV' + }, + 'ver070': { + 'name': 'VTel' + }, + 'nttcvtx010': { + 'name': 'VTX1' + }, + 'bci010-02': { + 'name': 'Vyve Broadband' + }, + 'wab020': { + 'name': 'Wabash Mutual Telephone' + }, + 'waitsfield': { + 'name': 'Waitsfield Cable' + }, + 'wal010': { + 'name': 'Walnut Communications' + }, + 'wavebroadband': { + 'name': 'Wave' + }, + 'wav030': { + 'name': 'Waverly Communications Utility' + }, + 'wbi010': { + 'name': 'WBI' + }, + 'web020': { + 'name': 'Webster-Calhoun Cooperative Telephone Association' + }, + 'wes005': { + 'name': 'West Alabama TV Cable' + }, + 'carolinata': { + 'name': 'West Carolina Communications' + }, + 'wct010': { + 'name': 'West Central Telephone Association' + }, + 'wes110': { + 'name': 'West River Cooperative Telephone Company' + }, + 'ani030': { + 'name': 'WesTel Systems' + }, + 'westianet': { + 'name': 'Western Iowa Networks' + }, + 'nttcwhi010': { + 'name': 'Whidbey Telecom' + }, + 'weh010-white': { + 'name': 'White County Cable TV' + }, + 'wes130': { + 'name': 'Wiatel' + }, + 'wik010': { + 'name': 'Wiktel' + }, + 'wil070': { + 'name': 'Wilkes Communications, Inc./RiverStreet Networks' + }, + 'wil015': { + 'name': 'Wilson Communications' + }, + 'win010': { + 'name': 'Windomnet/SMBS' + }, + 'win090': { + 'name': 'Windstream Cable TV' + }, + 'wcta': { + 'name': 'Winnebago Cooperative Telecom Association' + }, + 'wtc010': { + 'name': 'WTC' + }, + 'wil040': { + 'name': 'WTC Communications, Inc.' + }, + 'wya010': { + 'name': 'Wyandotte Cable' + }, + 'hin020-02': { + 'name': 'X-Stream Services' + }, + 'xit010': { + 'name': 'XIT Communications' + }, + 'yel010': { + 'name': 'Yelcot Communications' + }, + 'mid180-01': { + 'name': 'yondoo' + }, + 'cou060': { + 'name': 'Zito Media' + }, +} + + +class AdobePassIE(InfoExtractor): + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' + _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + _MVPD_CACHE = 'ap-mvpd' + + _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' + + def _download_webpage_handle(self, *args, **kwargs): + headers = self.geo_verification_headers() + headers.update(kwargs.get('headers', {})) + kwargs['headers'] = headers + return super(AdobePassIE, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + + @staticmethod + def _get_mvpd_resource(provider_id, title, guid, rating): + channel = etree.Element('channel') + channel_title = etree.SubElement(channel, 'title') + channel_title.text = provider_id + item = etree.SubElement(channel, 'item') + resource_title = etree.SubElement(item, 'title') + resource_title.text = title + resource_guid = etree.SubElement(item, 'guid') + resource_guid.text = guid + resource_rating = etree.SubElement(item, 'media:rating') + resource_rating.attrib = {'scheme': 'urn:v-chip'} + resource_rating.text = rating + return '' + etree.tostring(channel).decode() + '' + + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)' % (tag, tag), xml_str, tag) + + def is_expired(token, date_ele): + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) + return token_expires and token_expires <= int(time.time()) + + def post_form(form_page_res, note, data={}): + form_page, urlh = form_page_res + post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') + if not re.match(r'https?://', post_url): + post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) + form_data = self._hidden_inputs(form_page) + form_data.update(data) + return self._download_webpage_handle( + post_url, video_id, note, data=urlencode_postdata(form_data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + def raise_mvpd_required(): + raise ExtractorError( + 'This video is only available for users of participating TV providers. ' + 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' + 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) + + def extract_redirect_url(html, url=None, fatal=False): + # TODO: eliminate code duplication with generic extractor and move + # redirection code into _download_webpage_handle + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' + redirect_url = self._search_regex( + r'(?i)Resume' in mvpd_confirm_page: + post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Verizon': + # In general, if you're connecting from a Verizon-assigned IP, + # you will not actually pass your credentials. + provider_redirect_page, urlh = provider_redirect_page_res + if 'Please wait ...' in provider_redirect_page: + saml_redirect_url = self._html_search_regex( + r'self\.parent\.location=(["\'])(?P.+?)\1', + provider_redirect_page, + 'SAML Redirect URL', group='url') + saml_login_page = self._download_webpage( + saml_redirect_url, video_id, + 'Downloading SAML Login Page') + else: + saml_login_page_res = post_form( + provider_redirect_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password, + }) + saml_login_page, urlh = saml_login_page_res + if 'Please try again.' in saml_login_page: + raise ExtractorError( + 'We\'re sorry, but either the User ID or Password entered is not correct.') + saml_login_url = self._search_regex( + r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P.+?)\1', + saml_login_page, 'SAML Login URL', group='url') + saml_response_json = self._download_json( + saml_login_url, video_id, 'Downloading SAML Response', + headers={'Content-Type': 'text/xml'}) + self._download_webpage( + saml_response_json['targetValue'], video_id, + 'Confirming Login', data=urlencode_postdata({ + 'SAMLResponse': saml_response_json['SAMLResponse'], + 'RelayState': saml_response_json['RelayState'] + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }) + else: + # Some providers (e.g. DIRECTV NOW) have another meta refresh + # based redirect that should be followed. + provider_redirect_page, urlh = provider_redirect_page_res + provider_refresh_redirect_url = extract_redirect_url( + provider_redirect_page, url=urlh.geturl()) + if provider_refresh_redirect_url: + provider_redirect_page_res = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Provider Redirect Page (meta refresh)') + provider_login_page_res = post_form( + provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) + mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { + mso_info.get('username_field', 'username'): username, + mso_info.get('password_field', 'password'): password, + }) + if mso_id != 'Rogers': + post_form(mvpd_confirm_page_res, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + if '\d+)' + _TEST = { + 'url': 'https://tv.adobe.com/embed/22/4153', + 'md5': 'c8c0461bf04d54574fc2b4d07ac6783a', + 'info_dict': { + 'id': '4153', + 'ext': 'flv', + 'title': 'Creating Graphics Optimized for BlackBerry', + 'description': 'md5:eac6e8dced38bdaae51cd94447927459', + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20091109', + 'duration': 377, + 'view_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._call_api( + 'episode/' + video_id, video_id, {'disclosure': 'standard'})[0] + return self._parse_video_data(video_data) + + +class AdobeTVIE(AdobeTVBaseIE): + IE_NAME = 'adobetv' + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?watch/(?P[^/]+)/(?P[^/]+)' + + _TEST = { + 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/', + 'md5': '9bc5727bcdd55251f35ad311ca74fa1e', + 'info_dict': { + 'id': '10981', + 'ext': 'mp4', + 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop', + 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311', + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20110914', + 'duration': 60, + 'view_count': int, + }, + } + + def _real_extract(self, url): + language, show_urlname, urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + + video_data = self._call_api( + 'episode/get', urlname, { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + 'urlname': urlname, + })[0] + return self._parse_video_data(video_data) + + +class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): + _PAGE_SIZE = 25 + + def _fetch_page(self, display_id, query, page): + page += 1 + query['page'] = page + for element_data in self._call_api( + self._RESOURCE, display_id, query, 'Download Page %d' % page): + yield self._process_data(element_data) + + def _extract_playlist_entries(self, display_id, query): + return OnDemandPagedList(functools.partial( + self._fetch_page, display_id, query), self._PAGE_SIZE) + + +class AdobeTVShowIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:show' + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?show/(?P[^/]+)' + + _TEST = { + 'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost', + 'info_dict': { + 'id': '36', + 'title': 'The Complete Picture with Julieanne Kost', + 'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27', + }, + 'playlist_mincount': 136, + } + _RESOURCE = 'episode' + _process_data = AdobeTVBaseIE._parse_video_data + + def _real_extract(self, url): + language, show_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = { + 'disclosure': 'standard', + 'language': language, + 'show_urlname': show_urlname, + } + + show_data = self._call_api( + 'show/get', show_urlname, query)[0] + + return self.playlist_result( + self._extract_playlist_entries(show_urlname, query), + str_or_none(show_data.get('id')), + show_data.get('show_name'), + show_data.get('show_description')) + + +class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): + IE_NAME = 'adobetv:channel' + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?channel/(?P[^/]+)(?:/(?P[^/]+))?' + + _TEST = { + 'url': 'http://tv.adobe.com/channel/development', + 'info_dict': { + 'id': 'development', + }, + 'playlist_mincount': 96, + } + _RESOURCE = 'show' + + def _process_data(self, show_data): + return self.url_result( + show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id'))) + + def _real_extract(self, url): + language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = { + 'channel_urlname': channel_urlname, + 'language': language, + } + if category_urlname: + query['category_urlname'] = category_urlname + + return self.playlist_result( + self._extract_playlist_entries(channel_urlname, query), + channel_urlname) + + +class AdobeTVVideoIE(AdobeTVBaseIE): + IE_NAME = 'adobetv:video' + _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' + + _TEST = { + # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners + 'url': 'https://video.tv.adobe.com/v/2456/', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) + title = video_data['title'] + + formats = [] + sources = video_data.get('sources') or [] + for source in sources: + source_src = source.get('src') + if not source_src: + continue + formats.append({ + 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), + 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'height': int_or_none(source.get('height') or None), + 'tbr': int_or_none(source.get('bitrate') or None), + 'width': int_or_none(source.get('width') or None), + 'url': source_src, + }) + self._sort_formats(formats) + + # For both metadata and downloaded files the duration varies among + # formats. I just pick the max one + duration = max(filter(None, [ + float_or_none(source.get('duration'), scale=1000) + for source in sources])) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('video', {}).get('poster'), + 'duration': duration, + 'subtitles': self._parse_subtitles(video_data, 'vttPath'), + } diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py new file mode 100644 index 0000000000..8d1d9ac7da --- /dev/null +++ b/youtube_dl/extractor/adultswim.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .turner import TurnerBaseIE +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + parse_age_limit, + parse_iso8601, + strip_or_none, + try_get, +) + + +class AdultSwimIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P[^/?#]+)(?:/(?P[^/?#]+))?' + + _TESTS = [{ + 'url': 'http://adultswim.com/videos/rick-and-morty/pilot', + 'info_dict': { + 'id': 'rQxZvXQ4ROaSOqq-or2Mow', + 'ext': 'mp4', + 'title': 'Rick and Morty - Pilot', + 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', + 'timestamp': 1543294800, + 'upload_date': '20181127', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', + 'info_dict': { + 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'ext': 'mp4', + 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', + 'upload_date': '20080124', + 'timestamp': 1201150800, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': '404 Not Found', + }, { + 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', + 'info_dict': { + 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', + 'ext': 'mp4', + 'title': 'Decker - Inside Decker: A New Hero', + 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', + 'timestamp': 1469480460, + 'upload_date': '20160725', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://www.adultswim.com/videos/attack-on-titan', + 'info_dict': { + 'id': 'attack-on-titan', + 'title': 'Attack on Titan', + 'description': 'md5:41caa9416906d90711e31dc00cb7db7e', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://www.adultswim.com/videos/streams/williams-stream', + 'info_dict': { + 'id': 'd8DEBj7QRfetLsRgFnGEyg', + 'ext': 'mp4', + 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'original programming', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': '404 Not Found', + }] + + def _real_extract(self, url): + show_path, episode_path = re.match(self._VALID_URL, url).groups() + display_id = episode_path or show_path + query = '''query { + getShowBySlug(slug:"%s") { + %%s + } +}''' % show_path + if episode_path: + query = query % '''title + getVideoBySlug(slug:"%s") { + _id + auth + description + duration + episodeNumber + launchDate + mediaID + seasonNumber + poster + title + tvRating + }''' % episode_path + ['getVideoBySlug'] + else: + query = query % '''metaDescription + title + videos(first:1000,sort:["episode_number"]) { + edges { + node { + _id + slug + } + } + }''' + show_data = self._download_json( + 'https://www.adultswim.com/api/search', display_id, + data=json.dumps({'query': query}).encode(), + headers={'Content-Type': 'application/json'})['data']['getShowBySlug'] + if episode_path: + video_data = show_data['getVideoBySlug'] + video_id = video_data['_id'] + episode_title = title = video_data['title'] + series = show_data.get('title') + if series: + title = '%s - %s' % (series, title) + info = { + 'id': video_id, + 'title': title, + 'description': strip_or_none(video_data.get('description')), + 'duration': float_or_none(video_data.get('duration')), + 'formats': [], + 'subtitles': {}, + 'age_limit': parse_age_limit(video_data.get('tvRating')), + 'thumbnail': video_data.get('poster'), + 'timestamp': parse_iso8601(video_data.get('launchDate')), + 'series': series, + 'season_number': int_or_none(video_data.get('seasonNumber')), + 'episode': episode_title, + 'episode_number': int_or_none(video_data.get('episodeNumber')), + } + + auth = video_data.get('auth') + media_id = video_data.get('mediaID') + if media_id: + info.update(self._extract_ngtv_info(media_id, { + # CDN_TOKEN_APP_ID from: + # https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js + 'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE', + }, { + 'url': url, + 'site_name': 'AdultSwim', + 'auth_required': auth, + })) + + if not auth: + extract_data = self._download_json( + 'https://www.adultswim.com/api/shows/v1/videos/' + video_id, + video_id, query={'fields': 'stream'}, fatal=False) or {} + assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or [] + for asset in assets: + asset_url = asset.get('url') + if not asset_url: + continue + ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) + if ext == 'm3u8': + info['formats'].extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + continue + # info['formats'].extend(self._extract_f4m_formats( + # asset_url, video_id, f4m_id='hds', fatal=False)) + elif ext in ('scc', 'ttml', 'vtt'): + info['subtitles'].setdefault('en', []).append({ + 'url': asset_url, + }) + self._sort_formats(info['formats']) + + return info + else: + entries = [] + for edge in show_data.get('videos', {}).get('edges', []): + video = edge.get('node') or {} + slug = video.get('slug') + if not slug: + continue + entries.append(self.url_result( + 'http://adultswim.com/videos/%s/%s' % (show_path, slug), + 'AdultSwim', video.get('_id'))) + return self.playlist_result( + entries, show_path, show_data.get('title'), + strip_or_none(show_data.get('metaDescription'))) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py new file mode 100644 index 0000000000..611b948f51 --- /dev/null +++ b/youtube_dl/extractor/aenetworks.py @@ -0,0 +1,247 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .theplatform import ThePlatformIE +from ..utils import ( + extract_attributes, + ExtractorError, + int_or_none, + smuggle_url, + update_url_query, +) +from ..compat import ( + compat_urlparse, +) + + +class AENetworksBaseIE(ThePlatformIE): + _THEPLATFORM_KEY = 'crazyjava' + _THEPLATFORM_SECRET = 's3cr3t' + + def _extract_aen_smil(self, smil_url, video_id, auth=None): + query = {'mbr': 'true'} + if auth: + query['auth'] = auth + TP_SMIL_QUERY = [{ + 'assetTypes': 'high_video_ak', + 'switch': 'hls_high_ak' + }, { + 'assetTypes': 'high_video_s3' + }, { + 'assetTypes': 'high_video_s3', + 'switch': 'hls_ingest_fastly' + }] + formats = [] + subtitles = {} + last_e = None + for q in TP_SMIL_QUERY: + q.update(query) + m_url = update_url_query(smil_url, q) + m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) + except ExtractorError as e: + last_e = e + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + raise last_e + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + + +class AENetworksIE(AENetworksBaseIE): + IE_NAME = 'aenetworks' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?P + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/ + (?: + shows/(?P[^/]+(?:/[^/]+){0,2})| + movies/(?P[^/]+)(?:/full-movie)?| + specials/(?P[^/]+)/(?:full-special|preview-)| + collections/[^/]+/(?P[^/]+) + ) + ''' + _TESTS = [{ + 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'info_dict': { + 'id': '22253814', + 'ext': 'mp4', + 'title': 'Winter is Coming', + 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'info_dict': { + 'id': '71889446852', + }, + 'playlist_mincount': 5, + }, { + 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', + 'info_dict': { + 'id': 'SERIES4317', + 'title': 'Atlanta Plastic', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', + 'only_matching': True + }, { + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', + 'only_matching': True + }, { + 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'only_matching': True + }, { + 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', + 'only_matching': True + }, { + 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', + 'only_matching': True + }] + _DOMAIN_TO_REQUESTOR_ID = { + 'history.com': 'HISTORY', + 'aetv.com': 'AETV', + 'mylifetime.com': 'LIFETIME', + 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', + 'fyi.tv': 'FYI', + } + + def _real_extract(self, url): + domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id or special_display_id or collection_display_id + webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) + if show_path: + url_parts = show_path.split('/') + url_parts_len = len(url_parts) + if url_parts_len == 1: + entries = [] + for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) + if entries: + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + else: + # single season + url_parts_len = 2 + if url_parts_len == 2: + entries = [] + for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): + episode_attributes = extract_attributes(episode_item) + episode_url = compat_urlparse.urljoin( + url, episode_attributes['data-canonical']) + entries.append(self.url_result( + episode_url, 'AENetworks', + episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeasonId', webpage)) + + video_id = self._html_search_meta('aetn:VideoID', webpage) + media_url = self._search_regex( + [r"media_url\s*=\s*'(?P[^']+)'", + r'data-media-url=(?P(?:https?:)?//[^\s>]+)', + r'data-media-url=(["\'])(?P(?:(?!\1).)+?)\1'], + webpage, 'video url', group='url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._search_json_ld(webpage, video_id, fatal=False)) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + return info + + +class HistoryTopicIE(AENetworksBaseIE): + IE_NAME = 'history:topic' + IE_DESC = 'History.com Topic' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P[\w+-]+?)-video' + _TESTS = [{ + 'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video', + 'info_dict': { + 'id': '40700995724', + 'ext': 'mp4', + 'title': "History of Valentine’s Day", + 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def theplatform_url_result(self, theplatform_url, video_id, query): + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': smuggle_url( + update_url_query(theplatform_url, query), + { + 'sig': { + 'key': self._THEPLATFORM_KEY, + 'secret': self._THEPLATFORM_SECRET, + }, + 'force_smil_url': True + }), + 'ie_key': 'ThePlatform', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r']+src="[^"]+\btpid=(\d+)', webpage, 'tpid') + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/history/videos', + video_id, query={'filter[id]': video_id})['results'][0] + title = result['title'] + info = self._extract_aen_smil(result['publicUrl'], video_id) + info.update({ + 'title': title, + 'description': result.get('description'), + 'duration': int_or_none(result.get('duration')), + 'timestamp': int_or_none(result.get('added'), 1000), + }) + return info diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py new file mode 100644 index 0000000000..6275e5209e --- /dev/null +++ b/youtube_dl/extractor/afreecatv.py @@ -0,0 +1,367 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_xpath +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + url_or_none, + urlencode_postdata, + xpath_text, +) + + +class AfreecaTVIE(InfoExtractor): + IE_NAME = 'afreecatv' + IE_DESC = 'afreecatv.com' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? + (?: + /app/(?:index|read_ucc_bbs)\.cgi| + /player/[Pp]layer\.(?:swf|html) + )\?.*?\bnTitleNo=| + vod\.afreecatv\.com/PLAYER/STATION/ + ) + (?P\d+) + ''' + _NETRC_MACHINE = 'afreecatv' + _TESTS = [{ + 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', + 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', + 'info_dict': { + 'id': '36164052', + 'ext': 'mp4', + 'title': '데일리 에이프릴 요정들의 시상식!', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160503', + }, + 'skip': 'Video is gone', + }, { + 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867', + 'info_dict': { + 'id': '36153164', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': 'd8b7c174568da61d774ef0203159bf97', + 'info_dict': { + 'id': '36153164_1', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'upload_date': '20160502', + }, + }, { + 'md5': '58f2ce7f6044e34439ab2d50612ab02b', + 'info_dict': { + 'id': '36153164_2', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'upload_date': '20160502', + }, + }], + 'skip': 'Video is gone', + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', + 'info_dict': { + 'id': '18650793', + 'ext': 'mp4', + 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': '윈아디', + 'uploader_id': 'badkids', + 'duration': 107, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', + 'info_dict': { + 'id': '10481652', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'duration': 6492, + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': 'd8b7c174568da61d774ef0203159bf97', + 'info_dict': { + 'id': '20160502_c4c62b9d_174361386_1', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160502', + 'duration': 3601, + }, + }, { + 'md5': '58f2ce7f6044e34439ab2d50612ab02b', + 'info_dict': { + 'id': '20160502_39e739bb_174361386_2', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160502', + 'duration': 2891, + }, + }], + 'params': { + 'skip_download': True, + }, + }, { + # non standard key + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', + 'info_dict': { + 'id': '20170411_BE689A0E_190960999_1_2_h', + 'ext': 'mp4', + 'title': '혼자사는여자집', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': '♥이슬이', + 'uploader_id': 'dasl8121', + 'upload_date': '20170411', + 'duration': 213, + }, + 'params': { + 'skip_download': True, + }, + }, { + # PARTIAL_ADULT + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', + 'info_dict': { + 'id': '20180327_27901457_202289533_1', + 'ext': 'mp4', + 'title': '[생]빨개요♥ (part 1)', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': '[SA]서아', + 'uploader_id': 'bjdyrksu', + 'upload_date': '20180327', + 'duration': 3601, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['adult content'], + }, { + 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', + 'only_matching': True, + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', + 'only_matching': True, + }] + + @staticmethod + def parse_video_key(key): + video_key = {} + m = re.match(r'^(?P\d{8})_\w+_(?P\d+)$', key) + if m: + video_key['upload_date'] = m.group('upload_date') + video_key['part'] = int(m.group('part')) + return video_key + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_form = { + 'szWork': 'login', + 'szType': 'json', + 'szUid': username, + 'szPassword': password, + 'isSaveId': 'false', + 'szScriptVar': 'oLoginRet', + 'szAction': '', + } + + response = self._download_json( + 'https://login.afreecatv.com/app/LoginAction.php', None, + 'Logging in', data=urlencode_postdata(login_form)) + + _ERRORS = { + -4: 'Your account has been suspended due to a violation of our terms and policies.', + -5: 'https://member.afreecatv.com/app/user_delete_progress.php', + -6: 'https://login.afreecatv.com/membership/changeMember.php', + -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.afreecatv.com/app/pop_login_block.php', + -11: 'https://login.afreecatv.com/afreeca/second_login.php', + -12: 'https://member.afreecatv.com/app/user_security.php', + 0: 'The username does not exist or you have entered the wrong password.', + -1: 'The username does not exist or you have entered the wrong password.', + -3: 'You have entered your username/password incorrectly.', + -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', + -32008: 'You have failed to log in. Please contact our Help Center.', + } + + result = int_or_none(response.get('RESULT')) + if result != 1: + error = _ERRORS.get(result, 'You have failed to log in.') + raise ExtractorError( + 'Unable to login: %s said: %s' % (self.IE_NAME, error), + expected=True) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if re.search(r'alert\(["\']This video has been deleted', webpage): + raise ExtractorError( + 'Video %s has been deleted' % video_id, expected=True) + + station_id = self._search_regex( + r'nStationNo\s*=\s*(\d+)', webpage, 'station') + bbs_id = self._search_regex( + r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') + video_id = self._search_regex( + r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) + + partial_view = False + for _ in range(2): + query = { + 'nTitleNo': video_id, + 'nStationNo': station_id, + 'nBbsNo': bbs_id, + } + if partial_view: + query['partialView'] = 'SKIP_ADULT' + video_xml = self._download_xml( + 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', + video_id, 'Downloading video info XML%s' + % (' (skipping adult)' if partial_view else ''), + video_id, headers={ + 'Referer': url, + }, query=query) + + flag = xpath_text(video_xml, './track/flag', 'flag', default=None) + if flag and flag == 'SUCCEED': + break + if flag == 'PARTIAL_ADULT': + self._downloader.report_warning( + 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' + 'Only content suitable for all ages will be downloaded. ' + 'Provide account credentials if you wish to download restricted content.') + partial_view = True + continue + elif flag == 'ADULT': + error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' + else: + error = flag + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + else: + raise ExtractorError('Unable to download video info') + + video_element = video_xml.findall(compat_xpath('./track/video'))[-1] + if video_element is None or video_element.text is None: + raise ExtractorError( + 'Video %s video does not exist' % video_id, expected=True) + + video_url = video_element.text.strip() + + title = xpath_text(video_xml, './track/title', 'title', fatal=True) + + uploader = xpath_text(video_xml, './track/nickname', 'uploader') + uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') + duration = int_or_none(xpath_text( + video_xml, './track/duration', 'duration')) + thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') + + common_entry = { + 'uploader': uploader, + 'uploader_id': uploader_id, + 'thumbnail': thumbnail, + } + + info = common_entry.copy() + info.update({ + 'id': video_id, + 'title': title, + 'duration': duration, + }) + + if not video_url: + entries = [] + file_elements = video_element.findall(compat_xpath('./file')) + one = len(file_elements) == 1 + for file_num, file_element in enumerate(file_elements, start=1): + file_url = url_or_none(file_element.text) + if not file_url: + continue + key = file_element.get('key', '') + upload_date = self._search_regex( + r'^(\d{8})_', key, 'upload date', default=None) + file_duration = int_or_none(file_element.get('duration')) + format_id = key if key else '%s_%s' % (video_id, file_num) + if determine_ext(file_url) == 'm3u8': + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', + note='Downloading part %d m3u8 information' % file_num) + else: + formats = [{ + 'url': file_url, + 'format_id': 'http', + }] + if not formats: + continue + self._sort_formats(formats) + file_info = common_entry.copy() + file_info.update({ + 'id': format_id, + 'title': title if one else '%s (part %d)' % (title, file_num), + 'upload_date': upload_date, + 'duration': file_duration, + 'formats': formats, + }) + entries.append(file_info) + entries_info = info.copy() + entries_info.update({ + '_type': 'multi_video', + 'entries': entries, + }) + return entries_info + + info = { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'thumbnail': thumbnail, + } + + if determine_ext(video_url) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + app, playpath = video_url.split('mp4:') + info.update({ + 'url': app, + 'ext': 'flv', + 'play_path': 'mp4:' + playpath, + 'rtmp_live': True, # downloading won't end without this + }) + + return info diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py new file mode 100644 index 0000000000..9e38136b4b --- /dev/null +++ b/youtube_dl/extractor/airmozilla.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) + + +class AirMozillaIE(InfoExtractor): + _VALID_URL = r'https?://air\.mozilla\.org/(?P[0-9a-z-]+)/?' + _TEST = { + 'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/', + 'md5': '8d02f53ee39cf006009180e21df1f3ba', + 'info_dict': { + 'id': '6x4q2w', + 'ext': 'mp4', + 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', + 'thumbnail': r're:https?://.*/poster\.jpg', + 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', + 'timestamp': 1422487800, + 'upload_date': '20150128', + 'location': 'SFO Commons', + 'duration': 3780, + 'view_count': int, + 'categories': ['Main', 'Privacy'], + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id') + + embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id) + jwconfig = self._parse_json(self._search_regex( + r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config'] + + info_dict = self._parse_jwplayer_data(jwconfig, video_id) + view_count = int_or_none(self._html_search_regex( + r'Views since archived: ([0-9]+)', + webpage, 'view count', fatal=False)) + timestamp = parse_iso8601(self._html_search_regex( + r'', webpage), + }) + + return info_dict diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py new file mode 100644 index 0000000000..6f241e6837 --- /dev/null +++ b/youtube_dl/extractor/aliexpress.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, +) + + +class AliExpressLiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P\d+)' + _TEST = { + 'url': 'https://live.aliexpress.com/live/2800002704436634', + 'md5': 'e729e25d47c5e557f2630eaf99b740a5', + 'info_dict': { + 'id': '2800002704436634', + 'ext': 'mp4', + 'title': 'CASIMA7.22', + 'thumbnail': r're:http://.*\.jpg', + 'uploader': 'CASIMA Official Store', + 'timestamp': 1500717600, + 'upload_date': '20170722', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var', + webpage, 'runParams'), + video_id) + + title = data['title'] + + formats = self._extract_m3u8_formats( + data['replyStreamUrl'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': data.get('coverUrl'), + 'uploader': try_get( + data, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(data.get('startTimeLong'), scale=1000), + 'formats': formats, + } diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py new file mode 100644 index 0000000000..c68be31340 --- /dev/null +++ b/youtube_dl/extractor/aljazeera.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class AlJazeeraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P[^/]+)\.html' + + _TESTS = [{ + 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'info_dict': { + 'id': '3792260579001', + 'ext': 'mp4', + 'title': 'The Slum - Episode 1: Deliverance', + 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', + 'uploader_id': '665003303001', + 'timestamp': 1411116829, + 'upload_date': '20140919', + }, + 'add_ie': ['BrightcoveNew'], + 'skip': 'Not accessible from Travis CI server', + }, { + 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + program_name = self._match_id(url) + webpage = self._download_webpage(url, program_name) + brightcove_id = self._search_regex( + r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py new file mode 100644 index 0000000000..cd533acfc7 --- /dev/null +++ b/youtube_dl/extractor/allocine.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + qualities, + remove_end, + try_get, + unified_timestamp, + url_basename, +) + + +class AllocineIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?:article|video|film)/(?:fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P[0-9]+)(?:\.html)?' + + _TESTS = [{ + 'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html', + 'md5': '0c9fcf59a841f65635fa300ac43d8269', + 'info_dict': { + 'id': '19546517', + 'display_id': '18635087', + 'ext': 'mp4', + 'title': 'Astérix - Le Domaine des Dieux Teaser VF', + 'description': 'md5:4a754271d9c6f16c72629a8a993ee884', + 'thumbnail': r're:http://.*\.jpg', + 'duration': 39, + 'timestamp': 1404273600, + 'upload_date': '20140702', + 'view_count': int, + }, + }, { + 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', + 'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0', + 'info_dict': { + 'id': '19540403', + 'display_id': '19540403', + 'ext': 'mp4', + 'title': 'Planes 2 Bande-annonce VF', + 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', + 'thumbnail': r're:http://.*\.jpg', + 'duration': 69, + 'timestamp': 1385659800, + 'upload_date': '20131128', + 'view_count': int, + }, + }, { + 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html', + 'md5': '101250fb127ef9ca3d73186ff22a47ce', + 'info_dict': { + 'id': '19544709', + 'display_id': '19544709', + 'ext': 'mp4', + 'title': 'Dragons 2 - Bande annonce finale VF', + 'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a', + 'thumbnail': r're:http://.*\.jpg', + 'duration': 144, + 'timestamp': 1397589900, + 'upload_date': '20140415', + 'view_count': int, + }, + }, { + 'url': 'http://www.allocine.fr/video/video-19550147/', + 'md5': '3566c0668c0235e2d224fd8edb389f67', + 'info_dict': { + 'id': '19550147', + 'ext': 'mp4', + 'title': 'Faux Raccord N°123 - Les gaffes de Cliffhanger', + 'description': 'md5:bc734b83ffa2d8a12188d9eb48bb6354', + 'thumbnail': r're:http://.*\.jpg', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + formats = [] + quality = qualities(['ld', 'md', 'hd']) + + model = self._html_search_regex( + r'data-model="([^"]+)"', webpage, 'data model', default=None) + if model: + model_data = self._parse_json(model, display_id) + video = model_data['videos'][0] + title = video['title'] + for video_url in video['sources'].values(): + video_id, format_id = url_basename(video_url).split('_')[:2] + formats.append({ + 'format_id': format_id, + 'quality': quality(format_id), + 'url': video_url, + }) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('view_count')) + timestamp = unified_timestamp(try_get( + video, lambda x: x['added_at']['date'], compat_str)) + else: + video_id = display_id + media_data = self._download_json( + 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) + title = remove_end( + self._html_search_regex( + r'(?s)(.+?)', webpage, 'title').strip(), + ' - AlloCiné') + for key, value in media_data['video'].items(): + if not key.endswith('Path'): + continue + format_id = key[:-len('Path')] + formats.append({ + 'format_id': format_id, + 'quality': quality(format_id), + 'url': value, + }) + duration, view_count, timestamp = [None] * 3 + + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py new file mode 100644 index 0000000000..3a6d99f6bf --- /dev/null +++ b/youtube_dl/extractor/alphaporno.py @@ -0,0 +1,77 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + parse_duration, + parse_filesize, + int_or_none, +) + + +class AlphaPornoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P[^/]+)' + _TEST = { + 'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/', + 'md5': 'feb6d3bba8848cd54467a87ad34bd38e', + 'info_dict': { + 'id': '258807', + 'display_id': 'sensual-striptease-porn-with-samantha-alexandra', + 'ext': 'mp4', + 'title': 'Sensual striptease porn with Samantha Alexandra', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1418694611, + 'upload_date': '20141216', + 'duration': 387, + 'filesize_approx': 54120000, + 'tbr': 1145, + 'categories': list, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None) + + video_url = self._search_regex( + r"video_url\s*:\s*'([^']+)'", webpage, 'video url') + ext = self._html_search_meta( + 'encodingFormat', webpage, 'ext', default='.mp4')[1:] + + title = self._search_regex( + [r'', + r'class="title" itemprop="name">([^<]+)<'], + webpage, 'title') + thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail') + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage, 'upload date')) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration')) + filesize_approx = parse_filesize(self._html_search_meta( + 'contentSize', webpage, 'file size')) + bitrate = int_or_none(self._html_search_meta( + 'bitrate', webpage, 'bitrate')) + categories = self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'filesize_approx': filesize_approx, + 'tbr': bitrate, + 'categories': categories, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py new file mode 100644 index 0000000000..6fb3d6c53f --- /dev/null +++ b/youtube_dl/extractor/amcnetworks.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .theplatform import ThePlatformIE +from ..utils import ( + int_or_none, + parse_age_limit, + try_get, + update_url_query, +) + + +class AMCNetworksIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', + 'md5': '', + 'info_dict': { + 'id': 's3MX01Nl4vPH', + 'ext': 'mp4', + 'title': 'Maron - Season 4 - Step 1', + 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', + 'age_limit': 17, + 'upload_date': '20160505', + 'timestamp': 1462468831, + 'uploader': 'AMCN', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires TV provider accounts', + }, { + 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', + 'only_matching': True, + }, { + 'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal', + 'only_matching': True, + }, { + 'url': 'http://www.ifc.com/movies/chaos', + 'only_matching': True, + }, { + 'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/mama-june-from-not-to-hot/full-episode/season-01/thin-tervention', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3', + 'only_matching': True, + }, { + 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + media_url = self._search_regex( + r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', + webpage, 'media url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'link\.theplatform\.com/s/([^?]+)', + media_url, 'theplatform_path'), display_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + video_id = theplatform_metadata['pid'] + title = theplatform_metadata['title'] + rating = try_get( + theplatform_metadata, lambda x: x['ratings'][0]['rating']) + auth_required = self._search_regex( + r'window\.authRequired\s*=\s*(true|false);', + webpage, 'auth required') + if auth_required == 'true': + requestor_id = self._search_regex( + r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', + webpage, 'requestor id') + resource = self._get_mvpd_resource( + requestor_id, title, video_id, rating) + query['auth'] = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + media_url = update_url_query(media_url, query) + formats, subtitles = self._extract_theplatform_smil( + media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'subtitles': subtitles, + 'formats': formats, + 'age_limit': parse_age_limit(parse_age_limit(rating)), + }) + ns_keys = theplatform_metadata.get('$xmlns', {}).keys() + if ns_keys: + ns = list(ns_keys)[0] + series = theplatform_metadata.get(ns + '$show') + season_number = int_or_none( + theplatform_metadata.get(ns + '$season')) + episode = theplatform_metadata.get(ns + '$episodeTitle') + episode_number = int_or_none( + theplatform_metadata.get(ns + '$episode')) + if season_number: + title = 'Season %d - %s' % (season_number, title) + if series: + title = '%s - %s' % (series, title) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + return info diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py new file mode 100644 index 0000000000..9c9d77ae10 --- /dev/null +++ b/youtube_dl/extractor/americastestkitchen.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + js_to_json, + try_get, + unified_strdate, +) + + +class AmericasTestKitchenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', + 'md5': 'b861c3e365ac38ad319cfd509c30577f', + 'info_dict': { + 'id': '5b400b9ee338f922cb06450c', + 'title': 'Weeknight Japanese Suppers', + 'ext': 'mp4', + 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'thumbnail': r're:^https?://', + 'timestamp': 1523664000, + 'upload_date': '20180414', + 'release_date': '20180414', + 'series': "America's Test Kitchen", + 'season_number': 18, + 'episode': 'Weeknight Japanese Suppers', + 'episode_number': 15, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json( + self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', + webpage, 'initial context'), + video_id, js_to_json) + + ep_data = try_get( + video_data, + (lambda x: x['episodeDetail']['content']['data'], + lambda x: x['videoDetail']['content']['data']), dict) + ep_meta = ep_data.get('full_video', {}) + + zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] + + title = ep_data.get('title') or ep_meta.get('title') + description = clean_html(ep_meta.get('episode_description') or ep_data.get( + 'description') or ep_meta.get('description')) + thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) + release_date = unified_strdate(ep_data.get('aired_at')) + + season_number = int_or_none(ep_meta.get('season_number')) + episode = ep_meta.get('title') + episode_number = int_or_none(ep_meta.get('episode_number')) + + return { + '_type': 'url_transparent', + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'ie_key': 'Zype', + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'release_date': release_date, + 'series': "America's Test Kitchen", + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + } diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 0000000000..7ff098cfa0 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + parse_iso8601, + url_or_none, +) + + +class AMPIE(InfoExtractor): + # parse Akamai Adaptive Media Player feed + def _extract_feed_info(self, url): + feed = self._download_json( + url, None, 'Downloading Akamai AMP feed', + 'Unable to download Akamai AMP feed') + item = feed.get('channel', {}).get('item') + if not item: + raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) + + video_id = item['guid'] + + def get_media_node(name, default=None): + media_name = 'media-%s' % name + media_group = item.get('media-group') or item + return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + + thumbnails = [] + media_thumbnail = get_media_node('thumbnail') + if media_thumbnail: + if isinstance(media_thumbnail, dict): + media_thumbnail = [media_thumbnail] + for thumbnail_data in media_thumbnail: + thumbnail = thumbnail_data.get('@attributes', {}) + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail_url, 'http:'), + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + subtitles = {} + media_subtitle = get_media_node('subTitle') + if media_subtitle: + if isinstance(media_subtitle, dict): + media_subtitle = [media_subtitle] + for subtitle_data in media_subtitle: + subtitle = subtitle_data.get('@attributes', {}) + subtitle_href = url_or_none(subtitle.get('href')) + if not subtitle_href: + continue + subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ + 'url': subtitle_href, + 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href), + }) + + formats = [] + media_content = get_media_node('content') + if isinstance(media_content, dict): + media_content = [media_content] + for media_data in media_content: + media = media_data.get('@attributes', {}) + media_url = url_or_none(media.get('url')) + if not media_url: + continue + ext = mimetype2ext(media.get('type')) or determine_ext(media_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), + 'url': media_url, + 'tbr': int_or_none(media.get('bitrate')), + 'filesize': int_or_none(media.get('fileSize')), + 'ext': ext, + }) + + self._sort_formats(formats) + + timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + + return { + 'id': video_id, + 'title': get_media_node('title'), + 'description': get_media_node('description'), + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py new file mode 100644 index 0000000000..00ce684d1c --- /dev/null +++ b/youtube_dl/extractor/animeondemand.py @@ -0,0 +1,293 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + extract_attributes, + ExtractorError, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class AnimeOnDemandIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P\d+)' + _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' + _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' + _NETRC_MACHINE = 'animeondemand' + # German-speaking countries of Europe + _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] + _TESTS = [{ + # jap, OmU + 'url': 'https://www.anime-on-demand.de/anime/161', + 'info_dict': { + 'id': '161', + 'title': 'Grimgar, Ashes and Illusions (OmU)', + 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', + }, + 'playlist_mincount': 4, + }, { + # Film wording is used instead of Episode, ger/jap, Dub/OmU + 'url': 'https://www.anime-on-demand.de/anime/39', + 'only_matching': True, + }, { + # Episodes without titles, jap, OmU + 'url': 'https://www.anime-on-demand.de/anime/162', + 'only_matching': True, + }, { + # ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/169', + 'only_matching': True, + }, { + # Full length film, non-series, ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/185', + 'only_matching': True, + }, { + # Flash videos + 'url': 'https://www.anime-on-demand.de/anime/12', + 'only_matching': True, + }] + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: + self.raise_geo_restricted( + '%s is only available in German-speaking countries of Europe' % self.IE_NAME) + + login_form = self._form_hidden_inputs('new_user', login_page) + + login_form.update({ + 'user[login]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Referer': self._LOGIN_URL, + }) + + if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): + error = self._search_regex( + r']+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P.+?)

    ', + response, 'error', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + anime_id = self._match_id(url) + + webpage = self._download_webpage(url, anime_id) + + if 'data-playlist=' not in webpage: + self._download_webpage( + self._APPLY_HTML5_URL, anime_id, + 'Activating HTML5 beta', 'Unable to apply HTML5 beta') + webpage = self._download_webpage(url, anime_id) + + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', fatal=True) + + anime_title = self._html_search_regex( + r'(?s)]+itemprop="name"[^>]*>(.+?)', + webpage, 'anime name') + anime_description = self._html_search_regex( + r'(?s)]+itemprop="description"[^>]*>(.+?)', + webpage, 'anime description', default=None) + + entries = [] + + def extract_info(html, video_id, num=None): + title, description = [None] * 2 + formats = [] + + for input_ in re.findall( + r']+class=["\'].*?streamstarter[^>]+>', html): + attributes = extract_attributes(input_) + title = attributes.get('data-dialog-header') + playlist_urls = [] + for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): + playlist_url = attributes.get(playlist_key) + if isinstance(playlist_url, compat_str) and re.match( + r'/?[\da-zA-Z]+', playlist_url): + playlist_urls.append(attributes[playlist_key]) + if not playlist_urls: + continue + + lang = attributes.get('data-lang') + lang_note = attributes.get('value') + + for playlist_url in playlist_urls: + kind = self._search_regex( + r'videomaterialurl/\d+/([^/]+)/', + playlist_url, 'media kind', default=None) + format_id_list = [] + if lang: + format_id_list.append(lang) + if kind: + format_id_list.append(kind) + if not format_id_list and num is not None: + format_id_list.append(compat_str(num)) + format_id = '-'.join(format_id_list) + format_note = ', '.join(filter(None, (kind, lang_note))) + item_id_list = [] + if format_id: + item_id_list.append(format_id) + item_id_list.append('videomaterial') + playlist = self._download_json( + urljoin(url, playlist_url), video_id, + 'Downloading %s JSON' % ' '.join(item_id_list), + headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRF-Token': csrf_token, + 'Referer': url, + 'Accept': 'application/json, text/javascript, */*; q=0.01', + }, fatal=False) + if not playlist: + continue + stream_url = url_or_none(playlist.get('streamurl')) + if stream_url: + rtmp = re.search( + r'^(?Prtmpe?://(?P[^/]+)/(?P.+/))(?Pmp[34]:.+)', + stream_url) + if rtmp: + formats.append({ + 'url': rtmp.group('url'), + 'app': rtmp.group('app'), + 'play_path': rtmp.group('playpath'), + 'page_url': url, + 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', + 'rtmp_real_time': True, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + continue + start_video = playlist.get('startvideo', 0) + playlist = playlist.get('playlist') + if not playlist or not isinstance(playlist, list): + continue + playlist = playlist[start_video] + title = playlist.get('title') + if not title: + continue + description = playlist.get('description') + for source in playlist.get('sources', []): + file_ = source.get('file') + if not file_: + continue + ext = determine_ext(file_) + format_id_list = [lang, kind] + if ext == 'm3u8': + format_id_list.append('hls') + elif source.get('type') == 'video/dash' or ext == 'mpd': + format_id_list.append('dash') + format_id = '-'.join(filter(None, format_id_list)) + if ext == 'm3u8': + file_formats = self._extract_m3u8_formats( + file_, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) + elif source.get('type') == 'video/dash' or ext == 'mpd': + continue + file_formats = self._extract_mpd_formats( + file_, video_id, mpd_id=format_id, fatal=False) + else: + continue + for f in file_formats: + f.update({ + 'language': lang, + 'format_note': format_note, + }) + formats.extend(file_formats) + + return { + 'title': title, + 'description': description, + 'formats': formats, + } + + def extract_entries(html, video_id, common_info, num=None): + info = extract_info(html, video_id, num) + + if info['formats']: + self._sort_formats(info['formats']) + f = common_info.copy() + f.update(info) + entries.append(f) + + # Extract teaser/trailer only when full episode is not available + if not info['formats']: + m = re.search( + r'data-dialog-header=(["\'])(?P.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', + html) + if m: + f = common_info.copy() + f.update({ + 'id': '%s-%s' % (f['id'], m.group('kind').lower()), + 'title': m.group('title'), + 'url': urljoin(url, m.group('href')), + }) + entries.append(f) + + def extract_episodes(html): + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: + continue + + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + + video_id = 'episode-%d' % episode_number + + common_info = { + 'id': video_id, + 'series': anime_title, + 'episode': episode_title, + 'episode_number': episode_number, + } + + extract_entries(episode_html, video_id, common_info) + + def extract_film(html, video_id): + common_info = { + 'id': anime_id, + 'title': anime_title, + 'description': anime_description, + } + extract_entries(html, video_id, common_info) + + extract_episodes(webpage) + + if not entries: + extract_film(webpage, anime_id) + + return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py new file mode 100644 index 0000000000..84e841035a --- /dev/null +++ b/youtube_dl/extractor/anvato.py @@ -0,0 +1,314 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import hashlib +import json +import random +import re +import time + +from .common import InfoExtractor +from ..aes import aes_encrypt +from ..compat import compat_str +from ..utils import ( + bytes_to_intlist, + determine_ext, + intlist_to_bytes, + int_or_none, + strip_jsonp, + unescapeHTML, + unsmuggle_url, +) + + +def md5_text(s): + if not isinstance(s, compat_str): + s = compat_str(s) + return hashlib.md5(s.encode('utf-8')).hexdigest() + + +class AnvatoIE(InfoExtractor): + _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + + # Copied from anvplayer.min.js + _ANVACK_TABLE = { + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA', + 'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP', + 'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv', + 'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7', + 'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR', + 'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg', + 'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto', + 'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY', + 'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh', + 'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK', + 'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D', + 'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad', + 'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp', + 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih', + 'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR', + 'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW', + 'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su', + 'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q', + 'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5', + 'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3', + 'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI', + 'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s', + 'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz', + 'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg', + 'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x', + 'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH', + 'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX', + 'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc', + 'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK', + 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7', + 'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C', + 'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e', + 'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1', + 'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re', + 'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51', + 'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho', + 'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9', + 'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH', + 'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F', + 'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo', + 'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR', + 'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa', + 'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk', + 'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ', + 'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ', + 'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m', + 'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b', + 'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3', + 'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK', + 'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', + 'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', + 'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F', + 'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx', + 'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ', + 'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH', + 'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm', + 'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt', + 'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl', + 'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b', + 'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV', + 'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg', + 'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk', + 'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT', + 'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa', + 'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv', + 'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k', + 'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI', + 'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr', + 'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw', + 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K', + 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH', + 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK', + 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu', + 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', + 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', + 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK', + 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n', + 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD', + 'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk', + 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', + 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', + 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' + } + + _MCP_TO_ACCESS_KEY_TABLE = { + 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', + 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', + 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', + 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', + 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', + 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', + 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', + 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' + } + + _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' + + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' + _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' + + _TESTS = [{ + # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 + 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', + 'info_dict': { + 'id': '4465496', + 'ext': 'mp4', + 'title': 'VIDEO: Humpback whale breaches right next to NH boat', + 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', + 'duration': 22, + 'timestamp': 1534855680, + 'upload_date': '20180821', + 'uploader': 'ANV', + }, + 'params': { + 'skip_download': True, + }, + }, { + # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ + 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', + 'only_matching': True, + }] + + def __init__(self, *args, **kwargs): + super(AnvatoIE, self).__init__(*args, **kwargs) + self.__server_time = None + + def _server_time(self, access_key, video_id): + if self.__server_time is not None: + return self.__server_time + + self.__server_time = int(self._download_json( + self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, + note='Fetching server time')['server_time']) + + return self.__server_time + + def _api_prefix(self, access_key): + return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') + + def _get_video_json(self, access_key, video_id): + # See et() in anvplayer.min.js, which is an alias of getVideoJSON() + video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) + server_time = self._server_time(access_key, video_id) + input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + + auth_secret = intlist_to_bytes(aes_encrypt( + bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) + + video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') + anvrid = md5_text(time.time() * 1000 * random.random())[:30] + payload = { + 'api': { + 'anvrid': anvrid, + 'anvstk': md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))), + 'anvts': server_time, + }, + } + + return self._download_json( + video_data_url, video_id, transform_source=strip_jsonp, + data=json.dumps(payload).encode('utf-8')) + + def _get_anvato_videos(self, access_key, video_id): + video_data = self._get_video_json(access_key, video_id) + + formats = [] + for published_url in video_data['published_urls']: + video_url = published_url['embed_url'] + media_format = published_url.get('format') + ext = determine_ext(video_url) + + if ext == 'smil' or media_format == 'smil': + formats.extend(self._extract_smil_formats(video_url, video_id)) + continue + + tbr = int_or_none(published_url.get('kbps')) + a_format = { + 'url': video_url, + 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), + 'tbr': tbr if tbr != 0 else None, + } + + if media_format == 'm3u8' and tbr is not None: + a_format.update({ + 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'ext': 'mp4', + }) + elif media_format == 'm3u8-variant' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + elif ext == 'mp3' or media_format == 'mp3': + a_format['vcodec'] = 'none' + else: + a_format.update({ + 'width': int_or_none(published_url.get('width')), + 'height': int_or_none(published_url.get('height')), + }) + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + a_caption = { + 'url': caption['url'], + 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None + } + subtitles.setdefault(caption['language'], []).append(a_caption) + + return { + 'id': video_id, + 'formats': formats, + 'title': video_data.get('def_title'), + 'description': video_data.get('def_description'), + 'tags': video_data.get('def_tags', '').split(','), + 'categories': video_data.get('categories'), + 'thumbnail': video_data.get('thumbnail'), + 'timestamp': int_or_none(video_data.get( + 'ts_published') or video_data.get('ts_added')), + 'uploader': video_data.get('mcp_id'), + 'duration': int_or_none(video_data.get('duration')), + 'subtitles': subtitles, + } + + @staticmethod + def _extract_urls(ie, webpage, video_id): + entries = [] + for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): + anvplayer_data = ie._parse_json( + mobj.group('anvp'), video_id, transform_source=unescapeHTML, + fatal=False) + if not anvplayer_data: + continue + video = anvplayer_data.get('video') + if not isinstance(video, compat_str) or not video.isdigit(): + continue + access_key = anvplayer_data.get('accessKey') + if not access_key: + mcp = anvplayer_data.get('mcp') + if mcp: + access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( + mcp.lower()) + if not access_key: + continue + entries.append(ie.url_result( + 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), + video_id=video)) + return entries + + def _extract_anvato_videos(self, webpage, video_id): + anvplayer_data = self._parse_json( + self._html_search_regex( + self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), + video_id) + return self._get_anvato_videos( + anvplayer_data['accessKey'], anvplayer_data['video']) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + + mobj = re.match(self._VALID_URL, url) + access_key, video_id = mobj.group('access_key_or_mcp', 'id') + if access_key not in self._ANVACK_TABLE: + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( + access_key) or access_key + return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py new file mode 100644 index 0000000000..e87994a6aa --- /dev/null +++ b/youtube_dl/extractor/aol.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + ExtractorError, + int_or_none, + url_or_none, +) + + +class AolIE(InfoExtractor): + IE_NAME = 'aol.com' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)' + + _TESTS = [{ + # video with 5min ID + 'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/', + 'md5': '18ef68f48740e86ae94b98da815eec42', + 'info_dict': { + 'id': '518167793', + 'ext': 'mp4', + 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.', + 'timestamp': 1395405060, + 'upload_date': '20140321', + 'uploader': 'Newsy Studio', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # video with vidible ID + 'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/', + 'info_dict': { + 'id': '5707d6b8e4b090497b04f706', + 'ext': 'mp4', + 'title': 'Netflix is Raising Rates', + 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.', + 'upload_date': '20160408', + 'timestamp': 1460123280, + 'uploader': 'Veuer', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/', + 'only_matching': True, + }, { + 'url': 'aol-video:5707d6b8e4b090497b04f706', + 'only_matching': True, + }, { + 'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + response = self._download_json( + 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, + video_id)['response'] + if response['statusText'] != 'Ok': + raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True) + + video_data = response['data'] + formats = [] + m3u8_url = url_or_none(video_data.get('videoMasterPlaylist')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + for rendition in video_data.get('renditions', []): + video_url = url_or_none(rendition.get('url')) + if not video_url: + continue + ext = rendition.get('format') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + f = { + 'url': video_url, + 'format_id': rendition.get('quality'), + } + mobj = re.search(r'(\d+)x(\d+)', video_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + else: + qs = compat_parse_qs(compat_urllib_parse_urlparse(video_url).query) + f.update({ + 'width': int_or_none(qs.get('w', [None])[0]), + 'height': int_or_none(qs.get('h', [None])[0]), + }) + formats.append(f) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': video_data['title'], + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('publishDate')), + 'view_count': int_or_none(video_data.get('views')), + 'description': video_data.get('description'), + 'uploader': video_data.get('videoOwner'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py new file mode 100644 index 0000000000..98ccdaa4a9 --- /dev/null +++ b/youtube_dl/extractor/apa.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, + url_or_none, +) + + +class APAIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'md5': '2b12292faeb0a7d930c778c7a5b4759b', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + }, { + 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', + 'only_matching': True, + }, { + 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', + 'only_matching': True, + }, { + 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + jwplatform_id = self._search_regex( + r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage, + 'jwplatform id', default=None) + + if jwplatform_id: + return self.url_result( + 'jwplatform:' + jwplatform_id, ie='JWPlatform', + video_id=video_id) + + sources = self._parse_json( + self._search_regex( + r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + if not isinstance(source, dict): + continue + source_url = url_or_none(source.get('file')) + if not source_url: + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': source_url, + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'image\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py new file mode 100644 index 0000000000..883dcee7aa --- /dev/null +++ b/youtube_dl/extractor/aparat.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + merge_dicts, + mimetype2ext, + url_or_none, +) + + +class AparatIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://www.aparat.com/v/wP8On', + 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', + 'info_dict': { + 'id': 'wP8On', + 'ext': 'mp4', + 'title': 'تیم گلکسی 11 - زومیت', + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', + 'duration': 231, + 'timestamp': 1387394859, + 'upload_date': '20131218', + 'view_count': int, + }, + }, { + # multiple formats + 'url': 'https://www.aparat.com/v/8dflw/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Provides more metadata + webpage = self._download_webpage(url, video_id, fatal=False) + + if not webpage: + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) + + options = self._parse_json( + self._search_regex( + r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1\s*\)', + webpage, 'options', group='value'), + video_id) + + player = options['plugins']['sabaPlayerPlugin'] + + formats = [] + for sources in player['multiSRC']: + for item in sources: + if not isinstance(item, dict): + continue + file_url = url_or_none(item.get('src')) + if not file_url: + continue + item_type = item.get('type') + if item_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': 'http-%s' % (label or ext), + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', + default=None)), + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = player['title'] + + return merge_dicts(info, { + 'id': video_id, + 'thumbnail': url_or_none(options.get('poster')), + 'duration': int_or_none(player.get('duration')), + 'formats': formats, + }) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py new file mode 100644 index 0000000000..a84b8b1eba --- /dev/null +++ b/youtube_dl/extractor/appleconnect.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + str_to_int, + ExtractorError +) + + +class AppleConnectIE(InfoExtractor): + _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' + _TEST = { + 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'md5': 'e7c38568a01ea45402570e6029206723', + 'info_dict': { + 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'ext': 'm4v', + 'title': 'Energy', + 'uploader': 'Drake', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20150710', + 'timestamp': 1436545535, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + try: + video_json = self._html_search_regex( + r'class="auc-video-data">(\{.*?\})', webpage, 'json') + except ExtractorError: + raise ExtractorError('This post doesn\'t contain a video', expected=True) + + video_data = self._parse_json(video_json, video_id) + timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) + like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) + + return { + 'id': video_id, + 'url': video_data['sslSrc'], + 'title': video_data['title'], + 'description': video_data['description'], + 'uploader': video_data['artistName'], + 'thumbnail': video_data['artworkUrl'], + 'timestamp': timestamp, + 'like_count': like_count, + } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 0000000000..b5ed2b88b3 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,283 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) + + +class AppleTrailersIE(InfoExtractor): + IE_NAME = 'appletrailers' + _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TESTS = [{ + 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', + 'info_dict': { + 'id': '5111', + 'title': 'Man of Steel', + }, + 'playlist': [ + { + 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8', + 'info_dict': { + 'id': 'manofsteel-trailer4', + 'ext': 'mov', + 'duration': 111, + 'title': 'Trailer 4', + 'upload_date': '20130523', + 'uploader_id': 'wb', + }, + }, + { + 'md5': 'b8017b7131b721fb4e8d6f49e1df908c', + 'info_dict': { + 'id': 'manofsteel-trailer3', + 'ext': 'mov', + 'duration': 182, + 'title': 'Trailer 3', + 'upload_date': '20130417', + 'uploader_id': 'wb', + }, + }, + { + 'md5': 'd0f1e1150989b9924679b441f3404d48', + 'info_dict': { + 'id': 'manofsteel-trailer', + 'ext': 'mov', + 'duration': 148, + 'title': 'Trailer', + 'upload_date': '20121212', + 'uploader_id': 'wb', + }, + }, + { + 'md5': '5fe08795b943eb2e757fa95cb6def1cb', + 'info_dict': { + 'id': 'manofsteel-teaser', + 'ext': 'mov', + 'duration': 93, + 'title': 'Teaser', + 'upload_date': '20120721', + 'uploader_id': 'wb', + }, + }, + ] + }, { + 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', + 'info_dict': { + 'id': '4489', + 'title': 'Blackthorn', + }, + 'playlist_mincount': 2, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json + 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', + 'info_dict': { + 'id': '15881', + 'title': 'Kung Fu Panda 3', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://trailers.apple.com/ca/metropole/autrui/', + 'only_matching': True, + }, { + 'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/', + 'only_matching': True, + }] + + _JSON_RE = r'iTunes.playURL\((.*?)\);' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + webpage = self._download_webpage(url, movie) + film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') + film_data = self._download_json( + 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, + film_id, fatal=False) + + if film_data: + entries = [] + for clip in film_data.get('clips', []): + clip_title = clip['title'] + + formats = [] + for version, version_data in clip.get('versions', {}).items(): + for size, size_data in version_data.get('sizes', {}).items(): + src = size_data.get('src') + if not src: + continue + formats.append({ + 'format_id': '%s-%s' % (version, size), + 'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src), + 'width': int_or_none(size_data.get('width')), + 'height': int_or_none(size_data.get('height')), + 'language': version[:2], + }) + self._sort_formats(formats) + + entries.append({ + 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), + 'formats': formats, + 'title': clip_title, + 'thumbnail': clip.get('screen') or clip.get('thumb'), + 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), + 'upload_date': unified_strdate(clip.get('posted')), + 'uploader_id': uploader_id, + }) + + page_data = film_data.get('page', {}) + return self.playlist_result(entries, film_id, page_data.get('movie_title')) + + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') + + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s) + s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + + def _clean_json(m): + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = '<html>%s</html>' % s + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) + + playlist = [] + for li in doc.findall('./div/ul/li'): + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, 'trailer info') + trailer_info = json.loads(trailer_info_json) + first_url = trailer_info.get('url') + if not first_url: + continue + title = trailer_info['title'] + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + upload_date = trailer_info['posted'].replace('-', '') + + runtime = trailer_info['runtime'] + m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') + + formats = [] + for format in settings['metadata']['sizes']: + # The src is a file pointing to the real video file + format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', format['src']) + formats.append({ + 'url': format_url, + 'format': format['type'], + 'width': int_or_none(format['width']), + 'height': int_or_none(format['height']), + }) + + self._sort_formats(formats) + + playlist.append({ + '_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'http_headers': { + 'User-Agent': 'QuickTime compatible (youtube-dlc)', + }, + }) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } + + +class AppleTrailersSectionIE(InfoExtractor): + IE_NAME = 'appletrailers:section' + _SECTIONS = { + 'justadded': { + 'feed_path': 'just_added', + 'title': 'Just Added', + }, + 'exclusive': { + 'feed_path': 'exclusive', + 'title': 'Exclusive', + }, + 'justhd': { + 'feed_path': 'just_hd', + 'title': 'Just HD', + }, + 'mostpopular': { + 'feed_path': 'most_pop', + 'title': 'Most Popular', + }, + 'moviestudios': { + 'feed_path': 'studios', + 'title': 'Movie Studios', + }, + } + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS) + _TESTS = [{ + 'url': 'http://trailers.apple.com/#section=justadded', + 'info_dict': { + 'title': 'Just Added', + 'id': 'justadded', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=exclusive', + 'info_dict': { + 'title': 'Exclusive', + 'id': 'exclusive', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=justhd', + 'info_dict': { + 'title': 'Just HD', + 'id': 'justhd', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=mostpopular', + 'info_dict': { + 'title': 'Most Popular', + 'id': 'mostpopular', + }, + 'playlist_mincount': 30, + }, { + 'url': 'http://trailers.apple.com/#section=moviestudios', + 'info_dict': { + 'title': 'Movie Studios', + 'id': 'moviestudios', + }, + 'playlist_mincount': 80, + }] + + def _real_extract(self, url): + section = self._match_id(url) + section_data = self._download_json( + 'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], + section) + entries = [ + self.url_result('http://trailers.apple.com' + e['location']) + for e in section_data] + return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py new file mode 100644 index 0000000000..c79c58e828 --- /dev/null +++ b/youtube_dl/extractor/archiveorg.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + clean_html, +) + + +class ArchiveOrgIE(InfoExtractor): + IE_NAME = 'archive.org' + IE_DESC = 'archive.org videos' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$' + _TESTS = [{ + 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'md5': '8af1d4cf447933ed3c7f4871162602db', + 'info_dict': { + 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'ext': 'ogg', + 'title': '1968 Demo - FJCC Conference Presentation Reel #1', + 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', + 'upload_date': '19681210', + 'uploader': 'SRI International' + } + }, { + 'url': 'https://archive.org/details/Cops1922', + 'md5': '0869000b4ce265e8ca62738b336b268a', + 'info_dict': { + 'id': 'Cops1922', + 'ext': 'mp4', + 'title': 'Buster Keaton\'s "Cops" (1922)', + 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + } + }, { + 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://archive.org/embed/' + video_id, video_id) + jwplayer_playlist = self._parse_json(self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist'), video_id) + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + + def get_optional(metadata, field): + return metadata.get(field, [None])[0] + + metadata = self._download_json( + 'http://archive.org/details/' + video_id, video_id, query={ + 'output': 'json', + })['metadata'] + info.update({ + 'title': get_optional(metadata, 'title') or info.get('title'), + 'description': clean_html(get_optional(metadata, 'description')), + }) + if info.get('_type') != 'playlist': + info.update({ + 'uploader': get_optional(metadata, 'creator'), + 'upload_date': unified_strdate(get_optional(metadata, 'date')), + }) + return info diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py new file mode 100644 index 0000000000..5b7b2dd6d2 --- /dev/null +++ b/youtube_dl/extractor/ard.py @@ -0,0 +1,422 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from .generic import GenericIE +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_duration, + qualities, + str_or_none, + try_get, + unified_strdate, + unified_timestamp, + update_url_query, + url_or_none, + xpath_text, +) +from ..compat import compat_etree_fromstring + + +class ARDMediathekBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) + + def _parse_media_info(self, media_info, video_id, fsk): + formats = self._extract_formats(media_info, video_id) + + if not formats: + if fsk: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + self.raise_geo_restricted( + 'This video is not available due to geoblocking', + countries=self._GEO_COUNTRIES) + + self._sort_formats(formats) + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'ttml', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': int_or_none(media_info.get('_duration')), + 'thumbnail': media_info.get('_previewImage'), + 'is_live': media_info.get('_isLive') is True, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + if not url_or_none(stream_url): + continue + ext = determine_ext(stream_url) + if quality != 'auto' and ext in ('f4m', 'm3u8'): + continue + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + else: + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + m = re.search( + r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', + stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + + +class ARDMediathekIE(ARDMediathekBaseIE): + IE_NAME = 'ARD:mediathek' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + + _TESTS = [{ + # available till 26.07.2022 + 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', + 'info_dict': { + 'id': '44726822', + 'ext': 'mp4', + 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', + 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', + 'duration': 1740, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', + 'only_matching': True, + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'only_matching': True, + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, + }, { + # audio + 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', + 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + + def _real_extract(self, url): + # determine video id from url + m = re.match(self._VALID_URL, url) + + document_id = None + + numid = re.search(r'documentId=([0-9]+)', url) + if numid: + document_id = video_id = numid.group(1) + else: + video_id = m.group('video_id') + + webpage = self._download_webpage(url, video_id) + + ERRORS = ( + ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'), + ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<', + 'Video %s is no longer available'), + ) + + for pattern, message in ERRORS: + if pattern in webpage: + raise ExtractorError(message % video_id, expected=True) + + if re.search(r'[\?&]rss($|[=&])', url): + doc = compat_etree_fromstring(webpage.encode('utf-8')) + if doc.tag == 'rss': + return GenericIE()._extract_rss(url, video_id, doc) + + title = self._html_search_regex( + [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', + r'<meta name="dcterms\.title" content="(.*?)"/>', + r'<h4 class="headline">(.*?)</h4>', + r'<title[^>]*>(.*?)'], + webpage, 'title') + description = self._html_search_meta( + 'dcterms.abstract', webpage, 'description', default=None) + if description is None: + description = self._html_search_meta( + 'description', webpage, 'meta description', default=None) + if description is None: + description = self._html_search_regex( + r'(.+?)

    ', + webpage, 'teaser text', default=None) + + # Thumbnail is sometimes not present. + # It is in the mobile version, but that seems to use a different URL + # structure altogether. + thumbnail = self._og_search_thumbnail(webpage, default=None) + + media_streams = re.findall(r'''(?x) + mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* + "([^"]+)"''', webpage) + + if media_streams: + QUALITIES = qualities(['lo', 'hi', 'hq']) + formats = [] + for furl in set(media_streams): + if furl.endswith('.f4m'): + fid = 'f4m' + else: + fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) + fid = fid_m.group(1) if fid_m else None + formats.append({ + 'quality': QUALITIES(fid), + 'format_id': fid, + 'url': furl, + }) + self._sort_formats(formats) + info = { + 'formats': formats, + } + else: # request JSON file + if not document_id: + video_id = self._search_regex( + r'/play/(?:config|media)/(\d+)', webpage, 'media id') + info = self._extract_media_info( + 'http://www.ardmediathek.de/play/media/%s' % video_id, + webpage, video_id) + + info.update({ + 'id': video_id, + 'title': self._live_title(title) if info.get('is_live') else title, + 'description': description, + 'thumbnail': thumbnail, + }) + + return info + + +class ARDIE(InfoExtractor): + _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _TESTS = [{ + # available till 14.02.2019 + 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', + 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + 'info_dict': { + 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', + 'id': '102', + 'ext': 'mp4', + 'duration': 4435.0, + 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', + 'upload_date': '20180214', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', + 'only_matching': True, + }, { + 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + + player_url = mobj.group('mainurl') + '~playerXml.xml' + doc = self._download_xml(player_url, display_id) + video_node = doc.find('./video') + upload_date = unified_strdate(xpath_text( + video_node, './broadcastDate')) + thumbnail = xpath_text(video_node, './/teaserImage//variant/url') + + formats = [] + for a in video_node.findall('.//asset'): + f = { + 'format_id': a.attrib['type'], + 'width': int_or_none(a.find('./frameWidth').text), + 'height': int_or_none(a.find('./frameHeight').text), + 'vbr': int_or_none(a.find('./bitrateVideo').text), + 'abr': int_or_none(a.find('./bitrateAudio').text), + 'vcodec': a.find('./codecVideo').text, + 'tbr': int_or_none(a.find('./totalBitrate').text), + } + if a.find('./serverPrefix').text: + f['url'] = a.find('./serverPrefix').text + f['playpath'] = a.find('./fileName').text + else: + f['url'] = a.find('./fileName').text + formats.append(f) + self._sort_formats(formats) + + return { + 'id': mobj.group('id'), + 'formats': formats, + 'display_id': display_id, + 'title': video_node.find('./title').text, + 'duration': parse_duration(video_node.find('./duration').text), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + } + + +class ARDBetaMediathekIE(ARDMediathekBaseIE): + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'info_dict': { + 'display_id': 'die-robuste-roswita', + 'id': '70153354', + 'title': 'Die robuste Roswita', + 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'duration': 5316, + 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', + 'timestamp': 1577047500, + 'upload_date': '20191222', + 'ext': 'mp4', + }, + }, { + 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'only_matching': True, + }, { + 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + display_id = mobj.group('display_id') + if display_id: + display_id = display_id.rstrip('/') + if not display_id: + display_id = video_id + + player_page = self._download_json( + 'https://api.ardmediathek.de/public-gateway', + display_id, data=json.dumps({ + 'query': '''{ + playerPage(client:"%s", clipId: "%s") { + blockedByFsk + broadcastedOn + maturityContentRating + mediaCollection { + _duration + _geoblocked + _isLive + _mediaArray { + _mediaStreamArray { + _quality + _server + _stream + } + } + _previewImage + _subtitleUrl + _type + } + show { + title + } + synopsis + title + tracking { + atiCustomVars { + contentId + } + } + } +}''' % (mobj.group('client'), video_id), + }).encode(), headers={ + 'Content-Type': 'application/json' + })['data']['playerPage'] + title = player_page['title'] + content_id = str_or_none(try_get( + player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) + media_collection = player_page.get('mediaCollection') or {} + if not media_collection and content_id: + media_collection = self._download_json( + 'https://www.ardmediathek.de/play/media/' + content_id, + content_id, fatal=False) or {} + info = self._parse_media_info( + media_collection, content_id or video_id, + player_page.get('blockedByFsk')) + age_limit = None + description = player_page.get('synopsis') + maturity_content_rating = player_page.get('maturityContentRating') + if maturity_content_rating: + age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) + if not age_limit and description: + age_limit = int_or_none(self._search_regex( + r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) + info.update({ + 'age_limit': age_limit, + 'display_id': display_id, + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), + 'series': try_get(player_page, lambda x: x['show']['title']), + }) + return info diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py new file mode 100644 index 0000000000..854f587675 --- /dev/null +++ b/youtube_dl/extractor/arkena.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + mimetype2ext, + parse_iso8601, + strip_jsonp, +) + + +class ArkenaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + video\.arkena\.com/play2/embed/player\?| + play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P[^/]+)/[^/]+/(?P\d+) + ) + ''' + _TESTS = [{ + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + }, { + 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', + 'only_matching': True, + }, { + 'url': 'http://video.arkena.com/play2/embed/player?accountId=472718&mediaId=35763b3b-00090078-bf604299&pageStyling=styled', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + account_id = mobj.group('account_id') + + # Handle http://video.arkena.com/play2/embed/player URL + if not video_id: + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('mediaId', [None])[0] + account_id = qs.get('accountId', [None])[0] + if not video_id or not account_id: + raise ExtractorError('Invalid URL', expected=True) + + playlist = self._download_json( + 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' + % (video_id, account_id), + video_id, transform_source=strip_jsonp)['Playlist'][0] + + media_info = playlist['MediaInfo'] + title = media_info['Title'] + media_files = playlist['MediaFiles'] + + is_live = False + formats = [] + for kind_case, kind_formats in media_files.items(): + kind = kind_case.lower() + for f in kind_formats: + f_url = f.get('Url') + if not f_url: + continue + is_live = f.get('Live') == 'true' + exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) + if kind == 'm3u8' or 'm3u8' in exts: + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=kind, fatal=False, live=is_live)) + elif kind == 'flash' or 'f4m' in exts: + formats.extend(self._extract_f4m_formats( + f_url, video_id, f4m_id=kind, fatal=False)) + elif kind == 'dash' or 'mpd' in exts: + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id=kind, fatal=False)) + elif kind == 'silverlight': + # TODO: process when ism is supported (see + # https://github.com/ytdl-org/youtube-dl/issues/8118) + continue + else: + tbr = float_or_none(f.get('Bitrate'), 1000) + formats.append({ + 'url': f_url, + 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, + 'tbr': tbr, + }) + self._sort_formats(formats) + + description = media_info.get('Description') + video_id = media_info.get('VideoId') or video_id + timestamp = parse_iso8601(media_info.get('PublishDate')) + thumbnails = [{ + 'url': thumbnail['Url'], + 'width': int_or_none(thumbnail.get('Size')), + } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'is_live': is_live, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py new file mode 100644 index 0000000000..2bd3bfe8a8 --- /dev/null +++ b/youtube_dl/extractor/arte.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + try_get, + unified_strdate, +) + +# There are different sources of video in arte.tv, the extraction process +# is different for each one. The videos usually expire in 7 days, so we can't +# add tests. + + +class ArteTVBaseIE(InfoExtractor): + def _extract_from_json_url(self, json_url, video_id, lang, title=None): + info = self._download_json(json_url, video_id) + player_info = info['videoJsonPlayer'] + + vsr = try_get(player_info, lambda x: x['VSR'], dict) + if not vsr: + error = None + if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': + error = try_get( + player_info, lambda x: x['custom_msg']['msg'], compat_str) + if not error: + error = 'Video %s is not available' % player_info.get('VID') or video_id + raise ExtractorError(error, expected=True) + + upload_date_str = player_info.get('shootingDate') + if not upload_date_str: + upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] + + title = (player_info.get('VTI') or title or player_info['VID']).strip() + subtitle = player_info.get('VSU', '').strip() + if subtitle: + title += ' - %s' % subtitle + + info_dict = { + 'id': player_info['VID'], + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + } + qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) + + LANGS = { + 'fr': 'F', + 'de': 'A', + 'en': 'E[ANG]', + 'es': 'E[ESP]', + 'it': 'E[ITA]', + 'pl': 'E[POL]', + } + + langcode = LANGS.get(lang, lang) + + formats = [] + for format_id, format_dict in vsr.items(): + f = dict(format_dict) + versionCode = f.get('versionCode') + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 6.8 of + # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + + format = { + 'format_id': format_id, + 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, + 'language_preference': lang_pref, + 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'tbr': int_or_none(f.get('bitrate')), + 'quality': qfunc(f.get('quality')), + } + + if f.get('mediaType') == 'rtmp': + format['url'] = f['streamer'] + format['play_path'] = 'mp4:' + f['url'] + format['ext'] = 'flv' + else: + format['url'] = f['url'] + + formats.append(format) + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + info_dict['formats'] = formats + return info_dict + + +class ArteTVPlus7IE(ArteTVBaseIE): + IE_NAME = 'arte.tv:+7' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?P\d{6}-\d{3}-[AF])' + + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }] + + def _real_extract(self, url): + lang, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_from_json_url( + 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), + video_id, lang) + + +class ArteTVEmbedIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:embed' + _VALID_URL = r'''(?x) + https://www\.arte\.tv + /player/v3/index\.php\?json_url= + (?P + https?://api\.arte\.tv/api/player/v1/config/ + (?P[^/]+)/(?P\d{6}-\d{3}-[AF]) + ) + ''' + + _TESTS = [] + + def _real_extract(self, url): + json_url, lang, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVPlaylistIE(ArteTVBaseIE): + IE_NAME = 'arte.tv:playlist' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?PRC-\d{6})' + + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', + 'info_dict': { + 'id': 'RC-016954', + 'title': 'Earn a Living', + 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', + }, + 'playlist_mincount': 6, + }] + + def _real_extract(self, url): + lang, playlist_id = re.match(self._VALID_URL, url).groups() + collection = self._download_json( + 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' + % (lang, playlist_id), playlist_id) + title = collection.get('title') + description = collection.get('shortDescription') or collection.get('teaserText') + entries = [ + self._extract_from_json_url( + video['jsonUrl'], video.get('programId') or playlist_id, lang) + for video in collection['videos'] if video.get('jsonUrl')] + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py new file mode 100644 index 0000000000..0348e680c6 --- /dev/null +++ b/youtube_dl/extractor/asiancrush.py @@ -0,0 +1,145 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import extract_attributes + + +class AsianCrushIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?(?P(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))' + _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P\d+)v\b' % _VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', + 'md5': 'c3b740e48d0ba002a42c0b72857beae6', + 'info_dict': { + 'id': '1_y4tmjm5r', + 'ext': 'mp4', + 'title': 'Women Who Flirt', + 'description': 'md5:7e986615808bcfb11756eb503a751487', + 'timestamp': 1496936429, + 'upload_date': '20170608', + 'uploader_id': 'craig@crifkin.com', + }, + }, { + 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', + 'only_matching': True, + }, { + 'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/', + 'only_matching': True, + }, { + 'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/video/010400v/drifters/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/', + 'only_matching': True, + }, { + 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + entry_id, partner_id, title = [None] * 3 + + vars = self._parse_json( + self._search_regex( + r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', + default='{}'), video_id, fatal=False) + if vars: + entry_id = vars.get('entry_id') + partner_id = vars.get('partner_id') + title = vars.get('vid_label') + + if not entry_id: + entry_id = self._search_regex( + r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') + + player = self._download_webpage( + 'https://api.%s/embeddedVideoPlayer' % host, video_id, + query={'id': entry_id}) + + kaltura_id = self._search_regex( + r'entry_id["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', player, + 'kaltura id', group='id') + + if not partner_id: + partner_id = self._search_regex( + r'/p(?:artner_id)?/(\d+)', player, 'partner id', + default='513551') + + description = self._html_search_regex( + r'(?s)]+\bclass=["\']description["\'][^>]*>(.+?)', + webpage, 'description', fatal=False) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), + 'ie_key': KalturaIE.ie_key(), + 'id': video_id, + 'title': title, + 'description': description, + } + + +class AsianCrushPlaylistIE(InfoExtractor): + _VALID_URL = r'%s/series/0+(?P\d+)s\b' % AsianCrushIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', + 'info_dict': { + 'id': '12481', + 'title': 'Scholar Who Walks the Night', + 'description': 'md5:7addd7c5132a09fd4741152d96cce886', + }, + 'playlist_count': 20, + }, { + 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/series/016375s/mononoke/', + 'only_matching': True, + }, { + 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [] + + for mobj in re.finditer( + r']+href=(["\'])(?P%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) + + title = self._html_search_regex( + r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'([^<]+)', webpage, 'title', fatal=False) + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py new file mode 100644 index 0000000000..c2cec98452 --- /dev/null +++ b/youtube_dl/extractor/atresplayer.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata, +) + + +class AtresPlayerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P.+?)_(?P[0-9a-f]{24})' + _NETRC_MACHINE = 'atresplayer' + _TESTS = [ + { + 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', + 'info_dict': { + 'id': '5d4aa2c57ed1a88fc715a615', + 'ext': 'mp4', + 'title': 'Capítulo 7: Asuntos pendientes', + 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', + 'duration': 3413, + }, + 'params': { + 'format': 'bestvideo', + }, + 'skip': 'This video is only available for registered users' + }, + { + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, + }, + { + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', + 'only_matching': True, + }, + ] + _API_BASE = 'https://api.atresplayer.com/' + + def _real_initialize(self): + self._login() + + def _handle_error(self, e, code): + if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: + error = self._parse_json(e.cause.read(), None) + if error.get('error') == 'required_registered': + self.raise_login_required() + raise ExtractorError(error['error_description'], expected=True) + raise + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + self._request_webpage( + self._API_BASE + 'login', None, 'Downloading login page') + + try: + target_url = self._download_json( + 'https://account.atresmedia.com/api/login', None, + 'Logging in', headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=urlencode_postdata({ + 'username': username, + 'password': password, + }))['targetUrl'] + except ExtractorError as e: + self._handle_error(e, 400) + + self._request_webpage(target_url, None, 'Following Target URL') + + def _real_extract(self, url): + display_id, video_id = re.match(self._VALID_URL, url).groups() + + try: + episode = self._download_json( + self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + except ExtractorError as e: + self._handle_error(e, 403) + + title = episode['titulo'] + + formats = [] + for source in episode.get('sources', []): + src = source.get('src') + if not src: + continue + src_type = source.get('type') + if src_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif src_type == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + heartbeat = episode.get('heartbeat') or {} + omniture = episode.get('omniture') or {} + get_meta = lambda x: heartbeat.get(x) or omniture.get(x) + + return { + 'display_id': display_id, + 'id': video_id, + 'title': title, + 'description': episode.get('descripcion'), + 'thumbnail': episode.get('imgPoster'), + 'duration': int_or_none(episode.get('duration')), + 'formats': formats, + 'channel': get_meta('channel'), + 'season': get_meta('season'), + 'episode_number': int_or_none(get_meta('episodeNumber')), + } diff --git a/youtube_dl/extractor/atttechchannel.py b/youtube_dl/extractor/atttechchannel.py new file mode 100644 index 0000000000..8f93fb3534 --- /dev/null +++ b/youtube_dl/extractor/atttechchannel.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class ATTTechChannelIE(InfoExtractor): + _VALID_URL = r'https?://techchannel\.att\.com/play-video\.cfm/([^/]+/)*(?P.+)' + _TEST = { + 'url': 'http://techchannel.att.com/play-video.cfm/2014/1/27/ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use', + 'info_dict': { + 'id': '11316', + 'display_id': 'ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use', + 'ext': 'flv', + 'title': 'AT&T Archives : The UNIX System: Making Computers Easier to Use', + 'description': 'A 1982 film about UNIX is the foundation for software in use around Bell Labs and AT&T.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20140127', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r"url\s*:\s*'(rtmp://[^']+)'", + webpage, 'video URL') + + video_id = self._search_regex( + r'mediaid\s*=\s*(\d+)', + webpage, 'video id', fatal=False) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'[Rr]elease\s+date:\s*(\d{1,2}/\d{1,2}/\d{4})', + webpage, 'upload date', fatal=False), False) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'ext': 'flv', + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py new file mode 100644 index 0000000000..95e572d70c --- /dev/null +++ b/youtube_dl/extractor/atvat.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, +) + + +class ATVAtIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P[dv]\d+)' + _TESTS = [{ + 'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/', + 'md5': 'c3b6b975fb3150fc628572939df205f2', + 'info_dict': { + 'id': '1698447', + 'ext': 'mp4', + 'title': 'DI, 21.03.17 | 20:05 Uhr 1/1', + } + }, { + 'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_data = self._parse_json(unescapeHTML(self._search_regex( + [r'flashPlayerOptions\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P[^"]+)"'], + webpage, 'player data', group='json')), + display_id)['config']['initial_video'] + + video_id = video_data['id'] + video_title = video_data['title'] + + parts = [] + for part in video_data.get('parts', []): + part_id = part['id'] + part_title = part['title'] + + formats = [] + for source in part.get('sources', []): + source_url = source.get('src') + if not source_url: + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, part_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': source.get('delivery'), + 'url': source_url, + }) + self._sort_formats(formats) + + parts.append({ + 'id': part_id, + 'title': part_title, + 'thumbnail': part.get('preview_image_url'), + 'duration': int_or_none(part.get('duration')), + 'is_live': part.get('is_livestream'), + 'formats': formats, + }) + + return { + '_type': 'multi_video', + 'id': video_id, + 'title': video_title, + 'entries': parts, + } diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py new file mode 100644 index 0000000000..6bd48ef159 --- /dev/null +++ b/youtube_dl/extractor/audimedia.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class AudiMediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', + 'md5': '79a8b71c46d49042609795ab59779b66', + 'info_dict': { + 'id': '1565', + 'ext': 'mp4', + 'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test', + 'description': 'md5:60e5d30a78ced725f7b8d34370762941', + 'upload_date': '20151124', + 'timestamp': 1448354940, + 'duration': 74022, + 'view_count': int, + } + }, { + 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + raw_payload = self._search_regex([ + r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"', + r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"', + r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"', + r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"', + r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})', + ], webpage, 'raw payload') + _, stage_mode, video_id, _ = raw_payload.split('-') + + # TODO: handle s and e stage_mode (live streams and ended live streams) + if stage_mode not in ('s', 'e'): + video_data = self._download_json( + 'https://www.audimedia.tv/api/video/v1/videos/' + video_id, + video_id, query={ + 'embed[]': ['video_versions', 'thumbnail_image'], + })['results'] + formats = [] + + stream_url_hls = video_data.get('stream_url_hls') + if stream_url_hls: + formats.extend(self._extract_m3u8_formats( + stream_url_hls, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + stream_url_hds = video_data.get('stream_url_hds') + if stream_url_hds: + formats.extend(self._extract_f4m_formats( + stream_url_hds + '?hdcore=3.4.0', + video_id, f4m_id='hds', fatal=False)) + + for video_version in video_data.get('video_versions', []): + video_version_url = video_version.get('download_url') or video_version.get('stream_url') + if not video_version_url: + continue + f = { + 'url': video_version_url, + 'width': int_or_none(video_version.get('width')), + 'height': int_or_none(video_version.get('height')), + 'abr': int_or_none(video_version.get('audio_bitrate')), + 'vbr': int_or_none(video_version.get('video_bitrate')), + } + bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None) + if bitrate: + f.update({ + 'format_id': 'http-%s' % bitrate, + }) + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'], + 'description': video_data.get('subtitle'), + 'thumbnail': video_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(video_data.get('publication_date')), + 'duration': int_or_none(video_data.get('duration')), + 'view_count': int_or_none(video_data.get('view_count')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py new file mode 100644 index 0000000000..c51837b406 --- /dev/null +++ b/youtube_dl/extractor/audioboom.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, +) + + +class AudioBoomIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://audioboom.com/posts/7398103-asim-chaudhry', + 'md5': '7b00192e593ff227e6a315486979a42d', + 'info_dict': { + 'id': '7398103', + 'ext': 'mp3', + 'title': 'Asim Chaudhry', + 'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc', + 'duration': 4000.99, + 'uploader': 'Sue Perkins: An hour or so with...', + 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', + } + }, { + 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + clip = None + + clip_store = self._parse_json( + self._html_search_regex( + r'data-new-clip-store=(["\'])(?P{.+?})\1', + webpage, 'clip store', default='{}', group='json'), + video_id, fatal=False) + if clip_store: + clips = clip_store.get('clips') + if clips and isinstance(clips, list) and isinstance(clips[0], dict): + clip = clips[0] + + def from_clip(field): + if clip: + return clip.get(field) + + audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( + 'audio', webpage, 'audio url') + title = from_clip('title') or self._html_search_meta( + ['og:title', 'og:audio:title', 'audio_title'], webpage) + description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage) + + duration = float_or_none(from_clip('duration') or self._html_search_meta( + 'weibo:audio:duration', webpage)) + + uploader = from_clip('author') or self._html_search_meta( + ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader') + uploader_url = from_clip('author_url') or self._html_search_meta( + 'audioboo:channel', webpage, 'uploader url') + + return { + 'id': video_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py new file mode 100644 index 0000000000..cc7771354c --- /dev/null +++ b/youtube_dl/extractor/audiomack.py @@ -0,0 +1,145 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import time + +from .common import InfoExtractor +from .soundcloud import SoundcloudIE +from ..compat import compat_str +from ..utils import ( + ExtractorError, + url_basename, +) + + +class AudiomackIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P[\w/-]+)' + IE_NAME = 'audiomack' + _TESTS = [ + # hosted on audiomack + { + 'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary', + 'info_dict': + { + 'id': '310086', + 'ext': 'mp3', + 'uploader': 'Roosh Williams', + 'title': 'Extraordinary' + } + }, + # audiomack wrapper around soundcloud song + { + 'add_ie': ['Soundcloud'], + 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', + 'info_dict': { + 'id': '258901379', + 'ext': 'mp3', + 'description': 'mamba day freestyle for the legend Kobe Bryant ', + 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + 'uploader': 'ILOVEMAKONNEN', + 'upload_date': '20160414', + } + }, + ] + + def _real_extract(self, url): + # URLs end with [uploader name]/[uploader title] + # this title is whatever the user types in, and is rarely + # the proper song title. Real metadata is in the api response + album_url_tag = self._match_id(url) + + # Request the extended version of the api for extra fields like artist and title + api_response = self._download_json( + 'http://www.audiomack.com/api/music/url/song/%s?extended=1&_=%d' % ( + album_url_tag, time.time()), + album_url_tag) + + # API is inconsistent with errors + if 'url' not in api_response or not api_response['url'] or 'error' in api_response: + raise ExtractorError('Invalid url %s' % url) + + # Audiomack wraps a lot of soundcloud tracks in their branded wrapper + # if so, pass the work off to the soundcloud extractor + if SoundcloudIE.suitable(api_response['url']): + return self.url_result(api_response['url'], SoundcloudIE.ie_key()) + + return { + 'id': compat_str(api_response.get('id', album_url_tag)), + 'uploader': api_response.get('artist'), + 'title': api_response.get('title'), + 'url': api_response['url'], + } + + +class AudiomackAlbumIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P[\w/-]+)' + IE_NAME = 'audiomack:album' + _TESTS = [ + # Standard album playlist + { + 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', + 'playlist_count': 15, + 'info_dict': + { + 'id': '812251', + 'title': 'Tha Tour: Part 2 (Official Mixtape)' + } + }, + # Album playlist ripped from fakeshoredrive with no metadata + { + 'url': 'http://www.audiomack.com/album/fakeshoredrive/ppp-pistol-p-project', + 'info_dict': { + 'title': 'PPP (Pistol P Project)', + 'id': '837572', + }, + 'playlist': [{ + 'info_dict': { + 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', + 'id': '837577', + 'ext': 'mp3', + 'uploader': 'Lil Herb a.k.a. G Herbo', + } + }], + 'params': { + 'playliststart': 9, + 'playlistend': 9, + } + } + ] + + def _real_extract(self, url): + # URLs end with [uploader name]/[uploader title] + # this title is whatever the user types in, and is rarely + # the proper song title. Real metadata is in the api response + album_url_tag = self._match_id(url) + result = {'_type': 'playlist', 'entries': []} + # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata + # Therefore we don't know how many songs the album has and must infi-loop until failure + for track_no in itertools.count(): + # Get song's metadata + api_response = self._download_json( + 'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d' + % (album_url_tag, track_no, time.time()), album_url_tag, + note='Querying song information (%d)' % (track_no + 1)) + + # Total failure, only occurs when url is totally wrong + # Won't happen in middle of valid playlist (next case) + if 'url' not in api_response or 'error' in api_response: + raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url)) + # URL is good but song id doesn't exist - usually means end of playlist + elif not api_response['url']: + break + else: + # Pull out the album metadata and add to result (if it exists) + for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: + if apikey in api_response and resultkey not in result: + result[resultkey] = api_response[apikey] + song_id = url_basename(api_response['url']).rpartition('.')[0] + result['entries'].append({ + 'id': compat_str(api_response.get('id', song_id)), + 'uploader': api_response.get('artist'), + 'title': api_response.get('title', song_id), + 'url': api_response['url'], + }) + return result diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py new file mode 100644 index 0000000000..a2603bbffe --- /dev/null +++ b/youtube_dl/extractor/awaan.py @@ -0,0 +1,185 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import base64 + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_str, +) +from ..utils import ( + int_or_none, + parse_iso8601, + smuggle_url, + unsmuggle_url, + urlencode_postdata, +) + + +class AWAANIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P\d+)/[^/]+(?:/(?P\d+)/(?P\d+))?' + + def _real_extract(self, url): + show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() + if video_id and int(video_id) > 0: + return self.url_result( + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo') + elif season_id and int(season_id) > 0: + return self.url_result(smuggle_url( + 'http://awaan.ae/program/season/%s' % season_id, + {'show_id': show_id}), 'AWAANSeason') + else: + return self.url_result( + 'http://awaan.ae/program/%s' % show_id, 'AWAANSeason') + + +class AWAANBaseIE(InfoExtractor): + def _parse_video_data(self, video_data, video_id, is_live): + title = video_data.get('title_en') or video_data['title_ar'] + img = video_data.get('img') + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description_en') or video_data.get('description_ar'), + 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), + 'is_live': is_live, + } + + +class AWAANVideoIE(AWAANBaseIE): + IE_NAME = 'awaan:video' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', + 'md5': '5f61c33bfc7794315c671a62d43116aa', + 'info_dict': + { + 'id': '17375', + 'ext': 'mp4', + 'title': 'رحلة العمر : الحلقة 1', + 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', + 'duration': 2041, + 'timestamp': 1227504126, + 'upload_date': '20081124', + 'uploader_id': '71', + }, + }, { + 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, + video_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(video_data, video_id, False) + + embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse_urlencode({ + 'id': video_data['id'], + 'user_id': video_data['user_id'], + 'signature': video_data['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }) + info.update({ + '_type': 'url_transparent', + 'url': embed_url, + 'ie_key': 'MangomoloVideo', + }) + return info + + +class AWAANLiveIE(AWAANBaseIE): + IE_NAME = 'awaan:live' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P\d+)' + _TEST = { + 'url': 'http://awaan.ae/live/6/dubai-tv', + 'info_dict': { + 'id': '6', + 'ext': 'mp4', + 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'upload_date': '20150107', + 'timestamp': 1420588800, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + channel_data = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, + channel_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(channel_data, channel_id, True) + + embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse_urlencode({ + 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), + 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), + 'signature': channel_data['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }) + info.update({ + '_type': 'url_transparent', + 'url': embed_url, + 'ie_key': 'MangomoloLive', + }) + return info + + +class AWAANSeasonIE(InfoExtractor): + IE_NAME = 'awaan:season' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P\d+)|season/(?P\d+))' + _TEST = { + 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', + 'info_dict': + { + 'id': '7910', + 'title': 'محاضرات الشيخ الشعراوي', + }, + 'playlist_mincount': 27, + } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + show_id, season_id = re.match(self._VALID_URL, url).groups() + + data = {} + if season_id: + data['season'] = season_id + show_id = smuggled_data.get('show_id') + if show_id is None: + season = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + season_id, headers={'Origin': 'http://awaan.ae'}) + show_id = season['id'] + data['show_id'] = show_id + show = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/show', + show_id, data=urlencode_postdata(data), headers={ + 'Origin': 'http://awaan.ae', + 'Content-Type': 'application/x-www-form-urlencoded' + }) + if not season_id: + season_id = show['default_season'] + for season in show['seasons']: + if season['id'] == season_id: + title = season.get('title_en') or season['title_ar'] + + entries = [] + for video in show['videos']: + video_id = compat_str(video['id']) + entries.append(self.url_result( + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id)) + + return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py new file mode 100644 index 0000000000..dccfeaf734 --- /dev/null +++ b/youtube_dl/extractor/aws.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import hashlib +import hmac + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlencode + + +class AWSIE(InfoExtractor): + _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' + _AWS_REGION = 'us-east-1' + + def _aws_execute_api(self, aws_dict, video_id, query=None): + query = query or {} + amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + date = amz_date[:8] + headers = { + 'Accept': 'application/json', + 'Host': self._AWS_PROXY_HOST, + 'X-Amz-Date': amz_date, + 'X-Api-Key': self._AWS_API_KEY + } + session_token = aws_dict.get('session_token') + if session_token: + headers['X-Amz-Security-Token'] = session_token + + def aws_hash(s): + return hashlib.sha256(s.encode('utf-8')).hexdigest() + + # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + canonical_querystring = compat_urllib_parse_urlencode(query) + canonical_headers = '' + for header_name, header_value in sorted(headers.items()): + canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) + signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())]) + canonical_request = '\n'.join([ + 'GET', + aws_dict['uri'], + canonical_querystring, + canonical_headers, + signed_headers, + aws_hash('') + ]) + + # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html + credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request'] + credential_scope = '/'.join(credential_scope_list) + string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)]) + + # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html + def aws_hmac(key, msg): + return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) + + def aws_hmac_digest(key, msg): + return aws_hmac(key, msg).digest() + + def aws_hmac_hexdigest(key, msg): + return aws_hmac(key, msg).hexdigest() + + k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8') + for value in credential_scope_list: + k_signing = aws_hmac_digest(k_signing, value) + + signature = aws_hmac_hexdigest(k_signing, string_to_sign) + + # Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html + headers['Authorization'] = ', '.join([ + '%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope), + 'SignedHeaders=%s' % signed_headers, + 'Signature=%s' % signature, + ]) + + return self._download_json( + 'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''), + video_id, headers=headers) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py new file mode 100644 index 0000000000..b1e20def53 --- /dev/null +++ b/youtube_dl/extractor/azmedien.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE + + +class AZMedienIE(InfoExtractor): + IE_DESC = 'AZ Medien videos' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?P + telezueri\.ch| + telebaern\.tv| + telem1\.ch + )/ + [^/]+/ + (?P + [^/]+-(?P\d+) + ) + (?: + \#video= + (?P + [_0-9a-z]+ + ) + )? + ''' + + _TESTS = [{ + 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', + 'info_dict': { + 'id': '1_anruz3wy', + 'ext': 'mp4', + 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', + 'uploader_id': 'TVOnline', + 'upload_date': '20180930', + 'timestamp': 1538328802, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', + 'only_matching': True + }] + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _PARTNER_ID = '1719221' + + def _real_extract(self, url): + host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups() + + if not entry_id: + entry_id = self._download_json( + self._API_TEMPL % (host, host.split('.')[0]), display_id, query={ + 'variables': json.dumps({ + 'contextId': 'NewsArticle:' + article_id, + }), + })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] + + return self.url_result( + 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id) diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py new file mode 100644 index 0000000000..234a661d34 --- /dev/null +++ b/youtube_dl/extractor/baidu.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class BaiduVideoIE(InfoExtractor): + IE_DESC = '百度视频' + _VALID_URL = r'https?://v\.baidu\.com/(?P[a-z]+)/(?P\d+)\.htm' + _TESTS = [{ + 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', + 'info_dict': { + 'id': '1069', + 'title': '中华小当家 TV版国语', + 'description': 'md5:51be07afe461cf99fa61231421b5397c', + }, + 'playlist_count': 52, + }, { + 'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand', + 'info_dict': { + 'id': '11595', + 'title': 're:^奔跑吧兄弟', + 'description': 'md5:1bf88bad6d850930f542d51547c089b8', + }, + 'playlist_mincount': 12, + }] + + def _call_api(self, path, category, playlist_id, note): + return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % ( + path, category, playlist_id), playlist_id, note) + + def _real_extract(self, url): + category, playlist_id = re.match(self._VALID_URL, url).groups() + if category == 'show': + category = 'tvshow' + if category == 'tv': + category = 'tvplay' + + playlist_detail = self._call_api( + 'xqinfo', category, playlist_id, 'Download playlist JSON metadata') + + playlist_title = playlist_detail['title'] + playlist_description = unescapeHTML(playlist_detail.get('intro')) + + episodes_detail = self._call_api( + 'xqsingle', category, playlist_id, 'Download episodes JSON metadata') + + entries = [self.url_result( + episode['url'], video_title=episode['title'] + ) for episode in episodes_detail['videos']] + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py new file mode 100644 index 0000000000..b8a57e6a50 --- /dev/null +++ b/youtube_dl/extractor/bandcamp.py @@ -0,0 +1,417 @@ +from __future__ import unicode_literals + +import random +import re +import time + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + KNOWN_EXTENSIONS, + parse_filesize, + str_or_none, + try_get, + unescapeHTML, + update_url_query, + unified_strdate, + unified_timestamp, + url_or_none, +) + + +class BandcampIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song', + 'md5': 'c557841d5e50261777a6585648adf439', + 'info_dict': { + 'id': '1812978515', + 'ext': 'mp3', + 'title': "youtube-dlc \"'/\\\u00e4\u21ad - youtube-dlc test song \"'/\\\u00e4\u21ad", + 'duration': 9.8485, + }, + '_skip': 'There is a limit of 200 free downloads / month for the test song' + }, { + # free download + 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', + 'md5': '853e35bf34aa1d6fe2615ae612564b36', + 'info_dict': { + 'id': '2650410135', + 'ext': 'aiff', + 'title': 'Ben Prunty - Lanius (Battle)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Ben Prunty', + 'timestamp': 1396508491, + 'upload_date': '20140403', + 'release_date': '20140403', + 'duration': 260.877, + 'track': 'Lanius (Battle)', + 'track_number': 1, + 'track_id': '2650410135', + 'artist': 'Ben Prunty', + 'album': 'FTL: Advanced Edition Soundtrack', + }, + }, { + # no free download, mp3 128 + 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire', + 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7', + 'info_dict': { + 'id': '2584466013', + 'ext': 'mp3', + 'title': 'Mastodon - Hail to Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mastodon', + 'timestamp': 1322005399, + 'upload_date': '20111122', + 'release_date': '20040207', + 'duration': 120.79, + 'track': 'Hail to Fire', + 'track_number': 5, + 'track_id': '2584466013', + 'artist': 'Mastodon', + 'album': 'Call of the Mastodon', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + thumbnail = self._html_search_meta('og:image', webpage, default=None) + + track_id = None + track = None + track_number = None + duration = None + + formats = [] + track_info = self._parse_json( + self._search_regex( + r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', + webpage, 'track info', default='{}'), title) + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): + for format_id, format_url in file_.items(): + if not url_or_none(format_url): + continue + ext, abr_str = format_id.split('-', 1) + formats.append({ + 'format_id': format_id, + 'url': self._proto_relative_url(format_url, 'http:'), + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + 'abr': int_or_none(abr_str), + }) + track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_number = int_or_none(track_info.get('track_num')) + duration = float_or_none(track_info.get('duration')) + + def extract(key): + return self._search_regex( + r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, + webpage, key, default=None, group='value') + + artist = extract('artist') + album = extract('album_title') + timestamp = unified_timestamp( + extract('publish_date') or extract('album_publish_date')) + release_date = unified_strdate(extract('album_release_date')) + + download_link = self._search_regex( + r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'download link', default=None, group='url') + if download_link: + track_id = self._search_regex( + r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', + webpage, 'track id') + + download_webpage = self._download_webpage( + download_link, track_id, 'Downloading free downloads page') + + blob = self._parse_json( + self._search_regex( + r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, + 'blob', group='blob'), + track_id, transform_source=unescapeHTML) + + info = try_get( + blob, (lambda x: x['digital_items'][0], + lambda x: x['download_items'][0]), dict) + if info: + downloads = info.get('downloads') + if isinstance(downloads, dict): + if not track: + track = info.get('title') + if not artist: + artist = info.get('artist') + if not thumbnail: + thumbnail = info.get('thumb_url') + + download_formats = {} + download_formats_list = blob.get('download_formats') + if isinstance(download_formats_list, list): + for f in blob['download_formats']: + name, ext = f.get('name'), f.get('file_extension') + if all(isinstance(x, compat_str) for x in (name, ext)): + download_formats[name] = ext.strip('.') + + for format_id, f in downloads.items(): + format_url = f.get('url') + if not format_url: + continue + # Stat URL generation algorithm is reverse engineered from + # download_*_bundle_*.js + stat_url = update_url_query( + format_url.replace('/download/', '/statdownload/'), { + '.rand': int(time.time() * 1000 * random.random()), + }) + format_id = f.get('encoding_name') or format_id + stat = self._download_json( + stat_url, track_id, 'Downloading %s JSON' % format_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], + fatal=False) + if not stat: + continue + retry_url = url_or_none(stat.get('retry_url')) + if not retry_url: + continue + formats.append({ + 'url': self._proto_relative_url(retry_url, 'http:'), + 'ext': download_formats.get(format_id), + 'format_id': format_id, + 'format_note': f.get('description'), + 'filesize': parse_filesize(f.get('size_mb')), + 'vcodec': 'none', + }) + + self._sort_formats(formats) + + title = '%s - %s' % (artist, track) if artist else track + + if not duration: + duration = float_or_none(self._html_search_meta( + 'duration', webpage, default=None)) + + return { + 'id': track_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': artist, + 'timestamp': timestamp, + 'release_date': release_date, + 'duration': duration, + 'track': track, + 'track_number': track_number, + 'track_id': track_id, + 'artist': artist, + 'album': album, + 'formats': formats, + } + + +class BandcampAlbumIE(InfoExtractor): + IE_NAME = 'Bandcamp:album' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + + _TESTS = [{ + 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', + 'playlist': [ + { + 'md5': '39bc1eded3476e927c724321ddf116cf', + 'info_dict': { + 'id': '1353101989', + 'ext': 'mp3', + 'title': 'Intro', + } + }, + { + 'md5': '1a2c32e2691474643e912cc6cd4bffaa', + 'info_dict': { + 'id': '38097443', + 'ext': 'mp3', + 'title': 'Kero One - Keep It Alive (Blazo remix)', + } + }, + ], + 'info_dict': { + 'title': 'Jazz Format Mixtape vol.1', + 'id': 'jazz-format-mixtape-vol-1', + 'uploader_id': 'blazo', + }, + 'params': { + 'playlistend': 2 + }, + 'skip': 'Bandcamp imposes download limits.' + }, { + 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave', + 'info_dict': { + 'title': 'Hierophany of the Open Grave', + 'uploader_id': 'nightbringer', + 'id': 'hierophany-of-the-open-grave', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://dotscale.bandcamp.com', + 'info_dict': { + 'title': 'Loom', + 'id': 'dotscale', + 'uploader_id': 'dotscale', + }, + 'playlist_mincount': 7, + }, { + # with escaped quote in title + 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', + 'info_dict': { + 'title': '"Entropy" EP', + 'uploader_id': 'jstrecords', + 'id': 'entropy-ep', + }, + 'playlist_mincount': 3, + }, { + # not all tracks have songs + 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague', + 'info_dict': { + 'id': 'we-are-the-plague', + 'title': 'WE ARE THE PLAGUE', + 'uploader_id': 'insulters', + }, + 'playlist_count': 2, + }] + + @classmethod + def suitable(cls, url): + return (False + if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) + else super(BandcampAlbumIE, cls).suitable(url)) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader_id = mobj.group('subdomain') + album_id = mobj.group('album_id') + playlist_id = album_id or uploader_id + webpage = self._download_webpage(url, playlist_id) + track_elements = re.findall( + r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) + if not track_elements: + raise ExtractorError('The page doesn\'t contain any tracks') + # Only tracks with duration info have songs + entries = [ + self.url_result( + compat_urlparse.urljoin(url, t_path), + ie=BandcampIE.ie_key(), + video_title=self._search_regex( + r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', + elem_content, 'track title', fatal=False)) + for elem_content, t_path in track_elements + if self._html_search_meta('duration', elem_content, default=None)] + + title = self._html_search_regex( + r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', + webpage, 'title', fatal=False) + if title: + title = title.replace(r'\"', '"') + return { + '_type': 'playlist', + 'uploader_id': uploader_id, + 'id': playlist_id, + 'title': title, + 'entries': entries, + } + + +class BandcampWeeklyIE(InfoExtractor): + IE_NAME = 'Bandcamp:weekly' + _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://bandcamp.com/?show=224', + 'md5': 'b00df799c733cf7e0c567ed187dea0fd', + 'info_dict': { + 'id': '224', + 'ext': 'opus', + 'title': 'BC Weekly April 4th 2017 - Magic Moments', + 'description': 'md5:5d48150916e8e02d030623a48512c874', + 'duration': 5829.77, + 'release_date': '20170404', + 'series': 'Bandcamp Weekly', + 'episode': 'Magic Moments', + 'episode_number': 208, + 'episode_id': '224', + } + }, { + 'url': 'https://bandcamp.com/?blah/blah@&show=228', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + blob = self._parse_json( + self._search_regex( + r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, + 'blob', group='blob'), + video_id, transform_source=unescapeHTML) + + show = blob['bcw_show'] + + # This is desired because any invalid show id redirects to `bandcamp.com` + # which happens to expose the latest Bandcamp Weekly episode. + show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + + formats = [] + for format_id, format_url in show['audio_stream'].items(): + if not url_or_none(format_url): + continue + for known_ext in KNOWN_EXTENSIONS: + if known_ext in format_id: + ext = known_ext + break + else: + ext = None + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': ext, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = show.get('audio_title') or 'Bandcamp Weekly' + subtitle = show.get('subtitle') + if subtitle: + title += ' - %s' % subtitle + + episode_number = None + seq = blob.get('bcw_seq') + + if seq and isinstance(seq, list): + try: + episode_number = next( + int_or_none(e.get('episode_number')) + for e in seq + if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) + except StopIteration: + pass + + return { + 'id': video_id, + 'title': title, + 'description': show.get('desc') or show.get('short_desc'), + 'duration': float_or_none(show.get('audio_duration')), + 'is_live': False, + 'release_date': unified_strdate(show.get('published_date')), + 'series': 'Bandcamp Weekly', + 'episode': show.get('subtitle'), + 'episode_number': episode_number, + 'episode_id': compat_str(video_id), + 'formats': formats + } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py new file mode 100644 index 0000000000..002c39c394 --- /dev/null +++ b/youtube_dl/extractor/bbc.py @@ -0,0 +1,1359 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + ExtractorError, + float_or_none, + get_element_by_class, + int_or_none, + js_to_json, + parse_duration, + parse_iso8601, + try_get, + unescapeHTML, + url_or_none, + urlencode_postdata, + urljoin, +) +from ..compat import ( + compat_etree_Element, + compat_HTTPError, + compat_urlparse, +) + + +class BBCCoUkIE(InfoExtractor): + IE_NAME = 'bbc.co.uk' + IE_DESC = 'BBC iPlayer' + _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?bbc\.co\.uk/ + (?: + programmes/(?!articles/)| + iplayer(?:/[^/]+)?/(?:episode/|playlist/)| + music/(?:clips|audiovideo/popular)[/#]| + radio/player/| + sounds/play/| + events/[^/]+/play/[^/]+/ + ) + (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) + ''' % _ID_REGEX + + _LOGIN_URL = 'https://account.bbc.com/signin' + _NETRC_MACHINE = 'bbc' + + _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams with even better quality that pc mediaset but fails + # with geolocation in some cases when it's even not geo restricted at all (e.g. + # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + ] + + _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' + _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' + + _NAMESPACES = ( + _MEDIASELECTION_NS, + _EMP_PLAYLIST_NS, + ) + + _TESTS = [ + { + 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', + 'info_dict': { + 'id': 'b039d07m', + 'ext': 'flv', + 'title': 'Kaleidoscope, Leonard Cohen', + 'description': 'The Canadian poet and songwriter reflects on his musical career.', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Man in Black: Series 3: The Printed Name', + 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", + 'duration': 1800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Voice UK: Series 3: Blind Auditions 5', + 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.', + 'duration': 5100, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', + 'info_dict': { + 'id': 'b03k3pb7', + 'ext': 'flv', + 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", + 'description': '2. Invasion', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, { + 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', + 'info_dict': { + 'id': 'b04v209v', + 'ext': 'flv', + 'title': 'Pete Tong, The Essential New Tune Special', + 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", + 'duration': 10800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', + 'note': 'Audio', + 'info_dict': { + 'id': 'p022h44j', + 'ext': 'flv', + 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', + 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", + 'duration': 227, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', + 'note': 'Video', + 'info_dict': { + 'id': 'p025c103', + 'ext': 'flv', + 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', + 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', + 'duration': 226, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', + 'info_dict': { + 'id': 'p02n76xf', + 'ext': 'flv', + 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', + 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + # iptv-all mediaset fails with geolocation however there is no geo restriction + # for this programme at all + 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', + 'info_dict': { + 'id': 'b06rkms3', + 'ext': 'flv', + 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", + 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Now it\'s really geo-restricted', + }, { + # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147) + 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', + 'info_dict': { + 'id': 'p028bfkj', + 'ext': 'flv', + 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', + 'note': 'Audio', + 'info_dict': { + 'id': 'm0007jz9', + 'ext': 'mp4', + 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', + 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", + 'duration': 9840, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/m00005xn', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s', + 'only_matching': True, + }] + + _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading signin page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url')) + + response, urlh = self._download_webpage_handle( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + if self._LOGIN_URL in urlh.geturl(): + error = clean_html(get_element_by_class('form-message', response)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id + + def _extract_asx_playlist(self, connection, programme_id): + asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') + return [ref.get('href') for ref in asx.findall('./Entry/ref')] + + def _extract_items(self, playlist): + return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) + + def _findall_ns(self, element, xpath): + elements = [] + for ns in self._NAMESPACES: + elements.extend(element.findall(xpath % ns)) + return elements + + def _extract_medias(self, media_selection): + error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) + if error is None: + media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) + if error is not None: + raise BBCCoUkIE.MediaSelectionError(error.get('id')) + return self._findall_ns(media_selection, './{%s}media') + + def _extract_connections(self, media): + return self._findall_ns(media, './{%s}connection') + + def _get_subtitles(self, media, programme_id): + subtitles = {} + for connection in self._extract_connections(media): + cc_url = url_or_none(connection.get('href')) + if not cc_url: + continue + captions = self._download_xml( + cc_url, programme_id, 'Downloading captions', fatal=False) + if not isinstance(captions, compat_etree_Element): + continue + lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') + subtitles[lang] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + ] + return subtitles + + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + + def _download_media_selector(self, programme_id): + last_exception = None + for mediaselector_url in self._MEDIASELECTOR_URLS: + try: + return self._download_media_selector_url( + mediaselector_url % programme_id, programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) + + def _download_media_selector_url(self, url, programme_id=None): + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML', + expected_status=(403, 404)) + return self._process_media_selector(media_selection, programme_id) + + def _process_media_selector(self, media_selection, programme_id): + formats = [] + subtitles = None + urls = [] + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind in ('video', 'audio'): + bitrate = int_or_none(media.get('bitrate')) + encoding = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + href = connection.get('href') + if href in urls: + continue + if href: + urls.append(href) + conn_kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + transfer_format = connection.get('transferFormat') + format_id = supplier or conn_kind or protocol + if service: + format_id = '%s_%s' % (service, format_id) + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, format_id), + }) + elif transfer_format == 'dash': + formats.extend(self._extract_mpd_formats( + href, programme_id, mpd_id=format_id, fatal=False)) + elif transfer_format == 'hls': + formats.extend(self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + if re.search(self._USP_RE, href): + usp_formats = self._extract_m3u8_formats( + re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), + programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for f in usp_formats: + if f.get('height') and f['height'] > 720: + continue + formats.append(f) + elif transfer_format == 'hds': + formats.extend(self._extract_f4m_formats( + href, programme_id, f4m_id=format_id, fatal=False)) + else: + if not service and not supplier and bitrate: + format_id += '-%d' % bitrate + fmt = { + 'format_id': format_id, + 'filesize': file_size, + } + if kind == 'video': + fmt.update({ + 'width': width, + 'height': height, + 'tbr': bitrate, + 'vcodec': encoding, + }) + else: + fmt.update({ + 'abr': bitrate, + 'acodec': encoding, + 'vcodec': 'none', + }) + if protocol in ('http', 'https'): + # Direct link + fmt.update({ + 'url': href, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + fmt.update({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + }) + else: + continue + formats.append(fmt) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + return formats, subtitles + + def _download_playlist(self, playlist_id): + try: + playlist = self._download_json( + 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, + playlist_id, 'Downloading playlist JSON') + + version = playlist.get('defaultAvailableVersion') + if version: + smp_config = version['smpConfig'] + title = smp_config['title'] + description = smp_config['summary'] + for item in smp_config['items']: + kind = item['kind'] + if kind not in ('programme', 'radioProgramme'): + continue + programme_id = item.get('vpid') + duration = int_or_none(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + return programme_id, title, description, duration, formats, subtitles + except ExtractorError as ee: + if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + raise + + # fallback to legacy playlist + return self._process_legacy_playlist(playlist_id) + + def _process_legacy_playlist_url(self, url, display_id): + playlist = self._download_legacy_playlist_url(url, display_id) + return self._extract_from_legacy_playlist(playlist, display_id) + + def _process_legacy_playlist(self, playlist_id): + return self._process_legacy_playlist_url( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + + def _download_legacy_playlist_url(self, url, playlist_id=None): + return self._download_xml( + url, playlist_id, 'Downloading legacy playlist XML') + + def _extract_from_legacy_playlist(self, playlist, playlist_id): + no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) + if no_items is not None: + reason = no_items.get('reason') + if reason == 'preAvailability': + msg = 'Episode %s is not yet available' % playlist_id + elif reason == 'postAvailability': + msg = 'Episode %s is no longer available' % playlist_id + elif reason == 'noMedia': + msg = 'Episode %s is not currently available' % playlist_id + else: + msg = 'Episode %s is not available: %s' % (playlist_id, reason) + raise ExtractorError(msg, expected=True) + + for item in self._extract_items(playlist): + kind = item.get('kind') + if kind not in ('programme', 'radioProgramme'): + continue + title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text + description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) + description = description_el.text if description_el is not None else None + + def get_programme_id(item): + def get_from_attributes(item): + for p in ('identifier', 'group'): + value = item.get(p) + if value and re.match(r'^[pb][\da-z]{7}$', value): + return value + get_from_attributes(item) + mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) + if mediator is not None: + return get_from_attributes(mediator) + + programme_id = get_programme_id(item) + duration = int_or_none(item.get('duration')) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + else: + formats, subtitles = self._process_media_selector(item, playlist_id) + programme_id = playlist_id + + return programme_id, title, description, duration, formats, subtitles + + def _real_extract(self, url): + group_id = self._match_id(url) + + webpage = self._download_webpage(url, group_id, 'Downloading video page') + + error = self._search_regex( + r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + programme_id = None + duration = None + + tviplayer = self._search_regex( + r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', + webpage, 'player', default=None) + + if tviplayer: + player = self._parse_json(tviplayer, group_id).get('player', {}) + duration = int_or_none(player.get('duration')) + programme_id = player.get('vpid') + + if not programme_id: + programme_id = self._search_regex( + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', + r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title') + description = self._search_regex( + (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', + r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'), + webpage, 'description', default=None) + if not description: + description = self._html_search_meta('description', webpage) + else: + programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) + + self._sort_formats(formats) + + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class BBCIE(BBCCoUkIE): + IE_NAME = 'bbc' + IE_DESC = 'BBC' + _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' + + _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams but fails with geolocation in some cases when it's + # even not geo restricted at all + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', + # Provides more formats, namely direct mp4 links, but fails on some videos with + # notukerror for non UK (?) users (e.g. + # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', + # Provides fewer formats, but works everywhere for everybody (hopefully) + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + ] + + _TESTS = [{ + # article with multiple videos embedded with data-playable containing vpids + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade', + 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', + }, + 'playlist_count': 2, + }, { + # article with multiple videos embedded with data-playable (more videos) + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + 'description': 'BBC reports and video highlights at the Farnborough Airshow.', + }, + 'playlist_count': 9, + 'skip': 'Save time', + }, { + # article with multiple videos embedded with `new SMP()` + # broken + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BUGGER', + }, + 'playlist_count': 18, + }, { + # single video embedded with data-playable containing vpid + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', + 'duration': 47, + 'timestamp': 1427219242, + 'upload_date': '20150324', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # article with single video embedded with data-playable containing XML playlist + # with direct video links as progressiveDownloadUrl (for now these are extracted) + # and playlist with f4m and m3u8 as streamingUrl + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'info_dict': { + 'id': '150615_telabyad_kentin_cogu', + 'ext': 'mp4', + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", + 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', + 'timestamp': 1434397334, + 'upload_date': '20150615', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video embedded with data-playable containing XML playlists (regional section) + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'info_dict': { + 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', + 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', + 'timestamp': 1434713142, + 'upload_date': '20150619', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video from video playlist embedded with vxp-playlist-data JSON + 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', + 'info_dict': { + 'id': 'p02w6qjc', + 'ext': 'mp4', + 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + 'duration': 56, + 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video story with digitalData + 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', + 'info_dict': { + 'id': 'p02q6gc4', + 'ext': 'flv', + 'title': 'Sri Lanka’s spicy secret', + 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'timestamp': 1437674293, + 'upload_date': '20150723', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video story without digitalData + 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', + 'info_dict': { + 'id': 'p018zqqg', + 'ext': 'mp4', + 'title': 'Hyundai Santa Fe Sport: Rock star', + 'description': 'md5:b042a26142c4154a6e472933cf20793d', + 'timestamp': 1415867444, + 'upload_date': '20141113', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video embedded with Morph + 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', + 'info_dict': { + 'id': 'p041vhd0', + 'ext': 'mp4', + 'title': "Nigeria v Japan - Men's First Round", + 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.', + 'duration': 7980, + 'uploader': 'BBC Sport', + 'uploader_id': 'bbc_sport', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to UK', + }, { + # single video with playlist.sxml URL in playlist param + 'url': 'http://www.bbc.com/sport/0/football/33653409', + 'info_dict': { + 'id': 'p02xycnp', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', + 'duration': 140, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # article with multiple videos embedded with playlist.sxml in playlist param + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': '34475836', + 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', + 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', + }, + 'playlist_count': 3, + }, { + # school report article with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, + }, { + # single video with playlist URL from weather section + 'url': 'http://www.bbc.com/weather/features/33601775', + 'only_matching': True, + }, { + # custom redirection to www.bbc.com + 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', + 'only_matching': True, + }, { + # single video article embedded with data-media-vpid + 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', + 'only_matching': True, + }, { + 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', + 'info_dict': { + 'id': 'p06556y7', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + }, + 'params': { + 'skip_download': True, + } + }, { + # window.__PRELOADED_STATE__ + 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', + 'info_dict': { + 'id': 'b0b9z4vz', + 'ext': 'mp4', + 'title': 'Prom 6: An American in Paris and Turangalila', + 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8', + 'uploader': 'Radio 3', + 'uploader_id': 'bbc_radio_three', + }, + }, { + 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', + 'info_dict': { + 'id': 'p06w9tws', + 'ext': 'mp4', + 'title': 'md5:2fabf12a726603193a2879a055f72514', + 'description': 'Learn English words and phrases from this story', + }, + 'add_ie': [BBCCoUkIE.ie_key()], + }] + + @classmethod + def suitable(cls, url): + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) + else super(BBCIE, cls).suitable(url)) + + def _extract_from_media_meta(self, media_meta, video_id): + # Direct links to media in media metadata (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml + source_files = media_meta.get('sourceFiles') + if source_files: + return [{ + 'url': f['url'], + 'format_id': format_id, + 'ext': f.get('encoding'), + 'tbr': float_or_none(f.get('bitrate'), 1000), + 'filesize': int_or_none(f.get('filesize')), + } for format_id, f in source_files.items() if f.get('url')], [] + + programme_id = media_meta.get('externalId') + if programme_id: + return self._download_media_selector(programme_id) + + # Process playlist.sxml as legacy playlist + href = media_meta.get('href') + if href: + playlist = self._download_legacy_playlist_url(href) + _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) + return formats, subtitles + + return [], [] + + def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(url, playlist_id) + self._sort_formats(formats) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) + timestamp = json_ld_info.get('timestamp') + + playlist_title = json_ld_info.get('title') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'<title>(.+?)', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) + + if not timestamp: + timestamp = parse_iso8601(self._search_regex( + [r']+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"', + r'"datePublished":\s*"([^"]+)'], + webpage, 'date', default=None)) + + entries = [] + + # article with multiple videos embedded with playlist.sxml (e.g. + # http://www.bbc.com/sport/0/football/34475836) + playlists = re.findall(r']+name="playlist"[^>]+value="([^"]+)"', webpage) + playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) + if playlists: + entries = [ + self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) + for playlist_url in playlists] + + # news article with multiple videos embedded with data-playable + data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) + if data_playables: + for _, data_playable_json in data_playables: + data_playable = self._parse_json( + unescapeHTML(data_playable_json), playlist_id, fatal=False) + if not data_playable: + continue + settings = data_playable.get('settings', {}) + if settings: + # data-playable with video vpid in settings.playlistObject.items (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + playlist_object = settings.get('playlistObject', {}) + if playlist_object: + items = playlist_object.get('items') + if items and isinstance(items, list): + title = playlist_object['title'] + description = playlist_object.get('summary') + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + else: + # data-playable without vpid but with a playlist.sxml URLs + # in otherSettings.playlist (e.g. + # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) + playlist = data_playable.get('otherSettings', {}).get('playlist', {}) + if playlist: + entry = None + for key in ('streaming', 'progressiveDownload'): + playlist_url = playlist.get('%sUrl' % key) + if not playlist_url: + continue + try: + info = self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp) + if not entry: + entry = info + else: + entry['title'] = info['title'] + entry['formats'].extend(info['formats']) + except Exception as e: + # Some playlist URL may fail with 500, at the same time + # the other one may work fine (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + continue + raise + if entry: + self._sort_formats(entry['formats']) + entries.append(entry) + + if entries: + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 + group_id = self._search_regex( + r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + webpage, 'group id', default=None) + if playlist_id: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % group_id, + ie=BBCCoUkIE.ie_key()) + + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + programme_id = self._search_regex( + [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, + r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], + webpage, 'vpid', default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) + digital_data = self._parse_json( + self._search_regex( + r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), + programme_id, fatal=False) + page_info = digital_data.get('page', {}).get('pageInfo', {}) + title = page_info.get('pageName') or self._og_search_title(webpage) + description = page_info.get('description') or self._og_search_description(webpage) + timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) + # There are several setPayload calls may be present but the video + # seems to be always related to the first one + morph_payload = self._parse_json( + self._search_regex( + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', + webpage, 'morph payload', default='{}'), + playlist_id, fatal=False) + if morph_payload: + components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] + for component in components: + if not isinstance(component, dict): + continue + lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) + if not lead_media: + continue + identifiers = lead_media.get('identifiers') + if not identifiers or not isinstance(identifiers, dict): + continue + programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + if not programme_id: + continue + title = lead_media.get('title') or self._og_search_title(webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + description = lead_media.get('summary') + uploader = lead_media.get('masterBrand') + uploader_id = lead_media.get('mid') + duration = None + duration_d = lead_media.get('duration') + if isinstance(duration_d, dict): + duration = parse_duration(dict_get( + duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + } + + preload_state = self._parse_json(self._search_regex( + r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if preload_state: + current_programme = preload_state.get('programmes', {}).get('current') or {} + programme_id = current_programme.get('id') + if current_programme and programme_id and current_programme.get('type') == 'playable_item': + title = current_programme.get('titles', {}).get('tertiary') or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + synopses = current_programme.get('synopses') or {} + network = current_programme.get('network') or {} + duration = int_or_none( + current_programme.get('duration', {}).get('value')) + thumbnail = None + image_url = current_programme.get('image_url') + if image_url: + thumbnail = image_url.replace('{recipe}', '1920x1920') + return { + 'id': programme_id, + 'title': title, + 'description': dict_get(synopses, ('long', 'medium', 'short')), + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': network.get('short_title'), + 'uploader_id': network.get('id'), + 'formats': formats, + 'subtitles': subtitles, + } + + bbc3_config = self._parse_json( + self._search_regex( + r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, + 'bbcthree config', default='{}'), + playlist_id, transform_source=js_to_json, fatal=False) + if bbc3_config: + bbc3_playlist = try_get( + bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + dict) + if bbc3_playlist: + playlist_title = bbc3_playlist.get('title') or playlist_title + thumbnail = bbc3_playlist.get('holdingImageURL') + entries = [] + for bbc3_item in bbc3_playlist['items']: + programme_id = bbc3_item.get('versionID') + if not programme_id: + continue + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': playlist_title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], + playlist_id, playlist_title, playlist_description) + + # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) + medias = extract_all(r"data-media-meta='({[^']+})'") + + if not medias: + # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) + media_asset = self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', + webpage, 'media asset', default=None) + if media_asset: + media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + if not medias: + # Multiple video playlist with single `now playing` entry (e.g. + # http://www.bbc.com/news/video_and_audio/must_see/33767813) + vxp_playlist = self._parse_json( + self._search_regex( + r']+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)', + webpage, 'playlist data'), + playlist_id) + playlist_medias = [] + for item in vxp_playlist: + media = item.get('media') + if not media: + continue + playlist_medias.append(media) + # Download single video if found media with asset id matching the video id from URL + if item.get('advert', {}).get('assetId') == playlist_id: + medias = [media] + break + # Fallback to the whole playlist + if not medias: + medias = playlist_medias + + entries = [] + for num, media_meta in enumerate(medias, start=1): + formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) + if not formats: + continue + self._sort_formats(formats) + + video_id = media_meta.get('externalId') + if not video_id: + video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + + title = media_meta.get('caption') + if not title: + title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + + duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) + + images = [] + for image in media_meta.get('images', {}).values(): + images.extend(image.values()) + if 'image' in media_meta: + images.append(media_meta['image']) + + thumbnails = [{ + 'url': image.get('href'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in images] + + entries.append({ + 'id': video_id, + 'title': title, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title, description = self._extract_title_and_description(webpage) + + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) + + +class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' + _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 6, + 'skip': 'This programme is not currently available on BBC iPlayer', + }, { + # Available for over a year unlike 30 days for most other programmes + 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', + 'info_dict': { + 'id': 'p02tcc32', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 10, + }] + + def _extract_title_and_description(self, webpage): + title = self._search_regex(r'

    ([^<]+)

    ', webpage, 'title', fatal=False) + description = self._search_regex( + r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

    ', + webpage, 'description', fatal=False, group='value') + return title, description + + +class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' + _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance - Clips - BBC Four', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player', + 'only_matching': True, + }] + + def _extract_title_and_description(self, webpage): + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + return title, description diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py new file mode 100644 index 0000000000..86abdae00c --- /dev/null +++ b/youtube_dl/extractor/beampro.py @@ -0,0 +1,194 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + compat_str, + float_or_none, + int_or_none, + parse_iso8601, + try_get, + urljoin, +) + + +class BeamProBaseIE(InfoExtractor): + _API_BASE = 'https://mixer.com/api/v1' + _RATINGS = {'family': 0, 'teen': 13, '18+': 18} + + def _extract_channel_info(self, chan): + user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) + return { + 'uploader': chan.get('token') or try_get( + chan, lambda x: x['user']['username'], compat_str), + 'uploader_id': compat_str(user_id) if user_id else None, + 'age_limit': self._RATINGS.get(chan.get('audience')), + } + + +class BeamProLiveIE(BeamProBaseIE): + IE_NAME = 'Mixer:live' + _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P[^/?#&]+)' + _TEST = { + 'url': 'http://mixer.com/niterhayven', + 'info_dict': { + 'id': '261562', + 'ext': 'mp4', + 'title': 'Introducing The Witcher 3 // The Grind Starts Now!', + 'description': 'md5:0b161ac080f15fe05d18a07adb44a74d', + 'thumbnail': r're:https://.*\.jpg$', + 'timestamp': 1483477281, + 'upload_date': '20170103', + 'uploader': 'niterhayven', + 'uploader_id': '373396', + 'age_limit': 18, + 'is_live': True, + 'view_count': int, + }, + 'skip': 'niterhayven is offline', + 'params': { + 'skip_download': True, + }, + } + + _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE + + @classmethod + def suitable(cls, url): + return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) + + def _real_extract(self, url): + channel_name = self._match_id(url) + + chan = self._download_json( + '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) + + if chan.get('online') is False: + raise ExtractorError( + '{0} is offline'.format(channel_name), expected=True) + + channel_id = chan['id'] + + def manifest_url(kind): + return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) + + formats = self._extract_m3u8_formats( + manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', + fatal=False) + formats.extend(self._extract_smil_formats( + manifest_url('smil'), channel_name, fatal=False)) + self._sort_formats(formats) + + info = { + 'id': compat_str(chan.get('id') or channel_name), + 'title': self._live_title(chan.get('name') or channel_name), + 'description': clean_html(chan.get('description')), + 'thumbnail': try_get( + chan, lambda x: x['thumbnail']['url'], compat_str), + 'timestamp': parse_iso8601(chan.get('updatedAt')), + 'is_live': True, + 'view_count': int_or_none(chan.get('viewersTotal')), + 'formats': formats, + } + info.update(self._extract_channel_info(chan)) + + return info + + +class BeamProVodIE(BeamProBaseIE): + IE_NAME = 'Mixer:vod' + _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P[^?#&]+)' + _TESTS = [{ + 'url': 'https://mixer.com/willow8714?vod=2259830', + 'md5': 'b2431e6e8347dc92ebafb565d368b76b', + 'info_dict': { + 'id': '2259830', + 'ext': 'mp4', + 'title': 'willow8714\'s Channel', + 'duration': 6828.15, + 'thumbnail': r're:https://.*source\.png$', + 'timestamp': 1494046474, + 'upload_date': '20170506', + 'uploader': 'willow8714', + 'uploader_id': '6085379', + 'age_limit': 13, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw', + 'only_matching': True, + }, { + 'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig', + 'only_matching': True, + }] + + @staticmethod + def _extract_format(vod, vod_type): + if not vod.get('baseUrl'): + return [] + + if vod_type == 'hls': + filename, protocol = 'manifest.m3u8', 'm3u8_native' + elif vod_type == 'raw': + filename, protocol = 'source.mp4', 'https' + else: + assert False + + data = vod.get('data') if isinstance(vod.get('data'), dict) else {} + + format_id = [vod_type] + if isinstance(data.get('Height'), compat_str): + format_id.append('%sp' % data['Height']) + + return [{ + 'url': urljoin(vod['baseUrl'], filename), + 'format_id': '-'.join(format_id), + 'ext': 'mp4', + 'protocol': protocol, + 'width': int_or_none(data.get('Width')), + 'height': int_or_none(data.get('Height')), + 'fps': int_or_none(data.get('Fps')), + 'tbr': int_or_none(data.get('Bitrate'), 1000), + }] + + def _real_extract(self, url): + vod_id = self._match_id(url) + + vod_info = self._download_json( + '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) + + state = vod_info.get('state') + if state != 'AVAILABLE': + raise ExtractorError( + 'VOD %s is not available (state: %s)' % (vod_id, state), + expected=True) + + formats = [] + thumbnail_url = None + + for vod in vod_info['vods']: + vod_type = vod.get('format') + if vod_type in ('hls', 'raw'): + formats.extend(self._extract_format(vod, vod_type)) + elif vod_type == 'thumbnail': + thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') + + self._sort_formats(formats) + + info = { + 'id': vod_id, + 'title': vod_info.get('name') or vod_id, + 'duration': float_or_none(vod_info.get('duration')), + 'thumbnail': thumbnail_url, + 'timestamp': parse_iso8601(vod_info.get('createdAt')), + 'view_count': int_or_none(vod_info.get('viewsTotal')), + 'formats': formats, + } + info.update(self._extract_channel_info(vod_info.get('channel') or {})) + + return info diff --git a/youtube_dl/extractor/beatport.py b/youtube_dl/extractor/beatport.py new file mode 100644 index 0000000000..e607094179 --- /dev/null +++ b/youtube_dl/extractor/beatport.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class BeatportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P[^/]+)/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://beatport.com/track/synesthesia-original-mix/5379371', + 'md5': 'b3c34d8639a2f6a7f734382358478887', + 'info_dict': { + 'id': '5379371', + 'display_id': 'synesthesia-original-mix', + 'ext': 'mp4', + 'title': 'Froxic - Synesthesia (Original Mix)', + }, + }, { + 'url': 'https://beatport.com/track/love-and-war-original-mix/3756896', + 'md5': 'e44c3025dfa38c6577fbaeb43da43514', + 'info_dict': { + 'id': '3756896', + 'display_id': 'love-and-war-original-mix', + 'ext': 'mp3', + 'title': 'Wolfgang Gartner - Love & War (Original Mix)', + }, + }, { + 'url': 'https://beatport.com/track/birds-original-mix/4991738', + 'md5': 'a1fd8e8046de3950fd039304c186c05f', + 'info_dict': { + 'id': '4991738', + 'display_id': 'birds-original-mix', + 'ext': 'mp4', + 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + track_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + playables = self._parse_json( + self._search_regex( + r'window\.Playables\s*=\s*({.+?});', webpage, + 'playables info', flags=re.DOTALL), + track_id) + + track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) + + title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] + if track['mix']: + title += ' (' + track['mix'] + ')' + + formats = [] + for ext, info in track['preview'].items(): + if not info['url']: + continue + fmt = { + 'url': info['url'], + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } + if ext == 'mp3': + fmt['preference'] = 0 + fmt['acodec'] = 'mp3' + fmt['abr'] = 96 + fmt['asr'] = 44100 + elif ext == 'mp4': + fmt['preference'] = 1 + fmt['acodec'] = 'aac' + fmt['abr'] = 96 + fmt['asr'] = 44100 + formats.append(fmt) + self._sort_formats(formats) + + images = [] + for name, info in track['images'].items(): + image_url = info.get('url') + if name == 'dynamic' or not image_url: + continue + image = { + 'id': name, + 'url': image_url, + 'height': int_or_none(info.get('height')), + 'width': int_or_none(info.get('width')), + } + images.append(image) + + return { + 'id': compat_str(track.get('id')) or track_id, + 'display_id': track.get('slug') or display_id, + 'title': title, + 'formats': formats, + 'thumbnails': images, + } diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py new file mode 100644 index 0000000000..5788d13baa --- /dev/null +++ b/youtube_dl/extractor/beeg.py @@ -0,0 +1,116 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class BeegIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P\d+)' + _TESTS = [{ + # api/v6 v1 + 'url': 'http://beeg.com/5416503', + 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', + 'info_dict': { + 'id': '5416503', + 'ext': 'mp4', + 'title': 'Sultry Striptease', + 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', + 'timestamp': 1391813355, + 'upload_date': '20140207', + 'duration': 383, + 'tags': list, + 'age_limit': 18, + } + }, { + # api/v6 v2 + 'url': 'https://beeg.com/1941093077?t=911-1391', + 'only_matching': True, + }, { + # api/v6 v2 w/o t + 'url': 'https://beeg.com/1277207756', + 'only_matching': True, + }, { + 'url': 'https://beeg.porn/video/5416503', + 'only_matching': True, + }, { + 'url': 'https://beeg.porn/5416503', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + beeg_version = self._search_regex( + r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', + default='1546225636701') + + if len(video_id) >= 10: + query = { + 'v': 2, + } + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + t = qs.get('t', [''])[0].split('-') + if len(t) > 1: + query.update({ + 's': t[0], + 'e': t[1], + }) + else: + query = {'v': 1} + + for api_path in ('', 'api.'): + video = self._download_json( + 'https://%sbeeg.com/api/v6/%s/video/%s' + % (api_path, beeg_version, video_id), video_id, + fatal=api_path == 'api.', query=query) + if video: + break + + formats = [] + for format_id, video_url in video.items(): + if not video_url: + continue + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'url': self._proto_relative_url( + video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), + 'format_id': format_id, + 'height': int(height), + }) + self._sort_formats(formats) + + title = video['title'] + video_id = compat_str(video.get('id') or video_id) + display_id = video.get('code') + description = video.get('desc') + series = video.get('ps_name') + + timestamp = unified_timestamp(video.get('date')) + duration = int_or_none(video.get('duration')) + + tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'series': series, + 'timestamp': timestamp, + 'duration': duration, + 'tags': tags, + 'formats': formats, + 'age_limit': self._rta_search(webpage), + } diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py new file mode 100644 index 0000000000..9bca853b32 --- /dev/null +++ b/youtube_dl/extractor/behindkink.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import url_basename + + +class BehindKinkIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^/#?_]+)' + _TEST = { + 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', + 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', + 'info_dict': { + 'id': '37127', + 'ext': 'mp4', + 'title': 'What are you passionate about – Marley Blaze', + 'description': 'md5:aee8e9611b4ff70186f752975d9b94b4', + 'upload_date': '20141205', + 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r' + (?: + ctv| + tsn| + bnn(?:bloomberg)?| + thecomedynetwork| + discovery| + discoveryvelocity| + sciencechannel| + investigationdiscovery| + animalplanet| + bravo| + mtv| + space| + etalk| + marilyn + )\.ca| + (?:much|cp24)\.com + )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' + _TESTS = [{ + 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', + 'md5': '36d3ef559cfe8af8efe15922cd3ce950', + 'info_dict': { + 'id': '1403070', + 'ext': 'flv', + 'title': 'David Cockfield\'s Top Picks', + 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', + 'upload_date': '20180525', + 'timestamp': 1527288600, + }, + }, { + 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', + 'only_matching': True, + }, { + 'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549', + 'only_matching': True, + }, { + 'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654', + 'only_matching': True, + }, { + 'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009', + 'only_matching': True, + }, { + 'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016', + 'only_matching': True, + }, { + 'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6', + 'only_matching': True, + }, { + 'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430', + 'only_matching': True, + }, { + 'url': 'http://www.etalk.ca/video?videoid=663455', + 'only_matching': True, + }, { + 'url': 'https://www.cp24.com/video?clipId=1982548', + 'only_matching': True, + }] + _DOMAINS = { + 'thecomedynetwork': 'comedy', + 'discoveryvelocity': 'discvel', + 'sciencechannel': 'discsci', + 'investigationdiscovery': 'invdisc', + 'animalplanet': 'aniplan', + 'etalk': 'ctv', + 'bnnbloomberg': 'bnn', + 'marilyn': 'ctv_marilyn', + } + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + domain = domain.split('.')[0] + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id), + 'ie_key': 'NineCNineMedia', + } diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py new file mode 100644 index 0000000000..d7ceaa85e4 --- /dev/null +++ b/youtube_dl/extractor/bet.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor +from ..utils import unified_strdate + + +class BetIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P.+?)\.html' + _TESTS = [ + { + 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', + 'info_dict': { + 'id': '07e96bd3-8850-3051-b856-271b457f0ab8', + 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', + 'ext': 'flv', + 'title': 'A Conversation With President Obama', + 'description': 'President Obama urges persistence in confronting racism and bias.', + 'duration': 1534, + 'upload_date': '20141208', + 'thumbnail': r're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', + 'info_dict': { + 'id': '9f516bf1-7543-39c4-8076-dd441b459ba9', + 'display_id': 'justice-for-ferguson-a-community-reacts', + 'ext': 'flv', + 'title': 'Justice for Ferguson: A Community Reacts', + 'description': 'A BET News special.', + 'duration': 1696, + 'upload_date': '20141125', + 'thumbnail': r're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + ] + + _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" + + def _get_feed_query(self, uri): + return { + 'uuid': uri, + } + + def _extract_mgid(self, webpage): + return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + mgid = self._extract_mgid(webpage) + videos_info = self._get_videos_info(mgid) + + info_dict = videos_info['entries'][0] + + upload_date = unified_strdate(self._html_search_meta('date', webpage)) + description = self._html_search_meta('description', webpage) + + info_dict.update({ + 'display_id': display_id, + 'description': description, + 'upload_date': upload_date, + }) + + return info_dict diff --git a/youtube_dl/extractor/bfi.py b/youtube_dl/extractor/bfi.py new file mode 100644 index 0000000000..60c8944b53 --- /dev/null +++ b/youtube_dl/extractor/bfi.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class BFIPlayerIE(InfoExtractor): + IE_NAME = 'bfi:player' + _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P[\w-]+)-online' + _TEST = { + 'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online', + 'md5': 'e8783ebd8e061ec4bc6e9501ed547de8', + 'info_dict': { + 'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63', + 'ext': 'mp4', + 'title': 'Computer Doctor', + 'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b', + }, + 'skip': 'BFI Player films cannot be played outside of the UK', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + entries = [] + for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage): + player_attr = extract_attributes(player_el) + ooyala_id = player_attr.get('data-video-id') + if not ooyala_id: + continue + entries.append(self.url_result( + 'ooyala:' + ooyala_id, 'Ooyala', + ooyala_id, player_attr.get('data-label'))) + return self.playlist_result(entries) diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py new file mode 100644 index 0000000000..28e3e59f67 --- /dev/null +++ b/youtube_dl/extractor/bigflix.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) + + +class BigflixIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P[0-9]+)' + _TESTS = [{ + # 2 formats + 'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070', + 'info_dict': { + 'id': '16070', + 'ext': 'mp4', + 'title': 'Madarasapatinam', + 'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b', + 'formats': 'mincount:2', + }, + 'params': { + 'skip_download': True, + } + }, { + # multiple formats + 'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r']+class=["\']pagetitle["\'][^>]*>(.+?)', + webpage, 'title') + + def decode_url(quoted_b64_url): + return compat_b64decode(compat_urllib_parse_unquote( + quoted_b64_url)).decode('utf-8') + + formats = [] + for height, encoded_url in re.findall( + r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage): + video_url = decode_url(encoded_url) + f = { + 'url': video_url, + 'format_id': '%sp' % height, + 'height': int(height), + } + if video_url.startswith('rtmp'): + f['ext'] = 'flv' + formats.append(f) + + file_url = self._search_regex( + r'file=([^&]+)', webpage, 'video url', default=None) + if file_url: + video_url = decode_url(file_url) + if all(f['url'] != video_url for f in formats): + formats.append({ + 'url': decode_url(file_url), + }) + + self._sort_formats(formats) + + description = self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats + } diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py new file mode 100644 index 0000000000..b8dfbd42b4 --- /dev/null +++ b/youtube_dl/extractor/bild.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class BildIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P[^/]+)-(?P\d+)(?:,auto=true)?\.bild\.html' + IE_DESC = 'Bild.de' + _TEST = { + 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', + 'md5': 'dd495cbd99f2413502a1713a1156ac8a', + 'info_dict': { + 'id': '38184146', + 'ext': 'mp4', + 'title': 'Das können die neuen iPads', + 'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 196, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._download_json( + url.split('.bild.html')[0] + ',view=json.bild.html', video_id) + + return { + 'id': video_id, + 'title': unescapeHTML(video_data['title']).strip(), + 'description': unescapeHTML(video_data.get('description')), + 'url': video_data['clipList'][0]['srces'][0]['src'], + 'thumbnail': video_data.get('poster'), + 'duration': int_or_none(video_data.get('durationSec')), + } diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py new file mode 100644 index 0000000000..d39ee8ffec --- /dev/null +++ b/youtube_dl/extractor/bilibili.py @@ -0,0 +1,450 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_iso8601, + smuggle_url, + str_or_none, + strip_jsonp, + unified_timestamp, + unsmuggle_url, + urlencode_postdata, +) + + +class BiliBiliIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|bangumi)\.)? + bilibili\.(?:tv|com)/ + (?: + (?: + video/[aA][vV]| + anime/(?P\d+)/play\# + )(?P\d+)| + video/[bB][vV](?P[^/?#&]+) + ) + ''' + + _TESTS = [{ + 'url': 'http://www.bilibili.tv/video/av1074402/', + 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', + 'info_dict': { + 'id': '1074402', + 'ext': 'flv', + 'title': '【金坷垃】金泡沫', + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'duration': 308.067, + 'timestamp': 1398012678, + 'upload_date': '20140420', + 'thumbnail': r're:^https?://.+\.jpg', + 'uploader': '菊子桑', + 'uploader_id': '156160', + }, + }, { + # Tested in BiliBiliBangumiIE + 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', + 'only_matching': True, + }, { + 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', + 'md5': '3f721ad1e75030cc06faf73587cfec57', + 'info_dict': { + 'id': '100643', + 'ext': 'mp4', + 'title': 'CHAOS;CHILD', + 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', + }, + 'skip': 'Geo-restricted to China', + }, { + # Title with double quotes + 'url': 'http://www.bilibili.com/video/av8903802/', + 'info_dict': { + 'id': '8903802', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + }, + 'playlist': [{ + 'info_dict': { + 'id': '8903802_part1', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }, { + 'info_dict': { + 'id': '8903802_part2', + 'ext': 'flv', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382634, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, + }] + }, { + # new BV video id format + 'url': 'https://www.bilibili.com/video/BV1JE411F741', + 'only_matching': True, + }] + + _APP_KEY = 'iVGUTjsxvpLeuDCf' + _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' + + def _report_error(self, result): + if 'message' in result: + raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) + elif 'code' in result: + raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) + else: + raise ExtractorError('Can\'t extract Bangumi episode ID') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_bv') + anime_id = mobj.group('anime_id') + webpage = self._download_webpage(url, video_id) + + if 'anime/' not in url: + cid = self._search_regex( + r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', + default=None + ) or compat_parse_qs(self._search_regex( + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', + r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + webpage, 'player parameters'))['cid'][0] + else: + if 'no_bangumi_tip' not in smuggled_data: + self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dlc with %s' % ( + video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) + headers = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url + } + headers.update(self.geo_verification_headers()) + + js = self._download_json( + 'http://bangumi.bilibili.com/web_api/get_source', video_id, + data=urlencode_postdata({'episode_id': video_id}), + headers=headers) + if 'result' not in js: + self._report_error(js) + cid = js['result']['cid'] + + headers = { + 'Referer': url + } + headers.update(self.geo_verification_headers()) + + entries = [] + + RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') + for num, rendition in enumerate(RENDITIONS, start=1): + payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + + if not video_info: + continue + + if 'durl' not in video_info: + if num < len(RENDITIONS): + continue + self._report_error(video_info) + + for idx, durl in enumerate(video_info['durl']): + formats = [{ + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), + }] + for backup_url in durl.get('backup_url', []): + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url else -3, + }) + + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, + }) + break + + title = self._html_search_regex( + (']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', + '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') + description = self._html_search_meta('description', webpage) + timestamp = unified_timestamp(self._html_search_regex( + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', + default=None) or self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) + thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) + + # TODO 'view_count' requires deobfuscating Javascript + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'duration': float_or_none(video_info.get('timelength'), scale=1000), + } + + uploader_mobj = re.search( + r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)', + webpage) + if uploader_mobj: + info.update({ + 'uploader': uploader_mobj.group('name'), + 'uploader_id': uploader_mobj.group('id'), + }) + if not info.get('uploader'): + info['uploader'] = self._html_search_meta( + 'author', webpage, 'uploader', default=None) + + for entry in entries: + entry.update(info) + + if len(entries) == 1: + return entries[0] + else: + for idx, entry in enumerate(entries): + entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + + return { + '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'description': description, + 'entries': entries, + } + + +class BiliBiliBangumiIE(InfoExtractor): + _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)' + + IE_NAME = 'bangumi.bilibili.com' + IE_DESC = 'BiliBili番剧' + + _TESTS = [{ + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist_count': 26, + }, { + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist': [{ + 'md5': '91da8621454dd58316851c27c68b0c13', + 'info_dict': { + 'id': '40062', + 'ext': 'mp4', + 'title': '混沌武士', + 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...', + 'timestamp': 1414538739, + 'upload_date': '20141028', + 'episode': '疾风怒涛 Tempestuous Temperaments', + 'episode_number': 1, + }, + }], + 'params': { + 'playlist_items': '1', + }, + }] + + @classmethod + def suitable(cls, url): + return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url) + + def _real_extract(self, url): + bangumi_id = self._match_id(url) + + # Sometimes this API returns a JSONP response + season_info = self._download_json( + 'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id, + bangumi_id, transform_source=strip_jsonp)['result'] + + entries = [{ + '_type': 'url_transparent', + 'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}), + 'ie_key': BiliBiliIE.ie_key(), + 'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '), + 'episode': episode.get('index_title'), + 'episode_number': int_or_none(episode.get('index')), + } for episode in season_info['episodes']] + + entries = sorted(entries, key=lambda entry: entry.get('episode_number')) + + return self.playlist_result( + entries, bangumi_id, + season_info.get('bangumi_title'), season_info.get('evaluate')) + + +class BilibiliAudioBaseIE(InfoExtractor): + def _call_api(self, path, sid, query=None): + if not query: + query = {'sid': sid} + return self._download_json( + 'https://www.bilibili.com/audio/music-service-c/web/' + path, + sid, query=query)['data'] + + +class BilibiliAudioIE(BilibiliAudioBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)' + _TEST = { + 'url': 'https://www.bilibili.com/audio/au1003142', + 'md5': 'fec4987014ec94ef9e666d4d158ad03b', + 'info_dict': { + 'id': '1003142', + 'ext': 'm4a', + 'title': '【tsukimi】YELLOW / 神山羊', + 'artist': 'tsukimi', + 'comment_count': int, + 'description': 'YELLOW的mp3版!', + 'duration': 183, + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }], + }, + 'thumbnail': r're:^https?://.+\.jpg', + 'timestamp': 1564836614, + 'upload_date': '20190803', + 'uploader': 'tsukimi-つきみぐー', + 'view_count': int, + }, + } + + def _real_extract(self, url): + au_id = self._match_id(url) + + play_data = self._call_api('url', au_id) + formats = [{ + 'url': play_data['cdns'][0], + 'filesize': int_or_none(play_data.get('size')), + }] + + song = self._call_api('song/info', au_id) + title = song['title'] + statistic = song.get('statistic') or {} + + subtitles = None + lyric = song.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }] + } + + return { + 'id': au_id, + 'title': title, + 'formats': formats, + 'artist': song.get('author'), + 'comment_count': int_or_none(statistic.get('comment')), + 'description': song.get('intro'), + 'duration': int_or_none(song.get('duration')), + 'subtitles': subtitles, + 'thumbnail': song.get('cover'), + 'timestamp': int_or_none(song.get('passtime')), + 'uploader': song.get('uname'), + 'view_count': int_or_none(statistic.get('play')), + } + + +class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)' + _TEST = { + 'url': 'https://www.bilibili.com/audio/am10624', + 'info_dict': { + 'id': '10624', + 'title': '每日新曲推荐(每日11:00更新)', + 'description': '每天11:00更新,为你推送最新音乐', + }, + 'playlist_count': 19, + } + + def _real_extract(self, url): + am_id = self._match_id(url) + + songs = self._call_api( + 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data'] + + entries = [] + for song in songs: + sid = str_or_none(song.get('id')) + if not sid: + continue + entries.append(self.url_result( + 'https://www.bilibili.com/audio/au' + sid, + BilibiliAudioIE.ie_key(), sid)) + + if entries: + album_data = self._call_api('menu/info', am_id) or {} + album_title = album_data.get('title') + if album_title: + for entry in entries: + entry['album'] = album_title + return self.playlist_result( + entries, am_id, album_title, album_data.get('intro')) + + return self.playlist_result(entries, am_id) + + +class BiliBiliPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)' + _TEST = { + 'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.bilibili.tv/video/av%s/' % video_id, + ie=BiliBiliIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py new file mode 100644 index 0000000000..dc86c57c5d --- /dev/null +++ b/youtube_dl/extractor/biobiochiletv.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + remove_end, +) + + +class BioBioChileTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' + + _TESTS = [{ + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', + 'md5': '26f51f03cf580265defefb4518faec09', + 'info_dict': { + 'id': 'sobre-camaras-y-camarillas-parlamentarias', + 'ext': 'mp4', + 'title': 'Sobre Cámaras y camarillas parlamentarias', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Fernando Atria', + }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', + }, { + # different uploader layout + 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', + 'md5': 'edc2e6b58974c46d5b047dea3c539ff3', + 'info_dict': { + 'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades', + 'ext': 'mp4', + 'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Piangella Obrador', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', + }, { + 'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml', + 'info_dict': { + 'id': 'b4xd0LK3SK', + 'ext': 'mp4', + # TODO: fix url_transparent information overriding + # 'uploader': 'Juan Pablo Echenique', + 'title': 'Comentario Oscar Cáceres', + }, + 'params': { + # empty m3u8 manifest + 'skip_download': True, + }, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + rudo_url = self._search_regex( + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + webpage, 'embed URL', None, group='url') + if not rudo_url: + raise ExtractorError('No videos found') + + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') + + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r'<a[^>]+href=["\'](?:https?://(?:busca|www)\.biobiochile\.cl)?/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>', + webpage, 'uploader', fatal=False) + + return { + '_type': 'url_transparent', + 'url': rudo_url, + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + } diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py new file mode 100644 index 0000000000..17ebbb2576 --- /dev/null +++ b/youtube_dl/extractor/biqle.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .vk import VKIE +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) +from ..utils import int_or_none + + +class BIQLEIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' + _TESTS = [{ + # Youtube embed + 'url': 'https://biqle.ru/watch/-115995369_456239081', + 'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06', + 'info_dict': { + 'id': '8v4f-avW-VI', + 'ext': 'mp4', + 'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer", + 'description': 'Passe-Partout', + 'uploader_id': 'mrsimpsonstef3', + 'uploader': 'Phanolito', + 'upload_date': '20120822', + }, + }, { + 'url': 'http://biqle.org/watch/-44781847_168547604', + 'md5': '7f24e72af1db0edf7c1aaba513174f97', + 'info_dict': { + 'id': '-44781847_168547604', + 'ext': 'mp4', + 'title': 'Ребенок в шоке от автоматической мойки', + 'timestamp': 1396633454, + 'uploader': 'Dmitry Kotov', + 'upload_date': '20140404', + 'uploader_id': '47850140', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + embed_url = self._proto_relative_url(self._search_regex( + r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>', + webpage, 'embed url')) + if VKIE.suitable(embed_url): + return self.url_result(embed_url, VKIE.ie_key(), video_id) + + embed_page = self._download_webpage( + embed_url, video_id, headers={'Referer': url}) + video_ext = self._get_cookies(embed_url).get('video_ext') + if video_ext: + video_ext = compat_urllib_parse_unquote(video_ext.value) + if not video_ext: + video_ext = compat_b64decode(self._search_regex( + r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', + embed_page, 'video_ext')).decode() + video_id, sig, _, access_token = video_ext.split(':') + item = self._download_json( + 'https://api.vk.com/method/video.get', video_id, + headers={'User-Agent': 'okhttp/3.4.1'}, query={ + 'access_token': access_token, + 'sig': sig, + 'v': 5.44, + 'videos': video_id, + })['response']['items'][0] + title = item['title'] + + formats = [] + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + formats.append({ + 'format_id': height + 'p', + 'url': f_url, + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': item.get('description'), + 'duration': int_or_none(item.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('date')), + 'uploader': item.get('owner_id'), + 'view_count': int_or_none(item.get('views')), + } diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py new file mode 100644 index 0000000000..0c773e66e1 --- /dev/null +++ b/youtube_dl/extractor/bitchute.py @@ -0,0 +1,142 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + orderedSet, + unified_strdate, + urlencode_postdata, +) + + +class BitChuteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', + 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', + 'info_dict': { + 'id': 'szoMrox2JEI', + 'ext': 'mp4', + 'title': 'Fuck bitches get money', + 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Victoria X Rave', + 'upload_date': '20170813', + }, + }, { + 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', + 'only_matching': True, + }, { + 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + }) + + title = self._html_search_regex( + (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', + default=None) or self._og_search_description(webpage) + + format_urls = [] + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): + format_urls.append(mobj.group('url')) + format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) + + formats = [ + {'url': format_url} + for format_url in orderedSet(format_urls)] + + if not formats: + formats = self._parse_html5_media_entries( + url, webpage, video_id)[0]['formats'] + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail') + uploader = self._html_search_regex( + (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), + webpage, 'uploader', fatal=False) + + upload_date = unified_strdate(self._search_regex( + r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', + webpage, 'upload date', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'formats': formats, + } + + +class BitChuteChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.bitchute.com/channel/victoriaxrave/', + 'playlist_mincount': 185, + 'info_dict': { + 'id': 'victoriaxrave', + }, + } + + _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + + def _entries(self, channel_id): + channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id + offset = 0 + for page_num in itertools.count(1): + data = self._download_json( + '%sextend/' % channel_url, channel_id, + 'Downloading channel page %d' % page_num, + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': offset, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': channel_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': 'csrftoken=%s' % self._TOKEN, + }) + if data.get('success') is False: + break + html = data.get('html') + if not html: + break + video_ids = re.findall( + r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', + html) + if not video_ids: + break + offset += len(video_ids) + for video_id in video_ids: + yield self.url_result( + 'https://www.bitchute.com/video/%s' % video_id, + ie=BitChuteIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.playlist_result( + self._entries(channel_id), playlist_id=channel_id) diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 0000000000..dc60224d00 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'uploader_id': 3992341, + 'description': 'CFB, ACC, Florida State', + 'timestamp': 1434380212, + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', + 'md5': '6a5cd403418c7b01719248ca97fb0692', + 'info_dict': { + 'id': '2586817', + 'ext': 'webm', + 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', + 'timestamp': 1446839961, + 'uploader': 'Sean Fay', + 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', + 'uploader_id': 6466954, + 'upload_date': '20151011', + }, + 'add_ie': ['Youtube'], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + + thumbnails = [] + primary_photo = article_data.get('primaryPhoto') + if primary_photo: + thumbnails = [{ + 'url': primary_photo['url'], + 'width': primary_photo.get('width'), + 'height': primary_photo.get('height'), + }] + + info = { + '_type': 'url_transparent', + 'id': article_id, + 'title': article_data['title'], + 'uploader': article_data.get('author', {}).get('name'), + 'uploader_id': article_data.get('authorId'), + 'timestamp': parse_iso8601(article_data.get('createdAt')), + 'thumbnails': thumbnails, + 'comment_count': int_or_none(article_data.get('commentsCount')), + 'view_count': int_or_none(article_data.get('hitCount')), + } + + video = article_data.get('video') + if video: + video_type = video['type'] + if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'): + info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] + elif video_type == 'ooyala.com': + info['url'] = 'ooyala:%s' % video['id'] + elif video_type == 'youtube.com': + info['url'] = video['id'] + elif video_type == 'vine.co': + info['url'] = 'https://vine.co/v/%s' % video['id'] + else: + info['url'] = video_type + video['id'] + return info + else: + raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' + _TESTS = [{ + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', + 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'info_dict': { + 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'ext': 'flv', + 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', + 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id) + info['id'] = video_id + return info diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py new file mode 100644 index 0000000000..db5e12b21b --- /dev/null +++ b/youtube_dl/extractor/blinkx.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + remove_start, + int_or_none, +) + + +class BlinkxIE(InfoExtractor): + _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' + IE_NAME = 'blinkx' + + _TEST = { + 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', + 'md5': '337cf7a344663ec79bf93a526a2e06c7', + 'info_dict': { + 'id': 'Da0Gw3xc', + 'ext': 'mp4', + 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', + 'uploader': 'IGN News', + 'upload_date': '20150217', + 'timestamp': 1424215740, + 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', + 'duration': 47.743333, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + display_id = video_id[:8] + + api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + + 'video=%s' % video_id) + data_json = self._download_webpage(api_url, display_id) + data = json.loads(data_json)['api']['results'][0] + duration = None + thumbnails = [] + formats = [] + for m in data['media']: + if m['type'] == 'jpg': + thumbnails.append({ + 'url': m['link'], + 'width': int(m['w']), + 'height': int(m['h']), + }) + elif m['type'] == 'original': + duration = float(m['d']) + elif m['type'] == 'youtube': + yt_id = m['link'] + self.to_screen('Youtube video detected: %s' % yt_id) + return self.url_result(yt_id, 'Youtube', video_id=yt_id) + elif m['type'] in ('flv', 'mp4'): + vcodec = remove_start(m['vcodec'], 'ff') + acodec = remove_start(m['acodec'], 'ff') + vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) + abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) + tbr = vbr + abr if vbr and abr else None + format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) + formats.append({ + 'format_id': format_id, + 'url': m['link'], + 'vcodec': vcodec, + 'acodec': acodec, + 'abr': abr, + 'vbr': vbr, + 'tbr': tbr, + 'width': int_or_none(m.get('w')), + 'height': int_or_none(m.get('h')), + }) + + self._sort_formats(formats) + + return { + 'id': display_id, + 'fullid': video_id, + 'title': data['title'], + 'formats': formats, + 'uploader': data['channel_name'], + 'timestamp': data['pubdate_epoch'], + 'description': data.get('description'), + 'thumbnails': thumbnails, + 'duration': duration, + } diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py new file mode 100644 index 0000000000..2fbfad1ba0 --- /dev/null +++ b/youtube_dl/extractor/bloomberg.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class BloombergIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', + # The md5 checksum changes + 'info_dict': { + 'id': 'qurhIVlJSB6hzkVi229d8g', + 'ext': 'flv', + 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', + 'description': 'md5:a8ba0302912d03d246979735c17d2761', + }, + 'params': { + 'format': 'best[format_id^=hds]', + }, + }, { + # video ID in BPlayer(...) + 'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/', + 'info_dict': { + 'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74', + 'ext': 'flv', + 'title': 'Meet the Real-Life Tech Wizards of Middle Earth', + 'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.', + }, + 'params': { + 'format': 'best[format_id^=hds]', + }, + }, { + # data-bmmrid= + 'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money', + 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', + 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', + 'only_matching': True, + }] + + def _real_extract(self, url): + name = self._match_id(url) + webpage = self._download_webpage(url, name) + video_id = self._search_regex( + (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'), + webpage, 'id', group='id', default=None) + if not video_id: + bplayer_data = self._parse_json(self._search_regex( + r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + video_id = bplayer_data['id'] + title = re.sub(': Video$', '', self._og_search_title(webpage)) + + embed_info = self._download_json( + 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + formats = [] + for stream in embed_info['streams']: + stream_url = stream.get('url') + if not stream_url: + continue + if stream['muxing_format'] == 'TS': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + formats.extend(self._extract_f4m_formats( + stream_url, video_id, f4m_id='hds', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py new file mode 100644 index 0000000000..6017e83449 --- /dev/null +++ b/youtube_dl/extractor/bokecc.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class BokeCCBaseIE(InfoExtractor): + def _extract_bokecc_formats(self, webpage, video_id, format_id=None): + player_params_str = self._html_search_regex( + r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)', + webpage, 'player params', group='query') + + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + formats = [{ + 'format_id': format_id, + 'url': quality.find('./copy').attrib['playurl'], + 'preference': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + + self._sort_formats(formats) + + return formats + + +class BokeCCIE(BokeCCBaseIE): + _IE_DESC = 'CC视频' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + + _TESTS = [{ + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A', + 'info_dict': { + 'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461', + 'ext': 'flv', + 'title': 'BokeCC Video', + }, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('vid') or not qs.get('uid'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': 'BokeCC Video', # no title provided in the webpage + 'formats': self._extract_bokecc_formats(webpage, video_id), + } diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py new file mode 100644 index 0000000000..57882fbeea --- /dev/null +++ b/youtube_dl/extractor/bostonglobe.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' + _TESTS = [ + { + 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', + 'md5': '0a62181079c85c2d2b618c9a738aedaf', + 'info_dict': { + 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', + 'id': '5320421710001', + 'ext': 'mp4', + 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', + 'timestamp': 1486877593, + 'upload_date': '20170212', + 'uploader_id': '245991542', + }, + }, + { + # Embedded youtube video; we hand it off to the Generic extractor. + 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', + 'md5': '582b40327089d5c0c949b3c54b13c24b', + 'info_dict': { + 'title': "Who Is Matt Damon's Favorite Batman?", + 'id': 'ZW1QCnlA6Qc', + 'ext': 'mp4', + 'upload_date': '20170217', + 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', + 'uploader': 'The Late Late Show with James Corden', + 'uploader_id': 'TheLateLateShow', + }, + 'expected_warnings': ['404'], + }, + ] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + page_title = self._og_search_title(webpage, default=None) + + # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> + entries = [] + for video in re.findall(r'(?i)(<video[^>]+>)', webpage): + attrs = extract_attributes(video) + + video_id = attrs.get('data-brightcove-video-id') + account_id = attrs.get('data-account') + player_id = attrs.get('data-player') + embed = attrs.get('data-embed') + + if video_id and account_id and player_id and embed: + entries.append( + 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + % (account_id, player_id, embed, video_id)) + + if len(entries) == 0: + return self.url_result(url, 'Generic') + elif len(entries) == 1: + return self.url_result(entries[0], 'BrightcoveNew') + else: + return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py new file mode 100644 index 0000000000..07833532e9 --- /dev/null +++ b/youtube_dl/extractor/bpb.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + determine_ext, +) + + +class BpbIE(InfoExtractor): + IE_DESC = 'Bundeszentrale für politische Bildung' + _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' + + _TEST = { + 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', + # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 + 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', + 'info_dict': { + 'id': '297', + 'ext': 'mp4', + 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', + 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<h2 class="white">(.*?)</h2>', webpage, 'title') + video_info_dicts = re.findall( + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) + + formats = [] + for video_info in video_info_dicts: + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' + formats.append({ + 'url': video_url, + 'preference': 10 if quality == 'high' else 0, + 'format_note': quality, + 'format_id': '%s-%s' % (quality, determine_ext(video_url)), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py new file mode 100644 index 0000000000..9bde7f2d82 --- /dev/null +++ b/youtube_dl/extractor/br.py @@ -0,0 +1,311 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_duration, + parse_iso8601, + xpath_element, + xpath_text, +) + + +class BRIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk' + _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' + + _TESTS = [ + { + 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', + 'md5': '83a0477cf0b8451027eb566d88b51106', + 'info_dict': { + 'id': '48f656ef-287e-486f-be86-459122db22cc', + 'ext': 'mp4', + 'title': 'Die böse Überraschung', + 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', + 'duration': 180, + 'uploader': 'Reinhard Weber', + 'upload_date': '20150422', + }, + 'skip': '404 not found', + }, + { + 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', + 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', + 'info_dict': { + 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', + 'ext': 'flv', + 'title': 'Manfred Schreiber ist tot', + 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', + 'duration': 26, + }, + 'skip': '404 not found', + }, + { + 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', + 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', + 'info_dict': { + 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', + 'ext': 'aac', + 'title': 'Kurzweilig und sehr bewegend', + 'description': 'md5:0351996e3283d64adeb38ede91fac54e', + 'duration': 296, + }, + 'skip': '404 not found', + }, + { + 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', + 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', + 'info_dict': { + 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', + 'ext': 'mp4', + 'title': 'Umweltbewusster Häuslebauer', + 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', + 'duration': 116, + } + }, + { + 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', + 'md5': '23bca295f1650d698f94fc570977dae3', + 'info_dict': { + 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', + 'ext': 'mp4', + 'title': 'Folge 1 - Metaphysik', + 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', + 'duration': 893, + 'uploader': 'Eva Maria Steimle', + 'upload_date': '20170208', + } + }, + ] + + def _real_extract(self, url): + base_url, display_id = re.search(self._VALID_URL, url).groups() + page = self._download_webpage(url, display_id) + xml_url = self._search_regex( + r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') + xml = self._download_xml(base_url + xml_url, display_id) + + medias = [] + + for xml_media in xml.findall('video') + xml.findall('audio'): + media_id = xml_media.get('externalId') + media = { + 'id': media_id, + 'title': xpath_text(xml_media, 'title', 'title', True), + 'duration': parse_duration(xpath_text(xml_media, 'duration')), + 'formats': self._extract_formats(xpath_element( + xml_media, 'assets'), media_id), + 'thumbnails': self._extract_thumbnails(xpath_element( + xml_media, 'teaserImage/variants'), base_url), + 'description': xpath_text(xml_media, 'desc'), + 'webpage_url': xpath_text(xml_media, 'permalink'), + 'uploader': xpath_text(xml_media, 'author'), + } + broadcast_date = xpath_text(xml_media, 'broadcastDate') + if broadcast_date: + media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) + medias.append(media) + + if len(medias) > 1: + self._downloader.report_warning( + 'found multiple medias; please ' + 'report this with the video URL to http://yt-dl.org/bug') + if not medias: + raise ExtractorError('No media entries found') + return medias[0] + + def _extract_formats(self, assets, media_id): + formats = [] + for asset in assets.findall('asset'): + format_url = xpath_text(asset, ['downloadUrl', 'url']) + asset_type = asset.get('type') + if asset_type.startswith('HDS'): + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) + elif asset_type.startswith('HLS'): + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) + else: + format_info = { + 'ext': xpath_text(asset, 'mediaType'), + 'width': int_or_none(xpath_text(asset, 'frameWidth')), + 'height': int_or_none(xpath_text(asset, 'frameHeight')), + 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), + 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), + 'vcodec': xpath_text(asset, 'codecVideo'), + 'acodec': xpath_text(asset, 'codecAudio'), + 'container': xpath_text(asset, 'mediaType'), + 'filesize': int_or_none(xpath_text(asset, 'size')), + } + format_url = self._proto_relative_url(format_url) + if format_url: + http_format_info = format_info.copy() + http_format_info.update({ + 'url': format_url, + 'format_id': 'http-%s' % asset_type, + }) + formats.append(http_format_info) + server_prefix = xpath_text(asset, 'serverPrefix') + if server_prefix: + rtmp_format_info = format_info.copy() + rtmp_format_info.update({ + 'url': server_prefix, + 'play_path': xpath_text(asset, 'fileName'), + 'format_id': 'rtmp-%s' % asset_type, + }) + formats.append(rtmp_format_info) + self._sort_formats(formats) + return formats + + def _extract_thumbnails(self, variants, base_url): + thumbnails = [{ + 'url': base_url + xpath_text(variant, 'url'), + 'width': int_or_none(xpath_text(variant, 'width')), + 'height': int_or_none(xpath_text(variant, 'height')), + } for variant in variants.findall('variant') if xpath_text(variant, 'url')] + thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) + return thumbnails + + +class BRMediathekIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' + + _TESTS = [{ + 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', + 'md5': 'fdc3d485835966d1622587d08ba632ec', + 'info_dict': { + 'id': 'av:5a1e6a6e8fce6d001871cc8e', + 'ext': 'mp4', + 'title': 'Die Sendung vom 28.11.2017', + 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', + 'timestamp': 1511942766, + 'upload_date': '20171129', + } + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + + clip = self._download_json( + 'https://proxy-base.master.mango.express/graphql', + clip_id, data=json.dumps({ + "query": """{ + viewer { + clip(id: "%s") { + title + description + duration + createdAt + ageRestriction + videoFiles { + edges { + node { + publicLocation + fileSize + videoProfile { + width + height + bitrate + encoding + } + } + } + } + captionFiles { + edges { + node { + publicLocation + } + } + } + teaserImages { + edges { + node { + imageFiles { + edges { + node { + publicLocation + width + height + } + } + } + } + } + } + } + } +}""" % clip_id}).encode(), headers={ + 'Content-Type': 'application/json', + })['data']['viewer']['clip'] + title = clip['title'] + + formats = [] + for edge in clip.get('videoFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + ext = determine_ext(n_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + n_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + video_profile = node.get('videoProfile', {}) + tbr = int_or_none(video_profile.get('bitrate')) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': n_url, + 'width': int_or_none(video_profile.get('width')), + 'height': int_or_none(video_profile.get('height')), + 'tbr': tbr, + 'filesize': int_or_none(node.get('fileSize')), + }) + self._sort_formats(formats) + + subtitles = {} + for edge in clip.get('captionFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + subtitles.setdefault('de', []).append({ + 'url': n_url, + }) + + thumbnails = [] + for edge in clip.get('teaserImages', {}).get('edges', []): + for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): + node = image_edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + thumbnails.append({ + 'url': n_url, + 'width': int_or_none(node.get('width')), + 'height': int_or_none(node.get('height')), + }) + + return { + 'id': clip_id, + 'title': title, + 'description': clip.get('description'), + 'duration': int_or_none(clip.get('duration')), + 'timestamp': parse_iso8601(clip.get('createdAt')), + 'age_limit': int_or_none(clip.get('ageRestriction')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py new file mode 100644 index 0000000000..b9715df00b --- /dev/null +++ b/youtube_dl/extractor/bravotv.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + smuggle_url, + update_url_query, + int_or_none, +) + + +class BravoTVIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', + 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', + 'info_dict': { + 'id': 'epL0pmK1kQlT', + 'ext': 'mp4', + 'title': 'The Top Chef Season 16 Winner Is...', + 'description': 'Find out who takes the title of Top Chef!', + 'uploader': 'NBCU-BRAV', + 'upload_date': '20190314', + 'timestamp': 1552591860, + } + }, { + 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + settings = self._parse_json(self._search_regex( + r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), + display_id) + info = {} + query = { + 'mbr': 'true', + } + account_pid, release_pid = [None] * 2 + tve = settings.get('ls_tve') + if tve: + query['manifest'] = 'm3u' + mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) + if mobj: + account_pid, tp_path = mobj.groups() + release_pid = tp_path.strip('/').split('/')[-1] + else: + account_pid = 'HNK2IC' + tp_path = release_pid = tve['release_pid'] + if tve.get('entitlement') == 'auth': + adobe_pass = settings.get('tve_adobe_auth', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'bravo'), + tve['title'], release_pid, tve.get('rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + else: + shared_playlist = settings['ls_playlist'] + account_pid = shared_playlist['account_pid'] + metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] + tp_path = release_pid = metadata.get('release_pid') + if not release_pid: + release_pid = metadata['guid'] + tp_path = 'media/guid/2140479951/' + release_pid + info.update({ + 'title': metadata['title'], + 'description': metadata.get('description'), + 'season_number': int_or_none(metadata.get('season_num')), + 'episode_number': int_or_none(metadata.get('episode_num')), + }) + query['switch'] = 'progressive' + info.update({ + '_type': 'url_transparent', + 'id': release_pid, + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path), + query), {'force_smil_url': True}), + 'ie_key': 'ThePlatform', + }) + return info diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py new file mode 100644 index 0000000000..68c7cf2bba --- /dev/null +++ b/youtube_dl/extractor/breakcom.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + int_or_none, + url_or_none, +) + + +class BreakIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', + 'info_dict': { + 'id': '2468056', + 'ext': 'mp4', + 'title': 'When Girls Act Like D-Bags', + 'age_limit': 13, + }, + }, { + # youtube embed + 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', + 'info_dict': { + 'id': 'RrrDLdeL2HQ', + 'ext': 'mp4', + 'title': 'Whale Watching Boat Crashing Into San Diego Dock', + 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', + 'upload_date': '20160331', + 'uploader': 'Steve Holden', + 'uploader_id': 'sdholden07', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage(url, display_id) + + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + content = self._parse_json( + self._search_regex( + r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, + 'content'), + display_id) + + formats = [] + for video in content: + video_url = url_or_none(video.get('url')) + if not video_url: + continue + bitrate = int_or_none(self._search_regex( + r'(\d+)_kbps', video_url, 'tbr', default=None)) + formats.append({ + 'url': video_url, + 'format_id': 'http-%d' % bitrate if bitrate else 'http', + 'tbr': bitrate, + }) + self._sort_formats(formats) + + title = self._search_regex( + (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') + + def get(key, name): + return int_or_none(self._search_regex( + r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, + default=None)) + + age_limit = get('ratings', 'age limit') + video_id = video_id or get('pid', 'video id') or display_id + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py new file mode 100644 index 0000000000..2aa9f4782e --- /dev/null +++ b/youtube_dl/extractor/brightcove.py @@ -0,0 +1,677 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re +import struct + +from .adobepass import AdobePassIE +from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_urlparse, + compat_urlparse, + compat_xml_parse_error, +) +from ..utils import ( + clean_html, + extract_attributes, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + float_or_none, + int_or_none, + js_to_json, + mimetype2ext, + parse_iso8601, + smuggle_url, + str_or_none, + unescapeHTML, + unsmuggle_url, + UnsupportedError, + update_url_query, + url_or_none, +) + + +class BrightcoveLegacyIE(InfoExtractor): + IE_NAME = 'brightcove:legacy' + _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' + + _TESTS = [ + { + # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', + 'md5': '5423e113865d26e40624dce2e4b45d95', + 'note': 'Test Brightcove downloads and detection in GenericIE', + 'info_dict': { + 'id': '2371591881001', + 'ext': 'mp4', + 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + 'uploader': '8TV', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': '1589608506001', + }, + 'skip': 'The player has been deactivated by the content owner', + }, + { + # From http://medianetwork.oracle.com/video/player/1785452137001 + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', + 'info_dict': { + 'id': '1785452137001', + 'ext': 'flv', + 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', + 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', + 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': '1460825906', + }, + 'skip': 'video not playable', + }, + { + # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ + 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', + 'info_dict': { + 'id': '2750934548001', + 'ext': 'mp4', + 'title': 'This Bracelet Acts as a Personal Thermostat', + 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', + # 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': '1130468786001', + }, + }, + { + # test that the default referer works + # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ + 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', + 'info_dict': { + 'id': '2878862109001', + 'ext': 'mp4', + 'title': 'Lost in Motion II', + 'description': 'md5:363109c02998fee92ec02211bd8000df', + 'uploader': 'National Ballet of Canada', + }, + 'skip': 'Video gone', + }, + { + # test flv videos served by akamaihd.net + # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + # The md5 checksum changes on each download + 'info_dict': { + 'id': '3750436379001', + 'ext': 'flv', + 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'uploader': 'RBTV Old (do not use)', + 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': '710858724001', + }, + 'skip': 'Video gone', + }, + { + # playlist with 'videoList' + # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', + 'info_dict': { + 'title': 'Sealife', + 'id': '3550319591001', + }, + 'playlist_mincount': 7, + 'skip': 'Unsupported URL', + }, + { + # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) + 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', + 'info_dict': { + 'id': '1522758701001', + 'title': 'Lesson 08', + }, + 'playlist_mincount': 10, + 'skip': 'Unsupported URL', + }, + { + # playerID inferred from bcpid + # from http://www.un.org/chinese/News/story.asp?NewsID=27724 + 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', + 'only_matching': True, # Tested in GenericIE + } + ] + + @classmethod + def _build_brighcove_url(cls, object_str): + """ + Build a Brightcove url from a xml string containing + <object class="BrightcoveExperience">{params}</object> + """ + + # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553 + object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', + lambda m: m.group(1) + '/>', object_str) + # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608 + object_str = object_str.replace('<--', '<!--') + # remove namespace to simplify extraction + object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) + object_str = fix_xml_ampersands(object_str) + + try: + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) + except compat_xml_parse_error: + return + + fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') + if fv_el is not None: + flashvars = dict( + (k, v[0]) + for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + else: + flashvars = {} + + data_url = object_doc.attrib.get('data', '') + data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) + + def find_param(name): + if name in flashvars: + return flashvars[name] + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return data_url_params.get(name) + + params = {} + + playerID = find_param('playerID') or find_param('playerId') + if playerID is None: + raise ExtractorError('Cannot find player ID') + params['playerID'] = playerID + + playerKey = find_param('playerKey') + # Not all pages define this value + if playerKey is not None: + params['playerKey'] = playerKey + # These fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') + if videoPlayer is not None: + if isinstance(videoPlayer, list): + videoPlayer = videoPlayer[0] + videoPlayer = videoPlayer.strip() + # UUID is also possible for videoPlayer (e.g. + # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd + # or http://www8.hp.com/cn/zh/home.html) + if not (re.match( + r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', + videoPlayer) or videoPlayer.startswith('ref:')): + return None + params['@videoPlayer'] = videoPlayer + linkBase = find_param('linkBaseURL') + if linkBase is not None: + params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove <object /> XML + # } + m = re.search( + r'''(?x)customBC\.createVideo\( + .*? # skipping width and height + ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID + ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P<videoID>\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): + return update_url_query( + 'http://c.brightcove.com/services/viewer/htmlFederated', params) + + @classmethod + def _extract_brightcove_url(cls, webpage): + """Try to extract the brightcove url from the webpage, returns None + if it can't be found + """ + urls = cls._extract_brightcove_urls(webpage) + return urls[0] if urls else None + + @classmethod + def _extract_brightcove_urls(cls, webpage): + """Return a list of all Brightcove URLs from the webpage """ + + url_m = re.search( + r'''(?x) + <meta\s+ + (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+ + content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 + ''', webpage) + if url_m: + url = unescapeHTML(url_m.group('url')) + # Some sites don't add it, we can't download with this url, for example: + # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ + if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: + return [url] + + matches = re.findall( + r'''(?sx)<object + (?: + [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | + [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ + ).+?>\s*</object>''', + webpage) + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) + if matches: + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in matches])) + return [src for _, src in re.findall( + r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + # Change the 'videoId' and others field to '@videoPlayer' + url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) + # Change bckey (used by bcove.me urls) to playerKey + url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) + mobj = re.match(self._VALID_URL, url) + query_str = mobj.group('query') + query = compat_urlparse.parse_qs(query_str) + + videoPlayer = query.get('@videoPlayer') + if videoPlayer: + # We set the original url as the default 'Referer' header + referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) + video_id = videoPlayer[0] + if 'playerID' not in query: + mobj = re.search(r'/bcpid(\d+)', url) + if mobj is not None: + query['playerID'] = [mobj.group(1)] + publisher_id = query.get('publisherId') + if publisher_id and publisher_id[0].isdigit(): + publisher_id = publisher_id[0] + if not publisher_id: + player_key = query.get('playerKey') + if player_key and ',' in player_key[0]: + player_key = player_key[0] + else: + player_id = query.get('playerID') + if player_id and player_id[0].isdigit(): + headers = {} + if referer: + headers['Referer'] = referer + player_page = self._download_webpage( + 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + video_id, headers=headers, fatal=False) + if player_page: + player_key = self._search_regex( + r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', + player_page, 'player key', fatal=False) + if player_key: + enc_pub_id = player_key.split(',')[1].replace('~', '=') + publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] + if publisher_id: + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + if referer: + brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + # TODO: figure out if it's possible to extract playlistId from playerKey + # elif 'playerKey' in query: + # player_key = query['playerKey'] + # return self._get_playlist_info(player_key[0]) + raise UnsupportedError(url) + + +class BrightcoveNewIE(AdobePassIE): + IE_NAME = 'brightcove:new' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)' + _TESTS = [{ + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'mp4', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'duration': 165.768, + 'timestamp': 1441391203, + 'upload_date': '20150904', + 'uploader_id': '929656772001', + 'formats': 'mincount:20', + }, + }, { + # with rtmp streams + 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', + 'info_dict': { + 'id': '4279049078001', + 'ext': 'mp4', + 'title': 'Titansgrave: Chapter 0', + 'description': 'Titansgrave: Chapter 0', + 'duration': 1242.058, + 'timestamp': 1433556729, + 'upload_date': '20150606', + 'uploader_id': '4036320279001', + 'formats': 'mincount:39', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # playlist stream + 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', + 'info_dict': { + 'id': '5718313430001', + 'title': 'No Audio Playlist', + }, + 'playlist_count': 7, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001', + 'only_matching': True, + }, { + # ref: prefixed video id + 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', + 'only_matching': True, + }, { + # non numeric ref: prefixed video id + 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', + 'only_matching': True, + }, { + # unavailable video without message but with error_code + 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(ie, webpage): + urls = BrightcoveNewIE._extract_urls(ie, webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(ie, webpage): + # Reference: + # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript + # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html + # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player + + entries = [] + + # Look for iframe embeds [1] + for _, url in re.findall( + r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) + + # Look for <video> tags [2] and embed_in_page embeds [3] + # [2] looks like: + for video, script_tag, account_id, player_id, embed in re.findall( + r'''(?isx) + (<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) + (?:.*? + (<script[^>]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + ) + )? + ''', webpage): + attrs = extract_attributes(video) + + # According to examples from [4] it's unclear whether video id + # may be optional and what to do when it is + video_id = attrs.get('data-video-id') + if not video_id: + continue + + account_id = account_id or attrs.get('data-account') + if not account_id: + continue + + player_id = player_id or attrs.get('data-player') or 'default' + embed = embed or attrs.get('data-embed') or 'default' + + bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( + account_id, player_id, embed, video_id) + + # Some brightcove videos may be embedded with video tag only and + # without script tag or any mentioning of brightcove at all. Such + # embeds are considered ambiguous since they are matched based only + # on data-video-id and data-account attributes and in the wild may + # not be brightcove embeds at all. Let's check reconstructed + # brightcove URLs in case of such embeds and only process valid + # ones. By this we ensure there is indeed a brightcove embed. + if not script_tag and not ie._is_valid_url( + bc_url, video_id, 'possible brightcove video'): + continue + + entries.append(bc_url) + + return entries + + def _parse_brightcove_metadata(self, json_data, video_id, headers={}): + title = json_data['name'].strip() + + formats = [] + for source in json_data.get('sources', []): + container = source.get('container') + ext = mimetype2ext(source.get('type')) + src = source.get('src') + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + continue + elif ext == 'm3u8' or container == 'M2TS': + if not src: + continue + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + if not src: + continue + formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) + else: + streaming_src = source.get('streaming_src') + stream_name, app_name = source.get('stream_name'), source.get('app_name') + if not src and not streaming_src and (not stream_name or not app_name): + continue + tbr = float_or_none(source.get('avg_bitrate'), 1000) + height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) + f = { + 'tbr': tbr, + 'filesize': int_or_none(source.get('size')), + 'container': container, + 'ext': ext or container.lower(), + } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) + + def build_format_id(kind): + format_id = kind + if tbr: + format_id += '-%dk' % int(tbr) + if height: + format_id += '-%dp' % height + return format_id + + if src or streaming_src: + f.update({ + 'url': src or streaming_src, + 'format_id': build_format_id('http' if src else 'http-streaming'), + 'source_preference': 0 if src else -1, + }) + else: + f.update({ + 'url': app_name, + 'play_path': stream_name, + 'format_id': build_format_id('rtmp'), + }) + formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) + + errors = json_data.get('errors') + if not formats and errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + + self._sort_formats(formats) + + for f in formats: + f.setdefault('http_headers', {}).update(headers) + + subtitles = {} + for text_track in json_data.get('text_tracks', []): + if text_track.get('kind') != 'captions': + continue + text_track_url = url_or_none(text_track.get('src')) + if not text_track_url: + continue + lang = (str_or_none(text_track.get('srclang')) + or str_or_none(text_track.get('label')) or 'en').lower() + subtitles.setdefault(lang, []).append({ + 'url': text_track_url, + }) + + is_live = False + duration = float_or_none(json_data.get('duration'), 1000) + if duration is not None and duration <= 0: + is_live = True + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': clean_html(json_data.get('description')), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'duration': duration, + 'timestamp': parse_iso8601(json_data.get('published_at')), + 'uploader_id': json_data.get('account_id'), + 'formats': formats, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), + 'is_live': is_live, + } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + 'ip_blocks': smuggled_data.get('geo_ip_blocks'), + }) + + account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() + + policy_key_id = '%s_%s' % (account_id, player_id) + policy_key = self._downloader.cache.load('brightcove', policy_key_id) + policy_key_extracted = False + store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) + + def extract_policy_key(): + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', + webpage, 'policy key', group='pk') + + store_pk(policy_key) + return policy_key + + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) + headers = {} + referrer = smuggled_data.get('referrer') + if referrer: + headers.update({ + 'Referer': referrer, + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), + }) + + for _ in range(2): + if not policy_key: + policy_key = extract_policy_key() + policy_key_extracted = True + headers['Accept'] = 'application/json;pk=%s' % policy_key + try: + json_data = self._download_json(api_url, video_id, headers=headers) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: + policy_key = None + store_pk(None) + continue + raise ExtractorError(message, expected=True) + raise + + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + + if content_type == 'playlist': + return self.playlist_result( + [self._parse_brightcove_metadata(vid, vid.get('id'), headers) + for vid in json_data.get('videos', []) if vid.get('id')], + json_data.get('id'), json_data.get('name'), + json_data.get('description')) + + return self._parse_brightcove_metadata( + json_data, video_id, headers=headers) diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py new file mode 100644 index 0000000000..73a57b1e4d --- /dev/null +++ b/youtube_dl/extractor/businessinsider.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BusinessInsiderIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', + 'md5': 'ffed3e1e12a6f950aa2f7d83851b497a', + 'info_dict': { + 'id': 'cjGDb0X9', + 'ext': 'mp4', + 'title': "Bananas give you more radiation exposure than living next to a nuclear power plant", + 'description': 'md5:0175a3baf200dd8fa658f94cade841b3', + 'upload_date': '20160611', + 'timestamp': 1465675620, + }, + }, { + 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', + 'md5': '43f438dbc6da0b89f5ac42f68529d84a', + 'info_dict': { + 'id': '5zJwd4FK', + 'ext': 'mp4', + 'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort', + 'description': 'md5:2af8975825d38a4fed24717bbe51db49', + 'upload_date': '20170705', + 'timestamp': 1499270528, + }, + }, { + 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex( + (r'data-media-id=["\']([a-zA-Z0-9]{8})', + r'id=["\']jwplayer_([a-zA-Z0-9]{8})', + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})', + r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'), + webpage, 'jwplatform id') + return self.url_result( + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=video_id) diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py new file mode 100644 index 0000000000..ec411091ef --- /dev/null +++ b/youtube_dl/extractor/buzzfeed.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from .facebook import FacebookIE + + +class BuzzFeedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia', + 'info_dict': { + 'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss', + 'title': 'This Angry Ram Destroys A Punching Bag Like A Boss', + 'description': 'Rambro!', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'aVCR29aE_OQ', + 'ext': 'mp4', + 'title': 'Angry Ram destroys a punching bag..', + 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', + 'upload_date': '20141024', + 'uploader_id': 'Buddhanz1', + 'uploader': 'Angry Ram', + } + }] + }, { + 'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia', + 'params': { + 'skip_download': True, # Got enough YouTube download tests + }, + 'info_dict': { + 'id': 'look-at-this-cute-dog-omg', + 'description': 're:Munchkin the Teddy Bear is back ?!', + 'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'mVmBL8B-In0', + 'ext': 'mp4', + 'title': 're:Munchkin the Teddy Bear gets her exercise', + 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', + 'upload_date': '20141124', + 'uploader_id': 'CindysMunchkin', + 'uploader': 're:^Munchkin the', + }, + }] + }, { + 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', + 'info_dict': { + 'id': 'the-most-adorable-crash-landing-ever', + 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', + 'description': 'This gosling knows how to stick a landing.', + }, + 'playlist': [{ + 'md5': '763ca415512f91ca62e4621086900a23', + 'info_dict': { + 'id': '971793786185728', + 'ext': 'mp4', + 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', + 'uploader': 'Calgary Outdoor Centre-University of Calgary', + }, + }], + 'add_ie': ['Facebook'], + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + all_buckets = re.findall( + r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'', + webpage) + + entries = [] + for bd_json in all_buckets: + bd = json.loads(bd_json) + video = bd.get('video') or bd.get('progload_video') + if not video: + continue + entries.append(self.url_result(video['url'])) + + facebook_urls = FacebookIE._extract_urls(webpage) + entries.extend([ + self.url_result(facebook_url) + for facebook_url in facebook_urls]) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'entries': entries, + } diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py new file mode 100644 index 0000000000..0b11bf11fc --- /dev/null +++ b/youtube_dl/extractor/byutv.py @@ -0,0 +1,117 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + merge_dicts, + parse_duration, + url_or_none, +) + + +class BYUtvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' + _TESTS = [{ + # ooyalaVOD + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', + 'info_dict': { + 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', + 'display_id': 'studio-c-season-5-episode-5', + 'ext': 'mp4', + 'title': 'Season 5 Episode 5', + 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', + 'thumbnail': r're:^https?://.*', + 'duration': 1486.486, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + # dvr + 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', + 'info_dict': { + 'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451', + 'display_id': 'byu-softball-pacific-vs-byu-41219---game-2', + 'ext': 'mp4', + 'title': 'Pacific vs. BYU (4/12/19)', + 'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3', + 'duration': 11645, + }, + 'params': { + 'skip_download': True + }, + }, { + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'only_matching': True, + }, { + 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + video = self._download_json( + 'https://api.byutv.org/api3/catalog/getvideosforcontent', + display_id, query={ + 'contentid': video_id, + 'channel': 'byutv', + 'x-byutv-context': 'web$US', + }, headers={ + 'x-byutv-context': 'web$US', + 'x-byutv-platformkey': 'xsaaw9c7y5', + }) + + ep = video.get('ooyalaVOD') + if ep: + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % ep['providerId'], + 'id': video_id, + 'display_id': display_id, + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + } + + info = {} + formats = [] + for format_id, ep in video.items(): + if not isinstance(ep, dict): + continue + video_url = url_or_none(ep.get('videoUrl')) + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + merge_dicts(info, { + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + 'duration': parse_duration(ep.get('length')), + }) + self._sort_formats(formats) + + return merge_dicts(info, { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'formats': formats, + }) diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py new file mode 100644 index 0000000000..cac8fdcba4 --- /dev/null +++ b/youtube_dl/extractor/c56.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class C56IE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' + IE_NAME = '56.com' + _TESTS = [{ + 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', + 'md5': 'e59995ac63d0457783ea05f93f12a866', + 'info_dict': { + 'id': '93440716', + 'ext': 'flv', + 'title': '网事知多少 第32期:车怒', + 'duration': 283.813, + }, + }, { + 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', + 'md5': '', + 'info_dict': { + 'id': '82247482', + 'title': '爱的诅咒之杜鹃花开', + }, + 'playlist_count': 7, + 'add_ie': ['Sohu'], + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + text_id = mobj.group('textid') + + webpage = self._download_webpage(url, text_id) + sohu_video_info_str = self._search_regex( + r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) + if sohu_video_info_str: + sohu_video_info = self._parse_json( + sohu_video_info_str, text_id, transform_source=js_to_json) + return self.url_result(sohu_video_info['url'], 'Sohu') + + page = self._download_json( + 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') + + info = page['info'] + + formats = [ + { + 'format_id': f['type'], + 'filesize': int(f['filesize']), + 'url': f['url'] + } for f in info['rfiles'] + ] + self._sort_formats(formats) + + return { + 'id': info['vid'], + 'title': info['Subject'], + 'duration': int(info['duration']) / 1000.0, + 'formats': formats, + 'thumbnail': info.get('bimg') or info.get('img'), + } diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py new file mode 100644 index 0000000000..8f0c6c545c --- /dev/null +++ b/youtube_dl/extractor/camdemy.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_urlparse, +) +from ..utils import ( + clean_html, + parse_duration, + str_to_int, + unified_strdate, +) + + +class CamdemyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' + _TESTS = [{ + # single file + 'url': 'http://www.camdemy.com/media/5181/', + 'md5': '5a5562b6a98b37873119102e052e311b', + 'info_dict': { + 'id': '5181', + 'ext': 'mp4', + 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'ss11spring', + 'duration': 1591, + 'upload_date': '20130114', + 'view_count': int, + } + }, { + # With non-empty description + # webpage returns "No permission or not login" + 'url': 'http://www.camdemy.com/media/13885', + 'md5': '4576a3bb2581f86c61044822adbd1249', + 'info_dict': { + 'id': '13885', + 'ext': 'mp4', + 'title': 'EverCam + Camdemy QuickStart', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', + 'creator': 'evercam', + 'duration': 318, + } + }, { + # External source (YouTube) + 'url': 'http://www.camdemy.com/media/14842', + 'info_dict': { + 'id': '2vsYQzNIsJo', + 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', + 'upload_date': '20130211', + 'uploader': 'Hun Kim', + 'uploader_id': 'hunkimtutorials', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + src_from = self._html_search_regex( + r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') + if src_from: + return self.url_result(src_from) + + oembed_obj = self._download_json( + 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + + title = oembed_obj['title'] + thumb_url = oembed_obj['thumbnail_url'] + video_folder = compat_urlparse.urljoin(thumb_url, 'video/') + file_list_doc = self._download_xml( + compat_urlparse.urljoin(video_folder, 'fileList.xml'), + video_id, 'Downloading filelist XML') + file_name = file_list_doc.find('./video/item/fileName').text + video_url = compat_urlparse.urljoin(video_folder, file_name) + + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumb_url, + 'description': description, + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), + 'upload_date': upload_date, + 'view_count': view_count, + } + + +class CamdemyFolderIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)' + _TESTS = [{ + # links with trailing slash + 'url': 'http://www.camdemy.com/folder/450', + 'info_dict': { + 'id': '450', + 'title': '信號與系統 2012 & 2011 (Signals and Systems)', + }, + 'playlist_mincount': 145 + }, { + # links without trailing slash + # and multi-page + 'url': 'http://www.camdemy.com/folder/853', + 'info_dict': { + 'id': '853', + 'title': '科學計算 - 使用 Matlab' + }, + 'playlist_mincount': 20 + }, { + # with displayMode parameter. For testing the codes to add parameters + 'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg', + 'info_dict': { + 'id': '853', + 'title': '科學計算 - 使用 Matlab' + }, + 'playlist_mincount': 20 + }] + + def _real_extract(self, url): + folder_id = self._match_id(url) + + # Add displayMode=list so that all links are displayed in a single page + parsed_url = list(compat_urlparse.urlparse(url)) + query = dict(compat_urlparse.parse_qsl(parsed_url[4])) + query.update({'displayMode': 'list'}) + parsed_url[4] = compat_urllib_parse_urlencode(query) + final_url = compat_urlparse.urlunparse(parsed_url) + + page = self._download_webpage(final_url, folder_id) + matches = re.findall(r"href='(/media/\d+/?)'", page) + + entries = [self.url_result('http://www.camdemy.com' + media_path) + for media_path in matches] + + folder_title = self._html_search_meta('keywords', page) + + return self.playlist_result(entries, folder_id, folder_title) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py new file mode 100644 index 0000000000..1eb81b75e9 --- /dev/null +++ b/youtube_dl/extractor/cammodels.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + url_or_none, +) + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + 'age_limit': 18 + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), + ) + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + + manifest = self._download_json( + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = url_or_none(media.get('location')) + if not media_url: + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ + 'ext': 'mp4', + # hls skips fragments, preferring rtmp + 'preference': -1, + }) + else: + continue + formats.append(f) + self._sort_formats(formats) + + return { + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, + 'age_limit': 18 + } diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py new file mode 100644 index 0000000000..b3be3bdcf7 --- /dev/null +++ b/youtube_dl/extractor/camtube.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + 'age_limit': 18 + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + 'age_limit': 18 + } diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py new file mode 100644 index 0000000000..bbc5205fde --- /dev/null +++ b/youtube_dl/extractor/camwithher.py @@ -0,0 +1,89 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) + + +class CamWithHerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)' + + _TESTS = [{ + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + 'description': 'In the clouds teasing on periscope to my favorite song', + 'duration': 240, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MileenaK', + 'upload_date': '20160322', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flv_id = self._html_search_regex( + r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id') + + # Video URL construction algorithm is reverse-engineered from cwhplayer.swf + rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( + ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) + + title = self._html_search_regex( + r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title') + description = self._html_search_regex( + r'>Description:</span>(.+?)</div>', webpage, 'description', default=None) + + runtime = self._search_regex( + r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) + if runtime: + runtime = re.sub(r'[\s-]', '', runtime) + duration = parse_duration(runtime) + view_count = int_or_none(self._search_regex( + r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) + + uploader = self._search_regex( + r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None) + upload_date = unified_strdate(self._search_regex( + r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) + + return { + 'id': flv_id, + 'url': rtmp_url, + 'ext': 'flv', + 'no_resume': True, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'uploader': uploader, + 'upload_date': upload_date, + 'age_limit': 18 + } diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 0000000000..407cc80848 --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class Canalc2IE(InfoExtractor): + IE_NAME = 'canalc2.tv' + _VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.canalc2.tv/video/12163', + 'md5': '060158428b650f896c542dfbb3d6487f', + 'info_dict': { + 'id': '12163', + 'ext': 'mp4', + 'title': 'Terrasses du Numérique', + 'duration': 122, + }, + }, { + 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.canalc2.tv/video/%s' % video_id, video_id) + + title = self._html_search_regex( + r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>', + webpage, 'title') + + formats = [] + for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage): + if video_url.startswith('rtmp://'): + rtmp = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url) + formats.append({ + 'url': rtmp.group('url'), + 'format_id': 'rtmp', + 'ext': 'flv', + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + 'page_url': url, + }) + else: + formats.append({ + 'url': video_url, + 'format_id': 'http', + }) + + if formats: + info = { + 'formats': formats, + } + else: + info = self._parse_html5_media_entries(url, webpage, url)[0] + + self._sort_formats(info['formats']) + + info.update({ + 'id': video_id, + 'title': title, + 'duration': parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)), + }) + return info diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py new file mode 100644 index 0000000000..51c11cb7e1 --- /dev/null +++ b/youtube_dl/extractor/canalplus.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + # ExtractorError, + # HEADRequest, + int_or_none, + qualities, + unified_strdate, +) + + +class CanalplusIE(InfoExtractor): + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' + _SITE_ID_MAP = { + 'mycanal': 'cplus', + 'piwiplus': 'teletoon', + } + + # Only works for direct mp4 URLs + _GEO_COUNTRIES = ['FR'] + + _TESTS = [{ + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', + 'info_dict': { + 'id': '1397061', + 'display_id': 'lolywood', + 'ext': 'mp4', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', + }, + }, { + # geo restricted, bypassed + 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', + 'info_dict': { + 'id': '1108190', + 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger', + 'ext': 'mp4', + 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe', + 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', + 'upload_date': '20140724', + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }] + + def _real_extract(self, url): + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + + site_id = self._SITE_ID_MAP[site] + + info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) + video_data = self._download_json(info_url, video_id, 'Downloading video JSON') + + if isinstance(video_data, list): + video_data = [video for video in video_data if video.get('ID') == video_id][0] + media = video_data['MEDIA'] + infos = video_data['INFOS'] + + preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) + + # _, fmt_url = next(iter(media['VIDEOS'].items())) + # if '/geo' in fmt_url.lower(): + # response = self._request_webpage( + # HEADRequest(fmt_url), video_id, + # 'Checking if the video is georestricted') + # if '/blocage' in response.geturl(): + # raise ExtractorError( + # 'The video is not available in your country', + # expected=True) + + formats = [] + for format_id, format_url in media['VIDEOS'].items(): + if not format_url: + continue + if format_id == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + elif format_id == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js + 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', + 'format_id': format_id, + 'preference': preference(format_id), + }) + self._sort_formats(formats) + + thumbnails = [{ + 'id': image_id, + 'url': image_url, + } for image_id, image_url in media.get('images', {}).items()] + + titrage = infos['TITRAGE'] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': '%s - %s' % (titrage['TITRE'], + titrage['SOUS_TITRE']), + 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), + 'thumbnails': thumbnails, + 'description': infos.get('DESCRIPTION'), + 'duration': int_or_none(infos.get('DURATION')), + 'view_count': int_or_none(infos.get('NB_VUES')), + 'like_count': int_or_none(infos.get('NB_LIKES')), + 'comment_count': int_or_none(infos.get('NB_COMMENTS')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py new file mode 100644 index 0000000000..8667a0d045 --- /dev/null +++ b/youtube_dl/extractor/canvas.py @@ -0,0 +1,368 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from .gigya import GigyaBaseIE +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + strip_or_none, + float_or_none, + int_or_none, + merge_dicts, + parse_iso8601, + str_or_none, + url_or_none, +) + + +class CanvasIE(InfoExtractor): + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'md5': '68993eda72ef62386a15ea2cf3c93107', + 'info_dict': { + 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'ext': 'mp4', + 'title': 'Nachtwacht: De Greystook', + 'description': 'Nachtwacht: De Greystook', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1468.04, + }, + 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], + }, { + 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'only_matching': True, + }] + _HLS_ENTRY_PROTOCOLS_MAP = { + 'HLS': 'm3u8_native', + 'HLS_AES': 'm3u8', + } + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site_id'), mobj.group('id') + + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) + + # New API endpoint + if not data: + token = self._download_json( + '%s/tokens' % self._REST_API_BASE, video_id, + 'Downloading token', data=b'', + headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + data = self._download_json( + '%s/videos/%s' % (self._REST_API_BASE, video_id), + video_id, 'Downloading video JSON', fatal=False, query={ + 'vrtPlayerToken': token, + 'client': '%s@PROD' % site_id, + }, expected_status=400) + message = data.get('message') + if message and not data.get('title'): + if data.get('code') == 'AUTHENTICATION_REQUIRED': + self.raise_login_required(message) + raise ExtractorError(message, expected=True) + + title = data['title'] + description = data.get('description') + + formats = [] + for target in data['targetUrls']: + format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) + if not format_url or not format_type: + continue + format_type = format_type.upper() + if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], + m3u8_id=format_type, fatal=False)) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id=format_type, fatal=False)) + elif format_type == 'HSS': + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + self._sort_formats(formats) + + subtitles = {} + subtitle_urls = data.get('subtitleUrls') + if isinstance(subtitle_urls, list): + for subtitle in subtitle_urls: + subtitle_url = subtitle.get('url') + if subtitle_url and subtitle.get('type') == 'CLOSED': + subtitles.setdefault('nl', []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': float_or_none(data.get('duration'), 1000), + 'thumbnail': data.get('posterImageUrl'), + 'subtitles': subtitles, + } + + +class CanvasEenIE(InfoExtractor): + IE_DESC = 'canvas.be and een.be' + _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', + 'md5': 'ed66976748d12350b118455979cca293', + 'info_dict': { + 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', + 'ext': 'flv', + 'title': 'De afspraak veilt voor de Warmste Week', + 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 49.02, + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # with subtitles + 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', + 'info_dict': { + 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', + 'display_id': 'pieter-0167', + 'ext': 'mp4', + 'title': 'Pieter 0167', + 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2553.08, + 'subtitles': { + 'nl': [{ + 'ext': 'vtt', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Pagina niet gevonden', + }, { + 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', + 'info_dict': { + 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', + 'display_id': 'emma-pakt-thilly-aan', + 'ext': 'mp4', + 'title': 'Emma pakt Thilly aan', + 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 118.24, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, display_id = mobj.group('site_id'), mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(self._search_regex( + r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None)) + + video_id = self._html_search_regex( + r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + } + + +class VrtNUIE(GigyaBaseIE): + IE_DESC = 'VrtNU.be' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + # Available via old API endpoint + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'info_dict': { + 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'ext': 'mp4', + 'title': 'De zwarte weduwe', + 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'duration': 1457.04, + 'thumbnail': r're:^https?://.*\.jpg$', + 'season': 'Season 1', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '<snip>', + 'password': '<snip>', + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # Only available via new API endpoint + 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', + 'info_dict': { + 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', + 'ext': 'mp4', + 'title': 'Aflevering 5', + 'description': 'Wie valt door de mand tijdens een missie?', + 'duration': 2967.06, + 'season': 'Season 1', + 'season_number': 1, + 'episode_number': 5, + }, + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '<snip>', + 'password': '<snip>', + }, + 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], + }] + _NETRC_MACHINE = 'vrtnu' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' + _CONTEXT_ID = 'R3595707040' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + auth_data = { + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + } + + auth_info = self._gigya_login(auth_data) + + # Sometimes authentication fails for no good reason, retry + login_attempt = 1 + while login_attempt <= 3: + try: + # When requesting a token, no actual token is returned, but the + # necessary cookies are set. + self._request_webpage( + 'https://token.vrt.be', + None, note='Requesting a token', errnote='Could not get a token', + headers={ + 'Content-Type': 'application/json', + 'Referer': 'https://www.vrt.be/vrtnu/', + }, + data=json.dumps({ + 'uid': auth_info['UID'], + 'uidsig': auth_info['UIDSignature'], + 'ts': auth_info['signatureTimestamp'], + 'email': auth_info['profile']['email'], + }).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + login_attempt += 1 + self.report_warning('Authentication failed') + self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') + else: + raise e + else: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, display_id) + + info = self._search_json_ld(webpage, display_id, default={}) + + # title is optional here since it may be extracted by extractor + # that is delegated from here + title = strip_or_none(self._html_search_regex( + r'(?ms)<h1 class="content__heading">(.+?)</h1>', + webpage, 'title', default=None)) + + description = self._html_search_regex( + r'(?ms)<div class="content__description">(.+?)</div>', + webpage, 'description', default=None) + + season = self._html_search_regex( + [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* + <span>seizoen\ (.+?)</span>\s* + </div>''', + r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], + webpage, 'season', default=None) + + season_number = int_or_none(season) + + episode_number = int_or_none(self._html_search_regex( + r'''(?xms)<div\ class="content__episode">\s* + <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> + </div>''', + webpage, 'episode_number', default=None)) + + release_date = parse_iso8601(self._html_search_regex( + r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', + webpage, 'release_date', default=None)) + + # If there's a ? or a # in the URL, remove them and everything after + clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') + securevideo_url = clean_url + '.mssecurevideo.json' + + try: + video = self._download_json(securevideo_url, display_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_login_required() + raise + + # We are dealing with a '../<show>.relevant' URL + redirect_url = video.get('url') + if redirect_url: + return self.url_result(self._proto_relative_url(redirect_url, 'https:')) + + # There is only one entry, but with an unknown key, so just get + # the first one + video_id = list(video.values())[0].get('videoid') + + return merge_dicts(info, { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season': season, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_date': release_date, + }) diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py new file mode 100644 index 0000000000..b57b86af7e --- /dev/null +++ b/youtube_dl/extractor/carambatv.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + try_get, +) + +from .videomore import VideomoreIE + + +class CarambaTVIE(InfoExtractor): + _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://video1.carambatv.ru/v/191910501', + 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2678.31, + }, + }, { + 'url': 'carambatv:191910501', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, + video_id) + + title = video['title'] + + base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id + + formats = [{ + 'url': base_url + f['fn'], + 'height': int_or_none(f.get('height')), + 'format_id': '%sp' % f['height'] if f.get('height') else None, + } for f in video['qualities'] if f.get('fn')] + self._sort_formats(formats) + + thumbnail = video.get('splash') + duration = float_or_none(try_get( + video, lambda x: x['annotations'][0]['end_time'], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class CarambaTVPageIE(InfoExtractor): + _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', + 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86', + 'info_dict': { + 'id': '475222', + 'ext': 'flv', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': r're:^https?://.*\.jpg', + # duration reported by videomore is incorrect + 'duration': int, + }, + 'add_ie': [VideomoreIE.ie_key()], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + videomore_url = VideomoreIE._extract_url(webpage) + if not videomore_url: + videomore_id = self._search_regex( + r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', + default=None) + if videomore_id: + videomore_url = 'videomore:%s' % videomore_id + if videomore_url: + title = self._og_search_title(webpage) + return { + '_type': 'url_transparent', + 'url': videomore_url, + 'ie_key': VideomoreIE.ie_key(), + 'title': title, + } + + video_url = self._og_search_property('video:iframe', webpage, default=None) + + if not video_url: + video_id = self._search_regex( + r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', + webpage, 'video id') + video_url = 'carambatv:%s' % video_id + + return self.url_result(video_url, CarambaTVIE.ie_key()) diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py new file mode 100644 index 0000000000..48b33617f2 --- /dev/null +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .turner import TurnerBaseIE +from ..utils import int_or_none + + +class CartoonNetworkIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' + _TEST = { + 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', + 'info_dict': { + 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', + 'ext': 'mp4', + 'title': 'How to Draw Upgrade', + 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): + metadata_re = '' + if content_re: + metadata_re = r'|video_metadata\.content_' + content_re + return self._search_regex( + r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), + webpage, name, fatal=fatal) + + media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) + title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) + + info = self._extract_ngtv_info( + media_id, {'networkId': 'cartoonnetwork'}, { + 'url': url, + 'site_name': 'CartoonNetwork', + 'auth_required': find_field('authType', 'auth type') != 'unauth', + }) + + series = find_field( + 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'series': series, + 'episode': title, + }) + + for field in ('season', 'episode'): + field_name = field + 'Number' + info[field + '_number'] = int_or_none(find_field( + field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) + + return info diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py new file mode 100644 index 0000000000..fd5ec6033b --- /dev/null +++ b/youtube_dl/extractor/cbc.py @@ -0,0 +1,497 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import json +import re +from xml.sax.saxutils import escape + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + js_to_json, + smuggle_url, + try_get, + xpath_text, + xpath_element, + xpath_with_ns, + find_xpath_attr, + orderedSet, + parse_duration, + parse_iso8601, + parse_age_limit, + strip_or_none, + int_or_none, + ExtractorError, +) + + +class CBCIE(InfoExtractor): + IE_NAME = 'cbc.ca' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + # with mediaId + 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', + 'md5': '97e24d09672fc4cf56256d6faa6c25bc', + 'info_dict': { + 'id': '2682904050', + 'ext': 'mp4', + 'title': 'Don Cherry – All-Stars', + 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', + 'timestamp': 1454463000, + 'upload_date': '20160203', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Geo-restricted to Canada', + }, { + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + }, { + # with clipId, feed only available via tpfeed.cbc.ca + 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', + 'md5': '0274a90b51a9b4971fe005c63f592f12', + 'info_dict': { + 'id': '2487345465', + 'ext': 'mp4', + 'title': 'Robin Williams freestyles on 90 Minutes Live', + 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', + 'upload_date': '19780210', + 'uploader': 'CBCC-NEW', + 'timestamp': 255977160, + }, + }, { + # multiple iframes + 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', + 'playlist': [{ + 'md5': '377572d0b49c4ce0c9ad77470e0b96b4', + 'info_dict': { + 'id': '2680832926', + 'ext': 'mp4', + 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', + 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', + 'upload_date': '20160201', + 'timestamp': 1454342820, + 'uploader': 'CBCC-NEW', + }, + }, { + 'md5': '415a0e3f586113894174dfb31aa5bb1a', + 'info_dict': { + 'id': '2658915080', + 'ext': 'mp4', + 'title': 'Fly like an eagle!', + 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', + 'upload_date': '20150315', + 'timestamp': 1426443984, + 'uploader': 'CBCC-NEW', + }, + }], + 'skip': 'Geo-restricted to Canada', + }, { + # multiple CBC.APP.Caffeine.initInstance(...) + 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', + 'info_dict': { + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', + }, + 'playlist_mincount': 6, + }] + + @classmethod + def suitable(cls, url): + return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) + + def _extract_player_init(self, player_init, display_id): + player_info = self._parse_json(player_init, display_id, js_to_json) + media_id = player_info.get('mediaId') + if not media_id: + clip_id = player_info['clipId'] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] + return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'<title>([^<]+)', webpage, 'title', fatal=False) + entries = [ + self._extract_player_init(player_init, display_id) + for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) + entries.extend([ + self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + for media_id in orderedSet(media_ids)]) + return self.playlist_result( + entries, display_id, strip_or_none(title), + self._og_search_description(webpage)) + + +class CBCPlayerIE(InfoExtractor): + IE_NAME = 'cbc.ca:player' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P\d+)' + _TESTS = [{ + 'url': 'http://www.cbc.ca/player/play/2683190193', + 'md5': '64d25f841ddf4ddb28a235338af32e2c', + 'info_dict': { + 'id': '2683190193', + 'ext': 'mp4', + 'title': 'Gerry Runs a Sweat Shop', + 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', + 'timestamp': 1455071400, + 'upload_date': '20160210', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Geo-restricted to Canada', + }, { + # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '2657631896', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'uploader': 'CBCC-NEW', + }, + }, { + 'url': 'http://www.cbc.ca/player/play/2164402062', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', + 'info_dict': { + 'id': '2164402062', + 'ext': 'mp4', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'uploader': 'CBCC-NEW', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { + 'force_smil_url': True + }), + 'id': video_id, + } + + +class CBCWatchBaseIE(InfoExtractor): + _device_id = None + _device_token = None + _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' + _NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', + } + _GEO_COUNTRIES = ['CA'] + _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' + _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' + _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcwatch' + + def _signature(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._API_KEY} + resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + # token + query = { + 'access_token': access_token, + 'apikey': self._API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) + return resp['signature'] + + def _call_api(self, path, video_id): + url = path if path.startswith('http') else self._API_BASE_URL + path + for _ in range(2): + try: + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + # Device token has expired, re-acquiring device token + self._register_device() + continue + raise + error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') + if error_message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) + return result + + def _real_initialize(self): + if self._valid_device_token(): + return + device = self._downloader.cache.load( + 'cbcwatch', self._cache_device_key()) or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if self._valid_device_token(): + return + self._register_device() + + def _valid_device_token(self): + return self._device_id and self._device_token + + def _cache_device_key(self): + email, _ = self._get_login_info() + return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' + + def _register_device(self): + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, 'Acquiring device token', + data=b'web') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + email, password = self._get_login_info() + if email and password: + signature = self._signature(email, password) + data = '{0}{1}web'.format( + escape(signature), escape(self._device_id)).encode() + url = self._API_BASE_URL + 'device/login' + result = self._download_xml( + url, None, data=data, + headers={'content-type': 'application/xml'}) + self._device_token = xpath_text(result, 'token', fatal=True) + else: + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', self._cache_device_key(), { + 'id': self._device_id, + 'token': self._device_token, + }) + + def _parse_rss_feed(self, rss): + channel = xpath_element(rss, 'channel', fatal=True) + + def _add_ns(path): + return xpath_with_ns(path, self._NS_MAP) + + entries = [] + for item in channel.findall('item'): + guid = xpath_text(item, 'guid', fatal=True) + title = xpath_text(item, 'title', fatal=True) + + media_group = xpath_element(item, _add_ns('media:group'), fatal=True) + content = xpath_element(media_group, _add_ns('media:content'), fatal=True) + content_url = content.attrib['url'] + + thumbnails = [] + for thumbnail in media_group.findall(_add_ns('media:thumbnail')): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('profile'), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + timestamp = None + release_date = find_xpath_attr( + item, _add_ns('media:credit'), 'role', 'releaseDate') + if release_date is not None: + timestamp = parse_iso8601(release_date.text) + + entries.append({ + '_type': 'url_transparent', + 'url': content_url, + 'id': guid, + 'title': title, + 'description': xpath_text(item, 'description'), + 'timestamp': timestamp, + 'duration': int_or_none(content.get('duration')), + 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), + 'episode': xpath_text(item, _add_ns('clearleap:episode')), + 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), + 'series': xpath_text(item, _add_ns('clearleap:series')), + 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), + 'thumbnails': thumbnails, + 'ie_key': 'CBCWatchVideo', + }) + + return self.playlist_result( + entries, xpath_text(channel, 'guid'), + xpath_text(channel, 'title'), + xpath_text(channel, 'description')) + + +class CBCWatchVideoIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch:video' + _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + # geo-restricted to Canada, bypassable + 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self._call_api(url, video_id) + + m3u8_url = xpath_text(result, 'url', fatal=True) + formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) + if len(formats) < 2: + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + for f in formats: + format_id = f.get('format_id') + if format_id.startswith('AAC'): + f['acodec'] = 'aac' + elif format_id.startswith('AC3'): + f['acodec'] = 'ac-3' + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + rss = xpath_element(result, 'rss') + if rss: + info.update(self._parse_rss_feed(rss)['entries'][0]) + del info['url'] + del info['_type'] + del info['ie_key'] + return info + + +class CBCWatchIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch' + _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' + _TESTS = [{ + # geo-restricted to Canada, bypassable + 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', + 'info_dict': { + 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', + 'ext': 'mp4', + 'title': 'Customer (Dis)Service', + 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', + 'upload_date': '20160219', + 'timestamp': 1455840000, + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'format': 'bestvideo', + }, + }, { + # geo-restricted to Canada, bypassable + 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', + 'info_dict': { + 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', + 'title': 'Arthur', + 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + rss = self._call_api('web/browse/' + video_id, video_id) + return self._parse_rss_feed(rss) + + +class CBCOlympicsIE(InfoExtractor): + IE_NAME = 'cbc.ca:olympics' + _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._hidden_inputs(webpage)['videoId'] + video_doc = self._download_xml( + 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) + title = xpath_text(video_doc, 'title', fatal=True) + is_live = xpath_text(video_doc, 'kind') == 'Live' + if is_live: + title = self._live_title(title) + + formats = [] + for video_source in video_doc.findall('videoSources/videoSource'): + uri = xpath_text(video_source, 'uri') + if not uri: + continue + tokenize = self._download_json( + 'https://olympics.cbc.ca/api/api-akamai/tokenize', + video_id, data=json.dumps({ + 'VideoSource': uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': url, + # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js + 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie + }, fatal=False) + if not tokenize: + continue + content_url = tokenize['ContentUrl'] + video_source_format = video_source.get('format') + if video_source_format == 'IIS': + formats.extend(self._extract_ism_formats( + content_url, video_id, ism_id=video_source_format, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=video_source_format, fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': xpath_text(video_doc, 'description'), + 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), + 'duration': parse_duration(xpath_text(video_doc, 'duration')), + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py new file mode 100644 index 0000000000..4a19a73d2f --- /dev/null +++ b/youtube_dl/extractor/cbs.py @@ -0,0 +1,112 @@ +from __future__ import unicode_literals + +from .theplatform import ThePlatformFeedIE +from ..utils import ( + ExtractorError, + int_or_none, + find_xpath_attr, + xpath_element, + xpath_text, + update_url_query, +) + + +class CBSBaseIE(ThePlatformFeedIE): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + subtitles = {} + for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: + cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k) + if cc_e is not None: + cc_url = cc_e.get('value') + if cc_url: + subtitles.setdefault(subtitles_lang, []).append({ + 'ext': ext, + 'url': cc_url, + }) + return subtitles + + +class CBSIE(CBSBaseIE): + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + + _TESTS = [{ + 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'info_dict': { + 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', + 'ext': 'mp4', + 'title': 'Connect Chat feat. Garth Brooks', + 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + 'duration': 1495, + 'timestamp': 1385585425, + 'upload_date': '20131127', + 'uploader': 'CBSI-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + '_skip': 'Blocked outside the US', + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, + }] + + def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): + items_data = self._download_xml( + 'http://can.cbs.com/thunder/player/videoPlayerService.php', + content_id, query={'partner': site, 'contentId': content_id}) + video_data = xpath_element(items_data, './/item') + title = xpath_text(video_data, 'videoTitle', 'title', True) + tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) + tp_release_url = 'http://link.theplatform.com/s/' + tp_path + + asset_types = [] + subtitles = {} + formats = [] + last_e = None + for item in items_data.findall('.//item'): + asset_type = xpath_text(item, 'assetType') + if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type: + continue + asset_types.append(asset_type) + query = { + 'mbr': 'true', + 'assetTypes': asset_type, + } + if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'): + query['formats'] = 'MPEG4,M3U' + elif asset_type in ('RTMP', 'WIFI', '3G'): + query['formats'] = 'MPEG4,FLV' + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data' % asset_type) + except ExtractorError as e: + last_e = e + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + raise last_e + self._sort_formats(formats) + + info = self._extract_theplatform_metadata(tp_path, content_id) + info.update({ + 'id': content_id, + 'title': title, + 'series': xpath_text(video_data, 'seriesTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), + 'thumbnail': xpath_text(video_data, 'previewImageURL'), + 'formats': formats, + 'subtitles': subtitles, + }) + return info + + def _real_extract(self, url): + content_id = self._match_id(url) + return self._extract_video_info(content_id) diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py new file mode 100644 index 0000000000..6596e98a69 --- /dev/null +++ b/youtube_dl/extractor/cbsinteractive.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .cbs import CBSIE +from ..utils import int_or_none + + +class CBSInteractiveIE(CBSIE): + _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P[^/?]+)' + _TESTS = [{ + 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', + 'info_dict': { + 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', + 'display_id': 'hands-on-with-microsofts-windows-8-1-update', + 'ext': 'mp4', + 'title': 'Hands-on with Microsoft Windows 8.1 Update', + 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', + 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', + 'uploader': 'Sarah Mitroff', + 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', + 'info_dict': { + 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', + 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', + 'ext': 'mp4', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', + }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', + 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'duration': 731, + 'timestamp': 1449129925, + 'upload_date': '20151203', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', + 'only_matching': True, + }] + + MPX_ACCOUNTS = { + 'cnet': 2198311517, + 'zdnet': 2387448114, + } + + def _real_extract(self, url): + site, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + + data_json = self._html_search_regex( + r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", + webpage, 'data json') + data = self._parse_json(data_json, display_id) + vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] + + video_id = vdata['mpxRefId'] + + title = vdata['title'] + author = vdata.get('author') + if author: + uploader = '%s %s' % (author['firstName'], author['lastName']) + uploader_id = author.get('id') + else: + uploader = None + uploader_id = None + + info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) + info.update({ + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'duration': int_or_none(vdata.get('duration')), + 'uploader': uploader, + 'uploader_id': uploader_id, + }) + return info diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py new file mode 100644 index 0000000000..90852a9ef9 --- /dev/null +++ b/youtube_dl/extractor/cbslocal.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .anvato import AnvatoIE +from .sendtonews import SendtoNewsIE +from ..compat import compat_urlparse +from ..utils import ( + parse_iso8601, + unified_timestamp, +) + + +class CBSLocalIE(AnvatoIE): + _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P[0-9a-z-]+)' + + _TESTS = [{ + # Anvato backend + 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\KCBSTV', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\AOL', + 'Syndication\\Yahoo', + 'Syndication\\Tribune', + 'Syndication\\Curb.tv', + 'Content\\News' + ], + 'tags': ['CBS 2 News Evening'], + }, + }, { + # SendtoNews embed + 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588', + }, + 'playlist_count': 9, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', + 'info_dict': { + 'id': '3580809', + 'ext': 'mp4', + 'title': 'A Very Blue Anniversary', + 'description': 'CBS2’s Cindy Hsu has more.', + 'thumbnail': 're:^https?://.*', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\WCBSTV', + 'Syndication\\AOL', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\Yahoo', + 'Content\\News', + 'Content\\News\\Local News', + ], + 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sendtonews_url = SendtoNewsIE._extract_url(webpage) + if sendtonews_url: + return self.url_result( + compat_urlparse.urljoin(url, sendtonews_url), + ie=SendtoNewsIE.ie_key()) + + info_dict = self._extract_anvato_videos(webpage, display_id) + + timestamp = unified_timestamp(self._html_search_regex( + r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, + 'released date', default=None)) or parse_iso8601( + self._html_search_meta('uploadDate', webpage)) + + info_dict.update({ + 'display_id': display_id, + 'timestamp': timestamp, + }) + + return info_dict diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py new file mode 100644 index 0000000000..345debcf0c --- /dev/null +++ b/youtube_dl/extractor/cbsnews.py @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import zlib + +from .common import InfoExtractor +from .cbs import CBSIE +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) +from ..utils import ( + parse_duration, +) + + +class CBSNewsEmbedIE(CBSIE): + IE_NAME = 'cbsnews:embed' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P.+)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', + 'only_matching': True, + }] + + def _real_extract(self, url): + item = self._parse_json(zlib.decompress(compat_b64decode( + compat_urllib_parse_unquote(self._match_id(url))), + -zlib.MAX_WBITS), None)['video']['items'][0] + return self._extract_video_info(item['mpxRefId'], 'cbsnews') + + +class CBSNewsIE(CBSIE): + IE_NAME = 'cbsnews' + IE_DESC = 'CBS News' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\da-z_-]+)' + + _TESTS = [ + { + # 60 minutes + 'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/', + 'info_dict': { + 'id': 'Y_nf_aEg6WwO9OLAq0MpKaPgfnBUxfW4', + 'ext': 'flv', + 'title': 'Artificial Intelligence, real-life applications', + 'description': 'md5:a7aaf27f1b4777244de8b0b442289304', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 317, + 'uploader': 'CBSI-NEW', + 'timestamp': 1476046464, + 'upload_date': '20161009', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', + 'info_dict': { + 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', + 'ext': 'mp4', + 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', + 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', + 'upload_date': '20140404', + 'timestamp': 1396650660, + 'uploader': 'CBSI-NEW', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 205, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + # 48 hours + 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', + 'info_dict': { + 'title': 'Cold as Ice', + 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', + }, + 'playlist_mincount': 7, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for embed_url in re.findall(r']+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): + entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) + if entries: + return self.playlist_result( + entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), + playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + + item = self._parse_json(self._html_search_regex( + r'CBSNEWS\.defaultPayload\s*=\s*({.+})', + webpage, 'video JSON info'), display_id)['items'][0] + return self._extract_video_info(item['mpxRefId'], 'cbsnews') + + +class CBSNewsLiveVideoIE(InfoExtractor): + IE_NAME = 'cbsnews:livevideo' + IE_DESC = 'CBS News Live Videos' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[^/?#]+)' + + # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples + _TEST = { + 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', + 'info_dict': { + 'id': 'clinton-sanders-prepare-to-face-off-in-nh', + 'ext': 'mp4', + 'title': 'Clinton, Sanders Prepare To Face Off In NH', + 'duration': 334, + }, + 'skip': 'Video gone', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + video_info = self._download_json( + 'http://feeds.cbsn.cbsnews.com/rundown/story', display_id, query={ + 'device': 'desktop', + 'dvr_slug': display_id, + }) + + formats = self._extract_akamai_formats(video_info['url'], display_id) + self._sort_formats(formats) + + return { + 'id': display_id, + 'display_id': display_id, + 'title': video_info['headline'], + 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), + 'duration': parse_duration(video_info.get('segmentDur')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py new file mode 100644 index 0000000000..83b7647624 --- /dev/null +++ b/youtube_dl/extractor/cbssports.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals + +from .cbs import CBSBaseIE + + +class CBSSportsIE(CBSBaseIE): + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', + 'info_dict': { + 'id': '1214315075735', + 'ext': 'mp4', + 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', + 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', + 'timestamp': 1524111457, + 'upload_date': '20180419', + 'uploader': 'CBSI-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'only_matching': True, + }] + + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], + webpage, 'video id') + return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py new file mode 100644 index 0000000000..36e6dff72c --- /dev/null +++ b/youtube_dl/extractor/ccc.py @@ -0,0 +1,111 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, + url_or_none, +) + + +class CCCIE(InfoExtractor): + IE_NAME = 'media.ccc.de' + _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', + 'md5': '3a1eda8f3a29515d27f5adb967d7e740', + 'info_dict': { + 'id': '1839', + 'ext': 'mp4', + 'title': 'Introduction to Processor Design', + 'creator': 'byterazor', + 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20131228', + 'timestamp': 1388188800, + 'duration': 3710, + 'tags': list, + } + }, { + 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + event_id = self._search_regex(r"data-id='(\d+)'", webpage, 'event id') + event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) + + formats = [] + for recording in event_data.get('recordings', []): + recording_url = recording.get('recording_url') + if not recording_url: + continue + language = recording.get('language') + folder = recording.get('folder') + format_id = None + if language: + format_id = language + if folder: + if language: + format_id += '-' + folder + else: + format_id = folder + vcodec = 'h264' if 'h264' in folder else ( + 'none' if folder in ('mp3', 'opus') else None + ) + formats.append({ + 'format_id': format_id, + 'url': recording_url, + 'width': int_or_none(recording.get('width')), + 'height': int_or_none(recording.get('height')), + 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024), + 'language': language, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + return { + 'id': event_id, + 'display_id': display_id, + 'title': event_data['title'], + 'creator': try_get(event_data, lambda x: ', '.join(x['persons'])), + 'description': event_data.get('description'), + 'thumbnail': event_data.get('thumb_url'), + 'timestamp': parse_iso8601(event_data.get('date')), + 'duration': int_or_none(event_data.get('length')), + 'tags': event_data.get('tags'), + 'formats': formats, + } + + +class CCCPlaylistIE(InfoExtractor): + IE_NAME = 'media.ccc.de:lists' + _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/c/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://media.ccc.de/c/30c3', + 'info_dict': { + 'title': '30C3', + 'id': '30c3', + }, + 'playlist_count': 135, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url).lower() + + conf = self._download_json( + 'https://media.ccc.de/public/conferences/' + playlist_id, + playlist_id) + + entries = [] + for e in conf['events']: + event_url = url_or_none(e.get('frontend_link')) + if event_url: + entries.append(self.url_result(event_url, ie=CCCIE.ie_key())) + + return self.playlist_result(entries, playlist_id, conf.get('title')) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py new file mode 100644 index 0000000000..544647f92b --- /dev/null +++ b/youtube_dl/extractor/ccma.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + parse_iso8601, + parse_resolution, + url_or_none, +) + + +class CCMAIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + 'md5': '7296ca43977c8ea4469e719c609b0871', + 'info_dict': { + 'id': '5630208', + 'ext': 'mp4', + 'title': 'L\'espot de La Marató de TV3', + 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', + 'timestamp': 1470918540, + 'upload_date': '20160811', + } + }, { + 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + 'md5': 'fa3e38f269329a278271276330261425', + 'info_dict': { + 'id': '943685', + 'ext': 'mp3', + 'title': 'El Consell de Savis analitza el derbi', + 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', + 'upload_date': '20171205', + 'timestamp': 1512507300, + } + }] + + def _real_extract(self, url): + media_type, media_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'media': media_type, + 'idint': media_id, + }) + + formats = [] + media_url = media['media']['url'] + if isinstance(media_url, list): + for format_ in media_url: + format_url = url_or_none(format_.get('file')) + if not format_url: + continue + label = format_.get('label') + f = parse_resolution(label) + f.update({ + 'url': format_url, + 'format_id': label, + }) + formats.append(f) + else: + formats.append({ + 'url': media_url, + 'vcodec': 'none' if media_type == 'audio' else None, + }) + self._sort_formats(formats) + + informacio = media['informacio'] + title = informacio['titol'] + durada = informacio.get('durada', {}) + duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) + timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + + subtitles = {} + subtitols = media.get('subtitols', {}) + if subtitols: + sub_url = subtitols.get('url') + if sub_url: + subtitles.setdefault( + subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + 'url': sub_url, + }) + + thumbnails = [] + imatges = media.get('imatges', {}) + if imatges: + thumbnail_url = imatges.get('url') + if thumbnail_url: + thumbnails = [{ + 'url': thumbnail_url, + 'width': int_or_none(imatges.get('amplada')), + 'height': int_or_none(imatges.get('alcada')), + }] + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(informacio.get('descripcio')), + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cctv.py b/youtube_dl/extractor/cctv.py new file mode 100644 index 0000000000..c76f361c68 --- /dev/null +++ b/youtube_dl/extractor/cctv.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, + unified_timestamp, +) + + +class CCTVIE(InfoExtractor): + IE_DESC = '央视网' + _VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)' + _TESTS = [{ + # fo.addVariable("videoCenterId","id") + 'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml', + 'md5': 'd61ec00a493e09da810bf406a078f691', + 'info_dict': { + 'id': '5ecdbeab623f4973b40ff25f18b174e8', + 'ext': 'mp4', + 'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)', + 'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95', + 'duration': 98, + 'uploader': 'songjunjie', + 'timestamp': 1455279956, + 'upload_date': '20160212', + }, + }, { + # var guid = "id" + 'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml', + 'info_dict': { + 'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae', + 'ext': 'mp4', + 'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)', + 'description': '2月4日,蒙特泽莫罗透露了关于“车王”舒马赫恢复情况,但情况是否属实遭到了质疑。', + 'duration': 37, + 'uploader': 'shujun', + 'timestamp': 1454677291, + 'upload_date': '20160205', + }, + 'params': { + 'skip_download': True, + }, + }, { + # changePlayer('id') + 'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml', + 'info_dict': { + 'id': '4bb9bb4db7a6471ba85fdeda5af0381e', + 'ext': 'mp4', + 'title': 'NHnews008 ANNUAL POLITICAL SEASON', + 'description': 'Four Comprehensives', + 'duration': 60, + 'uploader': 'zhangyunlei', + 'timestamp': 1425385521, + 'upload_date': '20150303', + }, + 'params': { + 'skip_download': True, + }, + }, { + # loadvideo('id') + 'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml', + 'info_dict': { + 'id': 'b15f009ff45c43968b9af583fc2e04b2', + 'ext': 'mp4', + 'title': 'Путь,усыпанный космеями Серия 1', + 'description': 'Путь, усыпанный космеями', + 'duration': 2645, + 'uploader': 'renxue', + 'timestamp': 1477479241, + 'upload_date': '20161026', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var initMyAray = 'id' + 'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml', + 'info_dict': { + 'id': 'a194cfa7f18c426b823d876668325946', + 'ext': 'mp4', + 'title': '小泽征尔音乐塾 音乐梦想无国界', + 'duration': 2173, + 'timestamp': 1369248264, + 'upload_date': '20130522', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var ids = ["id"] + 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', + 'info_dict': { + 'id': 'a8606119a4884588a79d81c02abecc16', + 'ext': 'mp3', + 'title': '来自维也纳的新年贺礼', + 'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7', + 'duration': 1578, + 'uploader': 'djy', + 'timestamp': 1482942419, + 'upload_date': '20161228', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44', + 'only_matching': True, + }, { + 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', + r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', + r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', + r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', + r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', + r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'], + webpage, 'video id') + + data = self._download_json( + 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id, + query={ + 'pid': video_id, + 'url': url, + 'idl': 32, + 'idlr': 32, + 'modifyed': 'false', + }) + + title = data['title'] + + formats = [] + + video = data.get('video') + if isinstance(video, dict): + for quality, chapters_key in enumerate(('lowChapters', 'chapters')): + video_url = try_get( + video, lambda x: x[chapters_key][0]['url'], compat_str) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'http', + 'quality': quality, + 'preference': -1, + }) + + hls_url = try_get(data, lambda x: x['hls_url'], compat_str) + if hls_url: + hls_url = re.sub(r'maxbr=\d+&?', '', hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + uploader = data.get('editer_name') + description = self._html_search_meta( + 'description', webpage, default=None) + timestamp = unified_timestamp(data.get('f_pgmtime')) + duration = float_or_none(try_get(video, lambda x: x['totalLength'])) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py new file mode 100644 index 0000000000..0c3af23d58 --- /dev/null +++ b/youtube_dl/extractor/cda.py @@ -0,0 +1,182 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import codecs +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + multipart_encode, + parse_duration, + random_birthday, + urljoin, +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _BASE_URL = 'http://www.cda.pl/' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'description': 'md5:269ccd135d550da90d1662651fcb9772', + 'thumbnail': r're:^https?://.*\.jpg$', + 'average_rating': float, + 'duration': 39, + 'age_limit': 0, + } + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'crash404', + 'view_count': int, + 'average_rating': float, + 'duration': 137, + 'age_limit': 0, + } + }, { + # Age-restricted + 'url': 'http://www.cda.pl/video/1273454c4', + 'info_dict': { + 'id': '1273454c4', + 'ext': 'mp4', + 'title': 'Bronson (2008) napisy HD 1080p', + 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'height': 1080, + 'uploader': 'boniek61', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5554, + 'age_limit': 18, + 'view_count': int, + 'average_rating': float, + }, + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] + + def _download_age_confirm_page(self, url, video_id, *args, **kwargs): + form_data = random_birthday('rok', 'miesiac', 'dzien') + form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) + data, content_type = multipart_encode(form_data) + return self._download_webpage( + urljoin(url, '/a/validatebirth'), video_id, *args, + data=data, headers={ + 'Referer': url, + 'Content-Type': content_type, + }, **kwargs) + + def _real_extract(self, url): + video_id = self._match_id(url) + self._set_cookie('cda.pl', 'cda.player', 'html5') + webpage = self._download_webpage( + self._BASE_URL + '/video/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + need_confirm_age = False + if self._html_search_regex(r'(]+action="/a/validatebirth")', + webpage, 'birthday validate form', default=None): + webpage = self._download_age_confirm_page( + url, video_id, note='Confirming age') + need_confirm_age = True + + formats = [] + + uploader = self._search_regex(r'''(?x) + <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> + (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? + <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) + ''', webpage, 'uploader', default=None, group='uploader') + view_count = self._search_regex( + r'Odsłony:(?:\s| )*([0-9]+)', webpage, + 'view_count', default=None) + average_rating = self._search_regex( + r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', + webpage, 'rating', fatal=False, group='rating_value') + + info_dict = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'uploader': uploader, + 'view_count': int_or_none(view_count), + 'average_rating': float_or_none(average_rating), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': None, + 'age_limit': 18 if need_confirm_age else 0, + } + + def extract_format(page, version): + json_str = self._html_search_regex( + r'player_data=(\\?["\'])(?P.+?)\1', page, + '%s player_json' % version, fatal=False, group='player_data') + if not json_str: + return + player_data = self._parse_json( + json_str, '%s player_data' % version, fatal=False) + if not player_data: + return + video = player_data.get('video') + if not video or 'file' not in video: + self.report_warning('Unable to extract %s version information' % version) + return + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') + f = { + 'url': video['file'], + } + m = re.search( + r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', + page) + if m: + f.update({ + 'format_id': m.group('format_id'), + 'height': int(m.group('height')), + }) + info_dict['formats'].append(f) + if not info_dict['duration']: + info_dict['duration'] = parse_duration(video.get('duration')) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + if need_confirm_age: + handler = self._download_age_confirm_page + else: + handler = self._download_webpage + + webpage = handler( + self._BASE_URL + href, video_id, + 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return info_dict diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py new file mode 100644 index 0000000000..7cb4efb746 --- /dev/null +++ b/youtube_dl/extractor/ceskatelevize.py @@ -0,0 +1,289 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + sanitized_Request, + unescapeHTML, + update_url_query, + urlencode_postdata, + USER_AGENTS, +) + + +class CeskaTelevizeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _TESTS = [{ + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '61924494877246241', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Život v Grónsku', + 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3350, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', + 'info_dict': { + 'id': '61924494877028507', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'description': 'English Subtittles', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 81.3, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # live stream + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'info_dict': { + 'id': 402, + 'ext': 'mp4', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to Czech Republic', + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' + if '%s

    ' % NOT_AVAILABLE_STRING in webpage: + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + type_ = None + episode_id = None + + playlist = self._parse_json( + self._search_regex( + r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', + default='{}'), playlist_id) + if playlist: + type_ = playlist.get('type') + episode_id = playlist.get('id') + + if not type_: + type_ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', + webpage, 'type') + if not episode_id: + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', + webpage, 'episode_id') + + data = { + 'playlist[0][type]': type_, + 'playlist[0][id]': episode_id, + 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestSource': 'iVysilani', + } + + entries = [] + + for user_agent in (None, USER_AGENTS['Safari']): + req = sanitized_Request( + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=urlencode_postdata(data)) + + req.add_header('Content-type', 'application/x-www-form-urlencoded') + req.add_header('x-addr', '127.0.0.1') + req.add_header('X-Requested-With', 'XMLHttpRequest') + if user_agent: + req.add_header('User-Agent', user_agent) + req.add_header('Referer', url) + + playlistpage = self._download_json(req, playlist_id, fatal=False) + + if not playlistpage: + continue + + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) + req.add_header('Referer', url) + + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) + + playlist = self._download_json(req, playlist_id, fatal=False) + if not playlist: + continue + + playlist = playlist.get('playlist') + if not isinstance(playlist, list): + continue + + playlist_len = len(playlist) + + for num, item in enumerate(playlist): + is_live = item.get('type') == 'LIVE' + formats = [] + for format_id, stream_url in item.get('streamUrls', {}).items(): + if 'drmOnly=true' in stream_url: + continue + if 'playerType=flash' in stream_url: + stream_formats = self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % format_id, fatal=False) + else: + stream_formats = self._extract_mpd_formats( + stream_url, playlist_id, + mpd_id='dash-%s' % format_id, fatal=False) + # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031 + if format_id == 'audioDescription': + for f in stream_formats: + f['source_preference'] = -10 + formats.extend(stream_formats) + + if user_agent and len(entries) == playlist_len: + entries[num]['formats'].extend(formats) + continue + + item_id = item.get('id') or item['assetId'] + title = item['title'] + + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + if playlist_len == 1: + final_title = playlist_title or title + if is_live: + final_title = self._live_title(final_title) + else: + final_title = '%s (%s)' % (playlist_title, title) + + entries.append({ + 'id': item_id, + 'title': final_title, + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + }) + + for e in entries: + self._sort_formats(e['formats']) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) + return { + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] + } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield '{0} --> {1}'.format(start, stop) + else: + yield line + + return '\r\n'.join(_fix_subtitle(subtitles)) + + +class CeskaTelevizePoradyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' + _TESTS = [{ + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494876844842', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data_url = update_url_query(unescapeHTML(self._search_regex( + (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), + webpage, 'iframe player url', group='url')), query={ + 'autoStart': 'true', + }) + + return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py new file mode 100644 index 0000000000..09cacf6d3b --- /dev/null +++ b/youtube_dl/extractor/channel9.py @@ -0,0 +1,262 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + parse_iso8601, + qualities, + unescapeHTML, +) + + +class Channel9IE(InfoExtractor): + IE_DESC = 'Channel 9' + IE_NAME = 'channel9' + _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P.+?)(?P/RSS)?/?(?:[?#&]|$)' + + _TESTS = [{ + 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', + 'md5': '32083d4eaf1946db6d454313f44510ca', + 'info_dict': { + 'id': '6c413323-383a-49dc-88f9-a22800cab024', + 'ext': 'wmv', + 'title': 'Developer Kick-Off Session: Stuff We Love', + 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', + 'duration': 4576, + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1377717420, + 'upload_date': '20130828', + 'session_code': 'KOS002', + 'session_room': 'Arena 1A', + 'session_speakers': 'count:5', + }, + }, { + 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', + 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', + 'info_dict': { + 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', + 'ext': 'wmv', + 'title': 'Self-service BI with Power BI - nuclear testing', + 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', + 'duration': 1540, + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1386381991, + 'upload_date': '20131207', + 'authors': ['Mike Wilmot'], + }, + }, { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', + 'duration': 5646, + 'thumbnail': r're:https?://.*\.jpg', + 'upload_date': '20150930', + 'timestamp': 1443640735, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', + 'info_dict': { + 'id': 'Events/DEVintersection/DEVintersection-2016', + 'title': 'DEVintersection 2016 Orlando Sessions', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', + 'only_matching': True, + }, { + 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', + 'only_matching': True, + }] + + _RSS_URL = 'http://channel9.msdn.com/%s/RSS' + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', + webpage) + + def _extract_list(self, video_id, rss_url=None): + if not rss_url: + rss_url = self._RSS_URL % video_id + rss = self._download_xml(rss_url, video_id, 'Downloading RSS') + entries = [self.url_result(session_url.text, 'Channel9') + for session_url in rss.findall('./channel/item/link')] + title_text = rss.find('./channel/title').text + return self.playlist_result(entries, video_id, title_text) + + def _real_extract(self, url): + content_path, rss = re.match(self._VALID_URL, url).groups() + + if rss: + return self._extract_list(content_path, url) + + webpage = self._download_webpage( + url, content_path, 'Downloading web page') + + episode_data = self._search_regex( + r"data-episode='([^']+)'", webpage, 'episode data', default=None) + if episode_data: + episode_data = self._parse_json(unescapeHTML( + episode_data), content_path) + content_id = episode_data['contentId'] + is_session = '/Sessions(' in episode_data['api'] + content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' + if is_session: + content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' + else: + content_url += 'Authors,Body&$expand=Authors' + content_data = self._download_json(content_url, content_id) + title = content_data['Title'] + + QUALITIES = ( + 'mp3', + 'wmv', 'mp4', + 'wmv-low', 'mp4-low', + 'wmv-mid', 'mp4-mid', + 'wmv-high', 'mp4-high', + ) + + quality_key = qualities(QUALITIES) + + def quality(quality_id, format_url): + return (len(QUALITIES) if '_Source.' in format_url + else quality_key(quality_id)) + + formats = [] + urls = set() + + SITE_QUALITIES = { + 'MP3': 'mp3', + 'MP4': 'mp4', + 'Low Quality WMV': 'wmv-low', + 'Low Quality MP4': 'mp4-low', + 'Mid Quality WMV': 'wmv-mid', + 'Mid Quality MP4': 'mp4-mid', + 'High Quality WMV': 'wmv-high', + 'High Quality MP4': 'mp4-high', + } + + formats_select = self._search_regex( + r'(?s)]+name=["\']format[^>]+>(.+?)]+\bvalue=(["\'])(?P(?:(?!\1).)+)\1[^>]*>\s*(?P[^<]+?)\s*<', + formats_select): + format_url = mobj.group('url') + if format_url in urls: + continue + urls.add(format_url) + format_id = mobj.group('format') + quality_id = SITE_QUALITIES.get(format_id, format_id) + formats.append({ + 'url': format_url, + 'format_id': quality_id, + 'quality': quality(quality_id, format_url), + 'vcodec': 'none' if quality_id == 'mp3' else None, + }) + + API_QUALITIES = { + 'VideoMP4Low': 'mp4-low', + 'VideoWMV': 'wmv-mid', + 'VideoMP4Medium': 'mp4-mid', + 'VideoMP4High': 'mp4-high', + 'VideoWMVHQ': 'wmv-hq', + } + + for format_id, q in API_QUALITIES.items(): + q_url = content_data.get(format_id) + if not q_url or q_url in urls: + continue + urls.add(q_url) + formats.append({ + 'url': q_url, + 'format_id': q, + 'quality': quality(q, q_url), + }) + + self._sort_formats(formats) + + slides = content_data.get('Slides') + zip_file = content_data.get('ZipFile') + + if not formats and not slides and not zip_file: + raise ExtractorError( + 'None of recording, slides or zip are available for %s' % content_path) + + subtitles = {} + for caption in content_data.get('Captions', []): + caption_url = caption.get('Url') + if not caption_url: + continue + subtitles.setdefault(caption.get('Language', 'en'), []).append({ + 'url': caption_url, + 'ext': 'vtt', + }) + + common = { + 'id': content_id, + 'title': title, + 'description': clean_html(content_data.get('Description') or content_data.get('Body')), + 'thumbnail': content_data.get('VideoPlayerPreviewImage'), + 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), + 'timestamp': parse_iso8601(content_data.get('PublishedDate')), + 'avg_rating': int_or_none(content_data.get('Rating')), + 'rating_count': int_or_none(content_data.get('RatingCount')), + 'view_count': int_or_none(content_data.get('Views')), + 'comment_count': int_or_none(content_data.get('CommentCount')), + 'subtitles': subtitles, + } + if is_session: + speakers = [] + for s in content_data.get('Speakers', []): + speaker_name = s.get('FullName') + if not speaker_name: + continue + speakers.append(speaker_name) + + common.update({ + 'session_code': content_data.get('Code'), + 'session_room': content_data.get('Room'), + 'session_speakers': speakers, + }) + else: + authors = [] + for a in content_data.get('Authors', []): + author_name = a.get('DisplayName') + if not author_name: + continue + authors.append(author_name) + common['authors'] = authors + + contents = [] + + if slides: + d = common.copy() + d.update({'title': title + '-Slides', 'url': slides}) + contents.append(d) + + if zip_file: + d = common.copy() + d.update({'title': title + '-Zip', 'url': zip_file}) + contents.append(d) + + if formats: + d = common.copy() + d.update({'title': title, 'formats': formats}) + contents.append(d) + return self.playlist_result(contents) + else: + return self._extract_list(content_path) diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py new file mode 100644 index 0000000000..42c9af263e --- /dev/null +++ b/youtube_dl/extractor/charlierose.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class CharlieRoseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P\d+)' + _TESTS = [{ + 'url': 'https://charlierose.com/videos/27996', + 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', + 'info_dict': { + 'id': '27996', + 'ext': 'mp4', + 'title': 'Remembering Zaha Hadid', + 'thumbnail': r're:^https?://.*\.jpg\?\d+', + 'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.', + 'subtitles': { + 'en': [{ + 'ext': 'vtt', + }], + }, + }, + }, { + 'url': 'https://charlierose.com/videos/27996', + 'only_matching': True, + }, { + 'url': 'https://charlierose.com/episodes/30887?autoplay=true', + 'only_matching': True, + }] + + _PLAYER_BASE = 'https://charlierose.com/video/player/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id) + + title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') + + info_dict = self._parse_html5_media_entries( + self._PLAYER_BASE % video_id, webpage, video_id, + m3u8_entry_protocol='m3u8_native')[0] + + self._sort_formats(info_dict['formats']) + self._remove_duplicate_formats(info_dict['formats']) + + info_dict.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + }) + + return info_dict diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py new file mode 100644 index 0000000000..a459dcb8d5 --- /dev/null +++ b/youtube_dl/extractor/chaturbate.py @@ -0,0 +1,109 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + lowercase_escape, + url_or_none, +) + + +class ChaturbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.chaturbate.com/siswet19/', + 'info_dict': { + 'id': 'siswet19', + 'ext': 'mp4', + 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'age_limit': 18, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Room is offline', + }, { + 'url': 'https://chaturbate.com/fullvideo/?b=caylin', + 'only_matching': True, + }, { + 'url': 'https://en.chaturbate.com/siswet19/', + 'only_matching': True, + }] + + _ROOM_OFFLINE = 'Room is currently offline' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://chaturbate.com/%s/' % video_id, video_id, + headers=self.geo_verification_headers()) + + found_m3u8_urls = [] + + data = self._parse_json( + self._search_regex( + r'initialRoomDossier\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data', default='{}', group='value'), + video_id, transform_source=lowercase_escape, fatal=False) + if data: + m3u8_url = url_or_none(data.get('hls_source')) + if m3u8_url: + found_m3u8_urls.append(m3u8_url) + + if not found_m3u8_urls: + for m in re.finditer( + r'(\\u002[27])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(lowercase_escape(m.group('url'))) + + if not found_m3u8_urls: + for m in re.finditer( + r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(m.group('url')) + + m3u8_urls = [] + for found_m3u8_url in found_m3u8_urls: + m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '') + for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url): + if m3u8_url not in m3u8_urls: + m3u8_urls.append(m3u8_url) + + if not m3u8_urls: + error = self._search_regex( + [r']+class=(["\'])desc_span\1[^>]*>(?P[^<]+)', + r']+id=(["\'])defchat\1[^>]*>\s*

    (?P[^<]+)<'], + webpage, 'error', group='error', default=None) + if not error: + if any(p in webpage for p in ( + self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): + error = self._ROOM_OFFLINE + if error: + raise ExtractorError(error, expected=True) + raise ExtractorError('Unable to find stream URL') + + formats = [] + for m3u8_url in m3u8_urls: + for known_id in ('fast', 'slow'): + if '_%s' % known_id in m3u8_url: + m3u8_id = known_id + break + else: + m3u8_id = None + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + # ffmpeg skips segments for fast m3u8 + preference=-10 if m3u8_id == 'fast' else None, + m3u8_id=m3u8_id, fatal=False, live=True)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, + 'age_limit': self._rta_search(webpage), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py new file mode 100644 index 0000000000..5aac212991 --- /dev/null +++ b/youtube_dl/extractor/chilloutzone.py @@ -0,0 +1,96 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import compat_b64decode +from ..utils import ( + clean_html, + ExtractorError +) + + +class ChilloutzoneIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P[\w|-]+)\.html' + _TESTS = [{ + 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', + 'md5': 'a76f3457e813ea0037e5244f509e66d1', + 'info_dict': { + 'id': 'enemene-meck-alle-katzen-weg', + 'ext': 'mp4', + 'title': 'Enemene Meck - Alle Katzen weg', + 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', + }, + }, { + 'note': 'Video hosted at YouTube', + 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', + 'info_dict': { + 'id': '1YVQaAgHyRU', + 'ext': 'mp4', + 'title': '16 Photos Taken 1 Second Before Disaster', + 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', + 'uploader': 'BuzzFeedVideo', + 'uploader_id': 'BuzzFeedVideo', + 'upload_date': '20131105', + }, + }, { + 'note': 'Video hosted at Vimeo', + 'url': 'http://www.chilloutzone.net/video/icon-blending.html', + 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', + 'info_dict': { + 'id': '85523671', + 'ext': 'mp4', + 'title': 'The Sunday Times - Icons', + 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', + 'uploader': 'Us', + 'uploader_id': 'usfilms', + 'upload_date': '20140131' + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + base64_video_info = self._html_search_regex( + r'var cozVidData = "(.+?)";', webpage, 'video data') + decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') + video_info_dict = json.loads(decoded_video_info) + + # get video information from dict + video_url = video_info_dict['mediaUrl'] + description = clean_html(video_info_dict.get('description')) + title = video_info_dict['title'] + native_platform = video_info_dict['nativePlatform'] + native_video_id = video_info_dict['nativeVideoId'] + source_priority = video_info_dict['sourcePriority'] + + # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) + if native_platform is None: + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or + # the own CDN + if source_priority == 'native': + if native_platform == 'youtube': + return self.url_result(native_video_id, ie='Youtube') + if native_platform == 'vimeo': + return self.url_result( + 'http://vimeo.com/' + native_video_id, ie='Vimeo') + + if not video_url: + raise ExtractorError('No video found') + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 0000000000..8d75cdf199 --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_b64decode +from ..utils import parse_duration + + +class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://chirb.it/be2abG', + 'info_dict': { + 'id': 'be2abG', + 'ext': 'mp3', + 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', + 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', + 'duration': 306, + 'uploader': 'Gerryaudio', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', + 'only_matching': True, + }, { + 'url': 'https://chirb.it/wp/MN58c2', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + data_fd = self._search_regex( + r'data-fd=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data fd', group='url') + + # Reverse engineered from https://chirb.it/js/chirbit.player.js (look + # for soundURL) + audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8') + + title = self._search_regex( + r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') + description = self._search_regex( + r'

    Description

    \s*]*>([^<]+)', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'class=["\']c-length["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) + uploader = self._search_regex( + r'id=["\']chirbit-username["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) + + return { + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + } + + +class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' + _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'info_dict': { + 'id': 'ScarletBeauty', + }, + 'playlist_mincount': 3, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + webpage = self._download_webpage(url, profile_id) + + entries = [ + self.url_result(self._proto_relative_url('//chirb.it/' + video_id)) + for _, video_id in re.findall(r']+id=([\'"])copy-btn-(?P[0-9a-zA-Z]+)\1', webpage)] + + return self.playlist_result(entries, profile_id) diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py new file mode 100644 index 0000000000..b861d54b0a --- /dev/null +++ b/youtube_dl/extractor/cinchcast.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + xpath_text, +) + + +class CinchcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', + 'info_dict': { + 'id': '5258197', + 'ext': 'mp3', + 'title': 'Train Your Brain to Up Your Game with Coach Mandy', + 'upload_date': '20130816', + }, + }, { + # Actual test is run in generic, look for undergroundwellness + 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + doc = self._download_xml( + 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, + video_id) + + item = doc.find('.//item') + title = xpath_text(item, './title', fatal=True) + date_str = xpath_text( + item, './{http://developer.longtailvideo.com/trac/}date') + upload_date = unified_strdate(date_str, day_first=False) + # duration is present but wrong + formats = [{ + 'format_id': 'main', + 'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'], + }] + backup_url = xpath_text( + item, './{http://developer.longtailvideo.com/trac/}backupContent') + if backup_url: + formats.append({ + 'preference': 2, # seems to be more reliable + 'format_id': 'backup', + 'url': backup_url, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cinemax.py b/youtube_dl/extractor/cinemax.py new file mode 100644 index 0000000000..7f89d33de9 --- /dev/null +++ b/youtube_dl/extractor/cinemax.py @@ -0,0 +1,29 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .hbo import HBOBaseIE + + +class CinemaxIE(HBOBaseIE): + _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P[^/]+/video/[0-9a-z-]+-(?P\d+))' + _TESTS = [{ + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', + 'md5': '82e0734bba8aa7ef526c9dd00cf35a05', + 'info_dict': { + 'id': '20126903', + 'ext': 'mp4', + 'title': 'S1 Ep 1: Recap', + }, + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], + }, { + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id) + info['id'] = video_id + return info diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py new file mode 100644 index 0000000000..da404e4dc5 --- /dev/null +++ b/youtube_dl/extractor/ciscolive.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + try_get, + urlencode_postdata, +) + + +class CiscoLiveBaseIE(InfoExtractor): + # These appear to be constant across all Cisco Live presentations + # and are not tied to any user session or event + RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' + RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, + 'rfWidgetId': RAINFOCUS_WIDGET_ID, + } + + def _call_api(self, ep, rf_id, query, referrer, note=None): + headers = self.HEADERS.copy() + headers['Referer'] = referrer + return self._download_json( + self.RAINFOCUS_API_URL % ep, rf_id, note=note, + data=urlencode_postdata(query), headers=headers) + + def _parse_rf_item(self, rf_item): + event_name = rf_item.get('eventName') + title = rf_item['title'] + description = clean_html(rf_item.get('abstract')) + presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) + bc_id = rf_item['videos'][0]['url'] + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id + duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + location = try_get(rf_item, lambda x: x['times'][0]['room']) + + if duration: + duration = duration * 60 + + return { + '_type': 'url_transparent', + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + 'title': title, + 'description': description, + 'duration': duration, + 'creator': presenter_name, + 'location': location, + 'series': event_name, + } + + +class CiscoLiveSessionIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P[^/?&]+)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'upload_date': '20180629', + 'uploader_id': '5647924234001', + 'location': '16B Mezz.', + }, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS', + 'only_matching': True, + }] + + def _real_extract(self, url): + rf_id = self._match_id(url) + rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) + return self._parse_rf_item(rf_result['items'][0]) + + +class CiscoLiveSearchIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'info_dict': { + 'title': 'Search query', + }, + 'playlist_count': 5, + }, { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + + @staticmethod + def _check_bc_id_exists(rf_item): + return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + + def _entries(self, query, url): + query['size'] = 50 + query['from'] = 0 + for page_num in itertools.count(1): + results = self._call_api( + 'search', None, query, url, + 'Downloading search JSON page %d' % page_num) + sl = try_get(results, lambda x: x['sectionList'][0], dict) + if sl: + results = sl + items = results.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + if not self._check_bc_id_exists(item): + continue + yield self._parse_rf_item(item) + size = int_or_none(results.get('size')) + if size is not None: + query['size'] = size + total = int_or_none(results.get('total')) + if total is not None and query['from'] + query['size'] > total: + break + query['from'] += query['size'] + + def _real_extract(self, url): + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query['type'] = 'session' + return self.playlist_result( + self._entries(query, url), playlist_title='Search query') diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py new file mode 100644 index 0000000000..505bdbe164 --- /dev/null +++ b/youtube_dl/extractor/cjsw.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unescapeHTML, +) + + +class CJSWIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P[^/]+)/episode/(?P\d+)' + _TESTS = [{ + 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', + 'md5': 'cee14d40f1e9433632c56e3d14977120', + 'info_dict': { + 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', + 'ext': 'mp3', + 'title': 'Freshly Squeezed – Episode June 20, 2017', + 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', + 'series': 'Freshly Squeezed', + 'episode_id': '20170620', + }, + }, { + # no description + 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + program, episode_id = mobj.group('program', 'id') + audio_id = '%s/%s' % (program, episode_id) + + webpage = self._download_webpage(url, episode_id) + + title = unescapeHTML(self._search_regex( + (r']+class=["\']episode-header__title["\'][^>]*>(?P[^<]+)', + r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title')) + + audio_url = self._search_regex( + r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'audio url', group='url') + + audio_id = self._search_regex( + r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', + audio_url, 'audio id', default=audio_id) + + formats = [{ + 'url': audio_url, + 'ext': determine_ext(audio_url, 'mp3'), + 'vcodec': 'none', + }] + + description = self._html_search_regex( + r'<p>(?P<description>.+?)</p>', webpage, 'description', + default=None) + series = self._search_regex( + r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, + 'series', default=program, group='name') + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'formats': formats, + 'series': series, + 'episode_id': episode_id, + } diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py new file mode 100644 index 0000000000..f2ca7a337d --- /dev/null +++ b/youtube_dl/extractor/cliphunter.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + url_or_none, +) + + +class CliphunterIE(InfoExtractor): + IE_NAME = 'cliphunter' + + _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ + (?P<id>[0-9]+)/ + (?P<seo>.+?)(?:$|[#\?]) + ''' + _TESTS = [{ + 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', + 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', + 'info_dict': { + 'id': '1012420', + 'ext': 'flv', + 'title': 'Fun Jynx Maze solo', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', + 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', + 'info_dict': { + 'id': '2019449', + 'ext': 'mp4', + 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._search_regex( + r'mediaTitle = "([^"]+)"', webpage, 'title') + + gexo_files = self._parse_json( + self._search_regex( + r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), + video_id) + + formats = [] + for format_id, f in gexo_files.items(): + video_url = url_or_none(f.get('url')) + if not video_url: + continue + fmt = f.get('fmt') + height = f.get('h') + format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'width': int_or_none(f.get('w')), + 'height': int_or_none(height), + 'tbr': int_or_none(f.get('br')), + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r"var\s+mov_thumb\s*=\s*'([^']+)';", + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'age_limit': self._rta_search(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py new file mode 100644 index 0000000000..a1a7a774c6 --- /dev/null +++ b/youtube_dl/extractor/clippit.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + qualities, +) + +import re + + +class ClippitIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)' + _TEST = { + 'url': 'https://www.clippituser.tv/c/evmgm', + 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', + 'info_dict': { + 'id': 'evmgm', + 'ext': 'mp4', + 'title': 'Bye bye Brutus. #BattleBots - Clippit', + 'uploader': 'lizllove', + 'uploader_url': 'https://www.clippituser.tv/p/lizllove', + 'timestamp': 1472183818, + 'upload_date': '20160826', + 'description': 'BattleBots | ABC', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title.*>(.+?)', webpage, 'title') + + FORMATS = ('sd', 'hd') + quality = qualities(FORMATS) + formats = [] + for format_id in FORMATS: + url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, + webpage, 'url', fatal=False) + if not url: + continue + match = re.search(r'/(?P\d+)\.mp4', url) + formats.append({ + 'url': url, + 'format_id': format_id, + 'quality': quality(format_id), + 'height': int(match.group('height')) if match else None, + }) + + uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', + webpage, 'uploader', fatal=False) + uploader_url = ('https://www.clippituser.tv/p/' + uploader + if uploader else None) + + timestamp = self._html_search_regex(r'datetime="(.+?)"', + webpage, 'date', fatal=False) + thumbnail = self._html_search_regex(r'data-image="(.+?)"', + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'timestamp': parse_iso8601(timestamp), + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py new file mode 100644 index 0000000000..d55b26d59f --- /dev/null +++ b/youtube_dl/extractor/cliprs.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .onet import OnetBaseIE + + +class ClipRsIE(OnetBaseIE): + _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' + _TEST = { + 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', + 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', + 'info_dict': { + 'id': '1488842.1399140381', + 'ext': 'mp4', + 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', + 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', + 'duration': 229, + 'timestamp': 1459850243, + 'upload_date': '20160405', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + mvp_id = self._search_mvp_id(webpage) + + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict['display_id'] = display_id + + return info_dict diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py new file mode 100644 index 0000000000..6cdb42f5a4 --- /dev/null +++ b/youtube_dl/extractor/clipsyndicate.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + find_xpath_attr, + fix_xml_ampersands +) + + +class ClipsyndicateIE(InfoExtractor): + _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + + _TESTS = [{ + 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + 'md5': '4d7d549451bad625e0ff3d7bd56d776c', + 'info_dict': { + 'id': '4629301', + 'ext': 'mp4', + 'title': 'Brick Briscoe', + 'duration': 612, + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, { + 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + js_player = self._download_webpage( + 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, + video_id, 'Downlaoding player') + # it includes a required token + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') + + pdoc = self._download_xml( + 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, + video_id, 'Downloading video info', + transform_source=fix_xml_ampersands) + + track_doc = pdoc.find('trackList/track') + + def find_param(name): + node = find_xpath_attr(track_doc, './/param', 'name', name) + if node is not None: + return node.attrib['value'] + + return { + 'id': video_id, + 'title': find_param('title'), + 'url': track_doc.find('location').text, + 'thumbnail': find_param('thumbnail'), + 'duration': int(find_param('duration')), + } diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py new file mode 100644 index 0000000000..26243d52d5 --- /dev/null +++ b/youtube_dl/extractor/closertotruth.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CloserToTruthIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'info_dict': { + 'id': '0_zof1ktre', + 'display_id': 'solutions-the-mind-body-problem', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'info_dict': { + 'id': '0_iuxai6g6', + 'display_id': 'how-do-brains-work', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/interviews/1725', + 'info_dict': { + 'id': '1725', + 'title': 'AyaFr-002', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', + webpage, 'kaltura partner_id') + + title = self._search_regex( + r'(.+?)\s*\|\s*.+?', webpage, 'video title') + + select = self._search_regex( + r'(?s)]+id="select-version"[^>]*>(.+?)', + webpage, 'select version', default=None) + if select: + entry_ids = set() + entries = [] + for mobj in re.finditer( + r']+value=(["\'])(?P[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P[^<]+)', + webpage): + entry_id = mobj.group('id') + if entry_id in entry_ids: + continue + entry_ids.add(entry_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': mobj.group('title'), + }) + if entries: + return self.playlist_result(entries, display_id, title) + + entry_id = self._search_regex( + r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', + webpage, 'kaltura entry_id', group='id') + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': title + } diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py new file mode 100644 index 0000000000..2fdcfbb3af --- /dev/null +++ b/youtube_dl/extractor/cloudflarestream.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re + +from .common import InfoExtractor + + +class CloudflareStreamIE(InfoExtractor): + _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' + _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE + _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:watch\.)?%s/| + %s + ) + (?P<id>%s) + ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) + _TESTS = [{ + 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', + 'only_matching': True, + }, { + 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', + 'only_matching': True, + }, { + 'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE), + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' + base_url = 'https://%s/%s/' % (domain, video_id) + if '.' in video_id: + video_id = self._parse_json(base64.urlsafe_b64decode( + video_id.split('.')[1]), video_id)['sub'] + manifest_base_url = base_url + 'manifest/video.' + + formats = self._extract_m3u8_formats( + manifest_base_url + 'm3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', + 'formats': formats, + } diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py new file mode 100644 index 0000000000..85ca20eccd --- /dev/null +++ b/youtube_dl/extractor/cloudy.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + str_to_int, + unified_strdate, +) + + +class CloudyIE(InfoExtractor): + _IE_DESC = 'cloudy.ec' + _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ + 'url': 'https://www.cloudy.ec/v/af511e2527aac', + 'md5': '29832b05028ead1b58be86bf319397ca', + 'info_dict': { + 'id': 'af511e2527aac', + 'ext': 'mp4', + 'title': 'Funny Cats and Animals Compilation june 2013', + 'upload_date': '20130913', + 'view_count': int, + } + }, { + 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.cloudy.ec/embed.php', video_id, query={ + 'id': video_id, + 'playerPage': 1, + 'autoplay': 1, + }) + + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + webpage = self._download_webpage( + 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) + + if webpage: + info.update({ + 'title': self._search_regex( + r'<h\d[^>]*>([^<]+)<', webpage, 'title'), + 'upload_date': unified_strdate(self._search_regex( + r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, + 'upload date', fatal=False)), + 'view_count': str_to_int(self._search_regex( + r'([\d,.]+) views<', webpage, 'view count', fatal=False)), + }) + + if not info.get('title'): + info['title'] = video_id + + info['id'] = video_id + + return info diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py new file mode 100644 index 0000000000..98f9cb5969 --- /dev/null +++ b/youtube_dl/extractor/clubic.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + qualities, +) + + +class ClubicIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', + 'md5': '1592b694ba586036efac1776b0b43cd3', + 'info_dict': { + 'id': '448474', + 'ext': 'mp4', + 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', + 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', + 'thumbnail': r're:^http://img\.clubic\.com/.*\.jpg$', + } + }, { + 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id + player_page = self._download_webpage(player_url, video_id) + + config = self._parse_json(self._search_regex( + r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, + 'configuration'), video_id) + + video_info = config['videoInfo'] + sources = config['sources'] + quality_order = qualities(['sd', 'hq']) + + formats = [{ + 'format_id': src['streamQuality'], + 'url': src['src'], + 'quality': quality_order(src['streamQuality']), + } for src in sources] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'description': clean_html(video_info.get('description')), + 'thumbnail': config.get('poster'), + } diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py new file mode 100644 index 0000000000..06d04de139 --- /dev/null +++ b/youtube_dl/extractor/clyp.py @@ -0,0 +1,82 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + unified_timestamp, +) + + +class ClypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'https://clyp.it/ojz2wfah', + 'md5': '1d4961036c41247ecfdcc439c0cddcbb', + 'info_dict': { + 'id': 'ojz2wfah', + 'ext': 'mp3', + 'title': 'Krisson80 - bits wip wip', + 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + 'duration': 263.21, + 'timestamp': 1443515251, + 'upload_date': '20150929', + }, + }, { + 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', + 'info_dict': { + 'id': 'b04p1odi', + 'ext': 'mp3', + 'title': 'GJ! (Reward Edit)', + 'description': 'Metal Resistance (THE ONE edition)', + 'duration': 177.789, + 'timestamp': 1528241278, + 'upload_date': '20180605', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + token = qs.get('token', [None])[0] + + query = {} + if token: + query['token'] = token + + metadata = self._download_json( + 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) + + formats = [] + for secure in ('', 'Secure'): + for ext in ('Ogg', 'Mp3'): + format_id = '%s%s' % (secure, ext) + format_url = metadata.get('%sUrl' % format_id) + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = metadata['Title'] + description = metadata.get('Description') + duration = float_or_none(metadata.get('Duration')) + timestamp = unified_timestamp(metadata.get('DateCreated')) + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py new file mode 100644 index 0000000000..e701fbeab8 --- /dev/null +++ b/youtube_dl/extractor/cmt.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from .mtv import MTVIE + + +class CMTIE(MTVIE): + IE_NAME = 'cmt.com' + _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', + 'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2', + 'info_dict': { + 'id': '989124', + 'ext': 'mp4', + 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', + 'description': 'Blame It All On My Roots', + }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, + }, { + 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes', + 'only_matching': True, + }] + + def _extract_mgid(self, webpage): + mgid = self._search_regex( + r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1', + webpage, 'mgid', group='mgid', default=None) + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + return mgid + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py new file mode 100644 index 0000000000..6889b0f401 --- /dev/null +++ b/youtube_dl/extractor/cnbc.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class CNBCIE(InfoExtractor): + _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://video.cnbc.com/gallery/?video=3000503714', + 'info_dict': { + 'id': '3000503714', + 'ext': 'mp4', + 'title': 'Fighting zombies is big business', + 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + 'timestamp': 1459332000, + 'upload_date': '20160330', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, + {'force_smil_url': True}), + 'id': video_id, + } + + +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _TEST = { + 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'info_dict': { + 'id': '7000031301', + 'ext': 'mp4', + 'title': "Trump: I don't necessarily agree with raising rates", + 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', + 'timestamp': 1531958400, + 'upload_date': '20180719', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, + 'video id') + return self.url_result( + 'http://video.cnbc.com/gallery/?video=%s' % video_id, + CNBCIE.ie_key()) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 0000000000..774b710558 --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .turner import TurnerBaseIE +from ..utils import url_basename + + +class CNNIE(TurnerBaseIE): + _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' + + _TESTS = [{ + 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + 'md5': '3e6121ea48df7e2259fe73a0628605c4', + 'info_dict': { + 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', + 'ext': 'mp4', + 'title': 'Nadal wins 8th French Open title', + 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + 'duration': 135, + 'upload_date': '20130609', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', + 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', + 'info_dict': { + 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'ext': 'mp4', + 'title': "Student's epic speech stuns new freshmen", + 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", + 'upload_date': '20130821', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', + 'md5': 'f14d02ebd264df951feb2400e2c25a1b', + 'info_dict': { + 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'ext': 'mp4', + 'title': 'Nashville Ep. 1: Hand crafted skateboards', + 'description': 'md5:e7223a503315c9f150acac52e76de086', + 'upload_date': '20141222', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', + 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'info_dict': { + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'ext': 'mp4', + 'title': '5 stunning stats about Netflix', + 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', + 'upload_date': '20160819', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', + 'only_matching': True, + }, { + 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', + 'only_matching': True, + }, { + 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', + 'only_matching': True, + }] + + _CONFIG = { + # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml + 'edition': { + 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', + 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + }, + # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml + 'money': { + 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', + 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, + } + + def _extract_timestamp(self, video_data): + # TODO: fix timestamp extraction + return None + + def _real_extract(self, url): + sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() + if sub_domain not in ('money', 'edition'): + sub_domain = 'edition' + config = self._CONFIG[sub_domain] + return self._extract_cvp_info( + config['data_src'] % path, page_title, { + 'default': { + 'media_src': config['media_src'], + } + }) + + +class CNNBlogsIE(InfoExtractor): + _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' + _TEST = { + 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', + 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', + 'info_dict': { + 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', + 'ext': 'mp4', + 'title': 'Criminalizing journalism?', + 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', + 'upload_date': '20140209', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'add_ie': ['CNN'], + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, url_basename(url)) + cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') + return self.url_result(cnn_url, CNNIE.ie_key()) + + +class CNNArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' + _TEST = { + 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', + 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', + 'info_dict': { + 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', + 'ext': 'mp4', + 'title': 'Obama: Cyberattack not an act of war', + 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', + 'upload_date': '20141221', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'add_ie': ['CNN'], + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, url_basename(url)) + cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') + return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py new file mode 100644 index 0000000000..d08b909a68 --- /dev/null +++ b/youtube_dl/extractor/comedycentral.py @@ -0,0 +1,142 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor +from .common import InfoExtractor + + +class ComedyCentralIE(MTVServicesInfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ + (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) + /(?P<title>.*)''' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'info_dict': { + 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'ext': 'mp4', + 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', + 'description': 'After a certain point, breastfeeding becomes c**kblocking.', + 'timestamp': 1376798400, + 'upload_date': '20130818', + }, + }, { + 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'only_matching': True, + }] + + +class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ + (?:full-episodes|shows(?=/[^/]+/full-episodes)) + /(?P<id>[^?]+)''' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', + 'info_dict': { + 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', + 'title': 'November 28, 2016 - Ryan Speedo Green', + }, + 'playlist_count': 4, + }, { + 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') + videos_info = self._get_videos_info(mgid) + return videos_info + + +class ToshIE(MTVServicesInfoExtractor): + IE_DESC = 'Tosh.0' + _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' + _FEED_URL = 'http://tosh.cc.com/feeds/mrss' + + _TESTS = [{ + 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'info_dict': { + 'description': 'Tosh asked fans to share their summer plans.', + 'title': 'Twitter Users Share Summer Plans', + }, + 'playlist': [{ + 'md5': 'f269e88114c1805bb6d7653fecea9e06', + 'info_dict': { + 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', + 'description': 'Tosh asked fans to share their summer plans.', + 'thumbnail': r're:^https?://.*\.jpg', + # It's really reported to be published on year 2077 + 'upload_date': '20770610', + 'timestamp': 3390510600, + 'subtitles': { + 'en': 'mincount:3', + }, + }, + }] + }, { + 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'only_matching': True, + }] + + +class ComedyCentralTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'info_dict': { + 'id': 'local_playlist-f99b626bdfe13568579a', + 'ext': 'flv', + 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', + 'only_matching': True, + }, { + 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + mrss_url = self._search_regex( + r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'mrss url', group='url') + + return self._get_videos_info_from_url(mrss_url, video_id) + + +class ComedyCentralShortnameIE(InfoExtractor): + _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' + _TESTS = [{ + 'url': ':tds', + 'only_matching': True, + }, { + 'url': ':thedailyshow', + 'only_matching': True, + }, { + 'url': ':theopposition', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + shortcut_map = { + 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + } + return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py new file mode 100644 index 0000000000..c1ea5d8460 --- /dev/null +++ b/youtube_dl/extractor/common.py @@ -0,0 +1,3013 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import datetime +import hashlib +import json +import netrc +import os +import random +import re +import socket +import sys +import time +import math + +from ..compat import ( + compat_cookiejar_Cookie, + compat_cookies, + compat_etree_Element, + compat_etree_fromstring, + compat_getpass, + compat_integer_types, + compat_http_client, + compat_os_name, + compat_str, + compat_urllib_error, + compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, + compat_urllib_request, + compat_urlparse, + compat_xml_parse_error, +) +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, +) +from ..utils import ( + NO_DEFAULT, + age_restricted, + base_url, + bug_reports_message, + clean_html, + compiled_regex_type, + determine_ext, + determine_protocol, + dict_get, + error_to_compat_str, + ExtractorError, + extract_attributes, + fix_xml_ampersands, + float_or_none, + GeoRestrictedError, + GeoUtils, + int_or_none, + js_to_json, + JSON_LD_RE, + mimetype2ext, + orderedSet, + parse_bitrate, + parse_codecs, + parse_duration, + parse_iso8601, + parse_m3u8_attributes, + parse_resolution, + RegexNotFoundError, + sanitized_Request, + sanitize_filename, + str_or_none, + strip_or_none, + unescapeHTML, + unified_strdate, + unified_timestamp, + update_Request, + update_url_query, + urljoin, + url_basename, + url_or_none, + xpath_element, + xpath_text, + xpath_with_ns, +) + + +class InfoExtractor(object): + """Information Extractor class. + + Information extractors are the classes that, given a URL, extract + information about the video (or videos) the URL refers to. This + information includes the real video URL, the video title, author and + others. The information is stored in a dictionary which is then + passed to the YoutubeDL. The YoutubeDL processes this + information possibly downloading the video to the file system, among + other possible outcomes. + + The type field determines the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: + + id: Video identifier. + title: Video title, unescaped. + + Additionally, it must contain either a formats entry or a url one: + + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: + * url The mandatory URL representing the media: + for plain file media - HTTP URL of this file, + for RTMP - RTMP URL, + for HLS - URL of the M3U8 media playlist, + for HDS - URL of the F4M manifest, + for DASH + - HTTP URL to plain file media (in case of + unfragmented media) + - URL of the MPD manifest or base URL + representing the media if MPD manifest + is parsed from a string (in case of + fragmented media) + for MSS - URL of the ISM manifest. + * manifest_url + The URL of the manifest file in case of + fragmented media: + for HLS - URL of the M3U8 master playlist, + for HDS - URL of the F4M manifest, + for DASH - URL of the MPD manifest, + for MSS - URL of the ISM manifest. + * ext Will be calculated from URL if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz + * vbr Average video bitrate in KBit/s + * fps Frame rate + * vcodec Name of the video codec in use + * container Name of the container format + * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes + * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp", "rtmpe", + "m3u8", "m3u8_native" or "http_dash_segments". + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url + * "duration" (optional, int or float) + * "filesize" (optional, int) + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field, regardless of all other values. + -1 for default (order by other properties), + -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language Language code, e.g. "de" or "en-US". + * language_preference Is this in the language mentioned in + the URL? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * stretched_ratio If given and not 1, indicates that the + video's pixels are not square. + width : height ratio as float. + * no_resume The server does not support resuming the + (HTTP or RTMP) download. Boolean. + * downloader_options A dictionary of downloader options as + described in FileDownloader + + url: Final video URL. + ext: Video filename extension. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). + + The following fields are optional: + + alt_title: A secondary title of the video. + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "id" (optional, string) - Thumbnail format ID + * "url" + * "preference" (optional, int) - quality of the image + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height}", + deprecated) + * "filesize" (optional, int) + thumbnail: Full URL to a video thumbnail image. + description: Full video description. + uploader: Full name of the video uploader. + license: License name the video is licensed under. + creator: The creator of the video. + release_date: The date (YYYYMMDD) when the video was released. + timestamp: UNIX timestamp of the moment the video became available. + upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. + uploader_id: Nickname or id of the video uploader. + uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. + location: Physical location where the video was filmed. + subtitles: The available subtitles as a dictionary in the format + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: + * "data": The subtitles file contents + * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions + duration: Length of the video in seconds, as an integer or float. + view_count: How many users have watched the video on the platform. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video + average_rating: Average rating give by users, the scale used depends on the webpage + comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. + age_limit: Age restriction for the video, as an integer (years) + webpage_url: The URL to the video webpage, if given to youtube-dlc it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) + + The following fields should only be used when the video belongs to some logical + chapter or section: + + chapter: Name or title of the chapter the video belongs to. + chapter_number: Number of the chapter the video belongs to, as an integer. + chapter_id: Id of the chapter the video belongs to, as a unicode string. + + The following fields should only be used when the video is an episode of some + series, programme or podcast: + + series: Title of the series or programme the video episode belongs to. + season: Title of the season the video episode belongs to. + season_number: Number of the season the video episode belongs to, as an integer. + season_id: Id of the season the video episode belongs to, as a unicode string. + episode: Title of the video episode. Unlike mandatory video title field, + this field should denote the exact title of the video episode + without any kind of decoration. + episode_number: Number of the video episode within a season, as an integer. + episode_id: Id of the video episode, as a unicode string. + + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + + Unless mentioned otherwise, the fields should be Unicode strings. + + Unless mentioned otherwise, None is equivalent to absence of information. + + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "id", "title", "description", "uploader", + "uploader_id", "uploader_url" attributes with the same semantics as videos + (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + + Subclasses of this one should re-define the _real_initialize() and + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. + + _GEO_BYPASS attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with geo_bypass_country. + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. + + _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted + IP blocks in CIDR notation for this extractor. One of these IP blocks + will be used by geo restriction bypass mechanism similarly + to _GEO_COUNTRIES. + + Finally, the _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. + """ + + _ready = False + _downloader = None + _x_forwarded_for_ip = None + _GEO_BYPASS = True + _GEO_COUNTRIES = None + _GEO_IP_BLOCKS = None + _WORKING = True + + def __init__(self, downloader=None): + """Constructor. Receives an optional downloader.""" + self._ready = False + self._x_forwarded_for_ip = None + self.set_downloader(downloader) + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None + + @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return compat_str(m.group('id')) + + @classmethod + def working(cls): + """Getter method for _WORKING.""" + return cls._WORKING + + def initialize(self): + """Initializes an instance (authentication, etc).""" + self._initialize_geo_bypass({ + 'countries': self._GEO_COUNTRIES, + 'ip_blocks': self._GEO_IP_BLOCKS, + }) + if not self._ready: + self._real_initialize() + self._ready = True + + def _initialize_geo_bypass(self, geo_bypass_context): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP belonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES and + _GEO_IP_BLOCKS. + + You may also manually call it from extractor's code if geo bypass + information is not available beforehand (e.g. obtained during + extraction) or due to some other reason. In this case you should pass + this information in geo bypass context passed as first argument. It may + contain following fields: + + countries: List of geo unrestricted countries (similar + to _GEO_COUNTRIES) + ip_blocks: List of geo unrestricted IP blocks in CIDR notation + (similar to _GEO_IP_BLOCKS) + + """ + if not self._x_forwarded_for_ip: + + # Geo bypass mechanism is explicitly disabled by user + if not self._downloader.params.get('geo_bypass', True): + return + + if not geo_bypass_context: + geo_bypass_context = {} + + # Backward compatibility: previously _initialize_geo_bypass + # expected a list of countries, some 3rd party code may still use + # it this way + if isinstance(geo_bypass_context, (list, tuple)): + geo_bypass_context = { + 'countries': geo_bypass_context, + } + + # The whole point of geo bypass mechanism is to fake IP + # as X-Forwarded-For HTTP header based on some IP block or + # country code. + + # Path 1: bypassing based on IP block in CIDR notation + + # Explicit IP block specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + ip_block = self._downloader.params.get('geo_bypass_ip_block', None) + + # Otherwise use random IP block from geo bypass context but only + # if extractor is known as geo bypassable + if not ip_block: + ip_blocks = geo_bypass_context.get('ip_blocks') + if self._GEO_BYPASS and ip_blocks: + ip_block = random.choice(ip_blocks) + + if ip_block: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s as X-Forwarded-For.' + % self._x_forwarded_for_ip) + return + + # Path 2: bypassing based on country code + + # Explicit country code specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + country = self._downloader.params.get('geo_bypass_country', None) + + # Otherwise use random country code from geo bypass context but + # only if extractor is known as geo bypassable + if not country: + countries = geo_bypass_context.get('countries') + if self._GEO_BYPASS and countries: + country = random.choice(countries) + + if country: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country.upper())) + + def extract(self, url): + """Extracts URL information and returns it in list of dicts.""" + try: + for _ in range(2): + try: + self.initialize() + ie_result = self._real_extract(url) + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + return ie_result + except GeoRestrictedError as e: + if self.__maybe_fake_ip_and_retry(e.countries): + continue + raise + except ExtractorError: + raise + except compat_http_client.IncompleteRead as e: + raise ExtractorError('A network error has occurred.', cause=e, expected=True) + except (KeyError, StopIteration) as e: + raise ExtractorError('An extractor error has occurred.', cause=e) + + def __maybe_fake_ip_and_retry(self, countries): + if (not self._downloader.params.get('geo_bypass_country', None) + and self._GEO_BYPASS + and self._downloader.params.get('geo_bypass', True) + and not self._x_forwarded_for_ip + and countries): + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + return True + return False + + def set_downloader(self, downloader): + """Sets the downloader for this IE.""" + self._downloader = downloader + + def _real_initialize(self): + """Real initialization process. Redefine in subclasses.""" + pass + + def _real_extract(self, url): + """Real extraction process. Redefine in subclasses.""" + pass + + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return compat_str(cls.__name__[:-2]) + + @property + def IE_NAME(self): + return compat_str(type(self).__name__[:-2]) + + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ + if note is None: + self.report_download_webpage(video_id) + elif note is not False: + if video_id is None: + self.to_screen('%s' % (note,)) + else: + self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: + if query: + url_or_request = update_url_query(url_or_request, query) + if data is not None or headers: + url_or_request = sanitized_Request(url_or_request, data, headers) + try: + return self._downloader.urlopen(url_or_request) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of <https://bugs.python.org/issue15002> + # introduced in Python 3.4.1. + err.fp._error = err + return err.fp + + if errnote is False: + return False + if errnote is None: + errnote = 'Unable to download webpage' + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) + if fatal: + raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + else: + self._downloader.report_warning(errmsg) + return False + + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, (compat_str, str)): + url_or_request = url_or_request.partition('#')[0] + + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + if urlh is False: + assert not fatal + return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + return (content, urlh) + + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' + else: + encoding = 'utf-8' + + return encoding + + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked' in content + and 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'' + + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root', default=None) + if xml_root is None: + # Probably need to authenticate + login_res = self._login(webpage_url, display_id) + if login_res is None: + self.report_warning('Could not login.') + else: + start_page = login_res + # Grab the url from the authenticated page + xml_root = self._html_search_regex( + PLAYER_REGEX, start_page, 'xml root') + + xml_name = self._html_search_regex( + r'', webpage): + url = self._search_regex( + r'src=(["\'])(?P.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + + if not url: + url = self._og_search_url(webpage) + + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) + + player_id = mobj.group('player_id') + if not display_id: + display_id = player_id + if player_id: + player_page = self._download_webpage( + url, display_id, note='Downloading player page', + errnote='Could not download player page') + video_id = self._search_regex( + r'\d+)' + _TEST = { + 'url': 'http://www.pearvideo.com/video_1076290', + 'info_dict': { + 'id': '1076290', + 'ext': 'mp4', + 'title': '小浣熊在主人家玻璃上滚石头:没砸', + 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', + 'timestamp': 1494275280, + 'upload_date': '20170508', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + quality = qualities( + ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + + formats = [{ + 'url': mobj.group('url'), + 'format_id': mobj.group('id'), + 'quality': quality(mobj.group('id')), + } for mobj in re.finditer( + r'(?P[a-zA-Z]+)Url\s*=\s*(["\'])(?P(?:https?:)?//.+?)\2', + webpage)] + self._sort_formats(formats) + + title = self._search_regex( + (r']+\bclass=(["\'])video-tt\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-title=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', group='value') + description = self._search_regex( + (r']+\bclass=(["\'])summary\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-summary=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'description', default=None, + group='value') or self._html_search_meta('Description', webpage) + timestamp = unified_timestamp(self._search_regex( + r']+\bclass=["\']date["\'][^>]*>([^<]+)', + webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py new file mode 100644 index 0000000000..48fb954169 --- /dev/null +++ b/youtube_dl/extractor/peertube.py @@ -0,0 +1,600 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PeerTubeIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + # Taken from https://instances.joinpeertube.org/instances + peertube\.rainbowswingers\.net| + tube\.stanisic\.nl| + peer\.suiri\.us| + medias\.libox\.fr| + videomensoif\.ynh\.fr| + peertube\.travelpandas\.eu| + peertube\.rachetjay\.fr| + peertube\.montecsys\.fr| + tube\.eskuero\.me| + peer\.tube| + peertube\.umeahackerspace\.se| + tube\.nx-pod\.de| + video\.monsieurbidouille\.fr| + tube\.openalgeria\.org| + vid\.lelux\.fi| + video\.anormallostpod\.ovh| + tube\.crapaud-fou\.org| + peertube\.stemy\.me| + lostpod\.space| + exode\.me| + peertube\.snargol\.com| + vis\.ion\.ovh| + videosdulib\.re| + v\.mbius\.io| + videos\.judrey\.eu| + peertube\.osureplayviewer\.xyz| + peertube\.mathieufamily\.ovh| + www\.videos-libr\.es| + fightforinfo\.com| + peertube\.fediverse\.ru| + peertube\.oiseauroch\.fr| + video\.nesven\.eu| + v\.bearvideo\.win| + video\.qoto\.org| + justporn\.cc| + video\.vny\.fr| + peervideo\.club| + tube\.taker\.fr| + peertube\.chantierlibre\.org| + tube\.ipfixe\.info| + tube\.kicou\.info| + tube\.dodsorf\.as| + videobit\.cc| + video\.yukari\.moe| + videos\.elbinario\.net| + hkvideo\.live| + pt\.tux\.tf| + www\.hkvideo\.live| + FIGHTFORINFO\.com| + pt\.765racing\.com| + peertube\.gnumeria\.eu\.org| + nordenmedia\.com| + peertube\.co\.uk| + tube\.darfweb\.eu| + tube\.kalah-france\.org| + 0ch\.in| + vod\.mochi\.academy| + film\.node9\.org| + peertube\.hatthieves\.es| + video\.fitchfamily\.org| + peertube\.ddns\.net| + video\.ifuncle\.kr| + video\.fdlibre\.eu| + tube\.22decembre\.eu| + peertube\.harmoniescreatives\.com| + tube\.fabrigli\.fr| + video\.thedwyers\.co| + video\.bruitbruit\.com| + peertube\.foxfam\.club| + peer\.philoxweb\.be| + videos\.bugs\.social| + peertube\.malbert\.xyz| + peertube\.bilange\.ca| + libretube\.net| + diytelevision\.com| + peertube\.fedilab\.app| + libre\.video| + video\.mstddntfdn\.online| + us\.tv| + peertube\.sl-network\.fr| + peertube\.dynlinux\.io| + peertube\.david\.durieux\.family| + peertube\.linuxrocks\.online| + peerwatch\.xyz| + v\.kretschmann\.social| + tube\.otter\.sh| + yt\.is\.nota\.live| + tube\.dragonpsi\.xyz| + peertube\.boneheadmedia\.com| + videos\.funkwhale\.audio| + watch\.44con\.com| + peertube\.gcaillaut\.fr| + peertube\.icu| + pony\.tube| + spacepub\.space| + tube\.stbr\.io| + v\.mom-gay\.faith| + tube\.port0\.xyz| + peertube\.simounet\.net| + play\.jergefelt\.se| + peertube\.zeteo\.me| + tube\.danq\.me| + peertube\.kerenon\.com| + tube\.fab-l3\.org| + tube\.calculate\.social| + peertube\.mckillop\.org| + tube\.netzspielplatz\.de| + vod\.ksite\.de| + peertube\.laas\.fr| + tube\.govital\.net| + peertube\.stephenson\.cc| + bistule\.nohost\.me| + peertube\.kajalinifi\.de| + video\.ploud\.jp| + video\.omniatv\.com| + peertube\.ffs2play\.fr| + peertube\.leboulaire\.ovh| + peertube\.tronic-studio\.com| + peertube\.public\.cat| + peertube\.metalbanana\.net| + video\.1000i100\.fr| + peertube\.alter-nativ-voll\.de| + tube\.pasa\.tf| + tube\.worldofhauru\.xyz| + pt\.kamp\.site| + peertube\.teleassist\.fr| + videos\.mleduc\.xyz| + conf\.tube| + media\.privacyinternational\.org| + pt\.forty-two\.nl| + video\.halle-leaks\.de| + video\.grosskopfgames\.de| + peertube\.schaeferit\.de| + peertube\.jackbot\.fr| + tube\.extinctionrebellion\.fr| + peertube\.f-si\.org| + video\.subak\.ovh| + videos\.koweb\.fr| + peertube\.zergy\.net| + peertube\.roflcopter\.fr| + peertube\.floss-marketing-school\.com| + vloggers\.social| + peertube\.iriseden\.eu| + videos\.ubuntu-paris\.org| + peertube\.mastodon\.host| + armstube\.com| + peertube\.s2s\.video| + peertube\.lol| + tube\.open-plug\.eu| + open\.tube| + peertube\.ch| + peertube\.normandie-libre\.fr| + peertube\.slat\.org| + video\.lacaveatonton\.ovh| + peertube\.uno| + peertube\.servebeer\.com| + peertube\.fedi\.quebec| + tube\.h3z\.jp| + tube\.plus200\.com| + peertube\.eric\.ovh| + tube\.metadocs\.cc| + tube\.unmondemeilleur\.eu| + gouttedeau\.space| + video\.antirep\.net| + nrop\.cant\.at| + tube\.ksl-bmx\.de| + tube\.plaf\.fr| + tube\.tchncs\.de| + video\.devinberg\.com| + hitchtube\.fr| + peertube\.kosebamse\.com| + yunopeertube\.myddns\.me| + peertube\.varney\.fr| + peertube\.anon-kenkai\.com| + tube\.maiti\.info| + tubee\.fr| + videos\.dinofly\.com| + toobnix\.org| + videotape\.me| + voca\.tube| + video\.heromuster\.com| + video\.lemediatv\.fr| + video\.up\.edu\.ph| + balafon\.video| + video\.ivel\.fr| + thickrips\.cloud| + pt\.laurentkruger\.fr| + video\.monarch-pass\.net| + peertube\.artica\.center| + video\.alternanet\.fr| + indymotion\.fr| + fanvid\.stopthatimp\.net| + video\.farci\.org| + v\.lesterpig\.com| + video\.okaris\.de| + tube\.pawelko\.net| + peertube\.mablr\.org| + tube\.fede\.re| + pytu\.be| + evertron\.tv| + devtube\.dev-wiki\.de| + raptube\.antipub\.org| + video\.selea\.se| + peertube\.mygaia\.org| + video\.oh14\.de| + peertube\.livingutopia\.org| + peertube\.the-penguin\.de| + tube\.thechangebook\.org| + tube\.anjara\.eu| + pt\.pube\.tk| + video\.samedi\.pm| + mplayer\.demouliere\.eu| + widemus\.de| + peertube\.me| + peertube\.zapashcanon\.fr| + video\.latavernedejohnjohn\.fr| + peertube\.pcservice46\.fr| + peertube\.mazzonetto\.eu| + video\.irem\.univ-paris-diderot\.fr| + video\.livecchi\.cloud| + alttube\.fr| + video\.coop\.tools| + video\.cabane-libre\.org| + peertube\.openstreetmap\.fr| + videos\.alolise\.org| + irrsinn\.video| + video\.antopie\.org| + scitech\.video| + tube2\.nemsia\.org| + video\.amic37\.fr| + peertube\.freeforge\.eu| + video\.arbitrarion\.com| + video\.datsemultimedia\.com| + stoptrackingus\.tv| + peertube\.ricostrongxxx\.com| + docker\.videos\.lecygnenoir\.info| + peertube\.togart\.de| + tube\.postblue\.info| + videos\.domainepublic\.net| + peertube\.cyber-tribal\.com| + video\.gresille\.org| + peertube\.dsmouse\.net| + cinema\.yunohost\.support| + tube\.theocevaer\.fr| + repro\.video| + tube\.4aem\.com| + quaziinc\.com| + peertube\.metawurst\.space| + videos\.wakapo\.com| + video\.ploud\.fr| + video\.freeradical\.zone| + tube\.valinor\.fr| + refuznik\.video| + pt\.kircheneuenburg\.de| + peertube\.asrun\.eu| + peertube\.lagob\.fr| + videos\.side-ways\.net| + 91video\.online| + video\.valme\.io| + video\.taboulisme\.com| + videos-libr\.es| + tv\.mooh\.fr| + nuage\.acostey\.fr| + video\.monsieur-a\.fr| + peertube\.librelois\.fr| + videos\.pair2jeux\.tube| + videos\.pueseso\.club| + peer\.mathdacloud\.ovh| + media\.assassinate-you\.net| + vidcommons\.org| + ptube\.rousset\.nom\.fr| + tube\.cyano\.at| + videos\.squat\.net| + video\.iphodase\.fr| + peertube\.makotoworkshop\.org| + peertube\.serveur\.slv-valbonne\.fr| + vault\.mle\.party| + hostyour\.tv| + videos\.hack2g2\.fr| + libre\.tube| + pire\.artisanlogiciel\.net| + videos\.numerique-en-commun\.fr| + video\.netsyms\.com| + video\.die-partei\.social| + video\.writeas\.org| + peertube\.swarm\.solvingmaz\.es| + tube\.pericoloso\.ovh| + watching\.cypherpunk\.observer| + videos\.adhocmusic\.com| + tube\.rfc1149\.net| + peertube\.librelabucm\.org| + videos\.numericoop\.fr| + peertube\.koehn\.com| + peertube\.anarchmusicall\.net| + tube\.kampftoast\.de| + vid\.y-y\.li| + peertube\.xtenz\.xyz| + diode\.zone| + tube\.egf\.mn| + peertube\.nomagic\.uk| + visionon\.tv| + videos\.koumoul\.com| + video\.rastapuls\.com| + video\.mantlepro\.com| + video\.deadsuperhero\.com| + peertube\.musicstudio\.pro| + peertube\.we-keys\.fr| + artitube\.artifaille\.fr| + peertube\.ethernia\.net| + tube\.midov\.pl| + peertube\.fr| + watch\.snoot\.tube| + peertube\.donnadieu\.fr| + argos\.aquilenet\.fr| + tube\.nemsia\.org| + tube\.bruniau\.net| + videos\.darckoune\.moe| + tube\.traydent\.info| + dev\.videos\.lecygnenoir\.info| + peertube\.nayya\.org| + peertube\.live| + peertube\.mofgao\.space| + video\.lequerrec\.eu| + peertube\.amicale\.net| + aperi\.tube| + tube\.ac-lyon\.fr| + video\.lw1\.at| + www\.yiny\.org| + videos\.pofilo\.fr| + tube\.lou\.lt| + choob\.h\.etbus\.ch| + tube\.hoga\.fr| + peertube\.heberge\.fr| + video\.obermui\.de| + videos\.cloudfrancois\.fr| + betamax\.video| + video\.typica\.us| + tube\.piweb\.be| + video\.blender\.org| + peertube\.cat| + tube\.kdy\.ch| + pe\.ertu\.be| + peertube\.social| + videos\.lescommuns\.org| + tv\.datamol\.org| + videonaute\.fr| + dialup\.express| + peertube\.nogafa\.org| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.bootlicker\.party| + skeptikon\.fr| + video\.blueline\.mg| + tube\.homecomputing\.fr| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + tube\.kher\.nl| + peertube\.qtg\.fr| + video\.migennes\.net| + tube\.p2p\.legal| + troll\.tv| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.cemea\.org| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + sikke\.fi| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + videos\.benpro\.fr| + peertube\.parleur\.net| + peertube\.heraut\.eu| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + tube\.mochi\.academy| + media\.zat\.im| + video\.colibris-outilslibres\.org| + tube\.svnet\.fr| + peertube\.video| + peertube3\.cpy\.re| + peertube2\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re + )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' + _VALID_URL = r'''(?x) + (?: + peertube:(?P[^:]+):| + https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _TESTS = [{ + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '9bed8c0137913e17b86334e5885aacff', + 'info_dict': { + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'ext': 'mp4', + 'title': 'What is PeerTube?', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'Les vidéos de Framasoft', + 'channel_id': '2', + 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, + }] + + @staticmethod + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P[^/]+)/videos/(?:watch|embed)/(?P%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries + + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return + subtitles = {} + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) + return subtitles + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') + + video = self._call_api( + host, video_id, '', note='Downloading video JSON') + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) + + description = None + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) + if not description: + description = video.get('description') + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str_or_none(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str_or_none(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + 'subtitles': subtitles + } diff --git a/youtube_dl/extractor/people.py b/youtube_dl/extractor/people.py new file mode 100644 index 0000000000..6ca95715ee --- /dev/null +++ b/youtube_dl/extractor/people.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class PeopleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html' + + _TEST = { + 'url': 'http://www.people.com/people/videos/0,,20995451,00.html', + 'info_dict': { + 'id': 'ref:20995451', + 'ext': 'mp4', + 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”', + 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 246.318, + 'timestamp': 1458720585, + 'upload_date': '20160323', + 'uploader_id': '416418724', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + } + + def _real_extract(self, url): + return self.url_result( + 'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s' + % self._match_id(url), 'BrightcoveNew') diff --git a/youtube_dl/extractor/performgroup.py b/youtube_dl/extractor/performgroup.py new file mode 100644 index 0000000000..26942bfb3f --- /dev/null +++ b/youtube_dl/extractor/performgroup.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PerformGroupIE(InfoExtractor): + _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})' + _TESTS = [{ + # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html + 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', + 'md5': '259cb03d142e2e52471e8837ecacb29f', + 'info_dict': { + 'id': 'xgrwobuzumes1lwjxtcdpwgxd', + 'ext': 'mp4', + 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', + 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', + 'timestamp': 1511533477, + 'upload_date': '20171124', + } + }] + + def _call_api(self, service, auth_token, content_id, referer_url): + return self._download_json( + 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + content_id, headers={ + 'Referer': referer_url, + 'Origin': 'http://player.performgroup.com', + }, query={ + '_fmt': 'json', + }) + + def _real_extract(self, url): + player_id, auth_token = re.search(self._VALID_URL, url).groups() + bootstrap = self._call_api('bootstrap', auth_token, player_id, url) + video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] + video_id = video['uuid'] + vod = self._call_api('vod', auth_token, video_id, url) + media = vod['videos']['video'][0]['media'] + + formats = [] + hls_url = media.get('hls', {}).get('url') + if hls_url: + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + hds_url = media.get('hds', {}).get('url') + if hds_url: + formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) + + for c in media.get('content', []): + c_url = c.get('url') + if not c_url: + continue + tbr = int_or_none(c.get('bitrate'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': c_url, + 'tbr': tbr, + 'width': int_or_none(c.get('width')), + 'height': int_or_none(c.get('height')), + 'filesize': int_or_none(c.get('fileSize')), + 'vcodec': c.get('type'), + 'fps': int_or_none(c.get('videoFrameRate')), + 'vbr': int_or_none(c.get('videoRate'), 1000), + 'abr': int_or_none(c.get('audioRate'), 1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': int_or_none(video.get('publishedTime'), 1000), + 'formats': formats, + } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py new file mode 100644 index 0000000000..b15906390d --- /dev/null +++ b/youtube_dl/extractor/periscope.py @@ -0,0 +1,189 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + unescapeHTML, +) + + +class PeriscopeBaseIE(InfoExtractor): + def _call_api(self, method, query, item_id): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s' % method, + item_id, query=query) + + def _parse_broadcast_data(self, broadcast, video_id): + title = broadcast.get('status') or 'Periscope Broadcast' + uploader = broadcast.get('user_display_name') or broadcast.get('username') + title = '%s - %s' % (uploader, title) if uploader else title + is_live = broadcast.get('state').lower() == 'running' + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'title': self._live_title(title) if is_live else title, + 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'uploader': uploader, + 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(broadcast.get('total_watched')), + 'tags': broadcast.get('tags'), + 'is_live': is_live, + } + + @staticmethod + def _extract_common_format_info(broadcast): + return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) + + @staticmethod + def _add_width_and_height(f, width, height): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=fatal) + if len(m3u8_formats) == 1: + self._add_width_and_height(m3u8_formats[0], width, height) + return m3u8_formats + + +class PeriscopeIE(PeriscopeBaseIE): + IE_DESC = 'Periscope' + IE_NAME = 'periscope' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' + # Alive example URLs can be found here https://www.periscope.tv/ + _TESTS = [{ + 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', + 'md5': '65b57957972e503fcbbaeed8f4fa04ca', + 'info_dict': { + 'id': '56102209', + 'ext': 'mp4', + 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗', + 'timestamp': 1438978559, + 'upload_date': '20150807', + 'uploader': 'Bec Boop', + 'uploader_id': '1465763', + }, + 'skip': 'Expires in 24 hours', + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', + 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + token = self._match_id(url) + + stream = self._call_api( + 'accessVideoPublic', {'broadcast_id': token}, token) + + broadcast = stream['broadcast'] + info = self._parse_broadcast_data(broadcast, token) + + state = broadcast.get('state').lower() + width = int_or_none(broadcast.get('width')) + height = int_or_none(broadcast.get('height')) + + def add_width_and_height(f): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + video_urls = set() + formats = [] + for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): + video_url = stream.get(format_id + '_url') + if not video_url or video_url in video_urls: + continue + video_urls.add(video_url) + if format_id != 'rtmp': + m3u8_formats = self._extract_pscp_m3u8_formats( + video_url, token, format_id, state, width, height, False) + formats.extend(m3u8_formats) + continue + rtmp_format = { + 'url': video_url, + 'ext': 'flv' if format_id == 'rtmp' else 'mp4', + } + self._add_width_and_height(rtmp_format) + formats.append(rtmp_format) + self._sort_formats(formats) + + info['formats'] = formats + return info + + +class PeriscopeUserIE(PeriscopeBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + 'description': 'md5:6cf4ec8047768098da58e446e82c82f0', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_name = self._match_id(url) + + webpage = self._download_webpage(url, user_name) + + data_store = self._parse_json( + unescapeHTML(self._search_regex( + r'data-store=(["\'])(?P<data>.+?)\1', + webpage, 'data store', default='{}', group='data')), + user_name) + + user = list(data_store['UserCache']['users'].values())[0]['user'] + user_id = user['id'] + session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id'] + + broadcasts = self._call_api( + 'getUserBroadcastsPublic', + {'user_id': user_id, 'session_id': session_id}, + user_name)['broadcasts'] + + broadcast_ids = [ + broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] + + title = user.get('display_name') or user.get('username') or user_name + description = user.get('description') + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) + for broadcast_id in broadcast_ids] + + return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py new file mode 100644 index 0000000000..03da64b116 --- /dev/null +++ b/youtube_dl/extractor/philharmoniedeparis.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + try_get, + urljoin, +) + + +class PhilharmonieDeParisIE(InfoExtractor): + IE_DESC = 'Philharmonie de Paris' + _VALID_URL = r'''(?x) + https?:// + (?: + live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)| + pad\.philharmoniedeparis\.fr/doc/CIMU/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', + 'md5': 'a0a4b195f544645073631cbec166a2c2', + 'info_dict': { + 'id': '1086697', + 'ext': 'mp4', + 'title': 'Jazz à la Villette : Knower', + }, + }, { + 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', + 'info_dict': { + 'id': '1032066', + 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', + 'only_matching': True, + }, { + 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', + 'only_matching': True, + }, { + 'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'only_matching': True, + }, { + 'url': 'https://live.philharmoniedeparis.fr/embed/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'only_matching': True, + }] + _LIVE_URL = 'https://live.philharmoniedeparis.fr' + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_json( + '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ + 'id': video_id, + 'lang': 'fr-FR', + }) + + def extract_entry(source): + if not isinstance(source, dict): + return + title = source.get('title') + if not title: + return + files = source.get('files') + if not isinstance(files, dict): + return + format_urls = set() + formats = [] + for format_id in ('mobile', 'desktop'): + format_url = try_get( + files, lambda x: x[format_id]['file'], compat_str) + if not format_url or format_url in format_urls: + continue + format_urls.add(format_url) + m3u8_url = urljoin(self._LIVE_URL, format_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + if not formats: + return + self._sort_formats(formats) + return { + 'title': title, + 'formats': formats, + } + + thumbnail = urljoin(self._LIVE_URL, config.get('image')) + + info = extract_entry(config) + if info: + info.update({ + 'id': video_id, + 'thumbnail': thumbnail, + }) + return info + + entries = [] + for num, chapter in enumerate(config['chapters'], start=1): + entry = extract_entry(chapter) + entry['id'] = '%s-%d' % (video_id, num) + entries.append(entry) + + return self.playlist_result(entries, video_id, config.get('title')) diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py new file mode 100644 index 0000000000..8d52ad3b4a --- /dev/null +++ b/youtube_dl/extractor/phoenix.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class PhoenixIE(InfoExtractor): + IE_NAME = 'phoenix.de' + _VALID_URL = r'''https?://(?:www\.)?phoenix.de/\D+(?P<id>\d+)\.html''' + _TESTS = [ + { + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/unsere-welt-in-zukunft---stadt-a-1283620.html', + 'md5': '5e765e838aa3531c745a4f5b249ee3e3', + 'info_dict': { + 'id': '0OB4HFc43Ns', + 'ext': 'mp4', + 'title': 'Unsere Welt in Zukunft - Stadt', + 'description': 'md5:9bfb6fd498814538f953b2dcad7ce044', + 'upload_date': '20190912', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', + } + }, + { + 'url': 'https://www.phoenix.de/drohnenangriffe-in-saudi-arabien-a-1286995.html?ref=aktuelles', + 'only_matching': True, + }, + # an older page: https://www.phoenix.de/sendungen/gespraeche/phoenix-persoenlich/im-dialog-a-177727.html + # seems to not have an embedded video, even though it's uploaded on youtube: https://www.youtube.com/watch?v=4GxnoUHvOkM + ] + + def extract_from_json_api(self, video_id, api_url): + doc = self._download_json( + api_url, video_id, + note="Downloading webpage metadata", + errnote="Failed to load webpage metadata") + + for a in doc["absaetze"]: + if a["typ"] == "video-youtube": + return { + '_type': 'url_transparent', + 'id': a["id"], + 'title': doc["titel"], + 'url': "https://www.youtube.com/watch?v=%s" % a["id"], + 'ie_key': 'Youtube', + } + raise ExtractorError("No downloadable video found", expected=True) + + def _real_extract(self, url): + page_id = self._match_id(url) + api_url = 'https://www.phoenix.de/response/id/%s' % page_id + return self.extract_from_json_api(page_id, api_url) diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py new file mode 100644 index 0000000000..6c8bbe1d95 --- /dev/null +++ b/youtube_dl/extractor/photobucket.py @@ -0,0 +1,46 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class PhotobucketIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _TEST = { + 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', + 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', + 'info_dict': { + 'id': 'zpsc0c3b9fa', + 'ext': 'mp4', + 'timestamp': 1367669341, + 'upload_date': '20130504', + 'uploader': 'rachaneronas', + 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + video_extension = mobj.group('ext') + + webpage = self._download_webpage(url, video_id) + + # Extract URL, uploader, and title from webpage + self.report_extraction(video_id) + info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', + webpage, 'info json') + info = json.loads(info_json) + url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) + return { + 'id': video_id, + 'url': url, + 'uploader': info['username'], + 'timestamp': info['creationDate'], + 'title': info['title'], + 'ext': video_extension, + 'thumbnail': info['thumbUrl'], + } diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py new file mode 100644 index 0000000000..8099ef1d6b --- /dev/null +++ b/youtube_dl/extractor/picarto.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PicartoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?' + _TEST = { + 'url': 'https://picarto.tv/Setz', + 'info_dict': { + 'id': 'Setz', + 'ext': 'mp4', + 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'timestamp': int, + 'is_live': True + }, + 'skip': 'Stream is offline', + } + + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + + metadata = self._download_json( + 'https://api.picarto.tv/v1/channel/name/' + channel_id, + channel_id) + + if metadata.get('online') is False: + raise ExtractorError('Stream is offline', expected=True) + + cdn_data = self._download_json( + 'https://picarto.tv/process/channel', channel_id, + data=urlencode_postdata({'loadbalancinginfo': channel_id}), + note='Downloading load balancing info') + + token = mobj.group('token') or 'public' + params = { + 'con': int(time.time() * 1000), + 'token': token, + } + + prefered_edge = cdn_data.get('preferedEdge') + formats = [] + + for edge in cdn_data['edges']: + edge_ep = edge.get('ep') + if not edge_ep or not isinstance(edge_ep, compat_str): + continue + edge_id = edge.get('id') + for tech in cdn_data['techs']: + tech_label = tech.get('label') + tech_type = tech.get('type') + preference = 0 + if edge_id == prefered_edge: + preference += 1 + format_id = [] + if edge_id: + format_id.append(edge_id) + if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': + format_id.append('hls') + formats.extend(self._extract_m3u8_formats( + update_url_query( + 'https://%s/hls/%s/index.m3u8' + % (edge_ep, channel_id), params), + channel_id, 'mp4', preference=preference, + m3u8_id='-'.join(format_id), fatal=False)) + continue + elif tech_type == 'video/mp4' or tech_label == 'MP4': + format_id.append('mp4') + formats.append({ + 'url': update_url_query( + 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), + params), + 'format_id': '-'.join(format_id), + 'preference': preference, + }) + else: + # rtmp format does not seem to work + continue + self._sort_formats(formats) + + mature = metadata.get('adult') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + + return { + 'id': channel_id, + 'title': self._live_title(metadata.get('title') or channel_id), + 'is_live': True, + 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), + 'channel': channel_id, + 'channel_url': 'https://picarto.tv/%s' % channel_id, + 'age_limit': age_limit, + 'formats': formats, + } + + +class PicartoVodIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', + 'info_dict': { + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', + 'ext': 'mp4', + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + vod_info = self._parse_json( + self._search_regex( + r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, + video_id), + video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats( + vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': vod_info.get('vodThumb'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py new file mode 100644 index 0000000000..88b6859b01 --- /dev/null +++ b/youtube_dl/extractor/piksel.py @@ -0,0 +1,138 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + dict_get, + int_or_none, + unescapeHTML, + parse_iso8601, +) + + +class PikselIE(InfoExtractor): + _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P<id>[a-z0-9_]+)' + _TESTS = [ + { + 'url': 'http://player.piksel.com/v/ums2867l', + 'md5': '34e34c8d89dc2559976a6079db531e85', + 'info_dict': { + 'id': 'ums2867l', + 'ext': 'mp4', + 'title': 'GX-005 with Caption', + 'timestamp': 1481335659, + 'upload_date': '20161210' + } + }, + { + # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al + 'url': 'https://player.piksel.com/v/v80kqp41', + 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', + 'info_dict': { + 'id': 'v80kqp41', + 'ext': 'mp4', + 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', + 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', + 'timestamp': 1486171129, + 'upload_date': '20170204' + } + }, + { + # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ + 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-de-program-uuid=[\'"]([a-z0-9]+)', + webpage, 'program uuid', default=display_id) + app_token = self._search_regex([ + r'clientAPI\s*:\s*"([^"]+)"', + r'data-de-api-key\s*=\s*"([^"]+)"' + ], webpage, 'app token') + response = self._download_json( + 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, + video_id, query={ + 'v': video_id + })['response'] + failure = response.get('failure') + if failure: + raise ExtractorError(response['failure']['reason'], expected=True) + video_data = response['WsProgramResponse']['program']['asset'] + title = video_data['title'] + + formats = [] + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + asset_type = dict_get(video_data, ['assetType', 'asset_type']) + for asset_file in video_data.get('assetFiles', []): + # TODO: extract rtmp formats + http_url = asset_file.get('http_url') + if not http_url: + continue + tbr = None + vbr = int_or_none(asset_file.get('videoBitrate'), 1024) + abr = int_or_none(asset_file.get('audioBitrate'), 1024) + if asset_type == 'video': + tbr = vbr + abr + elif asset_type == 'audio': + tbr = abr + + format_id = ['http'] + if tbr: + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': unescapeHTML(http_url), + 'vbr': vbr, + 'abr': abr, + 'width': int_or_none(asset_file.get('videoWidth')), + 'height': int_or_none(asset_file.get('videoHeight')), + 'filesize': int_or_none(asset_file.get('filesize')), + 'tbr': tbr, + }) + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + if caption_url: + subtitles.setdefault(caption.get('locale', 'en'), []).append({ + 'url': caption_url}) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_data.get('dateadd')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py new file mode 100644 index 0000000000..9f3501f777 --- /dev/null +++ b/youtube_dl/extractor/pinkbike.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + remove_start, + str_to_int, + unified_strdate, +) + + +class PinkbikeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.pinkbike.com/video/402811/', + 'md5': '4814b8ca7651034cd87e3361d5c2155a', + 'info_dict': { + 'id': '402811', + 'ext': 'mp4', + 'title': 'Brandon Semenuk - RAW 100', + 'description': 'Official release: www.redbull.ca/rupertwalker', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 100, + 'upload_date': '20150406', + 'uploader': 'revelco', + 'location': 'Victoria, British Columbia, Canada', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.pinkbike.com/video/%s' % video_id, video_id) + + formats = [] + for _, format_id, src in re.findall( + r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + }) + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') + description = self._html_search_regex( + r'(?s)id="media-description"[^>]*>(.+?)<', + webpage, 'description', default=None) or remove_start( + self._og_search_description(webpage), title + '. ') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) + + uploader = self._search_regex( + r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage, + 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class="fullTime"[^>]+title="([^"]+)"', + webpage, 'upload date', fatal=False)) + + location = self._html_search_regex( + r'(?s)<dt>Location</dt>\s*<dd>(.+?)<', + webpage, 'location', fatal=False) + + def extract_count(webpage, label): + return str_to_int(self._search_regex( + r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, + webpage, label, fatal=False)) + + view_count = extract_count(webpage, 'Views') + comment_count = extract_count(webpage, 'Comments') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader': uploader, + 'location': location, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats + } diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py new file mode 100644 index 0000000000..e86c65396b --- /dev/null +++ b/youtube_dl/extractor/pladform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + xpath_text, + qualities, +) + + +class PladformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + out\.pladform\.ru/player| + static\.pladform\.ru/player\.swf + ) + \?.*\bvideoid=| + video\.pladform\.ru/catalog/video/videoid/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', + 'info_dict': { + 'id': '3777899', + 'ext': 'mp4', + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3190, + }, + }, { + 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', + 'only_matching': True, + }, { + 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + pl = qs.get('pl', ['1'])[0] + + video = self._download_xml( + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }) + + def fail(text): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, text), + expected=True) + + if video.tag == 'error': + fail(video.text) + + quality = qualities(('ld', 'sd', 'hd')) + + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + + self._sort_formats(formats) + + webpage = self._download_webpage( + 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, + video_id) + + title = self._og_search_title(webpage, fatal=False) or xpath_text( + video, './/title', 'title', fatal=True) + description = self._search_regex( + r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) or xpath_text( + video, './/cover', 'cover') + + duration = int_or_none(xpath_text(video, './/time', 'duration')) + age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py new file mode 100644 index 0000000000..23c8256b59 --- /dev/null +++ b/youtube_dl/extractor/platzi.py @@ -0,0 +1,224 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + str_or_none, + try_get, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class PlatziBaseIE(InfoExtractor): + _LOGIN_URL = 'https://platzi.com/login/' + _NETRC_MACHINE = 'platzi' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username, + 'password': password, + }) + + urlh = self._request_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + # login succeeded + if 'platzi.com/login' not in urlh.geturl(): + return + + login_error = self._webpage_read_content( + urlh, self._LOGIN_URL, None, 'Downloading login error page') + + login = self._parse_json( + self._search_regex( + r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), + None) + + for kind in ('error', 'password', 'nonFields'): + error = str_or_none(login.get('%sError' % kind)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class PlatziIE(PlatziBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/[^/]+/(?P<id>\d+)-[^/?\#&]+ + ''' + + _TESTS = [{ + 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', + 'md5': '8f56448241005b561c10f11a595b37e3', + 'info_dict': { + 'id': '12074', + 'ext': 'mp4', + 'title': 'Creando nuestra primera página', + 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', + 'duration': 420, + }, + 'skip': 'Requires platzi account credentials', + }, { + 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', + 'info_dict': { + 'id': '13430', + 'ext': 'mp4', + 'title': 'Background', + 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', + 'duration': 360, + }, + 'skip': 'Requires platzi account credentials', + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + lecture_id = self._match_id(url) + + webpage = self._download_webpage(url, lecture_id) + + data = self._parse_json( + self._search_regex( + # client_data may contain "};" so that we have to try more + # strict regex first + (r'client_data\s*=\s*({.+?})\s*;\s*\n', + r'client_data\s*=\s*({.+?})\s*;'), + webpage, 'client data'), + lecture_id) + + material = data['initialState']['material'] + desc = material['description'] + title = desc['title'] + + formats = [] + for server_id, server in material['videos'].items(): + if not isinstance(server, dict): + continue + for format_id in ('hls', 'dash'): + format_url = url_or_none(server.get(format_id)) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, lecture_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % server_id, + fatal=False)) + elif format_id == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, lecture_id, mpd_id=format_id, + note='Downloading %s MPD manifest' % server_id, + fatal=False)) + self._sort_formats(formats) + + content = str_or_none(desc.get('content')) + description = (clean_html(compat_b64decode(content).decode('utf-8')) + if content else None) + duration = int_or_none(material.get('duration'), invscale=60) + + return { + 'id': lecture_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + } + + +class PlatziCourseIE(PlatziBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/(?P<id>[^/?\#&]+) + ''' + _TESTS = [{ + 'url': 'https://platzi.com/clases/next-js/', + 'info_dict': { + 'id': '1311', + 'title': 'Curso de Next.js', + }, + 'playlist_count': 22, + }, { + 'url': 'https://courses.platzi.com/classes/communication-codestream/', + 'info_dict': { + 'id': '1367', + 'title': 'Codestream Course', + }, + 'playlist_count': 14, + }] + + @classmethod + def suitable(cls, url): + return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + webpage = self._download_webpage(url, course_name) + + props = self._parse_json( + self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), + course_name)['initialProps'] + + entries = [] + for chapter_num, chapter in enumerate(props['concepts'], 1): + if not isinstance(chapter, dict): + continue + materials = chapter.get('materials') + if not materials or not isinstance(materials, list): + continue + chapter_title = chapter.get('title') + chapter_id = str_or_none(chapter.get('id')) + for material in materials: + if not isinstance(material, dict): + continue + if material.get('material_type') != 'video': + continue + video_url = urljoin(url, material.get('url')) + if not video_url: + continue + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'title': str_or_none(material.get('name')), + 'id': str_or_none(material.get('id')), + 'ie_key': PlatziIE.ie_key(), + 'chapter': chapter_title, + 'chapter_number': chapter_num, + 'chapter_id': chapter_id, + }) + + course_id = compat_str(try_get(props, lambda x: x['course']['id'])) + course_title = try_get(props, lambda x: x['course']['name'], compat_str) + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py new file mode 100644 index 0000000000..e766ccca32 --- /dev/null +++ b/youtube_dl/extractor/playfm.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class PlayFMIE(InfoExtractor): + IE_NAME = 'play.fm' + _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' + + _TEST = { + 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', + 'md5': 'c505f8307825a245d0c7ad1850001f22', + 'info_dict': { + 'id': '71276', + 'ext': 'mp3', + 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', + 'description': '', + 'duration': 5627, + 'timestamp': 1406033781, + 'upload_date': '20140722', + 'uploader': 'Dan Drastic', + 'uploader_id': '71170', + 'view_count': int, + 'comment_count': int, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + slug = mobj.group('slug') + + recordings = self._download_json( + 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) + + error = recordings.get('error') + if isinstance(error, dict): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error.get('message')), + expected=True) + + audio_url = recordings['audio'] + video_id = compat_str(recordings.get('id') or video_id) + title = recordings['title'] + description = recordings.get('description') + duration = int_or_none(recordings.get('recordingDuration')) + timestamp = parse_iso8601(recordings.get('created_at')) + uploader = recordings.get('page', {}).get('title') + uploader_id = compat_str(recordings.get('page', {}).get('id')) + view_count = int_or_none(recordings.get('playCount')) + comment_count = int_or_none(recordings.get('commentCount')) + categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] + + return { + 'id': video_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + } diff --git a/youtube_dl/extractor/playplustv.py b/youtube_dl/extractor/playplustv.py new file mode 100644 index 0000000000..1e30ab23a0 --- /dev/null +++ b/youtube_dl/extractor/playplustv.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + PUTRequest, +) + + +class PlayPlusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _TEST = { + 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', + 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', + 'info_dict': { + 'id': 'db8d274a5163424e967f35a30ddafb8e', + 'ext': 'mp4', + 'title': 'Capítulo 179 - Final', + 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', + 'timestamp': 1529992740, + 'upload_date': '20180626', + }, + 'skip': 'Requires account credential', + } + _NETRC_MACHINE = 'playplustv' + _GEO_COUNTRIES = ['BR'] + _token = None + _profile_id = None + + def _call_api(self, resource, video_id=None, query=None): + return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ + 'Authorization': 'Bearer ' + self._token, + }, query=query) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + self.raise_login_required() + + req = PUTRequest( + 'https://api.playplus.tv/api/web/login', json.dumps({ + 'email': email, + 'password': password, + }).encode(), { + 'Content-Type': 'application/json; charset=utf-8', + }) + + try: + self._token = self._download_json(req, None)['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(self._parse_json( + e.cause.read(), None)['errorMessage'], expected=True) + raise + + self._profile = self._call_api('Profiles')['list'][0]['_id'] + + def _real_extract(self, url): + project_id, media_id = re.match(self._VALID_URL, url).groups() + media = self._call_api( + 'Media', media_id, { + 'profileId': self._profile, + 'projectId': project_id, + 'mediaId': media_id, + })['obj'] + title = media['title'] + + formats = [] + for f in media.get('files', []): + f_url = f.get('url') + if not f_url: + continue + file_info = f.get('fileInfo') or {} + formats.append({ + 'url': f_url, + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumb in media.get('thumbs', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(media.get('description')) or media.get('shortDescription'), + 'timestamp': int_or_none(media.get('publishDate'), 1000), + 'view_count': int_or_none(media.get('numberOfViews')), + 'comment_count': int_or_none(media.get('numberOfComments')), + 'tags': media.get('tags'), + } diff --git a/youtube_dl/extractor/plays.py b/youtube_dl/extractor/plays.py new file mode 100644 index 0000000000..ddfc6f1486 --- /dev/null +++ b/youtube_dl/extractor/plays.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PlaysTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})' + _TESTS = [{ + 'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', + 'md5': 'dfeac1198506652b5257a62762cec7bc', + 'info_dict': { + 'id': '56af17f56c95335490', + 'ext': 'mp4', + 'title': 'Bjergsen - When you outplay the Azir wall', + 'description': 'Posted by Bjergsen', + } + }, { + 'url': 'https://plays.tv/embeds/56af17f56c95335490', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://plays.tv/video/%s' % video_id, video_id) + + info = self._search_json_ld(webpage, video_id,) + + mpd_url, sources = re.search( + r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', + webpage).groups() + formats = self._extract_mpd_formats( + self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') + for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): + formats.append({ + 'url': self._proto_relative_url(format_url), + 'format_id': 'http-' + format_id, + 'height': int_or_none(height), + }) + self._sort_formats(formats) + + info.update({ + 'id': video_id, + 'description': self._og_search_description(webpage), + 'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'formats': formats, + }) + + return info diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py new file mode 100644 index 0000000000..4c5f57919b --- /dev/null +++ b/youtube_dl/extractor/playtvak.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + qualities, +) + + +class PlaytvakIE(InfoExtractor): + IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' + _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)' + _TESTS = [{ + 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', + 'md5': '4525ae312c324b4be2f4603cc78ceb4a', + 'info_dict': { + 'id': 'A150730_150323_hodinovy-manzel_kuko', + 'ext': 'mp4', + 'title': 'Vyžeňte vosy a sršně ze zahrady', + 'description': 'md5:4436e61b7df227a093778efb7e373571', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 279, + 'timestamp': 1438732860, + 'upload_date': '20150805', + 'is_live': False, + } + }, { # live video test + 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', + 'info_dict': { + 'id': 'A150624_164934_planespotting_cat', + 'ext': 'flv', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # idnes.cz + 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', + 'md5': '819832ba33cd7016e58a6658577fe289', + 'info_dict': { + 'id': 'A150809_104116_domaci_pku', + 'ext': 'mp4', + 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', + 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 39, + 'timestamp': 1438969140, + 'upload_date': '20150807', + 'is_live': False, + } + }, { # lidovky.cz + 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', + 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', + 'info_dict': { + 'id': 'A150808_214044_ln-video_ELE', + 'ext': 'mp4', + 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', + 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439052180, + 'upload_date': '20150808', + 'is_live': False, + } + }, { # metro.cz + 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', + 'md5': '84fc1deedcac37b7d4a6ccae7c716668', + 'info_dict': { + 'id': 'A141111_173251_metro-extra_row', + 'ext': 'mp4', + 'title': 'Recesisté udělali z billboardu kolotoč', + 'description': 'md5:7369926049588c3989a66c9c1a043c4c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1415725500, + 'upload_date': '20141111', + 'is_live': False, + } + }, { + 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info_url = self._html_search_regex( + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + + parsed_url = compat_urlparse.urlparse(info_url) + + qs = compat_urlparse.parse_qs(parsed_url.query) + qs.update({ + 'reklama': ['0'], + 'type': ['js'], + }) + + info_url = compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + json_info = self._download_json( + info_url, video_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + item = None + for i in json_info['items']: + if i.get('type') == 'video' or i.get('type') == 'stream': + item = i + break + if not item: + raise ExtractorError('No suitable stream found') + + quality = qualities(('low', 'middle', 'high')) + + formats = [] + for fmt in item['video']: + video_url = fmt.get('file') + if not video_url: + continue + + format_ = fmt['format'] + format_id = '%s_%s' % (format_, fmt['quality']) + preference = None + + if format_ in ('mp4', 'webm'): + ext = format_ + elif format_ == 'rtmp': + ext = 'flv' + elif format_ == 'apple': + ext = 'mp4' + # Some streams have mp3 audio which does not play + # well with ffmpeg filter aac_adtstoasc + preference = -1 + elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests + continue + else: # Other formats not supported yet + continue + + formats.append({ + 'url': video_url, + 'ext': ext, + 'format_id': format_id, + 'quality': quality(fmt.get('quality')), + 'preference': preference, + }) + self._sort_formats(formats) + + title = item['title'] + is_live = item['type'] == 'stream' + if is_live: + title = self._live_title(title) + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) + timestamp = None + duration = None + if not is_live: + duration = int_or_none(item.get('length')) + timestamp = item.get('published') + if timestamp: + timestamp = parse_iso8601(timestamp[:-5]) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': item.get('image'), + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + 'formats': formats, + } diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py new file mode 100644 index 0000000000..4aef186ea2 --- /dev/null +++ b/youtube_dl/extractor/playvid.py @@ -0,0 +1,99 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, +) +from ..utils import ( + clean_html, + ExtractorError, +) + + +class PlayvidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' + _TESTS = [{ + 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', + 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', + 'info_dict': { + 'id': 'RnmBNgtrrJu', + 'ext': 'mp4', + 'title': 'md5:9256d01c6317e3f703848b5906880dc8', + 'duration': 82, + 'age_limit': 18, + }, + 'skip': 'Video removed due to ToS', + }, { + 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', + 'md5': '39d49df503ad7b8f23a4432cbf046477', + 'info_dict': { + 'id': 'hwb0GpNkzgH', + 'ext': 'mp4', + 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', + 'age_limit': 18, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m_error = re.search( + r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage) + if m_error: + raise ExtractorError(clean_html(m_error.group('msg')), expected=True) + + video_title = None + duration = None + video_thumbnail = None + formats = [] + + # most of the information is stored in the flashvars + flashvars = self._html_search_regex( + r'flashvars="(.+?)"', webpage, 'flashvars') + + infos = compat_urllib_parse_unquote(flashvars).split(r'&') + for info in infos: + videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info) + if videovars_match: + key = videovars_match.group(1) + val = videovars_match.group(2) + + if key == 'title': + video_title = compat_urllib_parse_unquote_plus(val) + if key == 'duration': + try: + duration = int(val) + except ValueError: + pass + if key == 'big_thumb': + video_thumbnail = val + + videourl_match = re.match( + r'^video_urls\]\[(?P<resolution>[0-9]+)p', key) + if videourl_match: + height = int(videourl_match.group('resolution')) + formats.append({ + 'height': height, + 'url': val, + }) + self._sort_formats(formats) + + # Extract title - should be in the flashvars; if not, look elsewhere + if video_title is None: + video_title = self._html_search_regex( + r'<title>(.*?)</title', webpage, 'title') + + return { + 'id': video_id, + 'formats': formats, + 'title': video_title, + 'thumbnail': video_thumbnail, + 'duration': duration, + 'description': None, + 'age_limit': 18 + } diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py new file mode 100644 index 0000000000..4d96a10a71 --- /dev/null +++ b/youtube_dl/extractor/playwire.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, +) + + +class PlaywireIE(InfoExtractor): + _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', + 'md5': 'e6398701e3595888125729eaa2329ed9', + 'info_dict': { + 'id': '3353705', + 'ext': 'mp4', + 'title': 'S04_RM_UCL_Rus', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 145.94, + }, + }, { + # m3u8 in f4m + 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', + 'info_dict': { + 'id': '4840492', + 'ext': 'mp4', + 'title': 'ITV EL SHOW FULL', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # Multiple resolutions while bitrates missing + 'url': 'http://cdn.playwire.com/11625/embed/85228.html', + 'only_matching': True, + }, { + 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', + 'only_matching': True, + }, { + 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') + + player = self._download_json( + 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), + video_id) + + title = player['settings']['title'] + duration = float_or_none(player.get('duration'), 1000) + + content = player['content'] + thumbnail = content.get('poster') + src = content['media']['f4m'] + + formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') + for a_format in formats: + if not dict_get(a_format, ['tbr', 'width', 'height']): + a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py new file mode 100644 index 0000000000..abd08bc285 --- /dev/null +++ b/youtube_dl/extractor/pluralsight.py @@ -0,0 +1,501 @@ +from __future__ import unicode_literals + +import collections +import json +import os +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + qualities, + srt_subtitles_timecode, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PluralsightBaseIE(InfoExtractor): + _API_BASE = 'https://app.pluralsight.com' + + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) + + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) + + +class PluralsightIE(PluralsightBaseIE): + IE_NAME = 'pluralsight' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' + _LOGIN_URL = 'https://app.pluralsight.com/id/' + + _NETRC_MACHINE = 'pluralsight' + + _TESTS = [{ + 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', + 'md5': '4d458cf5cf4c593788672419a8dd4cf8', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', + 'ext': 'mp4', + 'title': 'Demo Monitoring', + 'duration': 338, + }, + 'skip': 'Requires pluralsight account credentials', + }, { + 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', + 'only_matching': True, + }, { + # available without pluralsight account + 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', + 'only_matching': True, + }] + + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'Username': username, + 'Password': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + error = self._search_regex( + r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): + BLOCKED = 'Your account has been blocked due to suspicious activity' + if BLOCKED in response: + raise ExtractorError( + 'Unable to login: %s' % BLOCKED, expected=True) + MUST_AGREE = 'To continue using Pluralsight, you must agree to' + if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): + raise ExtractorError( + 'Unable to login: %s some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.' + % MUST_AGREE, expected=True) + + raise ExtractorError('Unable to log in') + + def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): + captions = None + if clip_id: + captions = self._download_json( + '%s/transcript/api/v1/caption/json/%s/%s' + % (self._API_BASE, clip_id, lang), video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False) + if not captions: + captions_post = { + 'a': author, + 'cn': int(clip_idx), + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') + TEXT_KEYS = ('text', 'Text') + for num, current in enumerate(subs): + current = subs[num] + start, text = ( + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), + dict_get(current, TEXT_KEYS)) + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) + if end is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + + def _real_extract(self, url): + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + author = qs.get('author', [None])[0] + name = qs.get('name', [None])[0] + clip_idx = qs.get('clip', [None])[0] + course_name = qs.get('course', [None])[0] + + if any(not f for f in (author, name, clip_idx, course_name,)): + raise ExtractorError('Invalid URL', expected=True) + + display_id = '%s-%s' % (name, clip_idx) + + course = self._download_course(course_name, url, display_id) + + collection = course['modules'] + + clip = None + + for module_ in collection: + if name in (module_.get('moduleName'), module_.get('name')): + for clip_ in module_.get('clips', []): + clip_index = clip_.get('clipIndex') + if clip_index is None: + clip_index = clip_.get('index') + if clip_index is None: + continue + if compat_str(clip_index) == clip_idx: + clip = clip_ + break + + if not clip: + raise ExtractorError('Unable to resolve clip') + + title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] + + QUALITIES = { + 'low': {'width': 640, 'height': 480}, + 'medium': {'width': 848, 'height': 640}, + 'high': {'width': 1024, 'height': 768}, + 'high-widescreen': {'width': 1280, 'height': 720}, + } + + QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) + quality_key = qualities(QUALITIES_PREFERENCE) + + AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) + + ALLOWED_QUALITIES = ( + AllowedQuality('webm', ['high', ]), + AllowedQuality('mp4', ['low', 'medium', 'high', ]), + ) + + # Some courses also offer widescreen resolution for high quality (see + # https://github.com/ytdl-org/youtube-dl/issues/7766) + widescreen = course.get('supportsWideScreenVideoFormats') is True + best_quality = 'high-widescreen' if widescreen else 'high' + if widescreen: + for allowed_quality in ALLOWED_QUALITIES: + allowed_quality.qualities.append(best_quality) + + # In order to minimize the number of calls to ViewClip API and reduce + # the probability of being throttled or banned by Pluralsight we will request + # only single format until formats listing was explicitly requested. + if self._downloader.params.get('listformats', False): + allowed_qualities = ALLOWED_QUALITIES + else: + def guess_allowed_qualities(): + req_format = self._downloader.params.get('format') or 'best' + req_format_split = req_format.split('-', 1) + if len(req_format_split) > 1: + req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) + for allowed_quality in ALLOWED_QUALITIES: + if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: + return (AllowedQuality(req_ext, (req_quality, )), ) + req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' + return (AllowedQuality(req_ext, (best_quality, )), ) + allowed_qualities = guess_allowed_qualities() + + formats = [] + for ext, qualities_ in allowed_qualities: + for quality in qualities_: + f = QUALITIES[quality].copy() + clip_post = { + 'author': author, + 'includeCaptions': 'false', + 'clipIndex': int(clip_idx), + 'courseName': course_name, + 'locale': 'en', + 'moduleName': name, + 'mediaType': ext, + 'quality': '%dx%d' % (f['width'], f['height']), + } + format_id = '%s-%s' % (ext, quality) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + + # Pluralsight tracks multiple sequential calls to ViewClip API and start + # to return 429 HTTP errors after some time (see + # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead + # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). + # To somewhat reduce the probability of these consequences + # we will sleep random amount of time before each call to ViewClip. + self._sleep( + random.randint(2, 5), display_id, + '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') + + if not viewclip: + continue + + clip_urls = viewclip.get('urls') + if not isinstance(clip_urls, list): + continue + + for clip_url_data in clip_urls: + clip_url = clip_url_data.get('url') + if not clip_url: + continue + cdn = clip_url_data.get('cdn') + clip_f = f.copy() + clip_f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, + 'quality': quality_key(quality), + 'source_preference': int_or_none(clip_url_data.get('rank')), + }) + formats.append(clip_f) + + self._sort_formats(formats) + + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) + + return { + 'id': clip_id, + 'title': title, + 'duration': duration, + 'creator': author, + 'formats': formats, + 'subtitles': subtitles, + } + + +class PluralsightCourseIE(PluralsightBaseIE): + IE_NAME = 'pluralsight:course' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' + _TESTS = [{ + # Free course from Pluralsight Starter Subscription for Microsoft TechNet + # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz + 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas', + 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', + 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', + }, + 'playlist_count': 31, + }, { + # available without pluralsight account + 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + # TODO: PSM cookie + + course = self._download_course(course_id, url, course_id) + + title = course['title'] + course_name = course['name'] + course_data = course['modules'] + description = course.get('description') or course.get('shortDescription') + + entries = [] + for num, module in enumerate(course_data, 1): + author = module.get('author') + module_name = module.get('name') + if not author or not module_name: + continue + for clip in module.get('clips', []): + clip_index = int_or_none(clip.get('index')) + if clip_index is None: + continue + clip_url = update_url_query( + '%s/player' % self._API_BASE, query={ + 'mode': 'live', + 'course': course_name, + 'author': author, + 'name': module_name, + 'clip': clip_index, + }) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': PluralsightIE.ie_key(), + 'chapter': module.get('title'), + 'chapter_number': num, + 'chapter_id': module.get('moduleRef'), + }) + + return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py new file mode 100644 index 0000000000..e782e3f1fd --- /dev/null +++ b/youtube_dl/extractor/podomatic.py @@ -0,0 +1,76 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PodomaticIE(InfoExtractor): + IE_NAME = 'podomatic' + _VALID_URL = r'''(?x) + (?P<proto>https?):// + (?: + (?P<channel>[^.]+)\.podomatic\.com/entry| + (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes + )/ + (?P<id>[^/?#&]+) + ''' + + _TESTS = [{ + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, { + 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + channel = mobj.group('channel') or mobj.group('channel_2') + + json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + + '?permalink=true&rtmp=0') % + (mobj.group('proto'), channel, video_id)) + data_json = self._download_webpage( + json_url, video_id, 'Downloading video info') + data = json.loads(data_json) + + video_url = data['downloadLink'] + if not video_url: + video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) + uploader = data['podcast'] + title = data['title'] + thumbnail = data['imageLocation'] + duration = int_or_none(data.get('length'), 1000) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'uploader': uploader, + 'uploader_id': channel, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py new file mode 100644 index 0000000000..80222d4283 --- /dev/null +++ b/youtube_dl/extractor/pokemon.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', + 'md5': '2fe8eaec69768b25ef898cda9c43062e', + 'info_dict': { + 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', + 'ext': 'mp4', + 'title': 'The Ol’ Raise and Switch!', + 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + }, + 'add_id': ['LimelightMedia'], + }, { + # no data-video-title + 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', + 'info_dict': { + 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', + 'ext': 'mp4', + 'title': "Pokémon : L'ascension de Darkrai", + 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', + }, + 'add_id': ['LimelightMedia'], + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data.get('data-video-title') or self._html_search_meta( + 'pkm-title', webpage, ' title', default=None) or self._search_regex( + r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py new file mode 100644 index 0000000000..978d6f813b --- /dev/null +++ b/youtube_dl/extractor/polskieradio.py @@ -0,0 +1,180 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, + compat_urlparse +) +from ..utils import ( + extract_attributes, + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', + webpage, 'content') + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) + + thumbnail_url = self._og_search_thumbnail(webpage) + + entries = [] + + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): + media = self._parse_json(data_media, playlist_id, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioCategoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + 'info_dict': { + 'id': '5102', + 'title': 'HISTORIA ŻYWA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'http://www.polskieradio.pl/7/4807', + 'info_dict': { + 'id': '4807', + 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + }, + 'playlist_mincount': 5 + }, { + 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', + 'only_matching': True + }, { + 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', + 'info_dict': { + 'id': '4143', + 'title': 'Kierunek Kraków', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', + 'info_dict': { + 'id': '214', + 'title': 'Muzyka', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + + def _entries(self, url, page, category_id): + content = page + for page_num in itertools.count(2): + for a_entry, entry_id in re.findall( + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + content): + entry = extract_attributes(a_entry) + href = entry.get('href') + if not href: + continue + yield self.url_result( + compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), + entry_id, entry.get('title')) + mobj = re.search( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content) + if not mobj: + break + next_url = compat_urlparse.urljoin(url, mobj.group('url')) + content = self._download_webpage( + next_url, category_id, 'Downloading page %s' % page_num) + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage(url, category_id) + title = self._html_search_regex( + r'<title>([^<]+) - [^<]+ - [^<]+', + webpage, 'title', fatal=False) + return self.playlist_result( + self._entries(url, webpage, category_id), + category_id, title) diff --git a/youtube_dl/extractor/popcorntimes.py b/youtube_dl/extractor/popcorntimes.py new file mode 100644 index 0000000000..7bf7f98587 --- /dev/null +++ b/youtube_dl/extractor/popcorntimes.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_chr, +) +from ..utils import int_or_none + + +class PopcorntimesIE(InfoExtractor): + _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P[^/]+)/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', + 'md5': '93f210991ad94ba8c3485950a2453257', + 'info_dict': { + 'id': 'A1XCFvz', + 'display_id': 'haensel-und-gretel-opera-fantasy', + 'ext': 'mp4', + 'title': 'Hänsel und Gretel', + 'description': 'md5:1b8146791726342e7b22ce8125cf6945', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'John Paul', + 'release_date': '19541009', + 'duration': 4260, + 'tbr': 5380, + 'width': 720, + 'height': 540, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._search_regex( + r'

    ([^<]+)', webpage, 'title', + default=None) or self._html_search_meta( + 'ya:ovs:original_name', webpage, 'title', fatal=True) + + loc = self._search_regex( + r'PCTMLOC\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'loc', + group='value') + + loc_b64 = '' + for c in loc: + c_ord = ord(c) + if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): + upper = ord('Z') if c_ord <= ord('Z') else ord('z') + c_ord += 13 + if upper < c_ord: + c_ord -= 26 + loc_b64 += compat_chr(c_ord) + + video_url = compat_b64decode(loc_b64).decode('utf-8') + + description = self._html_search_regex( + r'(?s)]+class=["\']pt-movie-desc[^>]+>(.+?)', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r']+class=["\']video-preview[^>]+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'thumbnail', default=None, + group='value') or self._og_search_thumbnail(webpage) + + creator = self._html_search_meta( + 'video:director', webpage, 'creator', default=None) + + release_date = self._html_search_meta( + 'video:release_date', webpage, default=None) + if release_date: + release_date = release_date.replace('-', '') + + def int_meta(name): + return int_or_none(self._html_search_meta( + name, webpage, default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'creator': creator, + 'release_date': release_date, + 'duration': int_meta('video:duration'), + 'tbr': int_meta('ya:ovs:bitrate'), + 'width': int_meta('og:video:width'), + 'height': int_meta('og:video:height'), + 'http_headers': { + 'Referer': url, + }, + } diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py new file mode 100644 index 0000000000..9f834fb6ce --- /dev/null +++ b/youtube_dl/extractor/popcorntv.py @@ -0,0 +1,76 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r']+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)]+itemprop=["\']description[^>]*>(.+?)', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py new file mode 100644 index 0000000000..20eac647a8 --- /dev/null +++ b/youtube_dl/extractor/porn91.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, + ExtractorError, +) + + +class Porn91IE(InfoExtractor): + IE_NAME = '91porn' + _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P[\w\d]+)' + + _TEST = { + 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', + 'md5': '7fcdb5349354f40d41689bd0fa8db05a', + 'info_dict': { + 'id': '7e42283b4f5ab36da134', + 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', + 'ext': 'mp4', + 'duration': 431, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + self._set_cookie('91porn.com', 'language', 'cn_CN') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) + + if '作为游客,你每天只可观看10个视频' in webpage: + raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) + + title = self._search_regex( + r'
    ([^<]+)
    ', webpage, 'title') + title = title.replace('\n', '') + + video_link_url = self._search_regex( + r']+id=["\']fm-video_link[^>]+>([^<]+)', + webpage, 'video link') + videopage = self._download_webpage(video_link_url, video_id) + + info_dict = self._parse_html5_media_entries(url, videopage, video_id)[0] + + duration = parse_duration(self._search_regex( + r'时长:\s*\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + + comment_count = int_or_none(self._search_regex( + r'留言:\s*\s*(\d+)', webpage, 'comment count', fatal=False)) + + info_dict.update({ + 'id': video_id, + 'title': title, + 'duration': duration, + 'comment_count': comment_count, + 'age_limit': self._rta_search(webpage), + }) + + return info_dict diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py new file mode 100644 index 0000000000..5726cab3ae --- /dev/null +++ b/youtube_dl/extractor/porncom.py @@ -0,0 +1,103 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_filesize, + str_to_int, +) + + +class PornComIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ + 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', + 'md5': '3f30ce76267533cd12ba999263156de7', + 'info_dict': { + 'id': '2603339', + 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', + 'ext': 'mp4', + 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 551, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), + webpage, 'config', default='{}'), + display_id, transform_source=js_to_json, fatal=False) + + if config: + title = config['title'] + formats = [{ + 'url': stream['url'], + 'format_id': stream.get('id'), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) + } for stream in config['streams'] if stream.get('url')] + thumbnail = (compat_urlparse.urljoin( + config['thumbCDN'], config['poster']) + if config.get('thumbCDN') and config.get('poster') else None) + duration = int_or_none(config.get('length')) + else: + title = self._search_regex( + (r'([^<]+)', r']*>([^<]+)

    '), + webpage, 'title') + formats = [{ + 'url': compat_urlparse.urljoin(url, format_url), + 'format_id': '%sp' % height, + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + } for format_url, height, filesize in re.findall( + r']+href="(/download/[^"]+)">[^<]*?(\d+)p]*>(\d+\s*[a-zA-Z]+)<', + webpage)] + thumbnail = None + duration = None + + self._sort_formats(formats) + + view_count = str_to_int(self._search_regex( + (r'Views:\s*\s*\s*([\d,.]+)', + r'class=["\']views["\'][^>]*>

    ([\d,.]+)'), webpage, + 'view count', fatal=False)) + + def extract_list(kind): + s = self._search_regex( + (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), + r'(?s)]*>%s:(.+?)

    ' % kind.capitalize()), + webpage, kind, fatal=False) + return re.findall(r']+>([^<]+)', s or '') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + 'categories': extract_list('categories'), + 'tags': extract_list('tags'), + } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py new file mode 100644 index 0000000000..c6052ac9f9 --- /dev/null +++ b/youtube_dl/extractor/pornhd.py @@ -0,0 +1,121 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + js_to_json, + merge_dicts, + urljoin, +) + + +class PornHdIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' + _TESTS = [{ + 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'md5': '87f1540746c1d32ec7a2305c12b96b25', + 'info_dict': { + 'id': '9864', + 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'ext': 'mp4', + 'title': 'Restroom selfie masturbation', + 'description': 'md5:3748420395e03e31ac96857a8f125b2b', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', + 'info_dict': { + 'id': '1962', + 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + 'ext': 'mp4', + 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', + 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id or video_id) + + title = self._html_search_regex( + [r']+class=["\']video-name["\'][^>]*>([^<]+)', + r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') + + sources = self._parse_json(js_to_json(self._search_regex( + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", + webpage, 'sources', default='{}')), video_id) + + info = {} + if not sources: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info = entries[0] + + if not sources and not info: + message = self._html_search_regex( + r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class=["\']video-description[^>]+>(?P.+?)', + r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, + 'thumbnail', default=None, group='url') + + like_count = int_or_none(self._search_regex( + (r'(\d+)
    \s*likes', + r'(\d+)\s*]+>(?: |\s)*\blikes', + r'class=["\']save-count["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + + return merge_dicts(info, { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'like_count': like_count, + 'formats': formats, + 'age_limit': 18, + }) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py new file mode 100644 index 0000000000..3567a32839 --- /dev/null +++ b/youtube_dl/extractor/pornhub.py @@ -0,0 +1,611 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import itertools +import operator +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urllib_request, +) +from .openload import PhantomJSwrapper +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + NO_DEFAULT, + orderedSet, + remove_quotes, + str_to_int, + url_or_none, +) + + +class PornHubBaseIE(InfoExtractor): + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r']+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + +class PornHubIE(PornHubBaseIE): + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P[\da-z]+) + ''' + _TESTS = [{ + 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', + 'md5': '1e19b41231a02eba417839222ac9d58e', + 'info_dict': { + 'id': '648719015', + 'ext': 'mp4', + 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', + 'uploader': 'Babes', + 'upload_date': '20130628', + 'duration': 361, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + }, + }, { + # non-ASCII title + 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', + 'info_dict': { + 'id': '1331683002', + 'ext': 'mp4', + 'title': '重庆婷婷女王足交', + 'uploader': 'Unknown', + 'upload_date': '20150213', + 'duration': 1753, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', + 'only_matching': True, + }, { + # removed at the request of cam4.com + 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', + 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + webpage) + + def _extract_count(self, pattern, webpage, name): + return str_to_int(self._search_regex( + pattern, webpage, '%s count' % name, fatal=False)) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') + + if 'premium' in host: + if not self._downloader.params.get('cookiefile'): + raise ExtractorError( + 'PornHub Premium requires authentication.' + ' You may want to use --cookies.', + expected=True) + + self._set_cookie(host, 'age_verified', '1') + + def dl_webpage(platform): + self._set_cookie(host, 'platform', platform) + return self._download_webpage( + 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), + video_id, 'Downloading %s webpage' % platform) + + webpage = dl_webpage('pc') + + error_msg = self._html_search_regex( + r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + webpage, 'error message', default=None, group='error') + if error_msg: + error_msg = re.sub(r'\s+', ' ', error_msg) + raise ExtractorError( + 'PornHub said: %s' % error_msg, + expected=True, video_id=video_id) + + # video_title from flashvars contains whitespace instead of non-ASCII (see + # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying + # on that anymore. + title = self._html_search_meta( + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)]+class=["\']title["\'][^>]*>(?P.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title') + + video_urls = [] + video_urls_set = set() + subtitles = {} + + flashvars = self._parse_json( + self._search_regex( + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + video_id) + if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) + else: + thumbnail, duration = [None] * 2 + + def extract_js_vars(webpage, pattern, default=NO_DEFAULT): + assignments = self._search_regex( + pattern, webpage, 'encoded url', default=default) + if not assignments: + return {} + + assignments = assignments.split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + return js_vars + + def add_video_url(video_url): + v_url = url_or_none(video_url) + if not v_url: + return + if v_url in video_urls_set: + return + video_urls.append((v_url, None)) + video_urls_set.add(v_url) + + if not video_urls: + FORMAT_PREFIXES = ('media', 'quality') + js_vars = extract_js_vars( + webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + default=None) + if js_vars: + for key, format_url in js_vars.items(): + if any(key.startswith(p) for p in FORMAT_PREFIXES): + add_video_url(format_url) + if not video_urls and re.search( + r'<[^>]+\bid=["\']lockedPlayer', webpage): + raise ExtractorError( + 'Video %s is locked' % video_id, expected=True) + + if not video_urls: + js_vars = extract_js_vars( + dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') + add_video_url(js_vars['mediastring']) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + upload_date = None + formats = [] + for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') + ext = determine_ext(video_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) + + video_uploader = self._html_search_regex( + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', + webpage, 'uploader', fatal=False) + + view_count = self._extract_count( + r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') + like_count = self._extract_count( + r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') + dislike_count = self._extract_count( + r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') + comment_count = self._extract_count( + r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') + + def extract_list(meta_key): + div = self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' + % meta_key, webpage, meta_key, default=None) + if div: + return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'upload_date': upload_date, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'formats': formats, + 'age_limit': 18, + 'tags': extract_list('tags'), + 'categories': extract_list('categories'), + 'subtitles': subtitles, + } + + +class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_entries(self, webpage, host): + # Only process container div with main playlist content skipping + # drop-down menu that uses similar pattern for videos (see + # https://github.com/ytdl-org/youtube-dl/issues/11594). + container = self._search_regex( + r'(?s)(<div[^>]+class=["\']container.+)', webpage, + 'container', default=webpage) + + return [ + self.url_result( + 'http://www.%s/%s' % (host, video_url), + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + container)) + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + playlist_id = mobj.group('id') + + webpage = self._download_webpage(url, playlist_id) + + entries = self._extract_entries(webpage, host) + + playlist = self._parse_json( + self._search_regex( + r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, + 'playlist', default='{}'), + playlist_id, fatal=False) + title = playlist.get('title') or self._search_regex( + r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) + + return self.playlist_result( + entries, playlist_id, title, playlist.get('description')) + + +class PornHubUserIE(PornHubPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph', + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious', + 'info_dict': { + 'id': 'liz-vicious', + }, + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/users/russianveet69', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/channels/povd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('id') + return self.url_result( + '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), + video_id=user_id) + + +class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): + @staticmethod + def _has_more(webpage): + return re.search( + r'''(?x) + <li[^>]+\bclass=["\']page_next| + <link[^>]+\brel=["\']next| + <button[^>]+\bid=["\']moreDataBtn + ''', webpage) is not None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + page = int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + + entries = [] + for page_num in (page, ) if page is not None else itertools.count(1): + try: + webpage = self._download_webpage( + url, item_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + break + raise + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + entries.extend(page_entries) + if not self._has_more(webpage): + break + + return self.playlist_result(orderedSet(entries), item_id) + + +class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph/videos', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 149, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 40, + }, { + # default sorting as Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos', + 'info_dict': { + 'id': 'channels/povd/videos', + }, + 'playlist_mincount': 293, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', + 'only_matching': True, + }, { + # Most Recent Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv', + 'only_matching': True, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr', + 'only_matching': True, + }, { + # Longest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg', + 'only_matching': True, + }, { + # Newest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/search?search=123', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video?page=2', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/playlist/44121572', + 'info_dict': { + 'id': 'playlist/44121572', + }, + 'playlist_mincount': 132, + }, { + 'url': 'https://www.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False + if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) + else super(PornHubPagedVideoListIE, cls).suitable(url)) + + +class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' + _TESTS = [{ + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', + 'info_dict': { + 'id': 'jenny-blighe', + }, + 'playlist_mincount': 129, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py new file mode 100644 index 0000000000..1b5b9a320d --- /dev/null +++ b/youtube_dl/extractor/pornotube.py @@ -0,0 +1,85 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PornotubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science', + 'md5': '60fc5a4f0d93a97968fc7999d98260c9', + 'info_dict': { + 'id': '4964', + 'ext': 'mp4', + 'upload_date': '20141203', + 'title': 'Weird Hot and Wet Science', + 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0', + 'categories': ['Adult Humor', 'Blondes'], + 'uploader': 'Alpha Blue Archives', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1417582800, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] + + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] + + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) + + info = self._download_json( + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] + + timestamp = int_or_none(info.get('publishDate'), scale=1000) + uploader = info.get('studios', [{}])[0].get('name') + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': info.get('description'), + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'thumbnail': thumbnail, + 'categories': categories, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py new file mode 100644 index 0000000000..b6b71069d3 --- /dev/null +++ b/youtube_dl/extractor/pornovoisines.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, + unified_strdate, +) + + +class PornoVoisinesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)' + + _TEST = { + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', + 'info_dict': { + 'id': '919', + 'display_id': 'recherche-appartement', + 'ext': 'mp4', + 'title': 'Recherche appartement', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20140925', + 'duration': 120, + 'view_count': int, + 'average_rating': float, + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], + 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + # The webpage has a bug - there's no space between "thumb" and src= + thumbnail = self._html_search_regex( + r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') + + upload_date = unified_strdate(self._search_regex( + r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') + view_count = int_or_none(self._search_regex( + r'(\d+) vues', webpage, 'view count', fatal=False)) + average_rating = self._search_regex( + r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) + if average_rating: + average_rating = float_or_none(average_rating.replace(',', '.')) + + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False) + if categories: + categories = [category.strip() for category in categories.split(',')] + + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'categories': categories, + 'age_limit': 18, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py new file mode 100644 index 0000000000..2831368b6a --- /dev/null +++ b/youtube_dl/extractor/pornoxo.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + str_to_int, +) + + +class PornoXOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html' + _TEST = { + 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', + 'md5': '582f28ecbaa9e6e24cb90f50f524ce87', + 'info_dict': { + 'id': '7564', + 'ext': 'flv', + 'title': 'Striptease From Sexy Secretary!', + 'display_id': 'striptease-from-sexy-secretary', + 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', + 'categories': list, # NSFW + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.groups() + + webpage = self._download_webpage(url, video_id) + video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) + + title = self._html_search_regex( + r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title') + + view_count = str_to_int(self._html_search_regex( + r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) + + categories_str = self._html_search_regex( + r'<meta name="description" content=".*featuring\s*([^"]+)"', + webpage, 'categories', fatal=False) + categories = ( + None if categories_str is None + else categories_str.split(',')) + + video_data.update({ + 'id': video_id, + 'title': title, + 'display_id': display_id, + 'description': self._html_search_meta('description', webpage), + 'categories': categories, + 'view_count': view_count, + 'age_limit': 18, + }) + + return video_data diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py new file mode 100644 index 0000000000..b5c279203b --- /dev/null +++ b/youtube_dl/extractor/presstv.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import remove_start + + +class PressTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?' + + _TEST = { + 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', + 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', + 'info_dict': { + 'id': '459911', + 'display_id': 'Australian-sewerage-treatment-facility-', + 'ext': 'mp4', + 'title': 'Organic mattresses used to clean waste water', + 'upload_date': '20160409', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + # extract video URL from webpage + video_url = self._hidden_inputs(webpage)['inpPlayback'] + + # build list of available formats + # specified in http://www.presstv.ir/Scripts/playback.js + base_url = 'http://192.99.219.222:82/presstv' + _formats = [ + (180, '_low200.mp4'), + (360, '_low400.mp4'), + (720, '_low800.mp4'), + (1080, '.mp4') + ] + + formats = [{ + 'url': base_url + video_url[:-4] + extension, + 'format_id': '%dp' % height, + 'height': height, + } for height, extension in _formats] + + # extract video metadata + title = remove_start( + self._html_search_meta('title', webpage, fatal=True), 'PressTV-') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + upload_date = '%04d%02d%02d' % ( + int(mobj.group('y')), + int(mobj.group('m')), + int(mobj.group('d')), + ) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'description': description + } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py new file mode 100644 index 0000000000..e470882922 --- /dev/null +++ b/youtube_dl/extractor/prosiebensat1.py @@ -0,0 +1,500 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from hashlib import sha1 +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, + merge_dicts, + unified_strdate, +) + + +class ProSiebenSat1BaseIE(InfoExtractor): + _GEO_BYPASS = False + _ACCESS_ID = None + _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' + _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' + + def _extract_video_info(self, url, clip_id): + client_location = url + + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': self._TOKEN, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'ids': clip_id, + })[0] + + if video.get('is_protected') is True: + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = [] + if self._ACCESS_ID: + raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID + protocols = self._download_json( + self._V4_BASE_URL + 'protocols', clip_id, + 'Downloading protocols JSON', + headers=self.geo_verification_headers(), query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct).encode()).hexdigest(), + 'video_id': clip_id, + }, fatal=False, expected_status=(403,)) or {} + error = protocols.get('error') or {} + if error.get('title') == 'Geo check failed': + self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) + server_token = protocols.get('server_token') + if server_token: + urls = (self._download_json( + self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), + 'protocols': self._SUPPORTED_PROTOCOLS, + 'server_token': server_token, + 'video_id': clip_id, + }, fatal=False) or {}).get('urls') or {} + for protocol, variant in urls.items(): + source_url = variant.get('clear', {}).get('url') + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + if not formats: + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + }) + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) + self._sort_formats(formats) + + return { + 'duration': float_or_none(video.get('duration')), + 'formats': formats, + } + + +class ProSiebenSat1IE(ProSiebenSat1BaseIE): + IE_NAME = 'prosiebensat1' + IE_DESC = 'ProSiebenSat.1 Digital' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?:beta\.)? + (?: + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia + )\.(?:de|at|ch)| + ran\.de|fem\.com|advopedia\.de|galileo\.tv/video + ) + /(?P<id>.+) + ''' + + _TESTS = [ + { + # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 + # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: + # - malformed f4m manifest support + # - proper handling of URLs starting with `https?://` in 2.0 manifests + # - recursive child f4m manifests extraction + 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', + 'info_dict': { + 'id': '2104602', + 'ext': 'mp4', + 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', + 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', + 'upload_date': '20131231', + 'duration': 5845.04, + 'series': 'CIRCUS HALLIGALLI', + 'season_number': 2, + 'episode': 'Episode 18 - Staffel 2', + 'episode_number': 18, + }, + }, + { + 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', + 'info_dict': { + 'id': '2570327', + 'ext': 'mp4', + 'title': 'Lady-Umstyling für Audrina', + 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', + 'upload_date': '20131014', + 'duration': 606.76, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Seems to be broken', + }, + { + 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', + 'info_dict': { + 'id': '2429369', + 'ext': 'mp4', + 'title': 'Countdown für die Autowerkstatt', + 'description': 'md5:809fc051a457b5d8666013bc40698817', + 'upload_date': '20140223', + 'duration': 2595.04, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', + 'info_dict': { + 'id': '2904997', + 'ext': 'mp4', + 'title': 'Sexy laufen in Ugg Boots', + 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', + 'upload_date': '20140122', + 'duration': 245.32, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', + 'info_dict': { + 'id': '2906572', + 'ext': 'mp4', + 'title': 'Im Interview: Kai Wiesinger', + 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', + 'upload_date': '20140203', + 'duration': 522.56, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', + 'info_dict': { + 'id': '2992323', + 'ext': 'mp4', + 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', + 'description': 'md5:2669cde3febe9bce13904f701e774eb6', + 'upload_date': '20141014', + 'duration': 2410.44, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', + 'info_dict': { + 'id': '3004256', + 'ext': 'mp4', + 'title': 'Schalke: Tönnies möchte Raul zurück', + 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', + 'upload_date': '20140226', + 'duration': 228.96, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', + 'info_dict': { + 'id': '2572814', + 'ext': 'mp4', + 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', + 'description': 'md5:6ddb02b0781c6adf778afea606652e38', + 'timestamp': 1382041620, + 'upload_date': '20131017', + 'duration': 469.88, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', + 'info_dict': { + 'id': '2156342', + 'ext': 'mp4', + 'title': 'Kurztrips zum Valentinstag', + 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', + 'duration': 307.24, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', + 'info_dict': { + 'id': '439664', + 'title': 'Episode 8 - Ganze Folge - Playlist', + 'description': 'md5:63b8963e71f481782aeea877658dec84', + }, + 'playlist_count': 2, + 'skip': 'This video is unavailable', + }, + { + # title in <h2 class="subtitle"> + 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', + 'info_dict': { + 'id': '4895826', + 'ext': 'mp4', + 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', + 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', + 'upload_date': '20170302', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'geo restricted to Germany', + }, + { + # geo restricted to Germany + 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', + 'only_matching': True, + }, + { + 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', + 'only_matching': True, + }, + { + 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', + 'only_matching': True, + }, + ] + + _TOKEN = 'prosieben' + _SALT = '01!8d8F_)r9]4s[qeuXfP%' + _CLIENT_NAME = 'kolibri-2.0.19-splec4' + + _ACCESS_ID = 'x_prosiebenmaxx-de' + _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' + _IV = 'Aeluchoc6aevechuipiexeeboowedaok' + + _CLIPID_REGEXES = [ + r'"clip_id"\s*:\s+"(\d+)"', + r'clipid: "(\d+)"', + r'clip[iI]d=(\d+)', + r'clip[iI][dD]\s*=\s*["\'](\d+)', + r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", + r'proMamsId"\s*:\s*"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', + ] + _TITLE_REGEXES = [ + r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', + r'<header class="clearfix">\s*<h3>(.+?)</h3>', + r'<!-- start video -->\s*<h1>(.+?)</h1>', + r'<h1 class="att-name">\s*(.+?)</h1>', + r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', + r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', + r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', + r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', + ] + _DESCRIPTION_REGEXES = [ + r'<p itemprop="description">\s*(.+?)</p>', + r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', + r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', + r'<p class="att-description">\s*(.+?)\s*</p>', + r'<p class="video-description" itemprop="description">\s*(.+?)</p>', + r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', + ] + _UPLOAD_DATE_REGEXES = [ + r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', + r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', + r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', + r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', + ] + _PAGE_TYPE_REGEXES = [ + r'<meta name="page_type" content="([^"]+)">', + r"'itemType'\s*:\s*'([^']*)'", + ] + _PLAYLIST_ID_REGEXES = [ + r'content[iI]d=(\d+)', + r"'itemId'\s*:\s*'([^']*)'", + ] + _PLAYLIST_CLIP_REGEXES = [ + r'(?s)data-qvt=.+?<a href="([^"]+)"', + ] + + def _extract_clip(self, url, webpage): + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') + title = self._html_search_regex( + self._TITLE_REGEXES, webpage, 'title', + default=None) or self._og_search_title(webpage) + info = self._extract_video_info(url, clip_id) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate( + self._html_search_meta('og:published_time', webpage, + 'upload date', default=None) + or self._html_search_regex(self._UPLOAD_DATE_REGEXES, + webpage, 'upload date', default=None)) + + json_ld = self._search_json_ld(webpage, clip_id, default={}) + + return merge_dicts(info, { + 'id': clip_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + }, json_ld) + + def _extract_playlist(self, url, webpage): + playlist_id = self._html_search_regex( + self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') + playlist = self._parse_json( + self._search_regex( + r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', + webpage, 'playlist'), + playlist_id) + entries = [] + for item in playlist: + clip_id = item.get('id') or item.get('upc') + if not clip_id: + continue + info = self._extract_video_info(url, clip_id) + info.update({ + 'id': clip_id, + 'title': item.get('title') or item.get('teaser', {}).get('headline'), + 'description': item.get('teaser', {}).get('description'), + 'thumbnail': item.get('poster'), + 'duration': float_or_none(item.get('duration')), + 'series': item.get('tvShowTitle'), + 'uploader': item.get('broadcastPublisher'), + }) + entries.append(info) + return self.playlist_result(entries, playlist_id) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_type = self._search_regex( + self._PAGE_TYPE_REGEXES, webpage, + 'page type', default='clip').lower() + if page_type == 'clip': + return self._extract_clip(url, webpage) + elif page_type == 'playlist': + return self._extract_playlist(url, webpage) + else: + raise ExtractorError( + 'Unsupported page type %s' % page_type, expected=True) diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py new file mode 100644 index 0000000000..ca71665e0f --- /dev/null +++ b/youtube_dl/extractor/puhutv.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'a347470371d56e1585d1b2c8dab01c96', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:ca09da25b7e57cbb5a9280d6e48d17aa', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1561062602, + 'upload_date': '20190620', + 'release_year': 1976, + 'view_count': int, + 'tags': list, + }, + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + show = info.get('title') or {} + title = info.get('name') or show['name'] + if info.get('display_name'): + title = '%s %s' % (title, info['display_name']) + + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted() + raise + + urls = [] + formats = [] + + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url or media_url in urls: + continue + urls.append(media_url) + + playlist = video.get('is_playlist') + if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + is_hls = (video_format == 'hls' or '/hls/' in media_url or '/chunklist.m3u8' in media_url) and playlist is False + if is_hls: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + self._sort_formats(formats) + + creator = try_get( + show, lambda x: x['producer']['name'], compat_str) + + content = info.get('content') or {} + + images = try_get( + content, lambda x: x['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + tags = [] + for genre in show.get('genres') or []: + if not isinstance(genre, dict): + continue + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) + + subtitles = {} + for subtitle in content.get('subtitles') or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url') or subtitle.get('file')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description') or show.get('description'), + 'season_id': str_or_none(info.get('season_id')), + 'season_number': int_or_none(info.get('season_number')), + 'episode_number': int_or_none(info.get('episode_number')), + 'release_year': int_or_none(show.get('released_at')), + 'timestamp': unified_timestamp(info.get('created_at')), + 'creator': creator, + 'view_count': int_or_none(content.get('watch_count')), + 'duration': float_or_none(content.get('duration_in_ms'), 1000), + 'tags': tags, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + }, + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] + + def _extract_entries(self, seasons): + for season in seasons: + season_id = season.get('id') + if not season_id: + continue + page = 1 + has_more = True + while has_more is True: + season = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + season_id, 'Downloading page %s' % page, query={ + 'page': page, + 'per': 40, + }) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] + + seasons = info.get('seasons') + if seasons: + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) + + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py new file mode 100644 index 0000000000..80091b85f8 --- /dev/null +++ b/youtube_dl/extractor/puls4.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .prosiebensat1 import ProSiebenSat1BaseIE +from ..utils import ( + unified_strdate, + parse_duration, + compat_str, +) + + +class Puls4IE(ProSiebenSat1BaseIE): + _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P<id>[^?#&]+)' + _TESTS = [{ + 'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118', + 'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03', + 'info_dict': { + 'id': '118118', + 'ext': 'flv', + 'title': 'Tobias Homberger von myclubs im #2min2miotalk', + 'description': 'md5:f9def7c5e8745d6026d8885487d91955', + 'upload_date': '20160830', + 'uploader': 'PULS_4', + }, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident.-Norbert-Hofer', + 'only_matching': True, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident-Analyse-des-Interviews-mit-Norbert-Hofer-416598', + 'only_matching': True, + }] + _TOKEN = 'puls4' + _SALT = '01!kaNgaiNgah1Ie4AeSha' + _CLIENT_NAME = '' + + def _real_extract(self, url): + path = self._match_id(url) + content_path = self._download_json( + 'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url'] + media = self._download_json( + 'http://www.puls4.com' + content_path, + content_path)['mediaCurrent'] + player_content = media['playerContent'] + info = self._extract_video_info(url, player_content['id']) + info.update({ + 'id': compat_str(media['objectId']), + 'title': player_content['title'], + 'description': media.get('description'), + 'thumbnail': media.get('previewLink'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(player_content.get('duration')), + 'episode': player_content.get('episodePartName'), + 'show': media.get('channel'), + 'season_id': player_content.get('seasonId'), + 'uploader': player_content.get('sourceCompany'), + }) + return info diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py new file mode 100644 index 0000000000..b8ac93a62c --- /dev/null +++ b/youtube_dl/extractor/pyvideo.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class PyvideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' + + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', + }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + category = mobj.group('category') + video_id = mobj.group('id') + + entries = [] + + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) + + if data: + for video in data['videos']: + video_url = video.get('url') + if video_url: + if video.get('type') == 'youtube': + entries.append(self.url_result(video_url, 'Youtube')) + else: + entries.append({ + 'id': compat_str(data.get('id') or video_id), + 'url': video_url, + 'title': data['title'], + 'description': data.get('description') or data.get('summary'), + 'thumbnail': data.get('thumbnail_url'), + 'duration': int_or_none(data.get('duration')), + }) + else: + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + media_urls = self._search_regex( + r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') + for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) + + return self.playlist_result(entries, video_id) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py new file mode 100644 index 0000000000..084308aeb8 --- /dev/null +++ b/youtube_dl/extractor/qqmusic.py @@ -0,0 +1,369 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import time + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + strip_jsonp, + unescapeHTML, +) + + +class QQMusicIE(InfoExtractor): + IE_NAME = 'qqmusic' + IE_DESC = 'QQ音乐' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', + 'info_dict': { + 'id': '004295Et37taLD', + 'ext': 'mp3', + 'title': '可惜没如果', + 'release_date': '20141227', + 'creator': '林俊杰', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'There is no mp3-320 version of this song.', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', + 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'info_dict': { + 'id': '004MsGEo3DdNxV', + 'ext': 'mp3', + 'title': '如果', + 'release_date': '20050626', + 'creator': '李季美', + 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'lyrics not in .lrc format', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', + 'info_dict': { + 'id': '001JyApY11tIp6', + 'ext': 'mp3', + 'title': 'Shadows Over Transylvania', + 'release_date': '19970225', + 'creator': 'Dark Funeral', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }] + + _FORMATS = { + 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, + 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, + 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} + } + + # Reference: m_r_GetRUin() in top_player.js + # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js + @staticmethod + def m_r_get_ruin(): + curMs = int(time.time() * 1000) % 1000 + return int(round(random.random() * 2147483647) * curMs % 1E10) + + def _real_extract(self, url): + mid = self._match_id(url) + + detail_info_page = self._download_webpage( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, + mid, note='Download song detail info', + errnote='Unable to get song detail info', encoding='gbk') + + song_name = self._html_search_regex( + r"songname:\s*'([^']+)'", detail_info_page, 'song name') + + publish_time = self._html_search_regex( + r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, + 'publish time', default=None) + if publish_time: + publish_time = publish_time.replace('-', '') + + singer = self._html_search_regex( + r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) + + lrc_content = self._html_search_regex( + r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', + detail_info_page, 'LRC lyrics', default=None) + if lrc_content: + lrc_content = lrc_content.replace('\\n', '\n') + + thumbnail_url = None + albummid = self._search_regex( + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], + detail_info_page, 'album mid', default=None) + if albummid: + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ + % (albummid[-2:-1], albummid[-1], albummid) + + guid = self.m_r_get_ruin() + + vkey = self._download_json( + 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, + mid, note='Retrieve vkey', errnote='Unable to get vkey', + transform_source=strip_jsonp)['key'] + + formats = [] + for format_id, details in self._FORMATS.items(): + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), + }) + self._check_formats(formats, mid) + self._sort_formats(formats) + + actual_lrc_lyrics = ''.join( + line + '\n' for line in re.findall( + r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) + + info_dict = { + 'id': mid, + 'formats': formats, + 'title': song_name, + 'release_date': publish_time, + 'creator': singer, + 'description': lrc_content, + 'thumbnail': thumbnail_url + } + if actual_lrc_lyrics: + info_dict['subtitles'] = { + 'origin': [{ + 'ext': 'lrc', + 'data': actual_lrc_lyrics, + }] + } + return info_dict + + +class QQPlaylistBaseIE(InfoExtractor): + @staticmethod + def qq_static_url(category, mid): + return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) + + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) + + def get_entries_from_page(self, singmid): + entries = [] + + default_num = 1 + json_text = self.get_singer_all_songs(singmid, default_num) + json_obj_all_songs = self._parse_json(json_text, singmid) + + if json_obj_all_songs['code'] == 0: + total = json_obj_all_songs['data']['total'] + json_text = self.get_singer_all_songs(singmid, total) + json_obj_all_songs = self._parse_json(json_text, singmid) + + for item in json_obj_all_songs['data']['list']: + if item['musicData'].get('songmid') is not None: + songmid = item['musicData']['songmid'] + entries.append(self.url_result( + r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) + + return entries + + +class QQMusicSingerIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:singer' + IE_DESC = 'QQ音乐 - 歌手' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' + _TEST = { + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', + 'info_dict': { + 'id': '001BLpXF2DyJe2', + 'title': '林俊杰', + 'description': 'md5:870ec08f7d8547c29c93010899103751', + }, + 'playlist_mincount': 12, + } + + def _real_extract(self, url): + mid = self._match_id(url) + + entries = self.get_entries_from_page(mid) + singer_page = self._download_webpage(url, mid, 'Download singer page') + singer_name = self._html_search_regex( + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) + singer_desc = None + + if mid: + singer_desc_page = self._download_xml( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) + + singer_desc = singer_desc_page.find('./data/info/desc').text + + return self.playlist_result(entries, mid, singer_name, singer_desc) + + +class QQMusicAlbumIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:album' + IE_DESC = 'QQ音乐 - 专辑' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', + 'info_dict': { + 'id': '000gXCTb2AhRR1', + 'title': '我们都是这样长大的', + 'description': 'md5:179c5dce203a5931970d306aa9607ea6', + }, + 'playlist_count': 4, + }, { + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', + 'info_dict': { + 'id': '002Y5a3b3AlCu3', + 'title': '그리고...', + 'description': 'md5:a48823755615508a95080e81b51ba729', + }, + 'playlist_count': 8, + }] + + def _real_extract(self, url): + mid = self._match_id(url) + + album = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, + mid, 'Download album page')['data'] + + entries = [ + self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] + ) for song in album['list'] + ] + album_name = album.get('name') + album_detail = album.get('desc') + if album_detail is not None: + album_detail = album_detail.strip() + + return self.playlist_result(entries, mid, album_name, album_detail) + + +class QQMusicToplistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:toplist' + IE_DESC = 'QQ音乐 - 排行榜' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', + 'info_dict': { + 'id': '123', + 'title': '美国iTunes榜', + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', + 'info_dict': { + 'id': '3', + 'title': '巅峰榜·欧美', + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', + 'info_dict': { + 'id': '106', + 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', + }, + 'playlist_count': 50, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + toplist_json = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) + + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] + + topinfo = toplist_json.get('topinfo', {}) + list_name = topinfo.get('ListName') + list_description = topinfo.get('info') + return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:playlist' + IE_DESC = 'QQ音乐 - 歌单' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', + 'info_dict': { + 'id': '3462654915', + 'title': '韩国5月新歌精选下旬', + 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', + }, + 'playlist_count': 40, + 'skip': 'playlist gone', + }, { + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', + 'info_dict': { + 'id': '1374105607', + 'title': '易入人心的华语民谣', + 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_json = self._download_json( + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, + transform_source=strip_jsonp) + if not len(list_json.get('cdlist', [])): + if list_json.get('code'): + raise ExtractorError( + 'QQ Music said: error %d in fetching playlist info' % list_json['code'], + expected=True) + raise ExtractorError('Unable to get playlist info') + + cdlist = list_json['cdlist'][0] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] + + list_name = cdlist.get('dissname') + list_description = clean_html(unescapeHTML(cdlist.get('desc'))) + return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py new file mode 100644 index 0000000000..e2202d6036 --- /dev/null +++ b/youtube_dl/extractor/r7.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class R7IE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| + noticias\.r7\.com(?:/[^/]+)+/[^/]+-| + player\.r7\.com/video/i/ + ) + (?P<id>[\da-f]{24}) + ''' + _TESTS = [{ + 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', + 'md5': '403c4e393617e8e8ddc748978ee8efde', + 'info_dict': { + 'id': '54e7050b0cf2ff57e0279389', + 'ext': 'mp4', + 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'description': 'md5:01812008664be76a6479aa58ec865b72', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 98, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', + 'only_matching': True, + }, { + 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', + 'only_matching': True, + }, { + 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://player-api.r7.com/video/i/%s' % video_id, video_id) + + title = video['title'] + + formats = [] + media_url_hls = video.get('media_url_hls') + if media_url_hls: + formats.extend(self._extract_m3u8_formats( + media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + media_url = video.get('media_url') + if media_url: + f = { + 'url': media_url, + 'format_id': 'http', + } + # m3u8 format always matches the http format, let's copy metadata from + # one to another + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none', formats)) + if len(m3u8_formats) == 1: + f_copy = m3u8_formats[0].copy() + f_copy.update(f) + f_copy['protocol'] = 'http' + f = f_copy + formats.append(f) + self._sort_formats(formats) + + description = video.get('description') + thumbnail = video.get('thumb') + duration = int_or_none(video.get('media_duration')) + like_count = int_or_none(video.get('likes')) + view_count = int_or_none(video.get('views')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'view_count': view_count, + 'formats': formats, + } + + +class R7ArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)' + _TEST = { + 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', + webpage, 'video id') + + return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key()) diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py new file mode 100644 index 0000000000..2c35f98451 --- /dev/null +++ b/youtube_dl/extractor/radiobremen.py @@ -0,0 +1,63 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class RadioBremenIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)' + IE_NAME = 'radiobremen' + + _TEST = { + 'url': 'http://www.radiobremen.de/mediathek/?id=141876', + 'info_dict': { + 'id': '141876', + 'ext': 'mp4', + 'duration': 178, + 'width': 512, + 'title': 'Druck auf Patrick Öztürk', + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + meta_url = 'http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s' % video_id + meta_doc = self._download_webpage( + meta_url, video_id, 'Downloading metadata') + title = self._html_search_regex( + r'<h1.*>(?P<title>.+)</h1>', meta_doc, 'title') + description = self._html_search_regex( + r'<p>(?P<description>.*)</p>', meta_doc, 'description', fatal=False) + duration = parse_duration(self._html_search_regex( + r'Länge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>', + meta_doc, 'duration', fatal=False)) + + page_doc = self._download_webpage( + url, video_id, 'Downloading video information') + mobj = re.search( + r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)", + page_doc) + video_url = ( + "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" % + (video_id, video_id, mobj.group("secret"), mobj.group('width'))) + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'width': int(mobj.group('width')), + }] + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'thumbnail': mobj.group('thumbnail'), + } diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py new file mode 100644 index 0000000000..a28b1a24c4 --- /dev/null +++ b/youtube_dl/extractor/radiocanada.py @@ -0,0 +1,171 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unified_strdate, +) + + +class RadioCanadaIE(InfoExtractor): + IE_NAME = 'radiocanada' + _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, + } + ] + _GEO_COUNTRIES = ['CA'] + _access_token = None + _claims = None + + def _call_api(self, path, video_id=None, app_code=None, query=None): + if not query: + query = {} + query.update({ + 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', + 'output': 'json', + }) + if video_id: + query.update({ + 'appCode': app_code, + 'idMedia': video_id, + }) + if self._access_token: + query['access_token'] = self._access_token + try: + return self._download_json( + 'https://services.radio-canada.ca/media/' + path, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): + data = self._parse_json(e.cause.read().decode(), None) + error = data.get('error_description') or data['errorMessage']['text'] + raise ExtractorError(error, expected=True) + raise + + def _extract_info(self, app_code, video_id): + metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] + + def get_meta(name): + for meta in metas: + if meta.get('name') == name: + text = meta.get('text') + if text: + return text + + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/ytdl-org/youtube-dl/pull/18609). + if get_meta('protectionType'): + self.report_warning('This video is probably DRM protected.') + + query = { + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + } + if self._claims: + query['claims'] = self._claims + v_data = self._call_api('validation/v2/', video_id, app_code, query) + v_url = v_data.get('url') + if not v_url: + error = v_data['message'] + if error == "Le contenu sélectionné n'est pas disponible dans votre pays": + raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) + if error == 'Le contenu sélectionné est disponible seulement en premium': + self.raise_login_required(error) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') + if closed_caption_url: + subtitles['fr'] = [{ + 'url': closed_caption_url, + 'ext': determine_ext(closed_caption_url, 'vtt'), + }] + + return { + 'id': video_id, + 'title': get_meta('Title') or get_meta('AV-nomEmission'), + 'description': get_meta('Description') or get_meta('ShortDescription'), + 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), + 'duration': int_or_none(get_meta('length')), + 'series': get_meta('Emission'), + 'season_number': int_or_none('SrcSaison'), + 'episode_number': int_or_none('SrcEpisode'), + 'upload_date': unified_strdate(get_meta('Date')), + 'subtitles': subtitles, + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_info(*re.match(self._VALID_URL, url).groups()) + + +class RadioCanadaAudioVideoIE(InfoExtractor): + IE_NAME = 'radiocanada:audiovideo' + _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', + 'info_dict': { + 'id': '7527184', + 'ext': 'mp4', + 'title': 'Barack Obama au Vietnam', + 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', + 'upload_date': '20160523', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py new file mode 100644 index 0000000000..2c06c8b1e4 --- /dev/null +++ b/youtube_dl/extractor/radiode.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RadioDeIE(InfoExtractor): + IE_NAME = 'radio.de' + _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)' + _TEST = { + 'url': 'http://ndr2.radio.de/', + 'info_dict': { + 'id': 'ndr2', + 'ext': 'mp3', + 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:591c49c702db1a33751625ebfb67f273', + 'thumbnail': r're:^https?://.*\.png', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + radio_id = self._match_id(url) + webpage = self._download_webpage(url, radio_id) + jscode = self._search_regex( + r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n", + webpage, 'broadcast') + + broadcast = self._parse_json(jscode, radio_id) + title = self._live_title(broadcast['name']) + description = broadcast.get('description') or broadcast.get('shortDescription') + thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100') + + formats = [{ + 'url': stream['streamUrl'], + 'ext': stream['streamContentFormat'].lower(), + 'acodec': stream['streamContentFormat'], + 'abr': stream['bitRate'], + 'asr': stream['sampleRate'] + } for stream in broadcast['streamUrls']] + self._sort_formats(formats) + + return { + 'id': radio_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py new file mode 100644 index 0000000000..a8afc00146 --- /dev/null +++ b/youtube_dl/extractor/radiofrance.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RadioFranceIE(InfoExtractor): + _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' + IE_NAME = 'radiofrance' + + _TEST = { + 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', + 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', + 'info_dict': { + 'id': 'one-one', + 'ext': 'ogg', + 'title': 'One to one', + 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + 'uploader': 'Thomas Hercouët', + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') + description = self._html_search_regex( + r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', + webpage, 'description', fatal=False) + uploader = self._html_search_regex( + r'<div class="credit">  © (.*?)</div>', + webpage, 'uploader', fatal=False) + + formats_str = self._html_search_regex( + r'class="jp-jplayer[^"]*" data-source="([^"]+)">', + webpage, 'audio URLs') + formats = [ + { + 'format_id': fm[0], + 'url': fm[1], + 'vcodec': 'none', + 'preference': i, + } + for i, fm in + enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + } diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py new file mode 100644 index 0000000000..3f74f0c01f --- /dev/null +++ b/youtube_dl/extractor/radiojavan.py @@ -0,0 +1,83 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_resolution, + str_to_int, + unified_strdate, + urlencode_postdata, + urljoin, +) + + +class RadioJavanIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' + _TEST = { + 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', + 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', + 'info_dict': { + 'id': 'chaartaar-ashoobam', + 'ext': 'mp4', + 'title': 'Chaartaar - Ashoobam', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'upload_date': '20150215', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + download_host = self._download_json( + 'https://www.radiojavan.com/videos/video_host', video_id, + data=urlencode_postdata({'id': video_id}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }).get('host', 'https://host1.rjmusicmedia.com') + + webpage = self._download_webpage(url, video_id) + + formats = [] + for format_id, _, video_path in re.findall( + r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + f = parse_resolution(format_id) + f.update({ + 'url': urljoin(download_host, video_path), + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + upload_date = unified_strdate(self._search_regex( + r'class="date_added">Date added: ([^<]+)<', + webpage, 'upload date', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'class="views">Plays: ([\d,]+)', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) likes', + webpage, 'like count', fatal=False)) + dislike_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) dislikes', + webpage, 'dislike count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py new file mode 100644 index 0000000000..207a6c247f --- /dev/null +++ b/youtube_dl/extractor/rai.py @@ -0,0 +1,502 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_str, +) +from ..utils import ( + ExtractorError, + determine_ext, + find_xpath_attr, + fix_xml_ampersands, + GeoRestrictedError, + int_or_none, + parse_duration, + strip_or_none, + try_get, + unescapeHTML, + unified_strdate, + unified_timestamp, + update_url_query, + urljoin, + xpath_text, +) + + +class RaiBaseIE(InfoExtractor): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + + formats = [] + geoprotection = None + is_live = None + duration = None + + for platform in ('mon', 'flash', 'native'): + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, + headers=self.geo_verification_headers()) + + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) + if media_url == 'http://download.rai.it/video_no_available.mp4': + continue + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m' or platform == 'flash': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) + + @staticmethod + def _extract_subtitles(url, subtitle_url): + subtitles = {} + if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = urljoin(url, subtitle_url) + STL_EXT = '.stl' + SRT_EXT = '.srt' + subtitles['it'] = [{ + 'ext': 'stl', + 'url': subtitle_url, + }] + if subtitle_url.endswith(STL_EXT): + srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT + subtitles['it'].append({ + 'ext': 'srt', + 'url': srt_url, + }) + return subtitles + + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', + 'md5': '340aa3b7afb54bfd14a8c11786450d76', + 'info_dict': { + 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', + 'ext': 'mp4', + 'title': 'La Casa Bianca', + 'alt_title': 'S2016 - Puntata del 23/10/2016', + 'description': 'md5:a09d45890850458077d1f68bb036e0a5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 3278, + 'timestamp': 1477764300, + 'upload_date': '20161029', + 'series': 'La Casa Bianca', + 'season': '2016', + }, + }, { + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report del 07/04/2014', + 'alt_title': 'S2013/14 - Puntata del 07/04/2014', + 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 5', + 'creator': 'Rai 5', + 'duration': 6160, + 'series': 'Report', + 'season_number': 5, + 'season': '2013/14', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') + + media = self._download_json( + '%s?json' % url, video_id, 'Downloading video JSON') + + title = media['name'] + + video = media['video'] + + relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + if 'images' in media: + for _, value in media.get('images').items(): + if value: + thumbnails.append({ + 'url': value.replace('[RESOLUTION]', '600x400') + }) + + timestamp = unified_timestamp(try_get( + media, lambda x: x['availabilities'][0]['start'], compat_str)) + + subtitles = self._extract_subtitles(url, video.get('subtitles')) + + info = { + 'id': video_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), + 'duration': parse_duration(video.get('duration')), + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'series': try_get( + media, lambda x: x['isPartOf']['name'], compat_str), + 'season_number': int_or_none(try_get( + media, lambda x: x['isPartOf']['numeroStagioni'])), + 'season': media.get('stagione') or None, + 'subtitles': subtitles, + } + + info.update(relinker_info) + return info + + +class RaiPlayLiveIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.raiplay.it/dirette/rainews24', + 'info_dict': { + 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', + 'display_id': 'rainews24', + 'ext': 'mp4', + 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:6eca31500550f9376819f174e5644754', + 'uploader': 'Rai News 24', + 'creator': 'Rai News 24', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, + webpage, 'content id') + + return { + '_type': 'url_transparent', + 'ie_key': RaiPlayIE.ie_key(), + 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + 'id': video_id, + 'display_id': display_id, + } + + +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_meta( + ('programma', 'nomeProgramma'), webpage, 'title') + description = unescapeHTML(self._html_search_meta( + ('description', 'og:description'), webpage, 'description')) + + entries = [] + for mobj in re.finditer( + r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', + webpage): + video_url = urljoin(url, mobj.group('path')) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result(entries, playlist_id, title, description) + + +class RaiIE(RaiBaseIE): + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _TESTS = [{ + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1758, + 'upload_date': '20140612', + } + }, { + # with ContentItem in many metas + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 833, + 'upload_date': '20161103', + } + }, { + # with ContentItem in og:url + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '11959b4e44fa74de47011b5799490adf', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, + 'upload_date': '20161103', + } + }, { + # drawMediaRaiTV(...) + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'mp4', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20141221', + }, + }, { + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HDS live stream with only relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'params': { + 'skip_download': True, + }, + }, { + # HLS live stream with ContentItem in og:url + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, + }, { + 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', + 'only_matching': True, + }] + + def _extract_from_content_id(self, content_id, url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + title = media['name'].strip() + + media_type = media['type'] + if 'Audio' in media_type: + relinker_info = { + 'formats': [{ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }] + } + elif 'Video' in media_type: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(url, thumbnail_url), + }) + + subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + + info = { + 'id': content_id, + 'title': title, + 'description': strip_or_none(media.get('desc')), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'subtitles': subtitles, + } + + info.update(relinker_info) + + return info + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + content_item_id = None + + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) + + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') + + content_item_ids = set() + if content_item_id: + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url') + + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + info = { + 'id': video_id, + 'title': title, + } + + info.update(relinker_info) + + return info diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py new file mode 100644 index 0000000000..5411ece218 --- /dev/null +++ b/youtube_dl/extractor/raywenderlich.py @@ -0,0 +1,179 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, + urljoin, +) + + +class RayWenderlichIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', + 'info_dict': { + 'id': '248377018', + 'ext': 'mp4', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', + 'duration': 133, + 'uploader': 'Ray Wenderlich', + 'uploader_id': 'user3304672', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', + 'info_dict': { + 'title': 'Testing in iOS', + 'id': '3530-testing-in-ios', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 29, + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + entries = [] + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) + entries.append(self.url_result( + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py new file mode 100644 index 0000000000..ae7413fb5e --- /dev/null +++ b/youtube_dl/extractor/rbmaradio.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + int_or_none, + unified_timestamp, + update_url_query, +) + + +class RBMARadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', + 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', + 'info_dict': { + 'id': 'ford-lopatin-live-at-primavera-sound-2011', + 'ext': 'mp3', + 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') + + webpage = self._download_webpage(url, episode_id) + + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] + + title = episode['title'] + + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 192, 256)] + self._check_formats(formats, episode_id) + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) + + return { + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py new file mode 100644 index 0000000000..8c016a77de --- /dev/null +++ b/youtube_dl/extractor/rds.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + js_to_json, +) +from ..compat import compat_str + + +class RDSIE(InfoExtractor): + IE_DESC = 'RDS.ca' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+' + + _TESTS = [{ + 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + 'info_dict': { + 'id': '604333', + 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'ext': 'flv', + 'title': 'Fowler Jr. prend la direction de Jacksonville', + 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', + 'timestamp': 1430397346, + 'upload_date': '20150430', + 'duration': 154.354, + 'age_limit': 0, + } + }, { + 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( + [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', + r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], + webpage, 'thumbnail', fatal=False) + timestamp = parse_iso8601(self._search_regex( + r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"', + webpage, 'upload date', fatal=False)) + duration = parse_duration(self._search_regex( + r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"', + webpage, 'duration', fatal=False)) + age_limit = self._family_friendly_search(webpage) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': display_id, + 'url': '9c9media:rds_web:%s' % video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', + } diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py new file mode 100644 index 0000000000..dbe1aaded5 --- /dev/null +++ b/youtube_dl/extractor/redbulltv.py @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class RedBullTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live)/(?P<id>AP-\w+)' + _TESTS = [{ + # film + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', + 'md5': 'fb0445b98aa4394e504b413d98031d1f', + 'info_dict': { + 'id': 'AP-1Q6XCDTAN1W11', + 'ext': 'mp4', + 'title': 'ABC of... WRC - ABC of... S1E6', + 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', + 'duration': 1582.04, + }, + }, { + # episode + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', + 'info_dict': { + 'id': 'AP-1PMHKJFCW1W11', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:b5f522b89b72e1e23216e5018810bb25', + 'duration': 904.6, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + session = self._download_json( + 'https://api.redbull.tv/v3/session', video_id, + note='Downloading access token', query={ + 'category': 'personal_computer', + 'os_family': 'http', + }) + if session.get('code') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, session['message'])) + token = session['token'] + + try: + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, + video_id, note='Downloading video information', + headers={'Authorization': token} + ) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + error_message = self._parse_json( + e.cause.read().decode(), video_id)['error'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + title = video['title'].strip() + + formats = self._extract_m3u8_formats( + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + subtitles = {} + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) + + subheading = video.get('subheading') + if subheading: + title += ' - %s' % subheading + + return { + 'id': video_id, + 'title': title, + 'description': video.get('long_description') or video.get( + 'short_description'), + 'duration': float_or_none(video.get('duration'), scale=1000), + 'formats': formats, + 'subtitles': subtitles, + } + + +class RedBullTVRrnContentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)/(?:video|live)/rrn:content:[^:]+:(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_url = self._og_search_url(webpage) + + return self.url_result( + video_url, ie=RedBullTVIE.ie_key(), + video_id=RedBullTVIE._match_id(video_url)) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py new file mode 100644 index 0000000000..663f622b37 --- /dev/null +++ b/youtube_dl/extractor/reddit.py @@ -0,0 +1,130 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + url_or_none, +) + + +class RedditIE(InfoExtractor): + _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' + _TEST = { + # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '0a070c53eba7ec4534d95a5a1259e253', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'zv89llsvexdz', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + formats.extend(self._extract_mpd_formats( + 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, + mpd_id='dash', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class RedditRIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') + + video_id = self._match_id(url) + + data = self._download_json( + url + '/.json', video_id)[0]['data']['children'][0]['data'] + + video_url = data['url'] + + # Avoid recursing into the same reddit URL + if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: + raise ExtractorError('No media found', expected=True) + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': data.get('title'), + 'thumbnail': url_or_none(data.get('thumbnail')), + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py new file mode 100644 index 0000000000..2d2f6a98c9 --- /dev/null +++ b/youtube_dl/extractor/redtube.py @@ -0,0 +1,133 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + merge_dicts, + str_to_int, + unified_strdate, + url_or_none, +) + + +class RedTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.redtube.com/66418', + 'md5': 'fc08071233725f26b8f014dba9590005', + 'info_dict': { + 'id': '66418', + 'ext': 'mp4', + 'title': 'Sucked on a toilet', + 'upload_date': '20110811', + 'duration': 596, + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.redtube.com/%s' % video_id, video_id) + + ERRORS = ( + (('video-deleted-info', '>This video has been removed'), 'has been removed'), + (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), + ) + + for patterns, message in ERRORS: + if any(p in webpage for p in patterns): + raise ExtractorError( + 'Video %s %s' % (video_id, message), expected=True) + + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = self._html_search_regex( + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + medias = self._parse_json( + self._search_regex( + r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', + webpage, 'upload date', default=None)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, default=None) or self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', + r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), + webpage, 'view count', default=None)) + + # No self-labeling, but they describe themselves as + # "Home of Videos Porno" + age_limit = 18 + + return merge_dicts(info, { + 'id': video_id, + 'ext': 'mp4', + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'age_limit': age_limit, + 'formats': formats, + }) diff --git a/youtube_dl/extractor/regiotv.py b/youtube_dl/extractor/regiotv.py new file mode 100644 index 0000000000..e250a52f06 --- /dev/null +++ b/youtube_dl/extractor/regiotv.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + sanitized_Request, + xpath_text, + xpath_with_ns, +) + + +class RegioTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.regio-tv.de/video/395808.html', + 'info_dict': { + 'id': '395808', + 'ext': 'mp4', + 'title': 'Wir in Ludwigsburg', + 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!', + } + }, { + 'url': 'http://www.regio-tv.de/video/395808', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + key = self._search_regex( + r'key\s*:\s*(["\'])(?P<key>.+?)\1', webpage, 'key', group='key') + title = self._og_search_title(webpage) + + SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>' + + request = sanitized_Request( + 'http://v.telvi.de/', + SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) + video_data = self._download_xml(request, video_id, 'Downloading video XML') + + NS_MAP = { + 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', + 'soap': 'http://schemas.xmlsoap.org/soap/envelope/', + } + + video_url = xpath_text( + video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True) + thumbnail = xpath_text( + video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail') + description = self._og_search_description( + webpage) or self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py new file mode 100644 index 0000000000..7c8909d953 --- /dev/null +++ b/youtube_dl/extractor/rentv.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + url_or_none, +) + + +class RENTVIE(InfoExtractor): + _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://ren.tv/video/epizod/118577', + 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': '118577', + 'ext': 'mp4', + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', + 'timestamp': 1472230800, + 'upload_date': '20160826', + } + }, { + 'url': 'http://ren.tv/player/118577', + 'only_matching': True, + }, { + 'url': 'rentv:118577', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) + config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) + title = config['title'] + formats = [] + for video in config['src']: + src = url_or_none(video.get('src')) + if not src: + continue + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'duration': int_or_none(config.get('duration')), + 'timestamp': int_or_none(config.get('date')), + 'formats': formats, + } + + +class RENTVArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v', + 'md5': 'ebd63c4680b167693745ab91343df1d6', + 'info_dict': { + 'id': '136472', + 'ext': 'mp4', + 'title': 'Видео: микроавтобус, попавший в ДТП с грузовиками в Подмосковье, превратился в груду металла', + 'description': 'Жертвами столкновения двух фур и микроавтобуса, по последним данным, стали семь человек.', + } + }, { + # TODO: invalid m3u8 + 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', + 'info_dict': { + 'id': 'playlist', + 'ext': 'mp4', + 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', + 'uploader': 'ren.tv', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + entries = [] + for config_profile in drupal_settings.get('ren_jwplayer', {}).values(): + media_id = config_profile.get('mediaid') + if not media_id: + continue + media_id = compat_str(media_id) + entries.append(self.url_result('rentv:' + media_id, 'RENTV', media_id)) + return self.playlist_result(entries, display_id) diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py new file mode 100644 index 0000000000..d47fb45ca5 --- /dev/null +++ b/youtube_dl/extractor/restudy.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RestudyIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.restudy.dk/video/play/id/1637', + 'info_dict': { + 'id': '1637', + 'ext': 'flv', + 'title': 'Leiden-frosteffekt', + 'description': 'Denne video er et eksperiment med flydende kvælstof.', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + + formats = self._extract_smil_formats( + 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, + video_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + } diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py new file mode 100644 index 0000000000..9dc482d216 --- /dev/null +++ b/youtube_dl/extractor/reuters.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + int_or_none, + unescapeHTML, +) + + +class ReutersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', + 'md5': '8015113643a0b12838f160b0b81cc2ee', + 'info_dict': { + 'id': '368575562', + 'ext': 'mp4', + 'title': 'San Francisco police chief resigns', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id) + video_data = js_to_json(self._search_regex( + r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);', + webpage, 'video data')) + + def get_json_value(key, fatal=False): + return self._search_regex(r'"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) + + title = unescapeHTML(get_json_value('title', fatal=True)) + mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() + + mas_data = self._download_json( + 'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid), + video_id, transform_source=js_to_json) + formats = [] + for f in mas_data: + f_url = f.get('url') + if not f_url: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + container = f.get('container') + ext = '3gp' if method == 'mobile' else container + formats.append({ + 'format_id': ext, + 'url': f_url, + 'ext': ext, + 'container': container if method != 'mobile' else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': get_json_value('thumb'), + 'duration': int_or_none(get_json_value('seconds')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py new file mode 100644 index 0000000000..4cb99c244c --- /dev/null +++ b/youtube_dl/extractor/reverbnation.py @@ -0,0 +1,53 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + qualities, + str_or_none, +) + + +class ReverbNationIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' + _TESTS = [{ + 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', + 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', + 'info_dict': { + 'id': '16965047', + 'ext': 'mp3', + 'title': 'MONA LISA', + 'uploader': 'ALKILADOS', + 'uploader_id': '216429', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + + api_res = self._download_json( + 'https://api.reverbnation.com/song/%s' % song_id, + song_id, + note='Downloading information of song %s' % song_id + ) + + THUMBNAILS = ('thumbnail', 'image') + quality = qualities(THUMBNAILS) + thumbnails = [] + for thumb_key in THUMBNAILS: + if api_res.get(thumb_key): + thumbnails.append({ + 'url': api_res[thumb_key], + 'preference': quality(thumb_key) + }) + + return { + 'id': song_id, + 'title': api_res['name'], + 'url': api_res['url'], + 'uploader': api_res.get('artist', {}).get('name'), + 'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), + 'thumbnails': thumbnails, + 'ext': 'mp3', + 'vcodec': 'none', + } diff --git a/youtube_dl/extractor/rice.py b/youtube_dl/extractor/rice.py new file mode 100644 index 0000000000..f855719ac2 --- /dev/null +++ b/youtube_dl/extractor/rice.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_iso8601, + ExtractorError, +) + + +class RICEIE(InfoExtractor): + _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' + _TEST = { + 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', + 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', + 'info_dict': { + 'id': 'YEWIvbhb40aqdjMD1ALSqw', + 'ext': 'mp4', + 'title': 'Active Learning in Archeology', + 'upload_date': '20140616', + 'timestamp': 1402926346, + } + } + _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): + raise ExtractorError('Invalid URL', expected=True) + + portal_id = qs['PortalID'][0] + playlist_id = qs['DestinationID'][0] + content_id = qs['ContentID'][0] + + content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ + 'portalId': portal_id, + 'playlistId': playlist_id, + 'contentId': content_id + }) + metadata = xpath_element(content_data, './/metaData', fatal=True) + title = xpath_text(metadata, 'primaryTitle', fatal=True) + encodings = xpath_element(content_data, './/encodings', fatal=True) + player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ + 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), + 'contentId': content_id, + }) + + common_fmt = {} + dimensions = xpath_text(encodings, 'dimensions') + if dimensions: + wh = dimensions.split('x') + if len(wh) == 2: + common_fmt.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + + formats = [] + rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) + if rtsp_path: + fmt = { + 'url': rtsp_path, + 'format_id': 'rtsp', + } + fmt.update(common_fmt) + formats.append(fmt) + for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): + video_url = xpath_text(source, self._xpath_ns('File', self._NS)) + if not video_url: + continue + if '.m3u8' in video_url: + formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + fmt = { + 'url': video_url, + 'format_id': video_url.split(':')[0], + } + fmt.update(common_fmt) + rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + }) + formats.append(fmt) + self._sort_formats(formats) + + thumbnails = [] + for content_asset in content_data.findall('.//contentAssets'): + asset_type = xpath_text(content_asset, 'type') + if asset_type == 'image': + image_url = xpath_text(content_asset, 'httpPath') + if not image_url: + continue + thumbnails.append({ + 'id': xpath_text(content_asset, 'ID'), + 'url': image_url, + }) + + return { + 'id': content_id, + 'title': title, + 'description': xpath_text(metadata, 'abstract'), + 'duration': int_or_none(xpath_text(metadata, 'duration')), + 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py new file mode 100644 index 0000000000..c3623edcc7 --- /dev/null +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import smuggle_url + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))' + + _TESTS = [{ + 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', + 'info_dict': { + 'id': '5983675500001', + 'ext': 'mp4', + 'title': 'CORVETTE', + 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', + 'uploader_id': '1969646226001', + 'upload_date': '20181226', + 'timestamp': 1545861635, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'only available for a week', + }, { + # live, geo restricted, bypassable + 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('live_id') + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['FR']}), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py new file mode 100644 index 0000000000..69934ef2b0 --- /dev/null +++ b/youtube_dl/extractor/ro220.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class Ro220IE(InfoExtractor): + IE_NAME = '220.ro' + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/', + 'md5': '03af18b73a07b4088753930db7a34add', + 'info_dict': { + 'id': 'LYV6doKo7f', + 'ext': 'mp4', + 'title': 'Luati-le Banii sez 4 ep 1', + 'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + url = compat_urllib_parse_unquote(self._search_regex( + r'(?s)clip\s*:\s*{.*?url\s*:\s*\'([^\']+)\'', webpage, 'url')) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + formats = [{ + 'format_id': 'sd', + 'url': url, + 'ext': 'mp4', + }] + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/rockstargames.py b/youtube_dl/extractor/rockstargames.py new file mode 100644 index 0000000000..cd6904bc93 --- /dev/null +++ b/youtube_dl/extractor/rockstargames.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class RockstarGamesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.rockstargames.com/videos/video/11544/', + 'md5': '03b5caa6e357a4bd50e3143fc03e5733', + 'info_dict': { + 'id': '11544', + 'ext': 'mp4', + 'title': 'Further Adventures in Finance and Felony Trailer', + 'description': 'md5:6d31f55f30cb101b5476c4a379e324a3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1464876000, + 'upload_date': '20160602', + } + }, { + 'url': 'http://www.rockstargames.com/videos#/?video=48', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://www.rockstargames.com/videoplayer/videos/get-video.json', + video_id, query={ + 'id': video_id, + 'locale': 'en_us', + })['video'] + + title = video['title'] + + formats = [] + for video in video['files_processed']['video/mp4']: + if not video.get('src'): + continue + resolution = video.get('resolution') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', resolution or '', 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(video['src']), + 'format_id': resolution, + 'height': height, + }) + + if not formats: + youtube_id = video.get('youtube_id') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('screencap')), + 'timestamp': parse_iso8601(video.get('created')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py new file mode 100644 index 0000000000..8883639b29 --- /dev/null +++ b/youtube_dl/extractor/roosterteeth.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '9156', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But... The Game Announcement', + 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', + 'thumbnail': r're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }, { + 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'only_matching': True, + }] + _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) + + def _real_initialize(self): + if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): + return + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + api_episode_url = self._EPISODE_BASE_URL + display_id + + try: + m3u8_url = self._download_json( + api_episode_url + '/videos', display_id, + 'Downloading video JSON metadata')['data'][0]['attributes']['url'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + raise + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + episode = self._download_json( + api_episode_url, display_id, + 'Downloading episode JSON metadata')['data'][0] + attributes = episode['attributes'] + title = attributes.get('title') or attributes['display_title'] + video_id = compat_str(episode['id']) + + thumbnails = [] + for image in episode.get('included', {}).get('images', []): + if image.get('type') == 'episode_image': + img_attributes = image.get('attributes') or {} + for k in ('thumb', 'small', 'medium', 'large'): + img_url = img_attributes.get(k) + if img_url: + thumbnails.append({ + 'id': k, + 'url': img_url, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': attributes.get('description') or attributes.get('caption'), + 'thumbnails': thumbnails, + 'series': attributes.get('show_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': attributes.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': str_or_none(episode.get('uuid')), + 'formats': formats, + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + } diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py new file mode 100644 index 0000000000..14c8e82369 --- /dev/null +++ b/youtube_dl/extractor/rottentomatoes.py @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .internetvideoarchive import InternetVideoArchiveIE + + +class RottenTomatoesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', + 'info_dict': { + 'id': '11028566', + 'ext': 'mp4', + 'title': 'Toy Story 3', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') + + return { + '_type': 'url_transparent', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, + 'ie_key': InternetVideoArchiveIE.ie_key(), + 'id': video_id, + 'title': self._og_search_title(webpage), + } diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py new file mode 100644 index 0000000000..65284643b4 --- /dev/null +++ b/youtube_dl/extractor/roxwel.py @@ -0,0 +1,53 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' + + _TEST = { + 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', + 'info_dict': { + 'id': 'passionpittakeawalklive', + 'ext': 'flv', + 'title': 'Take A Walk (live)', + 'uploader': 'Passion Pit', + 'uploader_id': 'passionpit', + 'upload_date': '20120928', + 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + filename = mobj.group('filename') + info_url = 'http://www.roxwel.com/api/videos/%s' % filename + info = self._download_json(info_url, filename) + + rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) + best_rate = rtmp_rates[-1] + url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) + rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') + ext = determine_ext(rtmp_url) + if ext == 'f4v': + rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + + return { + 'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py new file mode 100644 index 0000000000..fccf69401d --- /dev/null +++ b/youtube_dl/extractor/rozhlas.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_start, +) + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) + + title = self._html_search_regex( + r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) + + return { + 'id': audio_id, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'vcodec': 'none', + } diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py new file mode 100644 index 0000000000..3b0f3080b1 --- /dev/null +++ b/youtube_dl/extractor/rtbf.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + strip_or_none, +) + + +class RTBFIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?rtbf\.be/ + (?: + video/[^?]+\?.*\bid=| + ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| + auvio/[^/]+\?.*\b(?P<live>l)?id= + )(?P<id>\d+)''' + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '8c876a1cceeb6cf31b476461ade72384', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'description': '(du 25/04/2014)', + 'duration': 3099.54, + 'upload_date': '20140425', + 'timestamp': 1398456300, + } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', + 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, + }] + _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' + _PROVIDERS = { + 'YOUTUBE': 'Youtube', + 'DAILYMOTION': 'Dailymotion', + 'VIMEO': 'Vimeo', + } + _QUALITIES = [ + ('mobile', 'SD'), + ('web', 'MD'), + ('high', 'HD'), + ] + + def _real_extract(self, url): + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) + + error = data.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + provider = data.get('provider') + if provider in self._PROVIDERS: + return self.url_result(data['url'], self._PROVIDERS[provider]) + + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' + formats = [] + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': fix_url(format_url), + 'height': height, + }) + + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) + + return { + 'id': media_id, + 'formats': formats, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py new file mode 100644 index 0000000000..1fbc72915e --- /dev/null +++ b/youtube_dl/extractor/rte.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + float_or_none, + parse_iso8601, + str_or_none, + try_get, + unescapeHTML, + url_or_none, + ExtractorError, +) + + +class RteBaseIE(InfoExtractor): + def _real_extract(self, url): + item_id = self._match_id(url) + + info_dict = {} + formats = [] + + ENDPOINTS = ( + 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', + ) + + for num, ep_url in enumerate(ENDPOINTS, start=1): + try: + data = self._download_json(ep_url + item_id, item_id) + except ExtractorError as ee: + if num < len(ENDPOINTS) or formats: + continue + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise + + # NB the string values in the JSON are stored using XML escaping(!) + show = try_get(data, lambda x: x['shows'][0], dict) + if not show: + continue + + if not info_dict: + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + info_dict = { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + } + + mg = try_get(show, lambda x: x['media:group'][0], dict) + if not mg: + continue + + if mg.get('url'): + m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) + if m: + m = m.groupdict() + formats.append({ + 'url': m['url'] + '/' + m['app'], + 'app': m['app'], + 'play_path': m['playpath'], + 'player_url': url, + 'ext': 'flv', + 'format_id': 'rtmp', + }) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + mg_rte_server = str_or_none(mg.get('rte:server')) + mg_url = str_or_none(mg.get('url')) + if mg_rte_server and mg_url: + hds_url = url_or_none(mg_rte_server + mg_url) + if hds_url: + formats.extend(self._extract_f4m_formats( + hds_url, item_id, f4m_id='hds', fatal=False)) + + self._sort_formats(formats) + + info_dict['formats'] = formats + return info_dict + + +class RteIE(RteBaseIE): + IE_NAME = 'rte' + IE_DESC = 'Raidió Teilifís Éireann TV' + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', + 'md5': '4a76eb3396d98f697e6e8110563d2604', + 'info_dict': { + 'id': '10478715', + 'ext': 'mp4', + 'title': 'iWitness', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, + 'upload_date': '20151012', + 'timestamp': 1444694160, + }, + } + + +class RteRadioIE(RteBaseIE): + IE_NAME = 'rte:radio' + IE_DESC = 'Raidió Teilifís Éireann radio' + # Radioplayer URLs have two distinct specifier formats, + # the old format #!rii=<channel_id>:<id>:<playable_item_id>:<date>: + # the new format #!rii=b<channel_id>_<id>_<playable_item_id>_<date>_ + # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. + # An <id> uniquely defines an individual recording, and is the only part we require. + _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P<id>[0-9]+)' + + _TESTS = [{ + # Old-style player URL; HLS and RTMPE formats + 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', + 'md5': 'c79ccb2c195998440065456b69760411', + 'info_dict': { + 'id': '10507902', + 'ext': 'mp4', + 'title': 'Gloria', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', + 'timestamp': 1451203200, + 'upload_date': '20151227', + 'duration': 7230.0, + }, + }, { + # New-style player URL; RTMPE formats only + 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', + 'info_dict': { + 'id': '3250678', + 'ext': 'flv', + 'title': 'The Lyric Concert with Paul Herriott', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': '', + 'timestamp': 1333742400, + 'upload_date': '20120406', + 'duration': 7199.016, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py new file mode 100644 index 0000000000..70f000ca8a --- /dev/null +++ b/youtube_dl/extractor/rtl2.py @@ -0,0 +1,207 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import ( + compat_b64decode, + compat_ord, + compat_str, +) +from ..utils import ( + bytes_to_intlist, + ExtractorError, + intlist_to_bytes, + int_or_none, + strip_or_none, +) + + +class RTL2IE(InfoExtractor): + IE_NAME = 'rtl2' + _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', + 'info_dict': { + 'id': 'folge-203-0', + 'ext': 'f4v', + 'title': 'GRIP sucht den Sommerkönig', + 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, { + 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', + 'info_dict': { + 'id': 'anna-erwischt-alex', + 'ext': 'mp4', + 'title': 'Anna erwischt Alex!', + 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }] + + def _real_extract(self, url): + vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups() + if not vico_id: + webpage = self._download_webpage(url, display_id) + + mobj = re.search( + r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', + webpage) + if mobj: + vico_id = mobj.group('vico_id') + vivi_id = mobj.group('vivi_id') + else: + vico_id = self._html_search_regex( + r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') + vivi_id = self._html_search_regex( + r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') + + info = self._download_json( + 'https://service.rtl2.de/api-player-vipo/video.php', + display_id, query={ + 'vico_id': vico_id, + 'vivi_id': vivi_id, + }) + video_info = info['video'] + title = video_info['titel'] + + formats = [] + + rtmp_url = video_info.get('streamurl') + if rtmp_url: + rtmp_url = rtmp_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') + rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': stream_url, + 'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + 'preference': 1, + }) + + m3u8_url = video_info.get('streamurl_hls') + if m3u8_url: + formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) + + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': title, + 'thumbnail': video_info.get('image'), + 'description': video_info.get('beschreibung'), + 'duration': int_or_none(video_info.get('duration')), + 'formats': formats, + } + + +class RTL2YouBaseIE(InfoExtractor): + _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' + + +class RTL2YouIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you' + _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', + 'info_dict': { + 'id': '15740', + 'ext': 'mp4', + 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', + 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', + 'age_limit': 12, + }, + }, { + 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', + 'only_matching': True, + }] + _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' + _GEO_COUNTRIES = ['DE'] + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) + + data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') + stream_url = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(compat_b64decode(data)), + bytes_to_intlist(self._AES_KEY), + bytes_to_intlist(compat_b64decode(iv)) + )) + if b'rtl2_you_video_not_found' in stream_url: + raise ExtractorError('video not found', expected=True) + + formats = self._extract_m3u8_formats( + stream_url[:-compat_ord(stream_url[-1])].decode(), + video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + video_data = self._download_json( + self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) + + series = video_data.get('formatTitle') + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': strip_or_none(video_data.get('description')), + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), + 'series': series, + 'episode': episode, + 'age_limit': int_or_none(video_data.get('minimumAge')), + } + + +class RTL2YouSeriesIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you:series' + _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://you.rtl2.de/videos/115/dragon-ball', + 'info_dict': { + 'id': '115', + }, + 'playlist_mincount': 5, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'videos', + series_id, query={ + 'formatId': series_id, + 'limit': 1000000000, + }) + + entries = [] + for video in stream_data.get('videos', []): + video_id = compat_str(video['videoId']) + if not video_id: + continue + entries.append(self.url_result( + 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), + 'RTL2You', video_id)) + return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py new file mode 100644 index 0000000000..fadca8c175 --- /dev/null +++ b/youtube_dl/extractor/rtlnl.py @@ -0,0 +1,126 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, +) + + +class RtlNlIE(InfoExtractor): + IE_NAME = 'rtl.nl' + IE_DESC = 'rtl.nl and rtlxl.nl' + _VALID_URL = r'''(?x) + https?://(?:(?:www|static)\.)? + (?: + rtlxl\.nl/[^\#]*\#!/[^/]+/| + rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/) + ) + (?P<id>[0-9a-f-]+)''' + + _TESTS = [{ + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'md5': '473d1946c1fdd050b2c0161a4b13c373', + 'info_dict': { + 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'ext': 'mp4', + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1461951000, + 'upload_date': '20160429', + 'duration': 1167.96, + }, + }, { + # best format available a3t + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', + 'md5': 'dea7474214af1271d91ef332fb8be7ea', + 'info_dict': { + 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed', + 'ext': 'mp4', + 'timestamp': 1424039400, + 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'upload_date': '20150215', + 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', + } + }, { + # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275) + # best format available nettv + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', + 'info_dict': { + 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', + 'ext': 'mp4', + 'title': 'RTL Nieuws - Meer beelden van overval juwelier', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'timestamp': 1437233400, + 'upload_date': '20150718', + 'duration': 30.474, + }, + 'params': { + 'skip_download': True, + }, + }, { + # encrypted m3u8 streams, georestricted + 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', + 'only_matching': True, + }, { + 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', + 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, + }, { + 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', + 'only_matching': True, + }, { + 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl', + 'only_matching': True, + }] + + def _real_extract(self, url): + uuid = self._match_id(url) + info = self._download_json( + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, + uuid) + + material = info['material'][0] + title = info['abstracts'][0]['name'] + subtitle = material.get('title') + if subtitle: + title += ' - %s' % subtitle + description = material.get('synopsis') + + meta = info.get('meta', {}) + + videopath = material['videopath'] + m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath + + formats = self._extract_m3u8_formats( + m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + thumbnails = [] + + for p in ('poster_base_url', '"thumb_base_url"'): + if not meta.get(p): + continue + + thumbnails.append({ + 'url': self._proto_relative_url(meta[p] + uuid), + 'width': int_or_none(self._search_regex( + r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)), + 'height': int_or_none(self._search_regex( + r'/sz=[0-9]+x([0-9]+)', + meta[p], 'thumbnail height', fatal=False)) + }) + + return { + 'id': uuid, + 'title': title, + 'formats': formats, + 'timestamp': material['original_date'], + 'description': description, + 'duration': parse_duration(material.get('duration')), + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py new file mode 100644 index 0000000000..02986f4426 --- /dev/null +++ b/youtube_dl/extractor/rtp.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, +) + + +class RTPIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' + _TESTS = [{ + 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', + 'md5': 'e736ce0c665e459ddb818546220b4ef8', + 'info_dict': { + 'id': 'e174042', + 'ext': 'mp3', + 'title': 'Paixões Cruzadas', + 'description': 'As paixões musicais de António Cartaxo e António Macedo', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + 'twitter:title', webpage, display_name='title', fatal=True) + + config = self._parse_json(self._search_regex( + r'(?s)RTPPlayer\(({.+?})\);', webpage, + 'player config'), video_id, js_to_json) + file_url = config['file'] + ext = determine_ext(file_url) + if ext == 'm3u8': + file_key = config.get('fileKey') + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=file_key) + if file_key: + formats.append({ + 'url': 'https://cdn-ondemand.rtp.pt' + file_key, + 'preference': 1, + }) + self._sort_formats(formats) + else: + formats = [{ + 'url': file_url, + 'ext': ext, + }] + if config.get('mediaType') == 'audio': + for f in formats: + f['vcodec'] = 'none' + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._html_search_meta(['description', 'twitter:description'], webpage), + 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py new file mode 100644 index 0000000000..48f17b828c --- /dev/null +++ b/youtube_dl/extractor/rts.py @@ -0,0 +1,230 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .srgssr import SRGSSRIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + unescapeHTML, + determine_ext, +) + + +class RTSIE(SRGSSRIE): + IE_DESC = 'RTS.ch' + _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' + + _TESTS = [ + { + 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', + 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'info_dict': { + 'id': '3449373', + 'display_id': 'les-enfants-terribles', + 'ext': 'mp4', + 'duration': 1488, + 'title': 'Les Enfants Terribles', + 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', + 'uploader': 'Divers', + 'upload_date': '19680921', + 'timestamp': -40280400, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + }, + { + 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', + 'info_dict': { + 'id': '5624065', + 'title': 'Passe-moi les jumelles', + }, + 'playlist_mincount': 4, + }, + { + 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', + 'info_dict': { + 'id': '5745975', + 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', + 'ext': 'mp4', + 'duration': 48, + 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', + 'description': 'Hockey - Playoff', + 'uploader': 'Hockey', + 'upload_date': '20140403', + 'timestamp': 1396556882, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Blocked outside Switzerland', + }, + { + 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', + 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'info_dict': { + 'id': '5745356', + 'display_id': 'londres-cachee-par-un-epais-smog', + 'ext': 'mp4', + 'duration': 33, + 'title': 'Londres cachée par un épais smog', + 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', + 'uploader': 'L\'actu en vidéo', + 'upload_date': '20140403', + 'timestamp': 1396537322, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + }, + { + 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', + 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae', + 'info_dict': { + 'id': '5706148', + 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014', + 'ext': 'mp3', + 'duration': 123, + 'title': '"Urban Hippie", de Damien Krisl', + 'description': 'Des Hippies super glam.', + 'upload_date': '20140403', + 'timestamp': 1396551600, + }, + }, + { + # article with videos on rhs + 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', + 'info_dict': { + 'id': '6693917', + 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', + }, + 'playlist_mincount': 5, + }, + { + 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + media_id = m.group('rts_id') or m.group('id') + display_id = m.group('display_id') or media_id + + def download_json(internal_id): + return self._download_json( + 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, + display_id) + + all_info = download_json(media_id) + + # media_id extracted out of URL is not always a real id + if 'video' not in all_info and 'audio' not in all_info: + entries = [] + + for item in all_info.get('items', []): + item_url = item.get('url') + if not item_url: + continue + entries.append(self.url_result(item_url, 'RTS')) + + if not entries: + page, urlh = self._download_webpage_handle(url, display_id) + if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: + return self.url_result(urlh.geturl(), 'RTS') + + # article with videos on rhs + videos = re.findall( + r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', + page) + if not videos: + videos = re.findall( + r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + page) + if videos: + entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] + + if entries: + return self.playlist_result(entries, media_id, all_info.get('title')) + + internal_id = self._html_search_regex( + r'<(?:video|audio) data-id="([0-9]+)"', page, + 'internal video id') + all_info = download_json(internal_id) + + media_type = 'video' if 'video' in all_info else 'audio' + + # check for errors + self.get_media_data('rts', media_type, media_id) + + info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] + + title = info['title'] + + def extract_bitrate(url): + return int_or_none(self._search_regex( + r'-([0-9]+)k\.', url, 'bitrate', default=None)) + + formats = [] + streams = info.get('streams', {}) + for format_id, format_url in streams.items(): + if format_id == 'hds_sd' and 'hds' in streams: + continue + if format_id == 'hls_sd' and 'hls' in streams: + continue + ext = determine_ext(format_url) + if ext in ('m3u8', 'f4m'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'tbr': extract_bitrate(format_url), + }) + + for media in info.get('media', []): + media_url = media.get('url') + if not media_url or re.match(r'https?://', media_url): + continue + rate = media.get('rate') + ext = media.get('ext') or determine_ext(media_url, 'mp4') + format_id = ext + if rate: + format_id += '-%dk' % rate + formats.append({ + 'format_id': format_id, + 'url': 'http://download-video.rts.ch/' + media_url, + 'tbr': rate or extract_bitrate(media_url), + }) + + self._check_formats(formats, media_id) + self._sort_formats(formats) + + duration = info.get('duration') or info.get('cutout') or info.get('cutduration') + if isinstance(duration, compat_str): + duration = parse_duration(duration) + + return { + 'id': media_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': info.get('intro'), + 'duration': duration, + 'view_count': int_or_none(info.get('plays')), + 'uploader': info.get('programName'), + 'timestamp': parse_iso8601(info.get('broadcast_date')), + 'thumbnail': unescapeHTML(info.get('preview_image_url')), + } diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py new file mode 100644 index 0000000000..ce9db06294 --- /dev/null +++ b/youtube_dl/extractor/rtve.py @@ -0,0 +1,292 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re +import time + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_struct_unpack, +) +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + remove_end, + remove_start, + sanitized_Request, + std_headers, +) + + +def _decrypt_url(png): + encrypted_data = compat_b64decode(png) + text_index = encrypted_data.find(b'tEXt') + text_chunk = encrypted_data[text_index - 4:] + length = compat_struct_unpack('!I', text_chunk[:4])[0] + # Use bytearray to get integers when iterating in both python 2.x and 3.x + data = bytearray(text_chunk[8:8 + length]) + data = [chr(b) for b in data if b != 0] + hash_index = data.index('#') + alphabet_data = data[:hash_index] + url_data = data[hash_index + 1:] + if url_data[0] == 'H' and url_data[3] == '%': + # remove useless HQ%% at the start + url_data = url_data[4:] + + alphabet = [] + e = 0 + d = 0 + for l in alphabet_data: + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in url_data: + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + return url + + +class RTVEALaCartaIE(InfoExtractor): + IE_NAME = 'rtve.es:alacarta' + IE_DESC = 'RTVE a la carta' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'info_dict': { + 'id': '2491869', + 'ext': 'mp4', + 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', + 'duration': 5024.566, + }, + }, { + 'note': 'Live stream', + 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', + 'info_dict': { + 'id': '1694255', + 'ext': 'flv', + 'title': 'TODO', + }, + 'skip': 'The f4m manifest can\'t be used yet', + }, { + 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', + 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'info_dict': { + 'id': '4236788', + 'ext': 'mp4', + 'title': 'Servir y proteger - Capítulo 104 ', + 'duration': 3222.0, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, { + 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', + 'only_matching': True, + }, { + 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', + 'only_matching': True, + }] + + def _real_initialize(self): + user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') + manager_info = self._download_json( + 'http://www.rtve.es/odin/loki/' + user_agent_b64, + None, 'Fetching manager info') + self._manager = manager_info['manager'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + if info['state'] == 'DESPU': + raise ExtractorError('The video is no longer available', expected=True) + title = info['title'] + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) + png_request = sanitized_Request(png_url) + png_request.add_header('Referer', url) + png = self._download_webpage(png_request, video_id, 'Downloading url information') + video_url = _decrypt_url(png) + ext = determine_ext(video_url) + + formats = [] + if not video_url.endswith('.f4m') and ext != 'm3u8': + if '?' not in video_url: + video_url = video_url.replace('resources/', 'auth/resources/') + video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') + + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'url': video_url, + }) + self._sort_formats(formats) + + subtitles = None + if info.get('sbtFile') is not None: + subtitles = self.extract_subtitles(video_id, info['sbtFile']) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': info.get('image'), + 'page_url': url, + 'subtitles': subtitles, + 'duration': float_or_none(info.get('duration'), scale=1000), + } + + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + + +class RTVEInfantilIE(InfoExtractor): + IE_NAME = 'rtve.es:infantil' + IE_DESC = 'RTVE infantil' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/' + + _TESTS = [{ + 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', + 'md5': '915319587b33720b8e0357caaa6617e6', + 'info_dict': { + 'id': '3040283', + 'ext': 'mp4', + 'title': 'Maneras de vivir', + 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'duration': 357.958, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + + webpage = self._download_webpage(url, video_id) + vidplayer_id = self._search_regex( + r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') + + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + png = self._download_webpage(png_url, video_id, 'Downloading url information') + video_url = _decrypt_url(png) + + return { + 'id': video_id, + 'ext': 'mp4', + 'title': info['title'], + 'url': video_url, + 'thumbnail': info.get('image'), + 'duration': float_or_none(info.get('duration'), scale=1000), + } + + +class RTVELiveIE(InfoExtractor): + IE_NAME = 'rtve.es:live' + IE_DESC = 'RTVE.es live streams' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/directo/la-1/', + 'info_dict': { + 'id': 'la-1', + 'ext': 'mp4', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + }, + 'params': { + 'skip_download': 'live stream', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + start_time = time.gmtime() + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') + title = remove_start(title, 'Estoy viendo ') + title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) + + vidplayer_id = self._search_regex( + (r'playerId=player([0-9]+)', + r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', + r'data-id=["\'](\d+)'), + webpage, 'internal video ID') + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id + png = self._download_webpage(png_url, video_id, 'Downloading url information') + m3u8_url = _decrypt_url(png) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'is_live': True, + } + + +class RTVETelevisionIE(InfoExtractor): + IE_NAME = 'rtve.es:television' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' + + _TEST = { + 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'info_dict': { + 'id': '3069778', + 'ext': 'mp4', + 'title': 'Documentos TV - La revolución del móvil', + 'duration': 3496.948, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + alacarta_url = self._search_regex( + r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', + webpage, 'alacarta url', default=None) + if alacarta_url is None: + raise ExtractorError( + 'The webpage doesn\'t contain any video', expected=True) + + return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py new file mode 100644 index 0000000000..6a00f70072 --- /dev/null +++ b/youtube_dl/extractor/rtvnh.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class RTVNHIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rtvnh.nl/video/131946', + 'md5': 'cdbec9f44550763c8afc96050fa747dc', + 'info_dict': { + 'id': '131946', + 'ext': 'mp4', + 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', + 'thumbnail': r're:^https?:.*\.jpg$' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + meta = self._parse_json(self._download_webpage( + 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) + + status = meta.get('status') + if status != 200: + raise ExtractorError( + '%s returned error code %d' % (self.IE_NAME, status), expected=True) + + formats = [] + rtmp_formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + formats.extend(rtmp_formats) + + for rtmp_format in rtmp_formats: + rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format = rtmp_format.copy() + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'url': rtmp_url.replace('rtmp://', 'rtsp://'), + 'protocol': 'rtsp', + }) + formats.append(rtsp_format) + http_base_url = rtmp_url.replace('rtmp://', 'http://') + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': meta['title'].strip(), + 'thumbnail': meta.get('image'), + 'formats': formats + } diff --git a/youtube_dl/extractor/rtvs.py b/youtube_dl/extractor/rtvs.py new file mode 100644 index 0000000000..6573b260da --- /dev/null +++ b/youtube_dl/extractor/rtvs.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTVSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)' + _TESTS = [{ + # radio archive + 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', + 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', + 'info_dict': { + 'id': '414872', + 'ext': 'mp3', + 'title': 'Ostrov pokladov 1 časť.mp3' + }, + 'params': { + 'skip_download': True, + } + }, { + # tv archive + 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', + 'md5': '85e2c55cf988403b70cac24f5c086dc6', + 'info_dict': { + 'id': '63118', + 'ext': 'mp4', + 'title': 'Amaro Džives - Náš deň', + 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'playlist url', group='url') + + data = self._download_json( + playlist_url, video_id, 'Downloading playlist')[0] + return self._parse_jwplayer_data(data, video_id=video_id) diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py new file mode 100644 index 0000000000..3c8053a261 --- /dev/null +++ b/youtube_dl/extractor/ruhd.py @@ -0,0 +1,45 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RUHDIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' + _TEST = { + 'url': 'http://www.ruhd.ru/play.php?vid=207', + 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', + 'info_dict': { + 'id': '207', + 'ext': 'divx', + 'title': 'КОТ бааааам', + 'description': 'классный кот)', + 'thumbnail': r're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'<param name="src" value="([^"]+)"', webpage, 'video url') + title = self._html_search_regex( + r'<title>([^<]+)   RUHD\.ru - Видео Высокого качества №1 в России!', + webpage, 'title') + description = self._html_search_regex( + r'(?s)
    (.+?)', + webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'[\da-z]{32})' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '1d24f180fac7a02f3900712e5a5764d6', + 'info_dict': { + 'id': '3eac3b4561676c17df9132a9a1e62e3e', + 'ext': 'mp4', + 'title': 'Раненный кенгуру забежал в аптеку', + 'description': 'http://www.ntdtv.ru ', + 'duration': 81, + 'uploader': 'NTDRussian', + 'uploader_id': '29790', + 'timestamp': 1381943602, + 'upload_date': '20131016', + 'age_limit': 0, + }, + }, { + 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) + + @staticmethod + def _extract_urls(webpage): + return [mobj.group('url') for mobj in re.finditer( + r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_and_extract_info(video_id) + info['formats'] = self._download_and_extract_formats(video_id) + return info + + +class RutubeEmbedIE(RutubeBaseIE): + IE_NAME = 'rutube:embed' + IE_DESC = 'Rutube embedded videos' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', + 'info_dict': { + 'id': 'a10e53b86e8f349080f718582ce4c661', + 'ext': 'mp4', + 'timestamp': 1387830582, + 'upload_date': '20131223', + 'uploader_id': '297833', + 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix

    восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', + 'uploader': 'subziro89 ILya', + 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://rutube.ru/play/embed/8083783', + 'only_matching': True, + }, { + # private video + 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', + 'only_matching': True, + }] + + def _real_extract(self, url): + embed_id = self._match_id(url) + # Query may contain private videos token and should be passed to API + # requests (see #19163) + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + options = self._download_api_options(embed_id, query) + video_id = options['effective_video'] + formats = self._extract_formats(options, video_id) + info = self._download_and_extract_info(video_id, query) + info.update({ + 'extractor_key': 'Rutube', + 'formats': formats, + }) + return info + + +class RutubePlaylistBaseIE(RutubeBaseIE): + def _next_page_url(self, page_num, playlist_id, *args, **kwargs): + return self._PAGE_TEMPLATE % (playlist_id, page_num) + + def _entries(self, playlist_id, *args, **kwargs): + next_page_url = None + for pagenum in itertools.count(1): + page = self._download_json( + next_page_url or self._next_page_url( + pagenum, playlist_id, *args, **kwargs), + playlist_id, 'Downloading page %s' % pagenum) + + results = page.get('results') + if not results or not isinstance(results, list): + break + + for result in results: + video_url = url_or_none(result.get('video_url')) + if not video_url: + continue + entry = self._extract_info(result, require_title=False) + entry.update({ + '_type': 'url', + 'url': video_url, + 'ie_key': RutubeIE.ie_key(), + }) + yield entry + + next_page_url = page.get('next') + if not next_page_url or not page.get('has_next'): + break + + def _extract_playlist(self, playlist_id, *args, **kwargs): + return self.playlist_result( + self._entries(playlist_id, *args, **kwargs), + playlist_id, kwargs.get('playlist_name')) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class RutubeChannelIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:channel' + IE_DESC = 'Rutube channels' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/tags/video/1800/', + 'info_dict': { + 'id': '1800', + }, + 'playlist_mincount': 68, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' + + +class RutubeMovieIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:movie' + IE_DESC = 'Rutube movies' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P\d+)' + _TESTS = [] + + _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' + _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' + + def _real_extract(self, url): + movie_id = self._match_id(url) + movie = self._download_json( + self._MOVIE_TEMPLATE % movie_id, movie_id, + 'Downloading movie JSON') + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) + + +class RutubePersonIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:person' + IE_DESC = 'Rutube person videos' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/video/person/313878/', + 'info_dict': { + 'id': '313878', + }, + 'playlist_mincount': 37, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P\d+)' + _TESTS = [{ + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'info_dict': { + 'id': '3097', + }, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + + @classmethod + def suitable(cls, url): + if not super(RutubePlaylistIE, cls).suitable(url): + return False + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py new file mode 100644 index 0000000000..d2713c19a0 --- /dev/null +++ b/youtube_dl/extractor/rutv.py @@ -0,0 +1,211 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none +) + + +class RUTVIE(InfoExtractor): + IE_DESC = 'RUTV.RU' + _VALID_URL = r'''(?x) + https?:// + (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ + (?P + flash\d+v/container\.swf\?id=| + iframe/(?Pswf|video|live)/id/| + index/iframe/cast_id/ + ) + (?P\d+) + ''' + + _TESTS = [ + { + 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724', + 'info_dict': { + 'id': '774471', + 'ext': 'mp4', + 'title': 'Монологи на все времена', + 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5', + 'duration': 2906, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638', + 'info_dict': { + 'id': '774016', + 'ext': 'mp4', + 'title': 'Чужой в семье Сталина', + 'description': '', + 'duration': 2539, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000', + 'info_dict': { + 'id': '766888', + 'ext': 'mp4', + 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', + 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', + 'duration': 279, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169', + 'info_dict': { + 'id': '771852', + 'ext': 'mp4', + 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет', + 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8', + 'duration': 3096, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014', + 'info_dict': { + 'id': '51499', + 'ext': 'flv', + 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', + 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', + }, + 'skip': 'Translation has finished', + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/', + 'info_dict': { + 'id': '21', + 'ext': 'mp4', + 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', + 'only_matching': True, + }, + ] + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r']+?src=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + if mobj: + return mobj.group('url') + + mobj = re.search( + r']+?property=(["\'])og:video\1[^>]+?content=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + video_path = mobj.group('path') + + if re.match(r'flash\d+v', video_path): + video_type = 'video' + elif video_path.startswith('iframe'): + video_type = mobj.group('type') + if video_type == 'swf': + video_type = 'video' + elif video_path.startswith('index/iframe/cast_id'): + video_type = 'live' + + is_live = video_type == 'live' + + json_data = self._download_json( + 'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), + video_id, 'Downloading JSON') + + if json_data['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True) + + playlist = json_data['data']['playlist'] + medialist = playlist['medialist'] + media = medialist[0] + + if media['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) + + view_count = playlist.get('count_views') + priority_transport = playlist['priority_transport'] + + thumbnail = media['picture'] + width = int_or_none(media['width']) + height = int_or_none(media['height']) + description = media['anons'] + title = media['title'] + duration = int_or_none(media.get('duration')) + + formats = [] + + for transport, links in media['sources'].items(): + for quality, url in links.items(): + preference = -1 if priority_transport == transport else -2 + if transport == 'rtmp': + mobj = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?P.+)$', url) + if not mobj: + continue + fmt = { + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': 'http://player.rutv.ru', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', + 'rtmp_live': True, + 'ext': 'flv', + 'vbr': int(quality), + 'preference': preference, + } + elif transport == 'm3u8': + formats.extend(self._extract_m3u8_formats( + url, video_id, 'mp4', preference=preference, m3u8_id='hls')) + continue + else: + fmt = { + 'url': url + } + fmt.update({ + 'width': width, + 'height': height, + 'format_id': '%s-%s' % (transport, quality), + }) + formats.append(fmt) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'duration': duration, + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py new file mode 100644 index 0000000000..f984040aa0 --- /dev/null +++ b/youtube_dl/extractor/ruutu.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, +) + + +class RuutuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P\d+)' + _TESTS = [ + { + 'url': 'http://www.ruutu.fi/video/2058907', + 'md5': 'ab2093f39be1ca8581963451b3c0234f', + 'info_dict': { + 'id': '2058907', + 'ext': 'mp4', + 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', + 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 114, + 'age_limit': 0, + }, + }, + { + 'url': 'http://www.ruutu.fi/video/2057306', + 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', + 'info_dict': { + 'id': '2057306', + 'ext': 'mp4', + 'title': 'Superpesis: katso koko kausi Ruudussa', + 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 40, + 'age_limit': 0, + }, + }, + { + 'url': 'http://www.supla.fi/supla/2231370', + 'md5': 'df14e782d49a2c0df03d3be2a54ef949', + 'info_dict': { + 'id': '2231370', + 'ext': 'mp4', + 'title': 'Osa 1: Mikael Jungner', + 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + }, + # Episode where is "NOT-USED", but has other + # downloadable sources available. + { + 'url': 'http://www.ruutu.fi/video/3193728', + 'only_matching': True, + }, + { + # audio podcast + 'url': 'https://www.supla.fi/supla/3382410', + 'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', + 'info_dict': { + 'id': '3382410', + 'ext': 'mp3', + 'title': 'Mikä ihmeen poltergeist?', + 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + 'expected_warnings': ['HTTP Error 502: Bad Gateway'], + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_xml = self._download_xml( + 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + query={'id': video_id}) + + formats = [] + processed_urls = [] + + def extract_formats(node): + for child in node: + if child.tag.endswith('Files'): + extract_formats(child) + elif child.tag.endswith('File'): + video_url = child.text + if (not video_url or video_url in processed_urls + or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): + continue + processed_urls.append(video_url) + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mpd': + # video-only and audio-only streams are of different + # duration resulting in out of sync issue + continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp3' or child.tag == 'AudioMediaFile': + formats.append({ + 'format_id': 'audio', + 'url': video_url, + 'vcodec': 'none', + }) + else: + proto = compat_urllib_parse_urlparse(video_url).scheme + if not child.tag.startswith('HTTP') and proto != 'rtmp': + continue + preference = -1 if proto == 'rtmp' else 1 + label = child.get('label') + tbr = int_or_none(child.get('bitrate')) + format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto + if not self._is_valid_url(video_url, video_id, format_id): + continue + width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'preference': preference, + }) + + extract_formats(video_xml.find('./Clip')) + + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if not formats and drm: + raise ExtractorError('This video is DRM protected.', expected=True) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), + 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), + 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/ruv.py b/youtube_dl/extractor/ruv.py new file mode 100644 index 0000000000..8f3cc40956 --- /dev/null +++ b/youtube_dl/extractor/ruv.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unified_timestamp, +) + + +class RuvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P[^/]+(?:/\d+)?)' + _TESTS = [{ + # m3u8 + 'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516', + 'md5': '66347652f4e13e71936817102acc1724', + 'info_dict': { + 'id': '1144499', + 'display_id': 'fh-valur/20170516', + 'ext': 'mp4', + 'title': 'FH - Valur', + 'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.', + 'timestamp': 1494963600, + 'upload_date': '20170516', + }, + }, { + # mp3 + 'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619', + 'md5': '395ea250c8a13e5fdb39d4670ef85378', + 'info_dict': { + 'id': '1153630', + 'display_id': 'morgunutvarpid/20170619', + 'ext': 'mp3', + 'title': 'Morgunútvarpið', + 'description': 'md5:a4cf1202c0a1645ca096b06525915418', + 'timestamp': 1497855000, + 'upload_date': '20170619', + }, + }, { + 'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614', + 'only_matching': True, + }, { + 'url': 'http://www.ruv.is/node/1151854', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + + FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' + + media_url = self._html_search_regex( + FIELD_RE % 'src', webpage, 'video URL', group='url') + + video_id = self._search_regex( + r']+\bhref=["\']https?://www\.ruv\.is/node/(\d+)', + webpage, 'video id', default=display_id) + + ext = determine_ext(media_url) + + if ext == 'm3u8': + formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + elif ext == 'mp3': + formats = [{ + 'format_id': 'mp3', + 'url': media_url, + 'vcodec': 'none', + }] + else: + formats = [{ + 'url': media_url, + }] + + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py new file mode 100644 index 0000000000..2cc6651224 --- /dev/null +++ b/youtube_dl/extractor/safari.py @@ -0,0 +1,264 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor + +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + update_url_query, +) + + +class SafariBaseIE(InfoExtractor): + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' + _NETRC_MACHINE = 'safari' + + _API_BASE = 'https://learning.oreilly.com/api/v1' + _API_FORMAT = 'json' + + LOGGED_IN = False + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + _, urlh = self._download_webpage_handle( + 'https://learning.oreilly.com/accounts/login-check/', None, + 'Downloading login page') + + def is_logged(urlh): + return 'learning.oreilly.com/home/' in urlh.geturl() + + if is_logged(urlh): + self.LOGGED_IN = True + return + + redirect_url = urlh.geturl() + parsed_url = compat_urlparse.urlparse(redirect_url) + qs = compat_parse_qs(parsed_url.query) + next_uri = compat_urlparse.urljoin( + 'https://api.oreilly.com', qs['next'][0]) + + auth, urlh = self._download_json_handle( + 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', + data=json.dumps({ + 'email': username, + 'password': password, + 'redirect_uri': next_uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': redirect_url, + }, expected_status=400) + + credentials = auth.get('credentials') + if (not auth.get('logged_in') and not auth.get('redirect_uri') + and credentials): + raise ExtractorError( + 'Unable to login: %s' % credentials, expected=True) + + # oreilly serves two same instances of the following cookies + # in Set-Cookie header and expects first one to be actually set + for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'): + self._apply_first_set_cookie_header(urlh, cookie) + + _, urlh = self._download_webpage_handle( + auth.get('redirect_uri') or next_uri, None, 'Completing login',) + + if is_logged(urlh): + self.LOGGED_IN = True + return + + raise ExtractorError('Unable to log in') + + +class SafariIE(SafariBaseIE): + IE_NAME = 'safari' + IE_DESC = 'safaribooksonline.com online video' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) + ) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', + 'md5': 'dcc5a425e79f2564148652616af1f2a3', + 'info_dict': { + 'id': '0_qbqx90ic', + 'ext': 'mp4', + 'title': 'Introduction to Hadoop Fundamentals LiveLessons', + 'timestamp': 1437758058, + 'upload_date': '20150724', + 'uploader_id': 'stork', + }, + }, { + # non-digits in course id + 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', + 'only_matching': True, + }] + + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') + + query = { + 'wid': '_%s' % partner_id, + 'uiconf_id': ui_id, + 'flashvars[referenceId]': reference_id, + } + + if self.LOGGED_IN: + kaltura_session = self._download_json( + '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), + video_id, 'Downloading kaltura session JSON', + 'Unable to download kaltura session JSON', fatal=False, + headers={'Accept': 'application/json'}) + if kaltura_session: + session = kaltura_session.get('session') + if session: + query['flashvars[ks]'] = session + + return self.url_result(update_url_query( + 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), + 'Kaltura') + + +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + return self.url_result(part['web_url'], SafariIE.ie_key()) + + +class SafariCourseIE(SafariBaseIE): + IE_NAME = 'safari:course' + IE_DESC = 'safaribooksonline.com online courses' + + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| + techbus\.safaribooksonline\.com + ) + /(?P[^/]+) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'info_dict': { + 'id': '9780133392838', + 'title': 'Hadoop Fundamentals LiveLessons', + }, + 'playlist_count': 22, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', + 'only_matching': True, + }, { + 'url': 'http://techbus.safaribooksonline.com/9780134426365', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + + def _real_extract(self, url): + course_id = self._match_id(url) + + course_json = self._download_json( + '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + course_id, 'Downloading course JSON') + + if 'chapters' not in course_json: + raise ExtractorError( + 'No chapters found for course %s' % course_id, expected=True) + + entries = [ + self.url_result(chapter, SafariApiIE.ie_key()) + for chapter in course_json['chapters']] + + course_title = course_json['title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/sapo.py b/youtube_dl/extractor/sapo.py new file mode 100644 index 0000000000..49a9b313a8 --- /dev/null +++ b/youtube_dl/extractor/sapo.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class SapoIE(InfoExtractor): + IE_DESC = 'SAPO Vídeos' + _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P[\da-zA-Z]{20})' + + _TESTS = [ + { + 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', + 'md5': '79ee523f6ecb9233ac25075dee0eda83', + 'note': 'SD video', + 'info_dict': { + 'id': 'UBz95kOtiWYUMTA5Ghfi', + 'ext': 'mp4', + 'title': 'Benfica - Marcas na Hitória', + 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', + 'duration': 264, + 'uploader': 'tiago_1988', + 'upload_date': '20080229', + 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], + }, + }, + { + 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', + 'md5': '90a2f283cfb49193fe06e861613a72aa', + 'note': 'HD video', + 'info_dict': { + 'id': 'IyusNAZ791ZdoCY5H5IF', + 'ext': 'mp4', + 'title': 'Codebits VII - Report', + 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', + 'duration': 144, + 'uploader': 'codebits', + 'upload_date': '20140427', + 'categories': ['codebits', 'codebits2014'], + }, + }, + { + 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', + 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', + 'note': 'v2 video', + 'info_dict': { + 'id': 'yLqjzPtbTimsn2wWBKHz', + 'ext': 'mp4', + 'title': 'Hipnose Condicionativa 4', + 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', + 'duration': 692, + 'uploader': 'sapozen', + 'upload_date': '20090609', + 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + item = self._download_xml( + 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') + + title = item.find('./title').text + description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text + thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') + duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) + uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text + upload_date = unified_strdate(item.find('./pubDate').text) + view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) + comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) + tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text + categories = tags.split() if tags else [] + age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 + + video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text + video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'sd', + 'width': int(video_size[0]), + 'height': int(video_size[1]), + }] + + if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': + formats.append({ + 'url': re.sub(r'/mov/1$', '/mov/39', video_url), + 'ext': 'mp4', + 'format_id': 'hd', + 'width': 1280, + 'height': 720, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py new file mode 100644 index 0000000000..21e44b69ab --- /dev/null +++ b/youtube_dl/extractor/savefrom.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path +import re + +from .common import InfoExtractor + + +class SaveFromIE(InfoExtractor): + IE_NAME = 'savefrom.net' + _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P.*)$' + + _TEST = { + 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', + 'info_dict': { + 'id': 'UlVRAPW2WJY', + 'ext': 'mp4', + 'title': 'About Team Radical MMA | MMA Fighting', + 'upload_date': '20120816', + 'uploader': 'Howcast', + 'uploader_id': 'Howcast', + 'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', + }, + 'params': { + 'skip_download': True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = os.path.splitext(url.split('/')[-1])[0] + + return self.url_result(mobj.group('url'), video_id=video_id) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py new file mode 100644 index 0000000000..0e623ff7b7 --- /dev/null +++ b/youtube_dl/extractor/sbs.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + smuggle_url, + ExtractorError, +) + + +class SBSIE(InfoExtractor): + IE_DESC = 'sbs.com.au' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P[0-9]+)' + + _TESTS = [{ + # Original URL is handled by the generic IE which finds the iframe: + # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation + 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', + 'md5': '3150cf278965eeabb5b4cea1c963fe0a', + 'info_dict': { + 'id': '320403011771', + 'ext': 'mp4', + 'title': 'Dingo Conservation (The Feed)', + 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', + 'thumbnail': r're:http://.*\.jpg', + 'duration': 308, + 'timestamp': 1408613220, + 'upload_date': '20140821', + 'uploader': 'SBSC', + }, + }, { + 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', + 'only_matching': True, + }, { + 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + player_params = self._download_json( + 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) + + error = player_params.get('error') + if error: + error_message = 'Sorry, The video you are looking for does not exist.' + video_data = error.get('results') or {} + error_code = error.get('errorCode') + if error_code == 'ComingSoon': + error_message = '%s is not yet available.' % video_data.get('title', '') + elif error_code in ('Forbidden', 'intranetAccessOnly'): + error_message = 'Sorry, This video cannot be accessed via this website' + elif error_code == 'Expired': + error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + + urls = player_params['releaseUrls'] + theplatform_url = (urls.get('progressive') or urls.get('html') + or urls.get('standard') or player_params['relatedItemsURL']) + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'id': video_id, + 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), + } diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py new file mode 100644 index 0000000000..69a0d01f39 --- /dev/null +++ b/youtube_dl/extractor/screencast.py @@ -0,0 +1,123 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, +) + + +class ScreencastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'http://www.screencast.com/t/3ZEjQXlT', + 'md5': '917df1c13798a3e96211dd1561fded83', + 'info_dict': { + 'id': '3ZEjQXlT', + 'ext': 'm4v', + 'title': 'Color Measurement with Ocean Optics Spectrometers', + 'description': 'md5:240369cde69d8bed61349a199c5fb153', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/V2uXehPJa1ZI', + 'md5': 'e8e4b375a7660a9e7e35c33973410d34', + 'info_dict': { + 'id': 'V2uXehPJa1ZI', + 'ext': 'mov', + 'title': 'The Amadeus Spectrometer', + 'description': 're:^In this video, our friends at.*To learn more about Amadeus, visit', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/aAB3iowa', + 'md5': 'dedb2734ed00c9755761ccaee88527cd', + 'info_dict': { + 'id': 'aAB3iowa', + 'ext': 'mp4', + 'title': 'Google Earth Export', + 'description': 'Provides a demo of a CommunityViz export to Google Earth, one of the 3D viewing options.', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/X3ddTrYh', + 'md5': '669ee55ff9c51988b4ebc0877cc8b159', + 'info_dict': { + 'id': 'X3ddTrYh', + 'ext': 'wmv', + 'title': 'Toolkit 6 User Group Webinar (2014-03-04) - Default Judgment and First Impression', + 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://screencast.com/t/aAB3iowa', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'(?:(?!\1).)+)\1', + webpage, 'video url', default=None, group='url') + + if video_url is None: + video_url = self._html_search_meta( + 'og:video', webpage, default=None) + + if video_url is None: + raise ExtractorError('Cannot find video') + + title = self._og_search_title(webpage, default=None) + if title is None: + title = self._html_search_regex( + [r'Title: ([^<]+)
    ', + r'class="tabSeperator">>(.+?)<', + r'([^<]+)'], + webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage, default=None) + if description is None: + description = self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py new file mode 100644 index 0000000000..b5e76c9af0 --- /dev/null +++ b/youtube_dl/extractor/screencastomatic.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class ScreencastOMaticIE(InfoExtractor): + _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' + _TEST = { + 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', + 'md5': '483583cb80d92588f15ccbedd90f0c18', + 'info_dict': { + 'id': 'c2lD3BeOPl', + 'ext': 'mp4', + 'title': 'Welcome to 3-4 Philosophy @ DECV!', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', + 'duration': 369.163, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + jwplayer_data = self._parse_json( + self._search_regex( + r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), + video_id, transform_source=js_to_json) + + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict.update({ + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + }) + return info_dict diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py new file mode 100644 index 0000000000..b40b4c4afd --- /dev/null +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -0,0 +1,152 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import hashlib +import re + +from .aws import AWSIE +from .anvato import AnvatoIE +from .common import InfoExtractor +from ..utils import ( + smuggle_url, + urlencode_postdata, + xpath_text, +) + + +class ScrippsNetworksWatchIE(AWSIE): + IE_NAME = 'scrippsnetworks:watch' + _VALID_URL = r'''(?x) + https?:// + watch\. + (?Pgeniuskitchen)\.com/ + (?: + player\.[A-Z0-9]+\.html\#| + show/(?:[^/]+/){2}| + player/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', + 'info_dict': { + 'id': '4194875', + 'ext': 'mp4', + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', + 'uploader': 'ANV', + 'upload_date': '20171011', + 'timestamp': 1507698000, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [AnvatoIE.ie_key()], + }] + + _SNI_TABLE = { + 'geniuskitchen': 'genius', + } + + _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' + + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site', 'id') + + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') + token = self._download_json( + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, + headers={ + 'Accept': '*/*', + 'Content-Type': 'application/x-amz-json-1.1', + 'Referer': url, + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), + 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + })['Token'] + + sts = self._download_xml( + 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ + 'Action': 'AssumeRoleWithWebIdentity', + 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', + 'RoleSessionName': 'web-identity', + 'Version': '2011-06-15', + 'WebIdentityToken': token, + }), headers={ + 'Referer': url, + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) + + def get(key): + return xpath_text( + sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, + fatal=True) + + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] + + return self.url_result( + smuggle_url( + 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, + {'geo_countries': ['US']}), + AnvatoIE.ie_key(), video_id=mcp_id) + + +class ScrippsNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?Pcookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', + 'info_dict': { + 'id': '0260338', + 'ext': 'mp4', + 'title': 'The Best of the Best', + 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', + 'timestamp': 1475678834, + 'upload_date': '20161005', + 'uploader': 'SCNI-SCND', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', + 'only_matching': True, + }, { + 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', + 'only_matching': True, + }, { + 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', + 'only_matching': True, + }, { + 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', + 'only_matching': True, + }, { + 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', + 'only_matching': True, + }] + _ACCOUNT_MAP = { + 'cookingchanneltv': 2433005105, + 'discovery': 2706091867, + 'diynetwork': 2433004575, + 'foodnetwork': 2433005105, + 'hgtv': 2433004575, + 'travelchannel': 2433005739, + } + _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' + + def _real_extract(self, url): + site, guid = re.match(self._VALID_URL, url).groups() + return self.url_result(smuggle_url( + self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), + {'force_smil_url': True}), 'ThePlatform', guid) diff --git a/youtube_dl/extractor/scte.py b/youtube_dl/extractor/scte.py new file mode 100644 index 0000000000..ca1de63b62 --- /dev/null +++ b/youtube_dl/extractor/scte.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + urlencode_postdata, +) + + +class SCTEBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' + _NETRC_MACHINE = 'scte' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_popup = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']welcome\b', r'>Sign Out<')) + + # already logged in + if is_logged(login_popup): + return + + login_form = self._hidden_inputs(login_popup) + + login_form.update({ + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on', + }) + + response = self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + if '|pageRedirect|' not in response and not is_logged(response): + error = self._html_search_regex( + r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484', + 'info_dict': { + 'title': 'Introduction to DOCSIS Engineering Professional', + 'id': '31484', + }, + 'playlist_count': 5, + 'skip': 'Requires account credentials', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'

    (.+?)

    ', webpage, 'title') + + context_id = self._search_regex(r'context-(\d+)', webpage, video_id) + content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id + context = decode_packed_codes(self._download_webpage( + '%smobile/data.js' % content_base, video_id)) + + data = self._parse_xml( + self._search_regex( + r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"), + video_id) + + entries = [] + for asset in data.findall('.//asset'): + asset_url = asset.get('url') + if not asset_url or not asset_url.endswith('.mp4'): + continue + asset_id = self._search_regex( + r'video_([^_]+)_', asset_url, 'asset id', default=None) + if not asset_id: + continue + entries.append({ + 'id': asset_id, + 'title': title, + 'url': content_base + asset_url, + }) + + return self.playlist_result(entries, video_id, title) + + +class SCTECourseIE(SCTEBaseIE): + _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3639', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3073', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + title = self._search_regex( + r'

    (.+?)

    ', webpage, 'title', default=None) + + entries = [] + for mobj in re.finditer( + r'''(?x) + ]+ + href=(["\']) + (?P + https?://learning\.scte\.org/mod/ + (?Pscorm|subcourse)/view\.php?(?:(?!\1).)*? + \bid=\d+ + ) + ''', + webpage): + item_url = mobj.group('url') + if item_url == url: + continue + ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm' + else SCTECourseIE.ie_key()) + entries.append(self.url_result(item_url, ie=ie)) + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py new file mode 100644 index 0000000000..7872dc80d9 --- /dev/null +++ b/youtube_dl/extractor/seeker.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + get_element_by_class, + strip_or_none, +) + + +class SeekerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P.*)-(?P\d+)\.html' + _TESTS = [{ + 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', + 'md5': '897d44bbe0d8986a2ead96de565a92db', + 'info_dict': { + 'id': 'Elrn3gnY', + 'ext': 'mp4', + 'title': 'Should Trump Be Required To Release His Tax Returns?', + 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', + 'timestamp': 1490090165, + 'upload_date': '20170321', + } + }, { + 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', + 'playlist': [ + { + 'md5': '0497b9f20495174be73ae136949707d2', + 'info_dict': { + 'id': 'FihYQ8AE', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', + 'timestamp': 1490039133, + 'upload_date': '20170320', + }, + } + ], + 'info_dict': { + 'id': '1834116536', + 'title': 'After Gorilla Killing, Changes Ahead for Zoos', + 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', + }, + }] + + def _real_extract(self, url): + display_id, article_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + entries = [] + for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): + entries.append(self.url_result( + 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) + return self.playlist_result( + entries, article_id, + self._og_search_title(webpage), + strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py new file mode 100644 index 0000000000..db5ef8b57b --- /dev/null +++ b/youtube_dl/extractor/senateisvp.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unsmuggle_url, +) +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class SenateISVPIE(InfoExtractor): + _COMM_MAP = [ + ['ag', '76440', 'http://ag-f.akamaihd.net'], + ['aging', '76442', 'http://aging-f.akamaihd.net'], + ['approps', '76441', 'http://approps-f.akamaihd.net'], + ['armed', '76445', 'http://armed-f.akamaihd.net'], + ['banking', '76446', 'http://banking-f.akamaihd.net'], + ['budget', '76447', 'http://budget-f.akamaihd.net'], + ['cecc', '76486', 'http://srs-f.akamaihd.net'], + ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], + ['csce', '75229', 'http://srs-f.akamaihd.net'], + ['dpc', '76590', 'http://dpc-f.akamaihd.net'], + ['energy', '76448', 'http://energy-f.akamaihd.net'], + ['epw', '76478', 'http://epw-f.akamaihd.net'], + ['ethics', '76449', 'http://ethics-f.akamaihd.net'], + ['finance', '76450', 'http://finance-f.akamaihd.net'], + ['foreign', '76451', 'http://foreign-f.akamaihd.net'], + ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], + ['help', '76452', 'http://help-f.akamaihd.net'], + ['indian', '76455', 'http://indian-f.akamaihd.net'], + ['intel', '76456', 'http://intel-f.akamaihd.net'], + ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], + ['jccic', '85180', 'http://jccic-f.akamaihd.net'], + ['jec', '76458', 'http://jec-f.akamaihd.net'], + ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], + ['rpc', '76591', 'http://rpc-f.akamaihd.net'], + ['rules', '76460', 'http://rules-f.akamaihd.net'], + ['saa', '76489', 'http://srs-f.akamaihd.net'], + ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], + ['srs', '75229', 'http://srs-f.akamaihd.net'], + ['uscc', '76487', 'http://srs-f.akamaihd.net'], + ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], + ['arch', '', 'http://ussenate-f.akamaihd.net/'] + ] + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P.+)' + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"]+src=['\"](?Phttps?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _get_info_for_comm(self, committee): + for entry in self._COMM_MAP: + if entry[0] == committee: + return entry[1:] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'([^<]+)', webpage, video_id) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + stream_num, domain = self._get_info_for_comm(committee) + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + formats = [{ + # All parameters in the query string are necessary to prevent a 403 error + 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', + }] + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py new file mode 100644 index 0000000000..9d9652949b --- /dev/null +++ b/youtube_dl/extractor/sendtonews.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, + update_url_query, + int_or_none, + determine_protocol, + unescapeHTML, +) + + +class SendtoNewsIE(InfoExtractor): + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' + + _TEST = { + # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588' + }, + 'playlist_count': 8, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '240385', + 'ext': 'mp4', + 'title': 'Indians introduce Encarnacion', + 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', + 'duration': 137.898, + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20170105', + 'timestamp': 1483649762, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search(r'''(?x)]+src=([\'"]) + (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? + .*\bSC=(?P[0-9a-zA-Z-]+).* + \1>''', webpage) + if mobj: + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) + + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) + + for f in info_dict['formats']: + if f.get('tbr'): + continue + tbr = int_or_none(self._search_regex( + r'/(\d+)k/', f['url'], 'bitrate', default=None)) + if not tbr: + continue + f.update({ + 'format_id': '%s-%d' % (determine_protocol(f), tbr), + 'tbr': tbr, + }) + self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id')) + + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'].strip(), + 'description': unescapeHTML(video.get('S_fullStory')), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py new file mode 100644 index 0000000000..9401bf2cf7 --- /dev/null +++ b/youtube_dl/extractor/servus.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ServusIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| + servustv\.com/videos + ) + /(?P[aA]{2}-\w+|\d+-\d+) + ''' + _TESTS = [{ + # new URL schema + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', + 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'info_dict': { + 'id': 'AA-1T6VBU5PW1W12', + 'ext': 'mp4', + 'title': 'Die Grünen aus Sicht des Volkes', + 'description': 'md5:1247204d85783afe3682644398ff2ec4', + 'thumbnail': r're:^https?://.*\.jpg', + } + }, { + # old URL schema + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + (r'videoLabel\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, + group='title') or self._og_search_title(webpage) + title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + formats = self._extract_m3u8_formats( + 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py new file mode 100644 index 0000000000..84568ac69f --- /dev/null +++ b/youtube_dl/extractor/sevenplus.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveNewIE +from ..compat import compat_str +from ..utils import ( + try_get, + update_url_query, +) + + +class SevenPlusIE(BrightcoveNewIE): + IE_NAME = '7plus' + _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))' + _TESTS = [{ + 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', + 'info_dict': { + 'id': 'MTYS7-003', + 'ext': 'mp4', + 'title': 'S7 E3 - Wind Surf', + 'description': 'md5:29c6a69f21accda7601278f81b46483d', + 'uploader_id': '5303576322001', + 'upload_date': '20171201', + 'timestamp': 1512106377, + 'series': 'Mighty Ships', + 'season_number': 7, + 'episode_number': 3, + 'episode': 'Wind Surf', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + }, { + 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, episode_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + + for source in media.get('sources', {}): + src = source.get('src') + if not src: + continue + source['src'] = update_url_query(src, {'rule': ''}) + + info = self._parse_brightcove_metadata(media, episode_id) + + content = self._download_json( + 'https://component-cdn.swm.digital/content/' + path, + episode_id, headers={ + 'market-id': 4, + }, fatal=False) or {} + for item in content.get('items', {}): + if item.get('componentData', {}).get('componentType') == 'infoPanel': + for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: + value = item.get(src_key) + if value: + info[dst_key] = value + info['series'] = try_get( + item, lambda x: x['seriesLogo']['name'], compat_str) + mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) + if mobj: + info.update({ + 'season_number': int(mobj.group(1)), + 'episode_number': int(mobj.group(2)), + 'episode': mobj.group(3), + }) + + return info diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py new file mode 100644 index 0000000000..3df51520b0 --- /dev/null +++ b/youtube_dl/extractor/sexu.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SexuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P<id>\d+)' + _TEST = { + 'url': 'http://sexu.com/961791/', + 'md5': 'ff615aca9691053c94f8f10d96cd7884', + 'info_dict': { + 'id': '961791', + 'ext': 'mp4', + 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', + 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403', + 'categories': list, # NSFW + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + jwvideo = self._parse_json( + self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'), + video_id) + + sources = jwvideo['sources'] + + formats = [{ + 'url': source['file'].replace('\\', ''), + 'format_id': source.get('label'), + 'height': int(self._search_regex( + r'^(\d+)[pP]', source.get('label', ''), 'height', + default=None)), + } for source in sources if source.get('file')] + self._sort_formats(formats) + + title = self._html_search_regex( + r'<title>([^<]+)\s*-\s*Sexu\.Com', webpage, 'title') + + description = self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = jwvideo.get('image') + + categories_str = self._html_search_meta( + 'keywords', webpage, 'categories') + categories = ( + None if categories_str is None + else categories_str.split(',')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'formats': formats, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py new file mode 100644 index 0000000000..7a1c7e38be --- /dev/null +++ b/youtube_dl/extractor/seznamzpravy.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + urljoin, + int_or_none, + parse_codecs, + try_get, +) + + +def _raw_id(src_url): + return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] + + +class SeznamZpravyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' + _TESTS = [{ + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'duration': 241, + 'series': 'Svět bez obalu', + }, + 'params': { + 'skip_download': True, + }, + }, { + # with Location key + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5§ionPrefixPostroll=%2Fzpravy%2Fvyzva®ression=false', + 'info_dict': { + 'id': '185688', + 'ext': 'mp4', + 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'series': 'Výzva', + }, + 'params': { + 'skip_download': True, + }, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', + webpage)] + + def _extract_sdn_formats(self, sdn_url, video_id): + sdn_data = self._download_json(sdn_url, video_id) + + if sdn_data.get('Location'): + sdn_url = sdn_data['Location'] + sdn_data = self._download_json(sdn_url, video_id) + + formats = [] + mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} + for format_id, format_data in mp4_formats.items(): + relative_url = format_data.get('url') + if not relative_url: + continue + + try: + width, height = format_data.get('resolution') + except (TypeError, ValueError): + width, height = None, None + + f = { + 'url': urljoin(sdn_url, relative_url), + 'format_id': 'http-%s' % format_id, + 'tbr': int_or_none(format_data.get('bandwidth'), scale=1000), + 'width': int_or_none(width), + 'height': int_or_none(height), + } + f.update(parse_codecs(format_data.get('codec'))) + formats.append(f) + + pls = sdn_data.get('pls', {}) + + def get_url(format_id): + return try_get(pls, lambda x: x[format_id]['url'], compat_str) + + dash_rel_url = get_url('dash') + if dash_rel_url: + formats.extend(self._extract_mpd_formats( + urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', + fatal=False)) + + hls_rel_url = get_url('hls') + if hls_rel_url: + formats.extend(self._extract_m3u8_formats( + urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + src = params['src'][0] + title = params['title'][0] + video_id = params.get('contentId', [_raw_id(src)])[0] + formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id) + + duration = int_or_none(params.get('duration', [None])[0]) + series = params.get('series', [None])[0] + thumbnail = params.get('poster', [None])[0] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'series': series, + 'formats': formats, + } + + +class SeznamZpravyArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P\d+)' + _API_URL = 'https://apizpravy.seznam.cz/' + + _TESTS = [{ + # two videos on one page, with SDN URL + 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'info_dict': { + 'id': '35990', + 'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2', + 'description': 'md5:933f7b06fa337a814ba199d3596d27ba', + }, + 'playlist_count': 2, + }, { + # video with live stream URL + 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', + 'info_dict': { + 'id': '38489', + 'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60', + 'description': 'md5:428e7926a1a81986ec7eb23078004fb4', + }, + 'playlist_count': 1, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + webpage = self._download_webpage(url, article_id) + + info = self._search_json_ld(webpage, article_id, default={}) + + title = info.get('title') or self._og_search_title(webpage, fatal=False) + description = info.get('description') or self._og_search_description(webpage) + + return self.playlist_result([ + self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) + for entry_url in SeznamZpravyIE._extract_urls(webpage)], + article_id, title, description) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py new file mode 100644 index 0000000000..5c2a6206be --- /dev/null +++ b/youtube_dl/extractor/shahid.py @@ -0,0 +1,215 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import math +import re + +from .aws import AWSIE +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + InAdvancePagedList, + int_or_none, + parse_iso8601, + str_or_none, + urlencode_postdata, +) + + +class ShahidBaseIE(AWSIE): + _AWS_PROXY_HOST = 'api2.shahid.net' + _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + + def _handle_error(self, e): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + + def _call_api(self, path, video_id, request=None): + query = {} + if request: + query['request'] = json.dumps(request) + try: + return self._aws_execute_api({ + 'uri': '/proxy/v2/' + path, + 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', + 'secret_key': '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', + }, video_id, query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + +class ShahidIE(ShahidBaseIE): + _NETRC_MACHINE = 'shahid' + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', + 'info_dict': { + 'id': '275286', + 'ext': 'mp4', + 'title': 'مجلس الشباب الموسم 1 كليب 1', + 'timestamp': 1506988800, + 'upload_date': '20171003', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746', + 'only_matching': True + }, { + # shahid plus subscriber only + 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', + 'only_matching': True + }] + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + try: + user_data = self._download_json( + 'https://shahid.mbc.net/wd/service/users/login', + None, 'Logging in', data=json.dumps({ + 'email': email, + 'password': password, + 'basic': 'false', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/json; charset=UTF-8', + })['user'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + self._download_webpage( + 'https://shahid.mbc.net/populateContext', + None, 'Populate Context', data=urlencode_postdata({ + 'firstName': user_data['firstName'], + 'lastName': user_data['lastName'], + 'userName': user_data['email'], + 'csg_user_name': user_data['email'], + 'subscriberId': user_data['id'], + 'sessionId': user_data['sessionId'], + })) + + def _real_extract(self, url): + page_type, video_id = re.match(self._VALID_URL, url).groups() + if page_type == 'clip': + page_type = 'episode' + + playout = self._call_api( + 'playout/url/' + video_id, video_id)['playout'] + + if playout.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + self._sort_formats(formats) + + # video = self._call_api( + # 'product/id', video_id, { + # 'id': video_id, + # 'productType': 'ASSET', + # 'productSubType': page_type.upper() + # })['productModel'] + + response = self._download_json( + 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), + video_id, 'Downloading video JSON', query={ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }) + data = response.get('data', {}) + error = data.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + + video = data[page_type] + title = video['title'] + categories = [ + category['name'] + for category in video.get('genres', []) if 'name' in category] + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('referenceDate')), + 'categories': categories, + 'series': video.get('showTitle') or video.get('showName'), + 'season': video.get('seasonTitle'), + 'season_number': int_or_none(video.get('seasonNumber')), + 'season_id': str_or_none(video.get('seasonId')), + 'episode_number': int_or_none(video.get('number')), + 'episode_id': video_id, + 'formats': formats, + } + + +class ShahidShowIE(ShahidBaseIE): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', + 'info_dict': { + 'id': '79187', + 'title': 'رامز قرش البحر', + 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', + 'only_matching': True + }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + show_id = self._match_id(url) + + product = self._call_api( + 'playableAsset', show_id, {'showId': show_id})['productModel'] + playlist = product['playlist'] + playlist_id = playlist['id'] + show = product.get('show', {}) + + def page_func(page_num): + playlist = self._call_api( + 'product/playlist', show_id, { + 'playListId': playlist_id, + 'pageNumber': page_num, + 'pageSize': 30, + 'sorts': [{ + 'order': 'DESC', + 'type': 'SORTDATE' + }], + }) + for product in playlist.get('productList', {}).get('products', []): + product_url = product.get('productUrl', []).get('url') + if not product_url: + continue + yield self.url_result( + product_url, 'Shahid', + str_or_none(product.get('id')), + product.get('title')) + + entries = InAdvancePagedList( + page_func, + math.ceil(playlist['count'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py new file mode 100644 index 0000000000..02295d1a4c --- /dev/null +++ b/youtube_dl/extractor/shared.py @@ -0,0 +1,138 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote_plus, +) +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + js_to_json, + KNOWN_EXTENSIONS, + parse_filesize, + rot47, + url_or_none, + urlencode_postdata, +) + + +class SharedBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + if self._FILE_NOT_FOUND in webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + video_url = self._extract_video_url(webpage, video_id, url) + + title = self._extract_title(webpage) + filesize = int_or_none(self._extract_filesize(webpage)) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'filesize': filesize, + 'title': title, + } + + def _extract_title(self, webpage): + return compat_b64decode(self._html_search_meta( + 'full:title', webpage, 'title')).decode('utf-8') + + def _extract_filesize(self, webpage): + return self._html_search_meta( + 'full:size', webpage, 'file size', fatal=False) + + +class SharedIE(SharedBaseIE): + IE_DESC = 'shared.sx' + _VALID_URL = r'https?://shared\.sx/(?P[\da-z]{10})' + _FILE_NOT_FOUND = '>File does not exist<' + + _TEST = { + 'url': 'http://shared.sx/0060718775', + 'md5': '106fefed92a8a2adb8c98e6a0652f49b', + 'info_dict': { + 'id': '0060718775', + 'ext': 'mp4', + 'title': 'Bmp4', + 'filesize': 1720110, + }, + } + + def _extract_video_url(self, webpage, video_id, url): + download_form = self._hidden_inputs(webpage) + + video_page = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) + + video_url = self._html_search_regex( + r'data-url=(["\'])(?P(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') + + return video_url + + +class VivoIE(SharedBaseIE): + IE_DESC = 'vivo.sx' + _VALID_URL = r'https?://vivo\.sx/(?P[\da-z]{10})' + _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' + + _TEST = { + 'url': 'http://vivo.sx/d7ddda0e78', + 'md5': '15b3af41be0b4fe01f4df075c2678b2c', + 'info_dict': { + 'id': 'd7ddda0e78', + 'ext': 'mp4', + 'title': 'Chicken', + 'filesize': 515659, + }, + } + + def _extract_title(self, webpage): + title = self._html_search_regex( + r'data-name\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='title') + if title: + ext = determine_ext(title) + if ext.lower() in KNOWN_EXTENSIONS: + title = title.rpartition('.' + ext)[0] + return title + return self._og_search_title(webpage) + + def _extract_filesize(self, webpage): + return parse_filesize(self._search_regex( + r'data-type=["\']video["\'][^>]*>Watch.*?<strong>\s*\((.+?)\)', + webpage, 'filesize', fatal=False)) + + def _extract_video_url(self, webpage, video_id, url): + def decode_url_old(encoded_url): + return compat_b64decode(encoded_url).decode('utf-8') + + stream_url = self._search_regex( + r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'stream url', default=None, group='url') + if stream_url: + stream_url = url_or_none(decode_url_old(stream_url)) + if stream_url: + return stream_url + + def decode_url(encoded_url): + return rot47(compat_urllib_parse_unquote_plus(encoded_url)) + + return decode_url(self._parse_json( + self._search_regex( + r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage, + 'stream'), + video_id, transform_source=js_to_json)['source']) diff --git a/youtube_dl/extractor/showroomlive.py b/youtube_dl/extractor/showroomlive.py new file mode 100644 index 0000000000..efd9d561ff --- /dev/null +++ b/youtube_dl/extractor/showroomlive.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urljoin, +) + + +class ShowRoomLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?showroom-live\.com/(?!onlive|timetable|event|campaign|news|ranking|room)(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.showroom-live.com/48_Nana_Okada', + 'only_matching': True, + } + + def _real_extract(self, url): + broadcaster_id = self._match_id(url) + + webpage = self._download_webpage(url, broadcaster_id) + + room_id = self._search_regex( + (r'SrGlobal\.roomId\s*=\s*(\d+)', + r'(?:profile|room)\?room_id\=(\d+)'), webpage, 'room_id') + + room = self._download_json( + urljoin(url, '/api/room/profile?room_id=%s' % room_id), + broadcaster_id) + + is_live = room.get('is_onlive') + if is_live is not True: + raise ExtractorError('%s is offline' % broadcaster_id, expected=True) + + uploader = room.get('performer_name') or broadcaster_id + title = room.get('room_name') or room.get('main_name') or uploader + + streaming_url_list = self._download_json( + urljoin(url, '/api/live/streaming_url?room_id=%s' % room_id), + broadcaster_id)['streaming_url_list'] + + formats = [] + for stream in streaming_url_list: + stream_url = stream.get('url') + if not stream_url: + continue + stream_type = stream.get('type') + if stream_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + stream_url, broadcaster_id, ext='mp4', m3u8_id='hls', + live=True) + for f in m3u8_formats: + f['quality'] = int_or_none(stream.get('quality', 100)) + formats.extend(m3u8_formats) + elif stream_type == 'rtmp': + stream_name = stream.get('stream_name') + if not stream_name: + continue + formats.append({ + 'url': stream_url, + 'play_path': stream_name, + 'page_url': url, + 'player_url': 'https://www.showroom-live.com/assets/swf/v3/ShowRoomLive.swf', + 'rtmp_live': True, + 'ext': 'flv', + 'format_id': 'rtmp', + 'format_note': stream.get('label'), + 'quality': int_or_none(stream.get('quality', 100)), + }) + self._sort_formats(formats) + + return { + 'id': compat_str(room.get('live_id') or broadcaster_id), + 'title': self._live_title(title), + 'description': room.get('description'), + 'timestamp': int_or_none(room.get('current_live_started_at')), + 'uploader': uploader, + 'uploader_id': broadcaster_id, + 'view_count': int_or_none(room.get('view_num')), + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py new file mode 100644 index 0000000000..07b766b4a7 --- /dev/null +++ b/youtube_dl/extractor/sina.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, + update_url_query, + qualities, + get_element_by_attribute, + clean_html, +) + + +class SinaIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + (?: + (?:view/|.*\#)(?P<video_id>\d+)| + .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| + # This is used by external sites like Weibo + api/sinawebApi/outplay.php/(?P<token>.+?)\.swf + ) + ''' + + _TESTS = [ + { + 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', + 'md5': 'd38433e2fc886007729735650ae4b3e9', + 'info_dict': { + 'id': '250576622', + 'ext': 'mp4', + 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', + } + }, + { + 'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html', + 'info_dict': { + 'id': '101314253', + 'ext': 'flv', + 'title': '军方提高对朝情报监视级别', + }, + 'skip': 'the page does not exist or has been deleted', + }, + { + 'url': 'http://video.sina.com.cn/view/250587748.html', + 'md5': '3d1807a25c775092aab3bc157fff49b4', + 'info_dict': { + 'id': '250587748', + 'ext': 'mp4', + 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('video_id') + if not video_id: + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen('Getting video id') + request = HEADRequest(url) + _, urlh = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + else: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + error = get_element_by_attribute('class', 'errtitle', webpage) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + video_id = self._search_regex( + r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + + video_data = self._download_json( + 'http://s.video.sina.com.cn/video/h5play', + video_id, query={'video_id': video_id}) + if video_data['code'] != 1: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, video_data['message']), expected=True) + else: + video_data = video_data['data'] + title = video_data['title'] + description = video_data.get('description') + if description: + description = description.strip() + + preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) + formats = [] + for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): + file_api = quality.get('file_api') + file_id = quality.get('file_id') + if not file_api or not file_id: + continue + formats.append({ + 'format_id': quality_id, + 'url': update_url_query(file_api, {'vid': file_id}), + 'preference': preference(quality_id), + 'ext': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(video_data.get('length')), + 'timestamp': int_or_none(video_data.get('create_time')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py new file mode 100644 index 0000000000..7ec66ecf39 --- /dev/null +++ b/youtube_dl/extractor/sixplay.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + determine_ext, + int_or_none, + try_get, + qualities, +) + + +class SixPlayIE(InfoExtractor): + IE_NAME = '6play' + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr|rtlmost\.hu)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', + 'info_dict': { + 'id': '12041051', + 'ext': 'mp4', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', + }, + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }, { + 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', + 'only_matching': True, + }, { + 'url': 'https://www.rtlmost.hu/megtorve-p_14167/megtorve-6-resz-c_12397787', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), + 'rtlmost.hu': ('rtlhu_rtl_most', 'rtlhu'), + }.get(domain, ('6play', 'm6web')) + + data = self._download_json( + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ + 'csa': 5, + 'with': 'clips', + }) + + clip_data = data['clips'][0] + title = clip_data['title'] + + urls = [] + quality_key = qualities(['lq', 'sd', 'hq', 'hd']) + formats = [] + subtitles = {} + assets = clip_data.get('assets') or [] + for asset in assets: + asset_url = asset.get('full_physical_path') + protocol = asset.get('protocol') + if not asset_url or ((protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264') and not ('_drmnp.ism/' in asset_url or '_unpnp.ism/' in asset_url)) or asset_url in urls: + continue + urls.append(asset_url) + container = asset.get('video_container') + ext = determine_ext(asset_url) + if protocol == 'http_subtitle' or ext == 'vtt': + subtitles.setdefault('fr', []).append({'url': asset_url}) + continue + if container == 'm3u8' or ext == 'm3u8': + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) + if not urlh: + continue + asset_url = urlh.geturl() + asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') + for i in range(3, 0, -1): + asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) + m3u8_formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + formats.extend(self._extract_mpd_formats( + asset_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + if m3u8_formats: + break + else: + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif container == 'mp4' or ext == 'mp4': + quality = asset.get('video_quality') + formats.append({ + 'url': asset_url, + 'format_id': quality, + 'quality': quality_key(quality), + 'ext': ext, + }) + self._sort_formats(formats) + + def get(getter): + for src in (data, clip_data): + v = try_get(src, getter, compat_str) + if v: + return v + + return { + 'id': video_id, + 'title': title, + 'description': get(lambda x: x['description']), + 'duration': int_or_none(clip_data.get('duration')), + 'series': get(lambda x: x['program']['title']), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/sky.py b/youtube_dl/extractor/sky.py new file mode 100644 index 0000000000..ea30d6e62e --- /dev/null +++ b/youtube_dl/extractor/sky.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) + + +class SkyBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(<div.+?class="[^"]*sdc-article-video__media-ooyala[^"]*"[^>]+>)', + webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get( + 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token.strip('"')}) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': strip_or_none(self._og_search_description(webpage)), + 'ie_key': 'Ooyala', + } + + +class SkySportsIE(SkyBaseIE): + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', + 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', + 'info_dict': { + 'id': 'o3eWJnNDE6l7kfNO8BOoBlRxXRQ4ANNQ', + 'ext': 'mp4', + 'title': 'Bale: It\'s our time to shine', + 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', + }, + 'add_ie': ['Ooyala'], + } + + +class SkyNewsIE(SkyBaseIE): + _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', + 'md5': 'd6327e581473cea9976a3236ded370cd', + 'info_dict': { + 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'ext': 'mp4', + 'title': 'Russian plane inspected after deadly fire', + 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + }, + 'add_ie': ['Ooyala'], + } diff --git a/youtube_dl/extractor/skylinewebcams.py b/youtube_dl/extractor/skylinewebcams.py new file mode 100644 index 0000000000..b7f8ac7368 --- /dev/null +++ b/youtube_dl/extractor/skylinewebcams.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SkylineWebcamsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?skylinewebcams\.com/[^/]+/webcam/(?:[^/]+/)+(?P<id>[^/]+)\.html' + _TEST = { + 'url': 'https://www.skylinewebcams.com/it/webcam/italia/lazio/roma/scalinata-piazza-di-spagna-barcaccia.html', + 'info_dict': { + 'id': 'scalinata-piazza-di-spagna-barcaccia', + 'ext': 'mp4', + 'title': 're:^Live Webcam Scalinata di Piazza di Spagna - La Barcaccia [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Roma, veduta sulla Scalinata di Piazza di Spagna e sulla Barcaccia', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + stream_url = self._search_regex( + r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, + 'stream url', group='url') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + return { + 'id': video_id, + 'url': stream_url, + 'ext': 'mp4', + 'title': self._live_title(title), + 'description': description, + 'is_live': True, + } diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py new file mode 100644 index 0000000000..fffc9aa227 --- /dev/null +++ b/youtube_dl/extractor/skynewsarabia.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + parse_duration, +) + + +class SkyNewsArabiaBaseIE(InfoExtractor): + _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images' + + def _call_api(self, path, value): + return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value) + + def _get_limelight_media_id(self, url): + return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id') + + def _get_image_url(self, image_path_template, width='1600', height='1200'): + return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height) + + def _extract_video_info(self, video_data): + video_id = compat_str(video_data['id']) + topic = video_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']), + 'id': video_id, + 'title': video_data['headline'], + 'description': video_data.get('summary'), + 'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('runTime')), + 'tags': video_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + +class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3', + 'info_dict': { + 'id': '794902', + 'ext': 'flv', + 'title': 'نصف مليون مصباح على شجرة كريسماس', + 'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6', + 'upload_date': '20151128', + 'timestamp': 1448697198, + 'duration': 2119, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._call_api('video', video_id) + return self._extract_video_info(video_data) + + +class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:article' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', + 'info_dict': { + 'id': '794549', + 'ext': 'flv', + 'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة', + 'description': 'md5:0c373d29919a851e080ee4edd0c5d97f', + 'upload_date': '20151126', + 'timestamp': 1448559336, + 'duration': 281.6, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD', + 'info_dict': { + 'id': '794844', + 'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن', + 'description': 'md5:5c927b8b2e805796e7f693538d96fc7e', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article_data = self._call_api('article', article_id) + media_asset = article_data['mediaAsset'] + if media_asset['type'] == 'VIDEO': + topic = article_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']), + 'id': article_id, + 'title': article_data['headline'], + 'description': article_data.get('summary'), + 'thumbnail': self._get_image_url(media_asset['imageUrl']), + 'timestamp': parse_iso8601(article_data.get('date')), + 'tags': article_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': url, + 'ie_key': 'LimelightMedia', + } + entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO'] + return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary')) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py new file mode 100644 index 0000000000..e89ebebe75 --- /dev/null +++ b/youtube_dl/extractor/slideshare.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, +) +from ..utils import ( + ExtractorError, + get_element_by_id, +) + + +class SlideshareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' + + _TEST = { + 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', + 'info_dict': { + 'id': '25665706', + 'ext': 'mp4', + 'title': 'Managing Scale and Complexity', + 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + slideshare_obj = self._search_regex( + r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', + webpage, 'slideshare object') + info = json.loads(slideshare_obj) + if info['slideshow']['type'] != 'video': + raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) + + doc = info['doc'] + bucket = info['jsplayer']['video_bucket'] + ext = info['jsplayer']['video_extension'] + video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( + r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, + 'description', fatal=False) + + return { + '_type': 'video', + 'id': info['slideshow']['id'], + 'title': info['slideshow']['title'], + 'ext': ext, + 'url': video_url, + 'thumbnail': info['slideshow']['pin_image_url'], + 'description': description.strip() if description else None, + } diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py new file mode 100644 index 0000000000..d9ea76831e --- /dev/null +++ b/youtube_dl/extractor/slideslive.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class SlidesLiveIE(InfoExtractor): + _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' + _TESTS = [{ + # video_service_name = YOUTUBE + 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', + 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', + 'info_dict': { + 'id': 'LMtgR8ba0b0', + 'ext': 'mp4', + 'title': 'GCC IA16 backend', + 'description': 'Watch full version of this video at https://slideslive.com/38902413.', + 'uploader': 'SlidesLive Videos - A', + 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'upload_date': '20170925', + } + }, { + # video_service_name = youtube + 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', + 'only_matching': True, + }, { + # video_service_name = url + 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', + 'only_matching': True, + }, { + # video_service_name = vimeo + 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://ben.slideslive.com/player/' + video_id, video_id) + service_name = video_data['video_service_name'].lower() + assert service_name in ('url', 'vimeo', 'youtube') + service_id = video_data['video_service_id'] + info = { + 'id': video_id, + 'thumbnail': video_data.get('thumbnail'), + 'url': service_id, + } + if service_name == 'url': + info['title'] = video_data['title'] + else: + info.update({ + '_type': 'url_transparent', + 'ie_key': service_name.capitalize(), + 'title': video_data.get('title'), + }) + if service_name == 'vimeo': + info['url'] = smuggle_url( + 'https://player.vimeo.com/video/' + service_id, + {'http_headers': {'Referer': url}}) + return info diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py new file mode 100644 index 0000000000..661f9e59d0 --- /dev/null +++ b/youtube_dl/extractor/slutload.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SlutloadIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', + 'md5': '868309628ba00fd488cf516a113fd717', + 'info_dict': { + 'id': 'TD73btpBqSxc', + 'ext': 'mp4', + 'title': 'virginie baisee en cam', + 'age_limit': 18, + 'thumbnail': r're:https?://.*?\.jpg' + }, + }, { + # mobile site + 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) + + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') + + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } + + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': title, + 'age_limit': 18, + }) + return info diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py new file mode 100644 index 0000000000..45995f30f3 --- /dev/null +++ b/youtube_dl/extractor/smotri.py @@ -0,0 +1,416 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json +import hashlib +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + sanitized_Request, + unified_strdate, + urlencode_postdata, + xpath_text, +) + + +class SmotriIE(InfoExtractor): + IE_DESC = 'Smotri.com' + IE_NAME = 'smotri' + _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' + _NETRC_MACHINE = 'smotri' + + _TESTS = [ + # real video id 2610366 + { + 'url': 'http://smotri.com/video/view/?id=v261036632ab', + 'md5': '02c0dfab2102984e9c5bb585cc7cc321', + 'info_dict': { + 'id': 'v261036632ab', + 'ext': 'mp4', + 'title': 'катастрофа с камер видеонаблюдения', + 'uploader': 'rbc2008', + 'uploader_id': 'rbc08', + 'upload_date': '20131118', + 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', + }, + }, + # real video id 57591 + { + 'url': 'http://smotri.com/video/view/?id=v57591cb20', + 'md5': '830266dfc21f077eac5afd1883091bcd', + 'info_dict': { + 'id': 'v57591cb20', + 'ext': 'flv', + 'title': 'test', + 'uploader': 'Support Photofile@photofile', + 'uploader_id': 'support-photofile', + 'upload_date': '20070704', + 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', + }, + }, + # video-password, not approved by moderator + { + 'url': 'http://smotri.com/video/view/?id=v1390466a13c', + 'md5': 'f6331cef33cad65a0815ee482a54440b', + 'info_dict': { + 'id': 'v1390466a13c', + 'ext': 'mp4', + 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', + 'uploader': 'timoxa40', + 'uploader_id': 'timoxa40', + 'upload_date': '20100404', + 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', + }, + 'params': { + 'videopassword': 'qwerty', + }, + 'skip': 'Video is not approved by moderator', + }, + # video-password + { + 'url': 'http://smotri.com/video/view/?id=v6984858774#', + 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', + 'info_dict': { + 'id': 'v6984858774', + 'ext': 'mp4', + 'title': 'Дача Солженицина ПАРОЛЬ 223322', + 'uploader': 'psavari1', + 'uploader_id': 'psavari1', + 'upload_date': '20081103', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'videopassword': '223322', + }, + }, + # age limit + video-password, not approved by moderator + { + 'url': 'http://smotri.com/video/view/?id=v15408898bcf', + 'md5': '91e909c9f0521adf5ee86fbe073aad70', + 'info_dict': { + 'id': 'v15408898bcf', + 'ext': 'flv', + 'title': 'этот ролик не покажут по ТВ', + 'uploader': 'zzxxx', + 'uploader_id': 'ueggb', + 'upload_date': '20101001', + 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', + 'age_limit': 18, + }, + 'params': { + 'videopassword': '333' + }, + 'skip': 'Video is not approved by moderator', + }, + # age limit + video-password + { + 'url': 'http://smotri.com/video/view/?id=v7780025814', + 'md5': 'b4599b068422559374a59300c5337d72', + 'info_dict': { + 'id': 'v7780025814', + 'ext': 'mp4', + 'title': 'Sexy Beach (пароль 123)', + 'uploader': 'вАся', + 'uploader_id': 'asya_prosto', + 'upload_date': '20081218', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'videopassword': '123' + }, + }, + # swf player + { + 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', + 'md5': '31099eeb4bc906712c5f40092045108d', + 'info_dict': { + 'id': 'v9188090500', + 'ext': 'mp4', + 'title': 'Shakira - Don\'t Bother', + 'uploader': 'HannahL', + 'uploader_id': 'lisaha95', + 'upload_date': '20090331', + 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', + }, + }, + ] + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', + webpage) + if mobj is not None: + return mobj.group('url') + + mobj = re.search( + r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s* + <div\s+class="video_image">[^<]+</div>\s* + <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage) + if mobj is not None: + return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') + + def _search_meta(self, name, html, display_name=None): + if display_name is None: + display_name = name + return self._html_search_meta(name, html, display_name) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_form = { + 'ticket': video_id, + 'video_url': '1', + 'frame_url': '1', + 'devid': 'LoadupFlashPlayer', + 'getvideoinfo': '1', + } + + video_password = self._downloader.params.get('videopassword') + if video_password: + video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() + + video = self._download_json( + 'http://smotri.com/video/view/url/bot/', + video_id, 'Downloading video JSON', + data=urlencode_postdata(video_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + video_url = video.get('_vidURL') or video.get('_vidURL_mp4') + + if not video_url: + if video.get('_moderate_no'): + raise ExtractorError( + 'Video %s has not been approved by moderator' % video_id, expected=True) + + if video.get('error'): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + if video.get('_pass_protected') == 1: + msg = ('Invalid video password' if video_password + else 'This video is protected by a password, use the --video-password option') + raise ExtractorError(msg, expected=True) + + title = video['title'] + thumbnail = video.get('_imgURL') + upload_date = unified_strdate(video.get('added')) + uploader = video.get('userNick') + uploader_id = video.get('userLogin') + duration = int_or_none(video.get('duration')) + + # Video JSON does not provide enough meta data + # We will extract some from the video web page instead + webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id + webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page') + + # Warning if video is unavailable + warning = self._html_search_regex( + r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, + 'warning message', default=None) + if warning is not None: + self._downloader.report_warning( + 'Video %s may not be available; smotri said: %s ' % + (video_id, warning)) + + # Adult content + if 'EroConfirmText">' in webpage: + self.report_age_confirmation() + confirm_string = self._html_search_regex( + r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, + webpage, 'confirm string') + confirm_url = webpage_url + '&confirm=%s' % confirm_string + webpage = self._download_webpage( + confirm_url, video_id, + 'Downloading video page (age confirmed)') + adult_content = True + else: + adult_content = False + + view_count = self._html_search_regex( + r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', + webpage, 'view count', fatal=False) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': int_or_none(view_count), + 'age_limit': 18 if adult_content else 0, + } + + +class SmotriCommunityIE(InfoExtractor): + IE_DESC = 'Smotri.com community videos' + IE_NAME = 'smotri:community' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' + _TEST = { + 'url': 'http://smotri.com/community/video/kommuna', + 'info_dict': { + 'id': 'kommuna', + }, + 'playlist_mincount': 4, + } + + def _real_extract(self, url): + community_id = self._match_id(url) + + rss = self._download_xml( + 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, + community_id, 'Downloading community RSS') + + entries = [ + self.url_result(video_url.text, SmotriIE.ie_key()) + for video_url in rss.findall('./channel/item/link')] + + return self.playlist_result(entries, community_id) + + +class SmotriUserIE(InfoExtractor): + IE_DESC = 'Smotri.com user videos' + IE_NAME = 'smotri:user' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' + _TESTS = [{ + 'url': 'http://smotri.com/user/inspector', + 'info_dict': { + 'id': 'inspector', + 'title': 'Inspector', + }, + 'playlist_mincount': 9, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + rss = self._download_xml( + 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, + user_id, 'Downloading user RSS') + + entries = [self.url_result(video_url.text, 'Smotri') + for video_url in rss.findall('./channel/item/link')] + + description_text = xpath_text(rss, './channel/description') or '' + user_nickname = self._search_regex( + '^Видео режиссера (.+)$', description_text, + 'user nickname', fatal=False) + + return self.playlist_result(entries, user_id, user_nickname) + + +class SmotriBroadcastIE(InfoExtractor): + IE_DESC = 'Smotri.com broadcasts' + IE_NAME = 'smotri:broadcast' + _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' + _NETRC_MACHINE = 'smotri' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + broadcast_id = mobj.group('id') + + broadcast_url = 'http://' + mobj.group('url') + broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') + + if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: + raise ExtractorError( + 'Broadcast %s does not exist' % broadcast_id, expected=True) + + # Adult content + if re.search('EroConfirmText">', broadcast_page) is not None: + + (username, password) = self._get_login_info() + if username is None: + self.raise_login_required( + 'Erotic broadcasts allowed only for registered users') + + login_form = { + 'login-hint53': '1', + 'confirm_erotic': '1', + 'login': username, + 'password': password, + } + + request = sanitized_Request( + broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + broadcast_page = self._download_webpage( + request, broadcast_id, 'Logging in and confirming age') + + if '>Неверный логин или пароль<' in broadcast_page: + raise ExtractorError( + 'Unable to log in: bad username or password', expected=True) + + adult_content = True + else: + adult_content = False + + ticket = self._html_search_regex( + (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), + broadcast_page, 'broadcast ticket', group='ticket') + + broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + + broadcast_password = self._downloader.params.get('videopassword') + if broadcast_password: + broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + + broadcast_json_page = self._download_webpage( + broadcast_url, broadcast_id, 'Downloading broadcast JSON') + + try: + broadcast_json = json.loads(broadcast_json_page) + + protected_broadcast = broadcast_json['_pass_protected'] == 1 + if protected_broadcast and not broadcast_password: + raise ExtractorError( + 'This broadcast is protected by a password, use the --video-password option', + expected=True) + + broadcast_offline = broadcast_json['is_play'] == 0 + if broadcast_offline: + raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) + + rtmp_url = broadcast_json['_server'] + mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url) + if not mobj: + raise ExtractorError('Unexpected broadcast rtmp URL') + + broadcast_playpath = broadcast_json['_streamName'] + broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) + broadcast_thumbnail = broadcast_json.get('_imgURL') + broadcast_title = self._live_title(broadcast_json['title']) + broadcast_description = broadcast_json.get('description') + broadcaster_nick = broadcast_json.get('nick') + broadcaster_login = broadcast_json.get('login') + rtmp_conn = 'S:%s' % uuid.uuid4().hex + except KeyError: + if protected_broadcast: + raise ExtractorError('Bad broadcast password', expected=True) + raise ExtractorError('Unexpected broadcast JSON') + + return { + 'id': broadcast_id, + 'url': rtmp_url, + 'title': broadcast_title, + 'thumbnail': broadcast_thumbnail, + 'description': broadcast_description, + 'uploader': broadcaster_nick, + 'uploader_id': broadcaster_login, + 'age_limit': 18 if adult_content else 0, + 'ext': 'flv', + 'play_path': broadcast_playpath, + 'player_url': 'http://pics.smotri.com/broadcast_play.swf', + 'app': broadcast_app, + 'rtmp_live': True, + 'rtmp_conn': rtmp_conn, + 'is_live': True, + } diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py new file mode 100644 index 0000000000..f773547483 --- /dev/null +++ b/youtube_dl/extractor/snotr.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_filesize, + str_to_int, +) + + +class SnotrIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' + _TESTS = [{ + 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', + 'info_dict': { + 'id': '13708', + 'ext': 'mp4', + 'title': 'Drone flying through fireworks!', + 'duration': 248, + 'filesize_approx': 40700000, + 'description': 'A drone flying through Fourth of July Fireworks', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['description'], + }, { + 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', + 'info_dict': { + 'id': '530', + 'ext': 'mp4', + 'title': 'David Letteman - George W. Bush Top 10', + 'duration': 126, + 'filesize_approx': 8500000, + 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + description = self._og_search_description(webpage) + info_dict = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] + + view_count = str_to_int(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)', + webpage, 'view count', fatal=False)) + + duration = parse_duration(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)', + webpage, 'duration', fatal=False)) + + filesize_approx = parse_filesize(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)', + webpage, 'filesize', fatal=False)) + + info_dict.update({ + 'id': video_id, + 'description': description, + 'title': title, + 'view_count': view_count, + 'duration': duration, + 'filesize_approx': filesize_approx, + }) + + return info_dict diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py new file mode 100644 index 0000000000..76b3cc6b6b --- /dev/null +++ b/youtube_dl/extractor/sohu.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + + +class SohuIE(InfoExtractor): + _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' + + # Sohu videos give different MD5 sums on Travis CI and my machine + _TESTS = [{ + 'note': 'This video is available only in Mainland China', + 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', + 'info_dict': { + 'id': '382479172', + 'ext': 'mp4', + 'title': 'MV:Far East Movement《The Illest》', + }, + 'skip': 'On available in China', + }, { + 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', + 'info_dict': { + 'id': '409385080', + 'ext': 'mp4', + 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', + } + }, { + 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + } + }, { + 'note': 'Multipart video', + 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', + 'info_dict': { + 'id': '78910339', + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + }, + 'playlist': [{ + 'info_dict': { + 'id': '78910339_part1', + 'ext': 'mp4', + 'duration': 294, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part2', + 'ext': 'mp4', + 'duration': 300, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part3', + 'ext': 'mp4', + 'duration': 150, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }] + }, { + 'note': 'Video with title containing dash', + 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', + 'info_dict': { + 'id': '78932792', + 'ext': 'mp4', + 'title': 'youtube-dlc testing video', + }, + 'params': { + 'skip_download': True + } + }] + + def _real_extract(self, url): + + def _fetch_data(vid_id, mytv=False): + if mytv: + base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' + else: + base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + + return self._download_json( + base_data_url + vid_id, video_id, + 'Downloading JSON data for %s' % vid_id, + headers=self.geo_verification_headers()) + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + mytv = mobj.group('mytv') is not None + + webpage = self._download_webpage(url, video_id) + + title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) + + vid = self._html_search_regex( + r'var vid ?= ?["\'](\d+)["\']', + webpage, 'video path') + vid_data = _fetch_data(vid, mytv) + if vid_data['play'] != 1: + if vid_data.get('status') == 12: + raise ExtractorError( + '%s said: There\'s something wrong in the video.' % self.IE_NAME, + expected=True) + else: + self.raise_geo_restricted( + '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME) + + formats_json = {} + for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): + vid_id = vid_data['data'].get('%sVid' % format_id) + if not vid_id: + continue + vid_id = compat_str(vid_id) + formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) + + part_count = vid_data['data']['totalBlocks'] + + playlist = [] + for i in range(part_count): + formats = [] + for format_id, format_data in formats_json.items(): + allot = format_data['allot'] + + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] + + video_url = 'newflv.sohu.ccgslb.net' + cdnId = None + retries = 0 + + while 'newflv.sohu.ccgslb.net' in video_url: + params = { + 'prot': 9, + 'file': clips_url[i], + 'new': su[i], + 'prod': 'flash', + 'rb': 1, + } + + if cdnId is not None: + params['idc'] = cdnId + + download_note = 'Downloading %s video URL part %d of %d' % ( + format_id, i + 1, part_count) + + if retries > 0: + download_note += ' (retry #%d)' % retries + part_info = self._parse_json(self._download_webpage( + 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), + video_id, download_note), video_id) + + video_url = part_info['url'] + cdnId = part_info.get('nid') + + retries += 1 + if retries > 5: + raise ExtractorError('Failed to get video URL') + + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'filesize': int_or_none( + try_get(data, lambda x: x['clipsBytes'][i])), + 'width': int_or_none(data.get('width')), + 'height': int_or_none(data.get('height')), + 'fps': int_or_none(data.get('fps')), + }) + self._sort_formats(formats) + + playlist.append({ + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + 'duration': vid_data['data']['clipsDuration'][i], + 'formats': formats, + }) + + if len(playlist) == 1: + info = playlist[0] + info['id'] = video_id + else: + info = { + '_type': 'multi_video', + 'entries': playlist, + 'id': video_id, + 'title': title, + } + + return info diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py new file mode 100644 index 0000000000..58a8c0d4dd --- /dev/null +++ b/youtube_dl/extractor/sonyliv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'info_dict': { + 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", + 'id': 'ref:5024612095001', + 'ext': 'mp4', + 'upload_date': '20170923', + 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', + 'uploader_id': '5182475815001', + 'timestamp': 1506200547, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'only_matching': True, + }] + + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { + 'geo_countries': ['IN'], + 'referrer': url, + }), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py new file mode 100644 index 0000000000..ae3573680b --- /dev/null +++ b/youtube_dl/extractor/soundcloud.py @@ -0,0 +1,890 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re +import json +import random + +from .common import ( + InfoExtractor, + SearchInfoExtractor +) +from ..compat import ( + compat_HTTPError, + compat_kwargs, + compat_str, + compat_urlparse, +) +from ..utils import ( + error_to_compat_str, + ExtractorError, + float_or_none, + HEADRequest, + int_or_none, + KNOWN_EXTENSIONS, + mimetype2ext, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urlhandle_detect_ext, + sanitized_Request, +) + + +class SoundcloudEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' + _TEST = { + # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ + 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', + 'only_matching': True, + } + + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + + def _real_extract(self, url): + query = compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + api_url = query['url'][0] + secret_token = query.get('secret_token') + if secret_token: + api_url = update_url_query(api_url, {'secret_token': secret_token[0]}) + return self.url_result(api_url) + + +class SoundcloudIE(InfoExtractor): + """Information extractor for soundcloud.com + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ + + _VALID_URL = r'''(?x)^(?:https?://)? + (?:(?:(?:www\.|m\.)?soundcloud\.com/ + (?!stations/track) + (?P<uploader>[\w\d-]+)/ + (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?P<title>[\w\d-]+)/? + (?P<token>[^?]+?)?(?:[?].*)?$) + |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) + (?:/?\?secret_token=(?P<secret_token>[^&]+))?) + ) + ''' + IE_NAME = 'soundcloud' + _TESTS = [ + { + 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', + 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', + 'info_dict': { + 'id': '62986583', + 'ext': 'mp3', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', + 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', + 'uploader': 'E.T. ExTerrestrial Music', + 'uploader_id': '1571244', + 'timestamp': 1349920598, + 'upload_date': '20121011', + 'duration': 143.216, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + } + }, + # geo-restricted + { + 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', + 'info_dict': { + 'id': '47127627', + 'ext': 'mp3', + 'title': 'Goldrushed', + 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', + 'uploader': 'The Royal Concept', + 'uploader_id': '9615865', + 'timestamp': 1337635207, + 'upload_date': '20120521', + 'duration': 227.155, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link + { + 'url': 'https://soundcloud.com/jaimemf/youtube-dlc-test-video-a-y-baw/s-8Pjrp', + 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', + 'info_dict': { + 'id': '123998367', + 'ext': 'mp3', + 'title': 'Youtube - Dl Test Video \'\' Ä↭', + 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'uploader_id': '69767071', + 'timestamp': 1386604920, + 'upload_date': '20131209', + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link (alt format) + { + 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', + 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', + 'info_dict': { + 'id': '123998367', + 'ext': 'mp3', + 'title': 'Youtube - Dl Test Video \'\' Ä↭', + 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'uploader_id': '69767071', + 'timestamp': 1386604920, + 'upload_date': '20131209', + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # downloadable song + { + 'url': 'https://soundcloud.com/oddsamples/bus-brakes', + 'md5': '7624f2351f8a3b2e7cd51522496e7631', + 'info_dict': { + 'id': '128590877', + 'ext': 'mp3', + 'title': 'Bus Brakes', + 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', + 'uploader': 'oddsamples', + 'uploader_id': '73680509', + 'timestamp': 1389232924, + 'upload_date': '20140109', + 'duration': 17.346, + 'license': 'cc-by-sa', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'uploader_id': '12563093', + 'timestamp': 1504206263, + 'upload_date': '20170831', + 'duration': 7449.096, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'uploader_id': '2366352', + 'timestamp': 1488152409, + 'upload_date': '20170226', + 'duration': 207.012, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', + 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', + 'info_dict': { + 'id': '583011102', + 'ext': 'mp3', + 'title': 'Mezzo Valzer', + 'description': 'md5:4138d582f81866a530317bae316e8b61', + 'uploader': 'Micronie', + 'uploader_id': '3352531', + 'timestamp': 1551394171, + 'upload_date': '20190228', + 'duration': 180.157, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + { + # with AAC HQ format available via OAuth token + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, + ] + + _API_V2_BASE = 'https://api-v2.soundcloud.com/' + _BASE_URL = 'https://soundcloud.com/' + _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' + + _ARTWORK_MAP = { + 'mini': 16, + 'tiny': 20, + 'small': 32, + 'badge': 47, + 't67x67': 67, + 'large': 100, + 't300x300': 300, + 'crop': 400, + 't500x500': 500, + 'original': 0, + } + + def _store_client_id(self, client_id): + self._downloader.cache.store('soundcloud', 'client_id', client_id) + + def _update_client_id(self): + webpage = self._download_webpage('https://soundcloud.com/', None) + for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): + script = self._download_webpage(src, None, fatal=False) + if script: + client_id = self._search_regex( + r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', + script, 'client id', default=None) + if client_id: + self._CLIENT_ID = client_id + self._store_client_id(client_id) + return + raise ExtractorError('Unable to extract client id') + + def _download_json(self, *args, **kwargs): + non_fatal = kwargs.get('fatal') is False + if non_fatal: + del kwargs['fatal'] + query = kwargs.get('query', {}).copy() + for _ in range(2): + query['client_id'] = self._CLIENT_ID + kwargs['query'] = query + try: + return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self._store_client_id(None) + self._update_client_id() + continue + elif non_fatal: + self._downloader.report_warning(error_to_compat_str(e)) + return False + raise + + def _real_initialize(self): + self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or "T5R4kgWS2PRf6lzLyIravUMnKlbIxQag" # 'EXLwg5lHTO2dslU5EePe3xkw0m1h86Cd' # 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' + self._login() + + _USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" + _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' + _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' + _access_token = None + _HEADERS = {} + _NETRC_MACHINE = 'soundcloud' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def genDevId(): + def genNumBlock(): + return ''.join([str(random.randrange(10)) for i in range(6)]) + return '-'.join([genNumBlock() for i in range(4)]) + + payload = { + 'client_id': self._CLIENT_ID, + 'recaptcha_pubkey': 'null', + 'recaptcha_response': 'null', + 'credentials': { + 'identifier': username, + 'password': password + }, + 'signature': self.sign(username, password, self._CLIENT_ID), + 'device_id': genDevId(), + 'user_agent': self._USER_AGENT + } + + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(login, None) + self._access_token = response.get('session').get('access_token') + if not self._access_token: + self.report_warning('Unable to get access token, login may has failed') + else: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + + # signature generation + def sign(self, user, pw, clid): + a = 33 + i = 1 + s = 440123 + w = 117 + u = 1800000 + l = 1042 + b = 37 + k = 37 + c = 5 + n = "0763ed7314c69015fd4a0dc16bbf4b90" # _KEY + y = "8" # _REV + r = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" # _USER_AGENT + e = user # _USERNAME + t = clid # _CLIENT_ID + + d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]]) + p = n + y + d + r + e + t + d + n + h = p + + m = 8011470 + f = 0 + + for f in range(f, len(h)): + m = (m >> 1) + ((1 & m) << 23) + m += ord(h[f]) + m &= 16777215 + + # c is not even needed + out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c) + + return out + + @classmethod + def _resolv_url(cls, url): + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + + def _extract_info_dict(self, info, full_title=None, secret_token=None): + track_id = compat_str(info['id']) + title = info['title'] + + format_urls = set() + formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token: + query['secret_token'] = secret_token + + if info.get('downloadable') and info.get('has_downloads_left'): + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'preference': 10, + }) + + def invalid_url(url): + return not url or url in format_urls + + def add_format(f, protocol, is_preview=False): + mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) + if mobj: + for k, v in mobj.groupdict().items(): + if not f.get(k): + f[k] = v + format_id_list = [] + if protocol: + format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' + for k in ('ext', 'abr'): + v = f.get(k) + if v: + format_id_list.append(v) + preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) + if preview: + format_id_list.append('preview') + abr = f.get('abr') + if abr: + f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' + f.update({ + 'format_id': '_'.join(format_id_list), + 'protocol': protocol, + 'preference': -10 if preview else None, + }) + formats.append(f) + + # New API + transcodings = try_get( + info, lambda x: x['media']['transcodings'], list) or [] + for t in transcodings: + if not isinstance(t, dict): + continue + format_url = url_or_none(t.get('url')) + if not format_url: + continue + stream = self._download_json( + format_url, track_id, query=query, fatal=False, headers=self._HEADERS) + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + stream_format = t.get('format') or {} + protocol = stream_format.get('protocol') + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + ext = None + preset = str_or_none(t.get('preset')) + if preset: + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(stream_format.get('mime_type')) + add_format({ + 'url': stream_url, + 'ext': ext, + }, 'http' if protocol == 'progressive' else protocol, + t.get('snipped') or '/preview/' in format_url) + + for f in formats: + f['vcodec'] = 'none' + + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted() + self._sort_formats(formats) + + user = info.get('user') or {} + + thumbnails = [] + artwork_url = info.get('artwork_url') + thumbnail = artwork_url or user.get('avatar_url') + if isinstance(thumbnail, compat_str): + if re.search(self._IMAGE_REPL_RE, thumbnail): + for image_id, size in self._ARTWORK_MAP.items(): + i = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + i['preference'] = 10 + if size: + i.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(i) + else: + thumbnails = [{'url': thumbnail}] + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + return { + 'id': track_id, + 'uploader': user.get('username'), + 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), + 'uploader_url': user.get('permalink_url'), + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), + 'thumbnails': thumbnails, + 'duration': float_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings') or extract_count('likes'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), + 'formats': formats + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + track_id = mobj.group('track_id') + + query = {} + if track_id: + info_json_url = self._API_V2_BASE + 'tracks/' + track_id + full_title = track_id + token = mobj.group('secret_token') + if token: + query['secret_token'] = token + else: + full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title') + token = mobj.group('token') + if token: + resolve_title += '/%s' % token + info_json_url = self._resolv_url(self._BASE_URL + resolve_title) + + info = self._download_json( + info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS) + + return self._extract_info_dict(info, full_title, token) + + +class SoundcloudPlaylistBaseIE(SoundcloudIE): + def _extract_set(self, playlist, token=None): + playlist_id = compat_str(playlist['id']) + tracks = playlist.get('tracks') or [] + if not all([t.get('permalink_url') for t in tracks]) and token: + tracks = self._download_json( + self._API_V2_BASE + 'tracks', playlist_id, + 'Downloading tracks', query={ + 'ids': ','.join([compat_str(t['id']) for t in tracks]), + 'playlistId': playlist_id, + 'playlistSecretToken': token, + }, headers=self._HEADERS) + entries = [] + for track in tracks: + track_id = str_or_none(track.get('id')) + url = track.get('permalink_url') + if not url: + if not track_id: + continue + url = self._API_V2_BASE + 'tracks/' + track_id + if token: + url += '?secret_token=' + token + entries.append(self.url_result( + url, SoundcloudIE.ie_key(), track_id)) + return self.playlist_result( + entries, playlist_id, + playlist.get('title'), + playlist.get('description')) + + +class SoundcloudSetIE(SoundcloudPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' + IE_NAME = 'soundcloud:set' + _TESTS = [{ + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', + 'info_dict': { + 'id': '2284613', + 'title': 'The Royal Concept EP', + 'description': 'md5:71d07087c7a449e8941a70a29e34671e', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title') + token = mobj.group('token') + if token: + full_title += '/' + token + + info = self._download_json(self._resolv_url( + self._BASE_URL + full_title), full_title, headers=self._HEADERS) + + if 'errors' in info: + msgs = (compat_str(err['error_message']) for err in info['errors']) + raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) + + return self._extract_set(info, token) + + +class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): + def _extract_playlist(self, base_url, playlist_id, playlist_title): + COMMON_QUERY = { + 'limit': 80000, + 'linked_partitioning': '1', + } + + query = COMMON_QUERY.copy() + query['offset'] = 0 + + next_href = base_url + + entries = [] + for i in itertools.count(): + response = self._download_json( + next_href, playlist_id, + 'Downloading track page %s' % (i + 1), query=query, headers=self._HEADERS) + + collection = response['collection'] + + if not isinstance(collection, list): + collection = [] + + # Empty collection may be returned, in this case we proceed + # straight to next_href + + def resolve_entry(candidates): + for cand in candidates: + if not isinstance(cand, dict): + continue + permalink_url = url_or_none(cand.get('permalink_url')) + if not permalink_url: + continue + return self.url_result( + permalink_url, + SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + str_or_none(cand.get('id')), cand.get('title')) + + for e in collection: + entry = resolve_entry((e, e.get('track'), e.get('playlist'))) + if entry: + entries.append(entry) + + next_href = response.get('next_href') + if not next_href: + break + + next_href = response['next_href'] + parsed_next_href = compat_urlparse.urlparse(next_href) + query = compat_urlparse.parse_qs(parsed_next_href.query) + query.update(COMMON_QUERY) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'entries': entries, + } + + +class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?soundcloud\.com/ + (?P<user>[^/]+) + (?:/ + (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) + )? + /?(?:[?#].*)?$ + ''' + IE_NAME = 'soundcloud:user' + _TESTS = [{ + 'url': 'https://soundcloud.com/soft-cell-official', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (All)', + }, + 'playlist_mincount': 28, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/tracks', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (Tracks)', + }, + 'playlist_mincount': 27, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/albums', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/jcv246/sets', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Sets)', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://soundcloud.com/jcv246/reposts', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Reposts)', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://soundcloud.com/clalberg/likes', + 'info_dict': { + 'id': '11817582', + 'title': 'clalberg (Likes)', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://soundcloud.com/grynpyret/spotlight', + 'info_dict': { + 'id': '7098329', + 'title': 'Grynpyret (Spotlight)', + }, + 'playlist_mincount': 1, + }] + + _BASE_URL_MAP = { + 'all': 'stream/users/%s', + 'tracks': 'users/%s/tracks', + 'albums': 'users/%s/albums', + 'sets': 'users/%s/playlists', + 'reposts': 'stream/users/%s/reposts', + 'likes': 'users/%s/likes', + 'spotlight': 'users/%s/spotlight', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader = mobj.group('user') + + user = self._download_json( + self._resolv_url(self._BASE_URL + uploader), + uploader, 'Downloading user info', headers=self._HEADERS) + + resource = mobj.group('rsrc') or 'all' + + return self._extract_playlist( + self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'], + str_or_none(user.get('id')), + '%s (%s)' % (user['username'], resource.capitalize())) + + +class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' + IE_NAME = 'soundcloud:trackstation' + _TESTS = [{ + 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', + 'info_dict': { + 'id': '286017854', + 'title': 'Track station: your text', + }, + 'playlist_mincount': 47, + }] + + def _real_extract(self, url): + track_name = self._match_id(url) + + track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS) + track_id = self._search_regex( + r'soundcloud:track-stations:(\d+)', track['id'], 'track id') + + return self._extract_playlist( + self._API_V2_BASE + 'stations/%s/tracks' % track['id'], + track_id, 'Track station: %s' % track['title']) + + +class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): + _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' + IE_NAME = 'soundcloud:playlist' + _TESTS = [{ + 'url': 'https://api.soundcloud.com/playlists/4110309', + 'info_dict': { + 'id': '4110309', + 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', + 'description': 're:.*?TILT Brass - Bowery Poetry Club', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + + query = {} + token = mobj.group('token') + if token: + query['secret_token'] = token + + data = self._download_json( + self._API_V2_BASE + 'playlists/' + playlist_id, + playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS) + + return self._extract_set(data, token) + + +class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): + IE_NAME = 'soundcloud:search' + IE_DESC = 'Soundcloud search' + _MAX_RESULTS = float('inf') + _TESTS = [{ + 'url': 'scsearch15:post-avant jazzcore', + 'info_dict': { + 'title': 'post-avant jazzcore', + }, + 'playlist_count': 15, + }] + + _SEARCH_KEY = 'scsearch' + _MAX_RESULTS_PER_PAGE = 200 + _DEFAULT_RESULTS_PER_PAGE = 50 + + def _get_collection(self, endpoint, collection_id, **query): + limit = min( + query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), + self._MAX_RESULTS_PER_PAGE) + query.update({ + 'limit': limit, + 'linked_partitioning': 1, + 'offset': 0, + }) + next_url = update_url_query(self._API_V2_BASE + endpoint, query) + + collected_results = 0 + + for i in itertools.count(1): + response = self._download_json( + next_url, collection_id, 'Downloading page {0}'.format(i), + 'Unable to download API page', headers=self._HEADERS) + + collection = response.get('collection', []) + if not collection: + break + + collection = list(filter(bool, collection)) + collected_results += len(collection) + + for item in collection: + yield self.url_result(item['uri'], SoundcloudIE.ie_key()) + + if not collection or collected_results >= limit: + break + + next_url = response.get('next_href') + if not next_url: + break + + def _get_n_results(self, query, n): + tracks = self._get_collection('search/tracks', query, limit=n, q=query) + return self.playlist_result(tracks, playlist_title=query) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py new file mode 100644 index 0000000000..3d78a9d76c --- /dev/null +++ b/youtube_dl/extractor/soundgasm.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', + 'md5': '010082a2c802c5275bb00030743e75ad', + 'info_dict': { + 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', + 'ext': 'm4a', + 'title': 'Piano sample', + 'description': 'Royalty Free Sample Music', + 'uploader': 'ytdl', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + audio_url = self._html_search_regex( + r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'audio URL', group='url') + + title = self._search_regex( + r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', + webpage, 'title', default=display_id) + + description = self._html_search_regex( + (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', + r'(?s)<li>Description:\s(.*?)<\/li>'), + webpage, 'description', fatal=False) + + audio_id = self._search_regex( + r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) + + return { + 'id': audio_id, + 'display_id': display_id, + 'url': audio_url, + 'vcodec': 'none', + 'title': title, + 'description': description, + 'uploader': mobj.group('user'), + } + + +class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'info_dict': { + 'id': 'ytdl', + }, + 'playlist_count': 1, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + webpage = self._download_webpage(url, profile_id) + + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] + + return self.playlist_result(entries, profile_id) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py new file mode 100644 index 0000000000..da75a43a73 --- /dev/null +++ b/youtube_dl/extractor/southpark.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class SouthParkIE(MTVServicesInfoExtractor): + IE_NAME = 'southpark.cc.com' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + + _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', + 'info_dict': { + 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'South Park|Bat Daded', + 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', + }, + }, { + 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', + 'only_matching': True, + }] + + +class SouthParkEsIE(SouthParkIE): + IE_NAME = 'southpark.cc.com:español' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' + _LANG = 'es' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, + 'playlist_count': 4, + 'skip': 'Geo-restricted', + }] + + +class SouthParkDeIE(SouthParkIE): + IE_NAME = 'southpark.de' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', + 'info_dict': { + 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', + 'ext': 'mp4', + 'title': 'South Park|The Government Won\'t Respect My Privacy', + 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'timestamp': 1380160800, + 'upload_date': '20130926', + }, + }, { + # non-ASCII characters in initial URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, + }, { + # non-ASCII characters in redirect URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09', + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1', + 'only_matching': True, + }] + + +class SouthParkNlIE(SouthParkIE): + IE_NAME = 'southpark.nl' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, + }] + + +class SouthParkDkIE(SouthParkIE): + IE_NAME = 'southparkstudios.dk' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', + 'only_matching': True, + }, { + 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py new file mode 100644 index 0000000000..61ca902ce2 --- /dev/null +++ b/youtube_dl/extractor/spankbang.py @@ -0,0 +1,184 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + merge_dicts, + orderedSet, + parse_duration, + parse_resolution, + str_to_int, + url_or_none, + urlencode_postdata, +) + + +class SpankBangIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b' + _TESTS = [{ + 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', + 'md5': '1cc433e1d6aa14bc376535b8679302f7', + 'info_dict': { + 'id': '3vvn', + 'ext': 'mp4', + 'title': 'fantasy solo', + 'description': 'dillion harper masturbates on a bed', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'silly2587', + 'timestamp': 1422571989, + 'upload_date': '20150129', + 'age_limit': 18, + } + }, { + # 480p only + 'url': 'http://spankbang.com/1vt0/video/solvane+gangbang', + 'only_matching': True, + }, { + # no uploader + 'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2', + 'only_matching': True, + }, { + # mobile page + 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', + 'only_matching': True, + }, { + # 4k + 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play', + 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2y3td/embed/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('/%s/embed' % video_id, '/%s/video' % video_id), + video_id, headers={'Cookie': 'country=US'}) + + if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + + formats = [] + + def extract_format(format_id, format_url): + f_url = url_or_none(format_url) + if not f_url: + return + f = parse_resolution(format_id) + ext = determine_ext(f_url) + if format_id.startswith('m3u8') or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id.startswith('mpd') or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp4' or f.get('width') or f.get('height'): + f.update({ + 'url': f_url, + 'format_id': format_id, + }) + formats.append(f) + + STREAM_URL_PREFIX = 'stream_url_' + + for mobj in re.finditer( + r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' + % STREAM_URL_PREFIX, webpage): + extract_format(mobj.group('id', 'url')) + + if not formats: + stream_key = self._search_regex( + r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'stream key', group='value') + + stream = self._download_json( + 'https://spankbang.com/api/videos/stream', video_id, + 'Downloading stream JSON', data=urlencode_postdata({ + 'id': stream_key, + 'data': 0, + }), headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + for format_id, format_url in stream.items(): + if format_url and isinstance(format_url, list): + format_url = format_url[0] + extract_format(format_id, format_url) + + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) + + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._html_search_regex( + r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None) + description = self._search_regex( + r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', + webpage, 'description', default=None) + thumbnail = self._og_search_thumbnail(webpage, default=None) + uploader = self._html_search_regex( + (r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>', + r'class="user"[^>]*><img[^>]+>([^<]+)'), + webpage, 'uploader', default=None) + duration = parse_duration(self._search_regex( + r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', + webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + r'([\d,.]+)\s+plays', webpage, 'view count', default=None)) + + age_limit = self._rta_search(webpage) + + return merge_dicts({ + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': age_limit, + }, info + ) + + +class SpankBangPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+' + _TEST = { + 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', + 'info_dict': { + 'id': 'ug0k', + 'title': 'Big Ass Titties', + }, + 'playlist_mincount': 50, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) + + entries = [self.url_result( + 'https://spankbang.com/%s/video' % video_id, + ie=SpankBangIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))] + + title = self._html_search_regex( + r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title', + fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py new file mode 100644 index 0000000000..35ab9ec375 --- /dev/null +++ b/youtube_dl/extractor/spankwire.py @@ -0,0 +1,182 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + merge_dicts, + str_or_none, + str_to_int, + url_or_none, +) + + +class SpankwireIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P<id>\d+) + ''' + _TESTS = [{ + # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 + 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', + 'info_dict': { + 'id': '103545', + 'ext': 'mp4', + 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', + 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, + 'uploader': 'oreusz', + 'uploader_id': '124697', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + # download URL pattern: */mp4_<format_id>_<video_id>.mp4 + 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', + 'md5': '09b3c20833308b736ae8902db2f8d7e6', + 'info_dict': { + 'id': '1921551', + 'ext': 'mp4', + 'title': 'Titcums Compiloation I', + 'description': 'cum on tits', + 'uploader': 'dannyh78999', + 'uploader_id': '3056053', + 'upload_date': '20150822', + 'age_limit': 18, + }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) + + title = video['title'] + + formats = [] + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, + }) + + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries + + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), + 'view_count': view_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, + 'formats': formats, + }, info) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py new file mode 100644 index 0000000000..4df7f4ddce --- /dev/null +++ b/youtube_dl/extractor/spiegel.py @@ -0,0 +1,159 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + strip_or_none, + unified_timestamp, +) + + +class SpiegelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _TESTS = [{ + 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', + 'md5': 'b57399839d055fccfeb9a0455c439868', + 'info_dict': { + 'id': '563747', + 'ext': 'mp4', + 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'description': 'md5:8029d8310232196eb235d27575a8b9f4', + 'duration': 49, + 'upload_date': '20130311', + 'timestamp': 1362994320, + }, + }, { + 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', + 'info_dict': { + 'id': '580988', + 'ext': 'mp4', + 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', + 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', + 'duration': 983, + 'upload_date': '20131115', + 'timestamp': 1384546642, + }, + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', + 'md5': '97b91083a672d72976faa8433430afb9', + 'info_dict': { + 'id': '601883', + 'ext': 'mp4', + 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', + 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', + 'upload_date': '20140904', + 'timestamp': 1409834160, + } + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', + 'only_matching': True, + }, { + # nexx video + 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) + + # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html + if SpiegeltvIE.suitable(handle.geturl()): + return self.url_result(handle.geturl(), 'Spiegeltv') + + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), + 'title': title, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), + } + + +class SpiegelArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' + IE_NAME = 'Spiegel:Article' + IE_DESC = 'Articles on spiegel.de' + _TESTS = [{ + 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', + 'info_dict': { + 'id': '1516455', + 'ext': 'mp4', + 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', + 'description': 're:^Patrick Kämnitz gehört.{100,}', + 'upload_date': '20140825', + }, + }, { + 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', + 'info_dict': { + + }, + 'playlist_count': 6, + }, { + # Nexx iFrame embed + 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', + 'info_dict': { + 'id': '161464', + 'ext': 'mp4', + 'title': 'Nervenkitzel Achterbahn', + 'alt_title': 'Karussellbauer in Deutschland', + 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', + 'release_year': 2005, + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2761, + 'timestamp': 1394021479, + 'upload_date': '20140305', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Single video on top of the page + video_link = self._search_regex( + r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, + 'video page URL', default=None) + if video_link: + video_url = compat_urlparse.urljoin( + self.http_scheme() + '//spiegel.de/', video_link) + return self.url_result(video_url) + + # Multiple embedded videos + embeds = re.findall( + r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', + webpage) + entries = [ + self.url_result(compat_urlparse.urljoin( + self.http_scheme() + '//spiegel.de/', embed_path)) + for embed_path in embeds] + if embeds: + return self.playlist_result(entries) + + return self.playlist_from_matches( + NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py new file mode 100644 index 0000000000..6ccf4c3423 --- /dev/null +++ b/youtube_dl/extractor/spiegeltv.py @@ -0,0 +1,17 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .nexx import NexxIE + + +class SpiegeltvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', + 'only_matching': True, + } + + def _real_extract(self, url): + return self.url_result( + 'https://api.nexx.cloud/v3/748/videos/byid/%s' + % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py new file mode 100644 index 0000000000..aabff7a3ce --- /dev/null +++ b/youtube_dl/extractor/spike.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class BellatorIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', + 'info_dict': { + 'title': 'Michael Page vs. Evangelista Cyborg', + 'description': 'md5:0d917fc00ffd72dd92814963fc6cbb05', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] + + _FEED_URL = 'http://www.bellator.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + def _extract_mgid(self, webpage): + return self._extract_triforce_mgid(webpage) + + +class ParamountNetworkIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', + 'info_dict': { + 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', + 'ext': 'mp4', + 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', + 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + def _extract_mgid(self, webpage): + root_data = self._parse_json(self._search_regex( + r'window\.__DATA__\s*=\s*({.+})', + webpage, 'data'), None) + + def find_sub_data(data, data_type): + return next(c for c in data['children'] if c.get('type') == data_type) + + c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') + return c['props']['media']['video']['config']['uri'] diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py new file mode 100644 index 0000000000..a417b5a4ef --- /dev/null +++ b/youtube_dl/extractor/sport5.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class Sport5IE(InfoExtractor): + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', + 'info_dict': { + 'id': 's5-Y59xx1-GUh2', + 'ext': 'mp4', + 'title': 'ולנסיה-קורדובה 0:3', + 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', + 'duration': 228, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + }, { + 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', + 'info_dict': { + 'id': 's5-SiXxx1-hKh2', + 'ext': 'mp4', + 'title': 'GOALS_CELTIC_270914.mp4', + 'description': '', + 'duration': 87, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + media_id = mobj.group('id') + + webpage = self._download_webpage(url, media_id) + + video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id') + + metadata = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, + video_id) + + error = metadata.find('./Error') + if error is not None: + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, + error.find('./Name').text, + error.find('./Description').text), + expected=True) + + title = metadata.find('./Title').text + description = metadata.find('./Description').text + duration = int(metadata.find('./Duration').text) + + posters_el = metadata.find('./PosterLinks') + thumbnails = [{ + 'url': thumbnail.text, + 'width': int(thumbnail.get('width')), + 'height': int(thumbnail.get('height')), + } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] + + categories_el = metadata.find('./Categories') + categories = [ + cat.get('name') for cat in categories_el.findall('./Category') + ] if categories_el is not None else [] + + formats = [{ + 'url': fmt.text, + 'ext': 'mp4', + 'vbr': int(fmt.get('bitrate')), + 'width': int(fmt.get('width')), + 'height': int(fmt.get('height')), + } for fmt in metadata.findall('./PlaybackLinks/FileURL')] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'duration': duration, + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py new file mode 100644 index 0000000000..b9017fd2ab --- /dev/null +++ b/youtube_dl/extractor/sportbox.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + merge_dicts, +) + + +class SportBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', + 'info_dict': { + 'id': '109158', + 'ext': 'mp4', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 292, + 'view_count': int, + 'timestamp': 1426237001, + 'upload_date': '20150313', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', + 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/193095', + 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/109158', + 'only_matching': True, + }, { + 'url': 'https://matchtv.ru/vdl/player/media/109158', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + sources = self._parse_json( + self._search_regex( + r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n', + webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + src = source.get('src') + if not src: + continue + if determine_ext(src) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + player = self._parse_json( + self._search_regex( + r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage, + 'player options', default='{}'), + video_id, transform_source=js_to_json) + media_id = player['mediaId'] + + info = self._search_json_ld(webpage, media_id, default={}) + + view_count = int_or_none(self._search_regex( + r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) + + return merge_dicts(info, { + 'id': media_id, + 'title': self._og_search_title(webpage, default=None) or media_id, + 'thumbnail': player.get('poster'), + 'duration': int_or_none(player.get('duration')), + 'view_count': view_count, + 'formats': formats, + }) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py new file mode 100644 index 0000000000..378fc75686 --- /dev/null +++ b/youtube_dl/extractor/sportdeutschland.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + sanitized_Request, +) + + +class SportDeutschlandIE(InfoExtractor): + _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])' + _TESTS = [{ + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'ext': 'mp4', + 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', + 'categories': ['Badminton-Deutschland'], + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': int, + 'upload_date': '20200201', + 'description': 're:.*', # meaningless description for THIS video + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + sport_id = mobj.group('sport') + + api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( + sport_id, video_id) + req = sanitized_Request(api_url, headers={ + 'Accept': 'application/vnd.vidibus.v2.html+json', + 'Referer': url, + }) + data = self._download_json(req, video_id) + + asset = data['asset'] + categories = [data['section']['title']] + + formats = [] + smil_url = asset['video'] + if '.smil' in smil_url: + m3u8_url = smil_url.replace('.smil', '.m3u8') + formats.extend( + self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) + + smil_doc = self._download_xml( + smil_url, video_id, note='Downloading SMIL metadata') + base_url_el = smil_doc.find('./head/meta') + if base_url_el: + base_url = base_url_el.attrib['base'] + formats.extend([{ + 'format_id': 'rmtp', + 'url': base_url if base_url_el else n.attrib['src'], + 'play_path': n.attrib['src'], + 'ext': 'flv', + 'preference': -100, + 'format_note': 'Seems to fail at example stream', + } for n in smil_doc.findall('./body/video')]) + else: + formats.append({'url': smil_url}) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': asset['title'], + 'thumbnail': asset.get('image'), + 'description': asset.get('teaser'), + 'duration': asset.get('duration'), + 'categories': categories, + 'view_count': asset.get('views'), + 'rtmp_live': asset.get('live'), + 'timestamp': parse_iso8601(asset.get('date')), + } diff --git a/youtube_dl/extractor/springboardplatform.py b/youtube_dl/extractor/springboardplatform.py new file mode 100644 index 0000000000..07d99b5791 --- /dev/null +++ b/youtube_dl/extractor/springboardplatform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, + xpath_element, + unescapeHTML, + unified_timestamp, +) + + +class SpringboardPlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + cms\.springboardplatform\.com/ + (?: + (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| + xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) + ) + ''' + _TESTS = [{ + 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', + 'md5': '5c3cb7b5c55740d482561099e920f192', + 'info_dict': { + 'id': '981017', + 'ext': 'mp4', + 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1409132328, + 'upload_date': '20140827', + 'duration': 193, + }, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') + index = mobj.group('index') or mobj.group('index_2') + + video = self._download_xml( + 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' + % (index, video_id), video_id) + + item = xpath_element(video, './/item', 'item', fatal=True) + + content = xpath_element( + item, './{http://search.yahoo.com/mrss/}content', 'content', + fatal=True) + title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) + + video_url = content.attrib['url'] + + if 'error_video.mp4' in video_url: + raise ExtractorError( + 'Video %s no longer exists' % video_id, expected=True) + + duration = int_or_none(content.get('duration')) + tbr = int_or_none(content.get('bitrate')) + filesize = int_or_none(content.get('fileSize')) + width = int_or_none(content.get('width')) + height = int_or_none(content.get('height')) + + description = unescapeHTML(xpath_text( + item, './description', 'description')) + thumbnail = xpath_attr( + item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', + 'thumbnail') + + timestamp = unified_timestamp(xpath_text( + item, './{http://cms.springboardplatform.com/namespaces.html}created', + 'timestamp')) + + formats = [{ + 'url': video_url, + 'format_id': 'http', + 'tbr': tbr, + 'filesize': filesize, + 'width': width, + 'height': height, + }] + + m3u8_format = formats[0].copy() + m3u8_format.update({ + 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', + 'ext': 'mp4', + 'format_id': 'hls', + 'protocol': 'm3u8_native', + }) + formats.append(m3u8_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/sprout.py b/youtube_dl/extractor/sprout.py new file mode 100644 index 0000000000..8467bf49df --- /dev/null +++ b/youtube_dl/extractor/sprout.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + update_url_query, + smuggle_url, +) + + +class SproutIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', + 'md5': '74bf14128578d1e040c3ebc82088f45f', + 'info_dict': { + 'id': '9dexnwtmh8_X', + 'ext': 'mp4', + 'title': 'A Cowboy Adventure', + 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.', + 'timestamp': 1437758640, + 'upload_date': '20150724', + 'uploader': 'NBCU-SPROUT-NEW', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_component = self._search_regex( + r'(?s)(<div[^>]+data-component="video"[^>]*?>)', + webpage, 'video component', default=None) + if video_component: + options = self._parse_json(extract_attributes( + video_component)['data-options'], video_id) + theplatform_url = options['video'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if options.get('protected'): + query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout') + theplatform_url = smuggle_url(update_url_query( + theplatform_url, query), {'force_smil_url': True}) + else: + iframe = self._search_regex( + r'(<iframe[^>]+id="sproutVideoIframe"[^>]*?>)', + webpage, 'iframe') + theplatform_url = extract_attributes(iframe)['src'] + + return self.url_result(theplatform_url, 'ThePlatform') diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py new file mode 100644 index 0000000000..170dce87f1 --- /dev/null +++ b/youtube_dl/extractor/srgssr.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + ExtractorError, + parse_iso8601, + qualities, +) + + +class SRGSSRIE(InfoExtractor): + _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['CH'] + + _ERRORS = { + 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', + 'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.', + # 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.', + 'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.', + 'LEGAL': 'The video cannot be transmitted for legal reasons.', + 'STARTDATE': 'This video is not yet available. Please try again later.', + } + + def _get_tokenized_src(self, url, video_id, format_id): + sp = compat_urllib_parse_urlparse(url).path.split('/') + token = self._download_json( + 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + video_id, 'Downloading %s token' % format_id, fatal=False) or {} + auth_params = token.get('token', {}).get('authparams') + if auth_params: + url += '?' + auth_params + return url + + def get_media_data(self, bu, media_type, media_id): + media_data = self._download_json( + 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), + media_id)[media_type.capitalize()] + + if media_data.get('block') and media_data['block'] in self._ERRORS: + message = self._ERRORS[media_data['block']] + if media_data['block'] == 'GEOBLOCK': + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + + return media_data + + def _real_extract(self, url): + bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + + media_data = self.get_media_data(bu, media_type, media_id) + + metadata = media_data['AssetMetadatas']['AssetMetadata'][0] + title = metadata['title'] + description = metadata.get('description') + created_date = media_data.get('createdDate') or metadata.get('createdDate') + timestamp = parse_iso8601(created_date) + + thumbnails = [{ + 'id': image.get('id'), + 'url': image['url'], + } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] + + preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) + formats = [] + for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): + protocol = source.get('@protocol') + for asset in source['url']: + asset_url = asset['text'] + quality = asset['@quality'] + format_id = '%s-%s' % (protocol, quality) + if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): + asset_url = self._get_tokenized_src(asset_url, media_id, format_id) + if protocol.startswith('HTTP-HDS'): + formats.extend(self._extract_f4m_formats( + asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + elif protocol.startswith('HTTP-HLS'): + formats.extend(self._extract_m3u8_formats( + asset_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': asset_url, + 'preference': preference(quality), + 'ext': 'flv' if protocol == 'RTMP' else None, + }) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, + } + + +class SRGSSRPlayIE(InfoExtractor): + IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|play)\.)? + (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/ + (?: + [^/]+/(?P<type>video|audio)/[^?]+| + popup(?P<type_2>video|audio)player + ) + \?id=(?P<id>[0-9a-f\-]{36}|\d+) + ''' + + _TESTS = [{ + 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'info_dict': { + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'ext': 'mp4', + 'upload_date': '20130701', + 'title': 'Snowden beantragt Asyl in Russland', + 'timestamp': 1372713995, + } + }, { + # No Speichern (Save) button + 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', + 'md5': '0a274ce38fda48c53c01890651985bc6', + 'info_dict': { + 'id': '677f5829-e473-4823-ac83-a1087fe97faa', + 'ext': 'flv', + 'upload_date': '20130710', + 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', + 'description': 'md5:88604432b60d5a38787f152dec89cd56', + 'timestamp': 1373493600, + }, + }, { + 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'info_dict': { + 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'ext': 'mp3', + 'upload_date': '20151013', + 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', + 'timestamp': 1444750398, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', + 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', + 'info_dict': { + 'id': '6348260', + 'display_id': '6348260', + 'ext': 'mp4', + 'duration': 1796, + 'title': 'Le 19h30', + 'description': '', + 'uploader': '19h30', + 'upload_date': '20141201', + 'timestamp': 1417458600, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + bu = mobj.group('bu') + media_type = mobj.group('type') or mobj.group('type_2') + media_id = mobj.group('id') + # other info can be extracted from url + '&layout=json' + return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py new file mode 100644 index 0000000000..359dadaa3c --- /dev/null +++ b/youtube_dl/extractor/srmediathek.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .ard import ARDMediathekBaseIE +from ..utils import ( + ExtractorError, + get_element_by_attribute, +) + + +class SRMediathekIE(ARDMediathekBaseIE): + IE_NAME = 'sr:mediathek' + IE_DESC = 'Saarländischer Rundfunk' + _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', + 'info_dict': { + 'id': '28455', + 'ext': 'mp4', + 'title': 'sportarena (26.10.2014)', + 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'no longer available', + }, { + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', + 'info_dict': { + 'id': '37682', + 'ext': 'mp4', + 'title': 'Love, Cakes and Rock\'n\'Roll', + 'description': 'md5:18bf9763631c7d326c22603681e1123d', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: + raise ExtractorError('Video %s is no longer available' % video_id, expected=True) + + media_collection_url = self._search_regex( + r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') + info = self._extract_media_info(media_collection_url, webpage, video_id) + info.update({ + 'id': video_id, + 'title': get_element_by_attribute('class', 'ardplayer-title', webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + }) + return info diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py new file mode 100644 index 0000000000..ae3dd13807 --- /dev/null +++ b/youtube_dl/extractor/stanfordoc.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + orderedSet, + unescapeHTML, +) + + +class StanfordOpenClassroomIE(InfoExtractor): + IE_NAME = 'stanfordoc' + IE_DESC = 'Stanford Open ClassRoom' + _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + _TEST = { + 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', + 'md5': '544a9468546059d4e80d76265b0443b8', + 'info_dict': { + 'id': 'PracticalUnix_intro-environment', + 'ext': 'mp4', + 'title': 'Intro Environment', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + if mobj.group('course') and mobj.group('video'): # A specific video + course = mobj.group('course') + video = mobj.group('video') + info = { + 'id': course + '_' + video, + 'uploader': None, + 'upload_date': None, + } + + baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' + xmlUrl = baseUrl + video + '.xml' + mdoc = self._download_xml(xmlUrl, info['id']) + try: + info['title'] = mdoc.findall('./title')[0].text + info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text + except IndexError: + raise ExtractorError('Invalid metadata XML file') + return info + elif mobj.group('course'): # A course page + course = mobj.group('course') + info = { + 'id': course, + '_type': 'playlist', + 'uploader': None, + 'upload_date': None, + } + + coursepage = self._download_webpage( + url, info['id'], + note='Downloading course info page', + errnote='Unable to download course info page') + + info['title'] = self._html_search_regex( + r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) + + info['description'] = self._html_search_regex( + r'(?s)<description>([^<]+)</description>', + coursepage, 'description', fatal=False) + + links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info + else: # Root page + info = { + 'id': 'Stanford OpenClassroom', + '_type': 'playlist', + 'uploader': None, + 'upload_date': None, + } + info['title'] = info['id'] + + rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' + rootpage = self._download_webpage(rootURL, info['id'], + errnote='Unable to download course info page') + + links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py new file mode 100644 index 0000000000..a6a191cebb --- /dev/null +++ b/youtube_dl/extractor/steam.py @@ -0,0 +1,149 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + ExtractorError, + get_element_by_class, + js_to_json, +) + + +class SteamIE(InfoExtractor): + _VALID_URL = r"""(?x) + https?://store\.steampowered\.com/ + (agecheck/)? + (?P<urltype>video|app)/ #If the page is only for videos or for a game + (?P<gameID>\d+)/? + (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID + | + https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+) + """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' + _TESTS = [{ + 'url': 'http://store.steampowered.com/video/105600/', + 'playlist': [ + { + 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', + 'info_dict': { + 'id': '2040428', + 'ext': 'mp4', + 'title': 'Terraria 1.3 Trailer', + 'playlist_index': 1, + } + }, + { + 'md5': '911672b20064ca3263fa89650ba5a7aa', + 'info_dict': { + 'id': '2029566', + 'ext': 'mp4', + 'title': 'Terraria 1.2 Trailer', + 'playlist_index': 2, + } + } + ], + 'info_dict': { + 'id': '105600', + 'title': 'Terraria', + }, + 'params': { + 'playlistend': 2, + } + }, { + 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', + 'info_dict': { + 'id': 'X8kpJBlzD2E', + 'ext': 'mp4', + 'upload_date': '20140617', + 'title': 'FRONTIERS - Trapping', + 'description': 'md5:bf6f7f773def614054089e5769c12a6e', + 'uploader': 'AAD Productions', + 'uploader_id': 'AtomicAgeDogGames', + } + }] + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + fileID = m.group('fileID') + if fileID: + videourl = url + playlist_id = fileID + else: + gameID = m.group('gameID') + playlist_id = gameID + videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + + self._set_cookie('steampowered.com', 'mature_content', '1') + + webpage = self._download_webpage(videourl, playlist_id) + + if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % playlist_id + self.report_age_confirmation() + webpage = self._download_webpage(videourl, playlist_id) + + flash_vars = self._parse_json(self._search_regex( + r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, + 'flash vars'), playlist_id, js_to_json) + + playlist_title = None + entries = [] + if fileID: + playlist_title = get_element_by_class('workshopItemTitle', webpage) + for movie in flash_vars.values(): + if not movie: + continue + youtube_id = movie.get('YOUTUBE_VIDEO_ID') + if not youtube_id: + continue + entries.append({ + '_type': 'url', + 'url': youtube_id, + 'ie_key': 'Youtube', + }) + else: + playlist_title = get_element_by_class('apphub_AppName', webpage) + for movie_id, movie in flash_vars.items(): + if not movie: + continue + video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) + title = movie.get('MOVIE_NAME') + if not title or not video_id: + continue + entry = { + 'id': video_id, + 'title': title.replace('+', ' '), + } + formats = [] + flv_url = movie.get('FILENAME') + if flv_url: + formats.append({ + 'format_id': 'flv', + 'url': flv_url, + }) + highlight_element = self._search_regex( + r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, + webpage, 'highlight element', fatal=False) + if highlight_element: + highlight_attribs = extract_attributes(highlight_element) + if highlight_attribs: + entry['thumbnail'] = highlight_attribs.get('data-poster') + for quality in ('', '-hd'): + for ext in ('webm', 'mp4'): + video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) + if video_url: + formats.append({ + 'format_id': ext + quality, + 'url': video_url, + }) + if not formats: + continue + entry['formats'] = formats + entries.append(entry) + if not entries: + raise ExtractorError('Could not find any videos') + + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py new file mode 100644 index 0000000000..97d1ff6811 --- /dev/null +++ b/youtube_dl/extractor/stitcher.py @@ -0,0 +1,81 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + unescapeHTML, +) + + +class StitcherIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', + 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'info_dict': { + 'id': '40789481', + 'ext': 'mp3', + 'title': 'Machine Learning Mastery and Cancer Clusters', + 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'duration': 1604, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', + 'info_dict': { + 'id': '40846275', + 'display_id': 'the-rare-hourlong-comedy-plus', + 'ext': 'mp3', + 'title': "The CW's 'Crazy Ex-Girlfriend'", + 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', + 'duration': 2235, + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # escaped title + 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', + 'only_matching': True, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + audio_id = mobj.group('id') + display_id = mobj.group('display_id') or audio_id + + webpage = self._download_webpage(url, display_id) + + episode = self._parse_json( + js_to_json(self._search_regex( + r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), + display_id)['config']['episode'] + + title = unescapeHTML(episode['title']) + formats = [{ + 'url': episode[episode_key], + 'ext': determine_ext(episode[episode_key]) or 'mp3', + 'vcodec': 'none', + } for episode_key in ('episodeURL',) if episode.get(episode_key)] + description = self._search_regex( + r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False) + duration = int_or_none(episode.get('duration')) + thumbnail = episode.get('episodeImage') + + return { + 'id': audio_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py new file mode 100644 index 0000000000..67457cc941 --- /dev/null +++ b/youtube_dl/extractor/storyfire.py @@ -0,0 +1,255 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +from .common import InfoExtractor + + +class StoryFireIE(InfoExtractor): + _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': '560953bfca81a69003cfa5e53ac8a920', + 'info_dict': { + 'id': '5df1d132b6378700117f9181', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies', + 'uploader': 'whang!', + 'upload_date': '20191212', + }, + 'params': {'format': 'bestvideo'} # There are no merged formats in the playlist. + }, { + 'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID + 'md5': '7a2dc6d60c4889edfed459c620fe690d', + 'info_dict': { + 'id': '5f1e11ecd78a57b6c702001d', + 'ext': 'm4a', + 'title': 'Weird Nintendo Prototype Leaks', + 'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis', + 'timestamp': 1595808576, + 'upload_date': '20200727', + 'uploader': 'whang!', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + }, + 'params': {'format': 'bestaudio'} # Verifying audio extraction + + }] + + _aformats = { + 'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10}, + 'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1}, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Extracting the json blob is mandatory to proceed with extraction. + jsontext = self._html_search_regex( + r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>', + webpage, 'json_data') + + json = self._parse_json(jsontext, video_id) + + # The currentVideo field in the json is mandatory + # because it contains the only link to the m3u playlist + video = json['props']['initialState']['video']['currentVideo'] + videourl = video['vimeoVideoURL'] # Video URL is mandatory + + # Extract other fields from the json in an error tolerant fashion + # ID may be incorrect (on short URL format), correct it. + parsed_id = video.get('_id') + if parsed_id: + video_id = parsed_id + + title = video.get('title') + description = video.get('description') + + thumbnail = video.get('storyImage') + views = video.get('views') + likes = video.get('likesCount') + comments = video.get('commentsCount') + duration = video.get('videoDuration') + publishdate = video.get('publishDate') # Apparently epoch time, day only + + uploader = video.get('username') + uploader_id = video.get('hostID') + # Construct an uploader URL + uploader_url = None + if uploader_id: + uploader_url = "https://storyfire.com/user/%s/video" % uploader_id + + # Collect root playlist to determine formats + formats = self._extract_m3u8_formats( + videourl, video_id, 'mp4', 'm3u8_native') + + # Modify formats to fill in missing information about audio codecs + for format in formats: + aformat = self._aformats.get(format['format_id']) + if aformat: + format['acodec'] = aformat['acodec'] + format['abr'] = aformat['abr'] + format['preference'] = aformat['preference'] + format['ext'] = 'm4a' + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'ext': "mp4", + 'url': videourl, + 'formats': formats, + + 'thumbnail': thumbnail, + 'view_count': views, + 'like_count': likes, + 'comment_count': comments, + 'duration': duration, + 'timestamp': publishdate, + + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + + } + + +class StoryFireUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video' + _TESTS = [{ + 'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video', + 'info_dict': { + 'id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'title': 'whang!', + }, + 'playlist_mincount': 18 + }, { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + 'title': 'McJuggerNuggets', + }, + 'playlist_mincount': 143 + + }] + + # Generator for fetching playlist items + def _enum_videos(self, baseurl, user_id, firstjson): + totalVideos = int(firstjson['videosCount']) + haveVideos = 0 + json = firstjson + + for page in itertools.count(1): + for video in json['videos']: + id = video['_id'] + url = "https://storyfire.com/video-details/%s" % id + haveVideos += 1 + yield { + '_type': 'url', + 'id': id, + 'url': url, + 'ie_key': 'StoryFire', + + 'title': video.get('title'), + 'description': video.get('description'), + 'view_count': video.get('views'), + 'comment_count': video.get('commentsCount'), + 'duration': video.get('videoDuration'), + 'timestamp': video.get('publishDate'), + } + # Are there more pages we could fetch? + if haveVideos < totalVideos: + pageurl = baseurl + ("%i" % haveVideos) + json = self._download_json(pageurl, user_id, + note='Downloading page %s' % page) + + # Are there any videos in the new json? + videos = json.get('videos') + if not videos or len(videos) == 0: + break # no videos + + else: + break # We have fetched all the videos, stop + + def _real_extract(self, url): + user_id = self._match_id(url) + + baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id + + # Download first page to ensure it can be downloaded, and get user information if available. + firstpage = baseurl + "0" + firstjson = self._download_json(firstpage, user_id) + + title = None + videos = firstjson.get('videos') + if videos and len(videos): + title = videos[1].get('username') + + return { + '_type': 'playlist', + 'entries': self._enum_videos(baseurl, user_id, firstjson), + 'id': user_id, + 'title': title, + } + + +class StoryFireSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13 + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0 # This playlist has entries, but no videos. + }, { + 'url': 'https://storyfire.com/write/series/stories/story_time', + 'info_dict': { + 'id': 'story_time', + }, + 'playlist_mincount': 10 + }] + + # Generator for returning playlist items + # This object is substantially different than the one in the user videos page above + def _enum_videos(self, jsonlist): + for video in jsonlist: + id = video['_id'] + if video.get('hasVideo'): # Boolean element + url = "https://storyfire.com/video-details/%s" % id + yield { + '_type': 'url', + 'id': id, + 'url': url, + 'ie_key': 'StoryFire', + + 'title': video.get('title'), + 'description': video.get('description'), + 'view_count': video.get('views'), + 'likes_count': video.get('likesCount'), + 'comment_count': video.get('commentsCount'), + 'duration': video.get('videoDuration'), + 'timestamp': video.get('publishDate'), + } + + def _real_extract(self, url): + list_id = self._match_id(url) + + listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id + json = self._download_json(listurl, list_id) + + return { + '_type': 'playlist', + 'entries': self._enum_videos(json), + 'id': list_id + } diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py new file mode 100644 index 0000000000..34725274e4 --- /dev/null +++ b/youtube_dl/extractor/streamable.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, +) + + +class StreamableIE(InfoExtractor): + _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://streamable.com/dnd1', + 'md5': '3e3bc5ca088b48c2d436529b64397fef', + 'info_dict': { + 'id': 'dnd1', + 'ext': 'mp4', + 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', + 'thumbnail': r're:https?://.*\.jpg$', + 'uploader': 'teabaker', + 'timestamp': 1454964157.35115, + 'upload_date': '20160208', + 'duration': 61.516, + 'view_count': int, + } + }, + # older video without bitrate, width/height, etc. info + { + 'url': 'https://streamable.com/moo', + 'md5': '2cf6923639b87fba3279ad0df3a64e73', + 'info_dict': { + 'id': 'moo', + 'ext': 'mp4', + 'title': '"Please don\'t eat me!"', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1426115495, + 'upload_date': '20150311', + 'duration': 12, + 'view_count': int, + } + }, + { + 'url': 'https://streamable.com/e/dnd1', + 'only_matching': True, + }, + { + 'url': 'https://streamable.com/s/okkqk/drxjds', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', + webpage) + if mobj: + return mobj.group('src') + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Note: Using the ajax API, as the public Streamable API doesn't seem + # to return video info like the title properly sometimes, and doesn't + # include info like the video duration + video = self._download_json( + 'https://ajax.streamable.com/videos/%s' % video_id, video_id) + + # Format IDs: + # 0 The video is being uploaded + # 1 The video is being processed + # 2 The video has at least one file ready + # 3 The video is unavailable due to an error + status = video.get('status') + if status != 2: + raise ExtractorError( + 'This video is currently unavailable. It may still be uploading or processing.', + expected=True) + + title = video.get('reddit_title') or video['title'] + + formats = [] + for key, info in video['files'].items(): + if not info.get('url'): + continue + formats.append({ + 'format_id': key, + 'url': self._proto_relative_url(info['url']), + 'width': int_or_none(info.get('width')), + 'height': int_or_none(info.get('height')), + 'filesize': int_or_none(info.get('size')), + 'fps': int_or_none(info.get('framerate')), + 'vbr': float_or_none(info.get('bitrate'), 1000) + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), + 'uploader': video.get('owner', {}).get('user_name'), + 'timestamp': float_or_none(video.get('date_added')), + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('plays')), + 'formats': formats + } diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py new file mode 100644 index 0000000000..32eb2b92d3 --- /dev/null +++ b/youtube_dl/extractor/streamcloud.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + urlencode_postdata, +) + + +class StreamcloudIE(InfoExtractor): + IE_NAME = 'streamcloud.eu' + _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?' + + _TESTS = [{ + 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dlc_test_video_____________-BaW_jenozKc.mp4.html', + 'md5': '6bea4c7fa5daaacc2a946b7146286686', + 'info_dict': { + 'id': 'skp9j99s4bpz', + 'ext': 'mp4', + 'title': 'youtube-dlc test video \'/\\ ä ↭', + }, + 'skip': 'Only available from the EU' + }, { + 'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url = 'http://streamcloud.eu/%s' % video_id + + orig_webpage = self._download_webpage(url, video_id) + + if '>File Not Found<' in orig_webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + fields = re.findall(r'''(?x)<input\s+ + type="(?:hidden|submit)"\s+ + name="([^"]+)"\s+ + (?:id="[^"]+"\s+)? + value="([^"]*)" + ''', orig_webpage) + + self._sleep(6, video_id) + + webpage = self._download_webpage( + url, video_id, data=urlencode_postdata(fields), headers={ + b'Content-Type': b'application/x-www-form-urlencoded', + }) + + try: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)<', webpage, 'title') + video_url = self._search_regex( + r'file:\s*"([^"]+)"', webpage, 'video URL') + except ExtractorError: + message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>', + webpage, 'message', default=None, group='message') + if message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + raise + thumbnail = self._search_regex( + r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'http_headers': { + 'Referer': url, + }, + } diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py new file mode 100644 index 0000000000..58e0b4c802 --- /dev/null +++ b/youtube_dl/extractor/streamcz.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import time + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + sanitized_Request, +) + + +def _get_api_key(api_path): + if api_path.endswith('?'): + api_path = api_path[:-1] + + api_key = 'fb5f58a820353bd7095de526253c14fd' + a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) + return hashlib.md5(a.encode('ascii')).hexdigest() + + +class StreamCZIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' + _API_URL = 'http://www.stream.cz/API' + + _TESTS = [{ + 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', + 'md5': '934bb6a6d220d99c010783c9719960d5', + 'info_dict': { + 'id': '765767', + 'ext': 'mp4', + 'title': 'Peklo na talíři: Éčka pro děti', + 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', + 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', + 'duration': 256, + }, + }, { + 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', + 'md5': '849a88c1e1ca47d41403c2ba5e59e261', + 'info_dict': { + 'id': '10002447', + 'ext': 'mp4', + 'title': 'Kancelář Blaník: Tři roky pro Mazánka', + 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', + 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', + 'duration': 368, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + api_path = '/episode/%s' % video_id + + req = sanitized_Request(self._API_URL + api_path) + req.add_header('Api-Password', _get_api_key(api_path)) + data = self._download_json(req, video_id) + + formats = [] + for quality, video in enumerate(data['video_qualities']): + for f in video['formats']: + typ = f['type'].partition('/')[2] + qlabel = video.get('quality_label') + formats.append({ + 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, + 'format_id': '%s-%s' % (typ, f['quality']), + 'url': f['source'], + 'height': int_or_none(f['quality'].rstrip('p')), + 'quality': quality, + }) + self._sort_formats(formats) + + image = data.get('image') + if image: + thumbnail = self._proto_relative_url( + image.replace('{width}', '1240').replace('{height}', '697'), + scheme='http:', + ) + else: + thumbnail = None + + stream = data.get('_embedded', {}).get('stream:show', {}).get('name') + if stream: + title = '%s: %s' % (stream, data['name']) + else: + title = data['name'] + + subtitles = {} + srt_url = data.get('subtitles_srt') + if srt_url: + subtitles['cs'] = [{ + 'ext': 'srt', + 'url': srt_url, + }] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + 'description': data.get('web_site_text'), + 'duration': int_or_none(data.get('duration')), + 'view_count': int_or_none(data.get('views')), + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py new file mode 100644 index 0000000000..91612c7f22 --- /dev/null +++ b/youtube_dl/extractor/streetvoice.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import unified_strdate + + +class StreetVoiceIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://streetvoice.com/skippylu/songs/94440/', + 'md5': '15974627fc01a29e492c98593c2fd472', + 'info_dict': { + 'id': '94440', + 'ext': 'mp3', + 'title': '輸', + 'description': 'Crispy脆樂團 - 輸', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 260, + 'upload_date': '20091018', + 'uploader': 'Crispy脆樂團', + 'uploader_id': '627810', + } + }, { + 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', + 'only_matching': True, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + + song = self._download_json( + 'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'') + + title = song['name'] + author = song['user']['nickname'] + + return { + 'id': song_id, + 'url': song['file'], + 'title': title, + 'description': '%s - %s' % (author, title), + 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), + 'duration': song.get('length'), + 'upload_date': unified_strdate(song.get('created_at')), + 'uploader': author, + 'uploader_id': compat_str(song['user']['id']), + } diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py new file mode 100644 index 0000000000..4dbead2ba4 --- /dev/null +++ b/youtube_dl/extractor/stretchinternet.py @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class StretchInternetIE(InfoExtractor): + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P<id>\d+)' + _TEST = { + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video', + 'info_dict': { + 'id': '573272', + 'ext': 'mp4', + 'title': 'University of Mary Wrestling vs. Upper Iowa', + 'timestamp': 1575668361, + 'upload_date': '20191206', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + event = self._download_json( + 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, + video_id)[0] + + return { + 'id': video_id, + 'title': event['title'], + 'timestamp': int_or_none(event.get('dateCreated'), 1000), + 'url': 'https://' + event['media'][0]['url'], + } diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py new file mode 100644 index 0000000000..bae8b71f4b --- /dev/null +++ b/youtube_dl/extractor/stv.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_str, + float_or_none, + int_or_none, +) + + +class STVPlayerIE(InfoExtractor): + IE_NAME = 'stv:player' + _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' + _TEST = { + 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', + 'md5': '5adf9439c31d554f8be0707c7abe7e0a', + 'info_dict': { + 'id': '5333973339001', + 'ext': 'mp4', + 'upload_date': '20170301', + 'title': '60 seconds on set with Laura Norton', + 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!", + 'timestamp': 1488388054, + 'uploader_id': '1486976045', + }, + 'skip': 'this resource is unavailable outside of the UK', + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' + _PTYPE_MAP = { + 'episode': 'episodes', + 'video': 'shortform', + } + + def _real_extract(self, url): + ptype, video_id = re.match(self._VALID_URL, url).groups() + resp = self._download_json( + 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), + video_id) + + result = resp['results'] + video = result['video'] + video_id = compat_str(video['id']) + + subtitles = {} + _subtitles = result.get('_subtitles') or {} + for ext, sub_url in _subtitles.items(): + subtitles.setdefault('en', []).append({ + 'ext': 'vtt' if ext == 'webvtt' else ext, + 'url': sub_url, + }) + + programme = result.get('programme') or {} + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'description': result.get('summary'), + 'duration': float_or_none(video.get('length'), 1000), + 'subtitles': subtitles, + 'view_count': int_or_none(result.get('views')), + 'series': programme.get('name') or programme.get('shortName'), + 'ie_key': 'BrightcoveNew', + } diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py new file mode 100644 index 0000000000..68051169b9 --- /dev/null +++ b/youtube_dl/extractor/sunporno.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, + qualities, + determine_ext, +) + + +class SunPornoIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.sunporno.com/videos/807778/', + 'md5': '507887e29033502f29dba69affeebfc9', + 'info_dict': { + 'id': '807778', + 'ext': 'mp4', + 'title': 'md5:0a400058e8105d39e35c35e7c5184164', + 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 302, + 'age_limit': 18, + } + }, { + 'url': 'http://embeds.sunporno.com/embed/807778', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.sunporno.com/videos/%s' % video_id, video_id) + + title = self._html_search_regex( + r'<title>([^<]+)', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') + thumbnail = self._html_search_regex( + r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + + duration = parse_duration(self._search_regex( + (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', + r'>Duration:\s*]+>\s*(\d+:\d+)\s*<'), + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._html_search_regex( + r'class="views">(?: