([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
description = self._html_search_meta(
'description', webpage, fatal=False)
- page_count = self._int(self._search_regex(
- r'(\d+)(?:a|span)><(?:a|span)[^>]+rel="next">',
- webpage, 'page_count', default=0), 'page_count')
+ page_count = str_to_int(self._search_regex(
+ r'(\d+)\s*(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
+ webpage, 'page_count', default=0))
if not page_count:
message = self._search_regex(
- r'class="error-page"[^>]*>\s*]*>\s*(?P[^<]+)(?<=\S)\s*',
+ r'''class\s*=\s*['"]error-page\b[^>]*>\s*]*>\s*(?P[^<]+)(?<=\S)\s*''',
webpage, 'error_msg', default=None) or 'This group has no videos.'
self.report_warning(message, group_id)
+ page_count = 1
PAGE_SIZE = 80
def _get_page(idx):
- if not page_count:
- return
- webpage = self._download_webpage(
- page_url, group_id, query={'page': idx + 1},
- note='Downloading page %d/%d' % (idx + 1, page_count)
- )
+ if idx > 0:
+ webpage = self._download_webpage(
+ page_url, group_id, query={'page': idx + 1},
+ note='Downloading page %d/%d' % (idx + 1, page_count)
+ )
for entry in self._extract_entries(webpage, url):
yield entry
diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py
index f9a67876ab..44fa60ce9b 100644
--- a/yt_dlp/extractor/neteasemusic.py
+++ b/yt_dlp/extractor/neteasemusic.py
@@ -1,12 +1,25 @@
-import itertools
+import json
import re
+import time
from base64 import b64encode
+from binascii import hexlify
from datetime import datetime
from hashlib import md5
+from random import randint
from .common import InfoExtractor
-from ..compat import compat_str, compat_urllib_parse_urlencode
-from ..utils import float_or_none, sanitized_Request
+from ..aes import aes_ecb_encrypt, pkcs7_padding
+from ..compat import compat_urllib_parse_urlencode
+from ..utils import (
+ ExtractorError,
+ bytes_to_intlist,
+ error_to_compat_str,
+ float_or_none,
+ int_or_none,
+ intlist_to_bytes,
+ sanitized_Request,
+ try_get,
+)
class NetEaseMusicBaseIE(InfoExtractor):
@@ -17,7 +30,7 @@ class NetEaseMusicBaseIE(InfoExtractor):
@classmethod
def _encrypt(cls, dfsid):
salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8'))
- string_bytes = bytearray(compat_str(dfsid).encode('ascii'))
+ string_bytes = bytearray(str(dfsid).encode('ascii'))
salt_len = len(salt_bytes)
for i in range(len(string_bytes)):
string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len]
@@ -26,32 +39,106 @@ class NetEaseMusicBaseIE(InfoExtractor):
result = b64encode(m.digest()).decode('ascii')
return result.replace('/', '_').replace('+', '-')
+ @classmethod
+ def make_player_api_request_data_and_headers(cls, song_id, bitrate):
+ KEY = b'e82ckenh8dichen8'
+ URL = '/api/song/enhance/player/url'
+ now = int(time.time() * 1000)
+ rand = randint(0, 1000)
+ cookie = {
+ 'osver': None,
+ 'deviceId': None,
+ 'appver': '8.0.0',
+ 'versioncode': '140',
+ 'mobilename': None,
+ 'buildver': '1623435496',
+ 'resolution': '1920x1080',
+ '__csrf': '',
+ 'os': 'pc',
+ 'channel': None,
+ 'requestId': '{0}_{1:04}'.format(now, rand),
+ }
+ request_text = json.dumps(
+ {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie},
+ separators=(',', ':'))
+ message = 'nobody{0}use{1}md5forencrypt'.format(
+ URL, request_text).encode('latin1')
+ msg_digest = md5(message).hexdigest()
+
+ data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format(
+ URL, request_text, msg_digest)
+ data = pkcs7_padding(bytes_to_intlist(data))
+ encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY)))
+ encrypted_params = hexlify(encrypted).decode('ascii').upper()
+
+ cookie = '; '.join(
+ ['{0}={1}'.format(k, v if v is not None else 'undefined')
+ for [k, v] in cookie.items()])
+
+ headers = {
+ 'User-Agent': self.extractor.get_param('http_headers')['User-Agent'],
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': 'https://music.163.com',
+ 'Cookie': cookie,
+ }
+ return ('params={0}'.format(encrypted_params), headers)
+
+ def _call_player_api(self, song_id, bitrate):
+ url = 'https://interface3.music.163.com/eapi/song/enhance/player/url'
+ data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate)
+ try:
+ msg = 'empty result'
+ result = self._download_json(
+ url, song_id, data=data.encode('ascii'), headers=headers)
+ if result:
+ return result
+ except ExtractorError as e:
+ if type(e.cause) in (ValueError, TypeError):
+ # JSON load failure
+ raise
+ except Exception as e:
+ msg = error_to_compat_str(e)
+ self.report_warning('%s API call (%s) failed: %s' % (
+ song_id, bitrate, msg))
+ return {}
+
def extract_formats(self, info):
+ err = 0
formats = []
+ song_id = info['id']
for song_format in self._FORMATS:
details = info.get(song_format)
if not details:
continue
- song_file_path = '/%s/%s.%s' % (
- self._encrypt(details['dfsId']), details['dfsId'], details['extension'])
- # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature
- # from NetEase's CDN provider that can be used if m5.music.126.net does not
- # work, especially for users outside of Mainland China
- # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880
- for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net',
- 'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'):
- song_url = host + song_file_path
+ bitrate = int_or_none(details.get('bitrate')) or 999000
+ data = self._call_player_api(song_id, bitrate)
+ for song in try_get(data, lambda x: x['data'], list) or []:
+ song_url = try_get(song, lambda x: x['url'])
+ if not song_url:
+ continue
if self._is_valid_url(song_url, info['id'], 'song'):
formats.append({
'url': song_url,
'ext': details.get('extension'),
- 'abr': float_or_none(details.get('bitrate'), scale=1000),
+ 'abr': float_or_none(song.get('br'), scale=1000),
'format_id': song_format,
- 'filesize': details.get('size'),
- 'asr': details.get('sr')
+ 'filesize': int_or_none(song.get('size')),
+ 'asr': int_or_none(details.get('sr')),
})
- break
+ elif err == 0:
+ err = try_get(song, lambda x: x['code'], int)
+
+ if not formats:
+ msg = 'No media links found'
+ if err != 0 and (err < 200 or err >= 400):
+ raise ExtractorError(
+ '%s (site code %d)' % (msg, err, ), expected=True)
+ else:
+ self.raise_geo_restricted(
+ msg + ': probably this video is not available from your location due to geo restriction.',
+ countries=['CN'])
+
return formats
@classmethod
@@ -67,33 +154,19 @@ class NetEaseMusicBaseIE(InfoExtractor):
class NetEaseMusicIE(NetEaseMusicBaseIE):
IE_NAME = 'netease:song'
IE_DESC = '网易云音乐'
- _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P[0-9]+)'
+ _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P[0-9]+)'
_TESTS = [{
'url': 'http://music.163.com/#/song?id=32102397',
- 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
+ 'md5': '3e909614ce09b1ccef4a3eb205441190',
'info_dict': {
'id': '32102397',
'ext': 'mp3',
- 'title': 'Bad Blood (feat. Kendrick Lamar)',
+ 'title': 'Bad Blood',
'creator': 'Taylor Swift / Kendrick Lamar',
- 'upload_date': '20150517',
- 'timestamp': 1431878400,
- 'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
+ 'upload_date': '20150516',
+ 'timestamp': 1431792000,
+ 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c',
},
- 'skip': 'Blocked outside Mainland China',
- }, {
- 'note': 'No lyrics translation.',
- 'url': 'http://music.163.com/#/song?id=29822014',
- 'info_dict': {
- 'id': '29822014',
- 'ext': 'mp3',
- 'title': '听见下雨的声音',
- 'creator': '周杰伦',
- 'upload_date': '20141225',
- 'timestamp': 1419523200,
- 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
- },
- 'skip': 'Blocked outside Mainland China',
}, {
'note': 'No lyrics.',
'url': 'http://music.163.com/song?id=17241424',
@@ -103,9 +176,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'title': 'Opus 28',
'creator': 'Dustin O\'Halloran',
'upload_date': '20080211',
+ 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4',
'timestamp': 1202745600,
},
- 'skip': 'Blocked outside Mainland China',
}, {
'note': 'Has translated name.',
'url': 'http://music.163.com/#/song?id=22735043',
@@ -119,7 +192,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'timestamp': 1264608000,
'alt_title': '说出愿望吧(Genie)',
},
- 'skip': 'Blocked outside Mainland China',
+ }, {
+ 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846',
+ 'md5': '95826c73ea50b1c288b22180ec9e754d',
+ 'info_dict': {
+ 'id': '95670',
+ 'ext': 'mp3',
+ 'title': '国际歌',
+ 'creator': '马备',
+ 'upload_date': '19911130',
+ 'timestamp': 691516800,
+ 'description': 'md5:1ba2f911a2b0aa398479f595224f2141',
+ },
}]
def _process_lyrics(self, lyrics_info):
diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py
index fcbafe4188..7eb5b21cb4 100644
--- a/yt_dlp/extractor/nrk.py
+++ b/yt_dlp/extractor/nrk.py
@@ -58,8 +58,7 @@ class NRKBaseIE(InfoExtractor):
return self._download_json(
urljoin('https://psapi.nrk.no/', path),
video_id, note or 'Downloading %s JSON' % item,
- fatal=fatal, query=query,
- headers={'Accept-Encoding': 'gzip, deflate, br'})
+ fatal=fatal, query=query)
class NRKIE(NRKBaseIE):
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index 25d2f200f2..2e36b8861a 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -870,7 +870,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
if '://player.vimeo.com/video/' in url:
config = self._parse_json(self._search_regex(
- r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
+ r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
if config.get('view') == 4:
config = self._verify_player_video_password(
redirect_url, video_id, headers)
diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py
index 3a7f01f7a8..1eab384b9c 100644
--- a/yt_dlp/extractor/zdf.py
+++ b/yt_dlp/extractor/zdf.py
@@ -3,13 +3,14 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ NO_DEFAULT,
+ ExtractorError,
determine_ext,
+ extract_attributes,
float_or_none,
int_or_none,
join_nonempty,
merge_dicts,
- NO_DEFAULT,
- orderedSet,
parse_codecs,
qualities,
traverse_obj,
@@ -188,7 +189,7 @@ class ZDFIE(ZDFBaseIE):
},
}, {
'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
- 'md5': '57af4423db0455a3975d2dc4578536bc',
+ 'md5': '1b93bdec7d02fc0b703c5e7687461628',
'info_dict': {
'ext': 'mp4',
'id': 'video_funk_1770473',
@@ -250,17 +251,15 @@ class ZDFIE(ZDFBaseIE):
title = content.get('title') or content['teaserHeadline']
t = content['mainVideoContent']['http://zdf.de/rels/target']
-
- ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
-
+ ptmd_path = traverse_obj(t, (
+ (('streams', 'default'), None),
+ ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template')
+ ), get_all=False)
if not ptmd_path:
- ptmd_path = traverse_obj(
- t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'),
- 'http://zdf.de/rels/streams/ptmd-template').replace(
- '{playerId}', 'ngplayer_2_4')
+ raise ExtractorError('Could not extract ptmd_path')
info = self._extract_ptmd(
- urljoin(url, ptmd_path), video_id, player['apiToken'], url)
+ urljoin(url, ptmd_path.replace('{playerId}', 'ngplayer_2_4')), video_id, player['apiToken'], url)
thumbnails = []
layouts = try_get(
@@ -309,15 +308,16 @@ class ZDFIE(ZDFBaseIE):
'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id,
video_id)
- document = video['document']
-
- title = document['titel']
- content_id = document['basename']
-
formats = []
- format_urls = set()
- for f in document['formitaeten']:
- self._extract_format(content_id, formats, format_urls, f)
+ formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list)
+ document = formitaeten and video['document']
+ if formitaeten:
+ title = document['titel']
+ content_id = document['basename']
+
+ format_urls = set()
+ for f in formitaeten or []:
+ self._extract_format(content_id, formats, format_urls, f)
self._sort_formats(formats)
thumbnails = []
@@ -364,9 +364,9 @@ class ZDFChannelIE(ZDFBaseIE):
'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio',
'info_dict': {
'id': 'das-aktuelle-sportstudio',
- 'title': 'das aktuelle sportstudio | ZDF',
+ 'title': 'das aktuelle sportstudio',
},
- 'playlist_mincount': 23,
+ 'playlist_mincount': 18,
}, {
'url': 'https://www.zdf.de/dokumentation/planet-e',
'info_dict': {
@@ -374,6 +374,14 @@ class ZDFChannelIE(ZDFBaseIE):
'title': 'planet e.',
},
'playlist_mincount': 50,
+ }, {
+ 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest',
+ 'info_dict': {
+ 'id': 'aktenzeichen-xy-ungeloest',
+ 'title': 'Aktenzeichen XY... ungelöst',
+ 'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)",
+ },
+ 'playlist_mincount': 2,
}, {
'url': 'https://www.zdf.de/filme/taunuskrimi/',
'only_matching': True,
@@ -383,60 +391,36 @@ class ZDFChannelIE(ZDFBaseIE):
def suitable(cls, url):
return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url)
+ def _og_search_title(self, webpage, fatal=False):
+ title = super(ZDFChannelIE, self)._og_search_title(webpage, fatal=fatal)
+ return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None
+
def _real_extract(self, url):
channel_id = self._match_id(url)
webpage = self._download_webpage(url, channel_id)
- entries = [
- self.url_result(item_url, ie=ZDFIE.ie_key())
- for item_url in orderedSet(re.findall(
- r'data-plusbar-url=["\'](http.+?\.html)', webpage))]
+ matches = re.finditer(
+ r''']*?\sdata-plusbar-id\s*=\s*(["'])(?P
[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P%s)\1''' % ZDFIE._VALID_URL,
+ webpage)
- return self.playlist_result(
- entries, channel_id, self._og_search_title(webpage, fatal=False))
+ if self._downloader.params.get('noplaylist', False):
+ entry = next(
+ (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches),
+ None)
+ self.to_screen('Downloading just the main video because of --no-playlist')
+ if entry:
+ return entry
+ else:
+ self.to_screen('Downloading playlist %s - add --no-playlist to download just the main video' % (channel_id, ))
- r"""
- player = self._extract_player(webpage, channel_id)
+ def check_video(m):
+ v_ref = self._search_regex(
+ r'''(]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["'])%s\2[^>]*>)''' % (m.group('p_id'), ),
+ webpage, 'check id', default='')
+ v_ref = extract_attributes(v_ref)
+ return v_ref.get('data-target-video-type') != 'novideo'
- channel_id = self._search_regex(
- r'docId\s*:\s*(["\'])(?P(?!\1).+?)\1', webpage,
- 'channel id', group='id')
-
- channel = self._call_api(
- 'https://api.zdf.de/content/documents/%s.json' % channel_id,
- player, url, channel_id)
-
- items = []
- for module in channel['module']:
- for teaser in try_get(module, lambda x: x['teaser'], list) or []:
- t = try_get(
- teaser, lambda x: x['http://zdf.de/rels/target'], dict)
- if not t:
- continue
- items.extend(try_get(
- t,
- lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'],
- list) or [])
- items.extend(try_get(
- module,
- lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'],
- list) or [])
-
- entries = []
- entry_urls = set()
- for item in items:
- t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict)
- if not t:
- continue
- sharing_url = t.get('http://zdf.de/rels/sharing-url')
- if not sharing_url or not isinstance(sharing_url, compat_str):
- continue
- if sharing_url in entry_urls:
- continue
- entry_urls.add(sharing_url)
- entries.append(self.url_result(
- sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id')))
-
- return self.playlist_result(entries, channel_id, channel.get('title'))
- """
+ return self.playlist_from_matches(
+ (m.group('url') for m in matches if check_video(m)),
+ channel_id, self._og_search_title(webpage, fatal=False))
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 1532d22ac0..4d1247eea3 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -685,7 +685,8 @@ def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
return '\0_'
return char
- if restricted and is_id is NO_DEFAULT:
+ # Replace look-alike Unicode glyphs
+ if restricted and (is_id is NO_DEFAULT or not is_id):
s = unicodedata.normalize('NFKC', s)
s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
result = ''.join(map(replace_insane, s))