yt-dlp/yt_dlp/extractor/nytimes.py
2024-04-21 13:53:54 +01:00

641 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
extract_attributes,
float_or_none,
get_elements_html_by_class,
int_or_none,
js_to_json,
merge_dicts,
mimetype2ext,
parse_iso8601,
remove_end,
remove_start,
str_or_none,
traverse_obj,
url_or_none,
)
class NYTimesBaseIE(InfoExtractor):
_DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d')
_TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB'
_GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2'
_GRAPHQL_QUERY = '''query VideoQuery($id: String!) {
video(id: $id) {
... on Video {
bylines {
renderedRepresentation
}
duration
firstPublished
promotionalHeadline
promotionalMedia {
... on Image {
crops {
name
renditions {
name
width
height
url
}
}
}
}
renditions {
type
width
height
url
bitrate
}
summary
}
}
}'''
def _call_api(self, media_id):
# reference: `id-to-uri.js`
video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video')
media_uuid = uuid.uuid5(video_uuid, media_id)
return traverse_obj(self._download_json(
self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({
'query': self._GRAPHQL_QUERY,
'variables': {'id': f'nyt://video/{media_uuid}'},
}, separators=(',', ':')).encode(), headers={
'Content-Type': 'application/json',
'Nyt-App-Type': 'vhs',
'Nyt-App-Version': 'v3.52.21',
'Nyt-Token': self._TOKEN,
'Origin': 'https://nytimes.com',
}, fatal=False), ('data', 'video', {dict})) or {}
def _extract_thumbnails(self, thumbs):
return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), {
'url': 'url',
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
}), default=None)
def _extract_formats_and_subtitles(self, video_id, content_media_json):
urls = []
formats = []
subtitles = {}
for video in traverse_obj(content_media_json, ('renditions', ..., {dict})):
video_url = video.get('url')
format_id = video.get('type')
if not video_url or format_id == 'thumbs' or video_url in urls:
continue
urls.append(video_url)
ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
if ext == 'm3u8':
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id or 'hls', fatal=False)
formats.extend(m3u8_fmts)
self._merge_subtitles(m3u8_subs, target=subtitles)
elif ext == 'mpd':
continue # all mpd urls give 404 errors
else:
formats.append({
'url': video_url,
'format_id': format_id,
'vcodec': video.get('videoencoding') or video.get('video_codec'),
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
'filesize': traverse_obj(video, (
('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False),
'tbr': int_or_none(video.get('bitrate'), 1000) or None,
'ext': ext,
})
return formats, subtitles
def _extract_video(self, media_id):
data = self._call_api(media_id)
formats, subtitles = self._extract_formats_and_subtitles(media_id, data)
return {
'id': media_id,
'title': data.get('promotionalHeadline'),
'description': data.get('summary'),
'timestamp': parse_iso8601(data.get('firstPublished')),
'duration': float_or_none(data.get('duration'), scale=1000),
'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators'
'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))),
'formats': formats,
'subtitles': subtitles,
'thumbnails': self._extract_thumbnails(
traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
}
class NYTimesIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>']
_TESTS = [{
'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
'md5': 'a553aa344014e3723d33893d89d4defc',
'info_dict': {
'id': '100000002847155',
'ext': 'mp4',
'title': 'Verbatim: What Is a Photocopier?',
'description': 'md5:93603dada88ddbda9395632fdc5da260',
'timestamp': 1398646132,
'upload_date': '20140428',
'creator': 'Brett Weiner',
'thumbnail': r're:https?://\w+\.nyt.com/images/.+\.jpg',
'duration': 419,
},
}, {
'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video(video_id)
class NYTimesArticleIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?!books|podcasts)[^/?#]+/(?:\w+/)?(?P<id>[^./?#]+)(?:\.html)?'
_TESTS = [{
'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': {
'id': '100000003628438',
'ext': 'mp4',
'title': 'One Companys New Minimum Wage: $70,000 a Year',
'description': 'md5:89ba9ab67ca767bb92bf823d1f138433',
'timestamp': 1429047468,
'upload_date': '20150414',
'uploader': 'Matthew Williams',
'creator': 'Patricia Cohen',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 119.0,
},
}, {
# article with audio and no video
'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
'md5': '2365b3555c8aa7f4dd34ca735ad02e6a',
'info_dict': {
'id': '100000009110381',
'ext': 'mp3',
'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
'timestamp': 1695960700,
'upload_date': '20230929',
'creator': 'Stephanie Nolen, Natalija Gormalova',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 1322,
},
}, {
'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': {
'id': '100000009202270',
'ext': 'mp4',
'title': 'Kamala Harris Defends Biden Policies, but Says More Work Needed to Reach Voters',
'description': 'md5:de4212a7e19bb89e4fb14210ca915f1f',
'timestamp': 1701290997,
'upload_date': '20231129',
'uploader': 'By The New York Times',
'creator': 'Katie Rogers',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 97.631,
},
'params': {
'skip_download': 'm3u8',
},
}, {
# multiple videos in the same article
'url': 'https://www.nytimes.com/2023/12/02/business/air-traffic-controllers-safety.html',
'info_dict': {
'id': 'air-traffic-controllers-safety',
'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
'upload_date': '20231202',
'creator': 'Emily Steel, Sydney Ember',
'timestamp': 1701511264,
},
'playlist_count': 3,
}, {
'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
'only_matching': True,
}]
def _extract_content_from_block(self, block):
details = traverse_obj(block, {
'id': ('sourceId', {str}),
'uploader': ('bylines', ..., 'renderedRepresentation', {str}),
'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))),
'timestamp': ('firstPublished', {parse_iso8601}),
'series': ('podcastSeries', {str}),
}, get_all=False)
formats, subtitles = self._extract_formats_and_subtitles(details.get('id'), block)
# audio articles will have an url and no formats
url = traverse_obj(block, ('fileUrl', {url_or_none}))
if not formats and url:
formats.append({'url': url, 'vcodec': 'none'})
return {
**details,
'thumbnails': self._extract_thumbnails(traverse_obj(
block, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
'formats': formats,
'subtitles': subtitles
}
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
art_json = self._search_json(
r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
blocks = traverse_obj(art_json, (
'sprinkledBody', 'content', ..., ('ledeMedia', None),
lambda _, v: v['__typename'] in ('Video', 'Audio')))
if not blocks:
raise ExtractorError('Unable to extract any media blocks from webpage')
common_info = {
'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'),
'description': traverse_obj(art_json, (
'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
'creator': ', '.join(
traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
'thumbnails': self._extract_thumbnails(traverse_obj(
art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
}
entries = []
for block in blocks:
entries.append(merge_dicts(self._extract_content_from_block(block), common_info))
if len(entries) > 1:
return self.playlist_result(entries, page_id, **common_info)
return {
'id': page_id,
**entries[0],
}
class NYTimesCookingIE(NYTimesBaseIE):
IE_NAME = 'NYTimesCookingGuide'
_VALID_URL = r'https?://cooking\.nytimes\.com/guides/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
'info_dict': {
'id': '13-how-to-cook-a-turkey',
'title': 'How to Cook a Turkey',
'description': 'md5:726cfd3f9b161bdf5c279879e8050ca0',
},
'playlist_count': 2,
}, {
# single video example
'url': 'https://cooking.nytimes.com/guides/50-how-to-make-mac-and-cheese',
'md5': '64415805fe0b8640fce6b0b9def5989a',
'info_dict': {
'id': '100000005835845',
'ext': 'mp4',
'title': 'How to Make Mac and Cheese',
'description': 'md5:b8f2f33ec1fb7523b21367147c9594f1',
'timestamp': 1522950315,
'upload_date': '20180405',
'duration': 9.51,
'creator': 'Alison Roman',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/guides/20-how-to-frost-a-cake',
'md5': '64415805fe0b8640fce6b0b9def5989a',
'info_dict': {
'id': '20-how-to-frost-a-cake',
'title': 'How to Frost a Cake',
'description': 'md5:a31fe3b98a8ce7b98aae097730c269cd',
},
'playlist_count': 8,
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
description = self._html_search_meta(['og:description', 'twitter:description'], webpage)
lead_video_id = self._search_regex(
r'data-video-player-id="(\d+)"></div>', webpage, 'lead video')
media_ids = traverse_obj(
get_elements_html_by_class('video-item', webpage), (..., {extract_attributes}, 'data-video-id'))
if media_ids:
media_ids.append(lead_video_id)
return self.playlist_result(
[self._extract_video(media_id) for media_id in media_ids], page_id, title, description)
return {
**self._extract_video(lead_video_id),
'title': title,
'description': description,
'creator': self._search_regex( # TODO: change to 'creators'
r'<span itemprop="author">([^<]+)</span></p>', webpage, 'author', default=None),
}
class NYTimesCookingRecipeIE(InfoExtractor):
_VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P<id>\d+)'
_TESTS = [{
'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
'md5': '579e83bbe8e61e9de67f80edba8a78a8',
'info_dict': {
'id': '1017817',
'ext': 'mp4',
'title': 'Cranberry Curd Tart',
'description': 'md5:ad77a3fc321db636256d4343c5742152',
'timestamp': 1447804800,
'upload_date': '20151118',
'creator': 'David Tanis',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies',
'md5': '58df35998241dcf0620e99e646331b42',
'info_dict': {
'id': '1024781',
'ext': 'mp4',
'title': 'Neapolitan Checkerboard Cookies',
'description': 'md5:ba12394c585ababea951cb6d2fcc6631',
'timestamp': 1701302400,
'upload_date': '20231130',
'creator': 'Sue Li',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats',
'md5': '2fe7965a3adc899913b8e25ada360823',
'info_dict': {
'id': '1019516',
'ext': 'mp4',
'timestamp': 1546387200,
'description': 'md5:8856ce10239161bd2596ac335b9f9bfb',
'upload_date': '20190102',
'title': 'Overnight Oats',
'creator': 'Genevieve Ko',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls')
return {
**traverse_obj(recipe_data, {
'id': ('id', {str_or_none}),
'title': ('title', {str}),
'description': ('topnote', {clean_html}),
'timestamp': ('publishedAt', {int_or_none}),
'creator': ('contentAttribution', 'cardByline', {str}),
}),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj(
recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))],
}
class NYTimesAudioIE(NYTimesBaseIE):
_VALID_URL = r"https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?:podcasts|books)/(?:[\w-]+/)?(?P<id>[^./?#]+)(?:\.html)?"
_TESTS = [
{
"url": "http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html",
"md5": "cd402e44a059c8caf3b5f514c9264d0f",
"info_dict": {
"id": "100000004709062",
"title": "Revelations From the Final Weeks",
"ext": "mp3",
"description": "md5:fb5c6b93b12efc51649b4847fe066ee4",
"timestamp": 1476448332,
"upload_date": "20161014",
"creators": [''],
"series": "The Run-Up",
"episode": "He Was Like an Octopus",
"episode_number": 20,
"duration": 2130,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.jpg",
},
},
{
"url": "https://www.nytimes.com/2023/11/25/podcasts/poultry-slam.html",
"info_dict": {
"id": "100000009191248",
"title": "Poultry Slam",
"ext": "mp3",
"description": "md5:1e6f16b21bb9287b8a1fe563145a72fe",
"timestamp": 1700911084,
"upload_date": "20231125",
"creators": [],
"series": "This American Life",
"episode": "Poultry Slam",
"duration": 3523,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.png",
},
"params": {
"skip_download": True,
},
},
{
"url": "http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html",
"info_dict": {
"id": "100000004709479",
"title": "Inside The New York Times Book Review: The Rise of Hitler",
"ext": "mp3",
"description": "md5:288161c98c098a0c24f07a94af7108c3",
"timestamp": 1476461513,
"upload_date": "20161014",
"creators": ['Pamela Paul'],
"series": "",
"episode": "The Rise of Hitler",
"duration": 3475,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.jpg",
},
"params": {
"skip_download": True,
},
},
{
"url": "https://www.nytimes.com/2023/12/07/podcasts/the-daily/nikki-haley.html",
"info_dict": {
"id": "100000009214128",
"title": "Nikki Haleys Moment",
"ext": "mp3",
"description": "md5:bf9f532fe689967ef1c458bcb057f3e5",
"timestamp": 1701946819,
"upload_date": "20231207",
"creators": [],
"series": "The Daily",
"episode": "Listen to The Daily: Nikki Haleys Moment",
"duration": 1908,
},
"params": {
"skip_download": True,
},
},
{
"url": "https://www.nytimes.com/2023/12/18/podcasts/israel-putin.html",
"md5": "708b4fd393ca103280fe9e56d91b08b5",
"info_dict": {
"id": "100000009227362",
"title": "Pressure Mounts on Israel, and Putin Profits Off Boycott",
"ext": "mp3",
"description": "Hear the news in five minutes.",
"timestamp": 1702897212,
"upload_date": "20231218",
"creators": [],
"series": "The Headlines",
"episode": "The Headlines",
"duration": 298,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.jpg",
},
},
]
def _extract_content_from_block(self, block):
return traverse_obj(
block,
{
"creators": ("data", "track", "credit", all),
"duration": (
("data", "media"),
("track", "length"),
("duration", None),
{int_or_none},
),
"series": (
("data", "media"),
("podcast", "podcastSeries"),
("title", None),
{str_or_none},
),
"episode": (
("data", "media"),
("track", "headline"),
("title", "default"), {str}),
"episode_number": (
"data",
"podcast",
"episode",
{lambda v: v.split()[1]},
{int_or_none},
),
"url": (
("data", "media"),
("track", "fileUrl"),
("source", None),
{url_or_none},
),
"vcodec": "none",
},
get_all=False,
)
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
art_json = self._search_json(
r"window\.__preloadedData\s*=",
webpage,
"media details",
page_id,
transform_source=js_to_json,
)["initialData"]["data"]["article"]
blocks = traverse_obj(
art_json,
(
"sprinkledBody",
"content",
lambda _, v: v["__typename"]
in ("InteractiveBlock", "HeaderMultimediaBlock"),
"media",
),
)
if not blocks:
raise ExtractorError("Unable to extract any media blocks from webpage")
common_info = {
"title": remove_end(
self._html_extract_title(webpage), " - The New York Times"
),
"description": self._html_search_meta(
["og:description", "twitter:description"], webpage
),
"id": traverse_obj(
art_json, ("sourceId")
), # poltry slam is under art_json > 'sourceId'
**traverse_obj(
art_json,
{
"id": (
"sprinkledBody",
"content",
...,
"media",
"sourceId",
any,
{str},
),
"title": ("headline", "default"),
"description": ("summary"),
"timestamp": ("firstPublished", {parse_iso8601}),
"thumbnails": (
"promotionalMedia",
"assetCrops",
...,
"renditions",
...,
all,
{self._extract_thumbnails},
),
},
),
}
entries = []
for block in blocks:
if block.get("html"):
block = self._search_json(
r"function\s+getFlexData\(\)\s*\{\s*return",
block.get("html"),
"Retrieve the inner JSON",
page_id,
)
entries.append(
merge_dicts(self._extract_content_from_block(block), common_info)
)
if len(entries) > 1:
return self.playlist_result(entries, page_id, **common_info)
return {
"id": page_id,
**entries[0],
}