import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
extract_attributes,
float_or_none,
get_elements_html_by_class,
int_or_none,
js_to_json,
merge_dicts,
mimetype2ext,
parse_iso8601,
remove_end,
remove_start,
str_or_none,
traverse_obj,
url_or_none,
)
class NYTimesBaseIE(InfoExtractor):
_DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d')
_TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB'
_GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2'
_GRAPHQL_QUERY = '''query VideoQuery($id: String!) {
video(id: $id) {
... on Video {
bylines {
renderedRepresentation
}
duration
firstPublished
promotionalHeadline
promotionalMedia {
... on Image {
crops {
name
renditions {
name
width
height
url
}
}
}
}
renditions {
type
width
height
url
bitrate
}
summary
}
}
}'''
def _call_api(self, media_id):
# reference: `id-to-uri.js`
video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video')
media_uuid = uuid.uuid5(video_uuid, media_id)
return traverse_obj(self._download_json(
self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({
'query': self._GRAPHQL_QUERY,
'variables': {'id': f'nyt://video/{media_uuid}'},
}, separators=(',', ':')).encode(), headers={
'Content-Type': 'application/json',
'Nyt-App-Type': 'vhs',
'Nyt-App-Version': 'v3.52.21',
'Nyt-Token': self._TOKEN,
'Origin': 'https://nytimes.com',
}, fatal=False), ('data', 'video', {dict})) or {}
def _extract_thumbnails(self, thumbs):
return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), {
'url': 'url',
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
}), default=None)
def _extract_formats_and_subtitles(self, video_id, content_media_json):
urls = []
formats = []
subtitles = {}
for video in traverse_obj(content_media_json, ('renditions', ..., {dict})):
video_url = video.get('url')
format_id = video.get('type')
if not video_url or format_id == 'thumbs' or video_url in urls:
continue
urls.append(video_url)
ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
if ext == 'm3u8':
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id or 'hls', fatal=False)
formats.extend(m3u8_fmts)
self._merge_subtitles(m3u8_subs, target=subtitles)
elif ext == 'mpd':
continue # all mpd urls give 404 errors
else:
formats.append({
'url': video_url,
'format_id': format_id,
'vcodec': video.get('videoencoding') or video.get('video_codec'),
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
'filesize': traverse_obj(video, (
('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False),
'tbr': int_or_none(video.get('bitrate'), 1000) or None,
'ext': ext,
})
return formats, subtitles
def _extract_video(self, media_id):
data = self._call_api(media_id)
formats, subtitles = self._extract_formats_and_subtitles(media_id, data)
return {
'id': media_id,
'title': data.get('promotionalHeadline'),
'description': data.get('summary'),
'timestamp': parse_iso8601(data.get('firstPublished')),
'duration': float_or_none(data.get('duration'), scale=1000),
'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators'
'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))),
'formats': formats,
'subtitles': subtitles,
'thumbnails': self._extract_thumbnails(
traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
}
class NYTimesIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P\d+)'
_EMBED_REGEX = [r'
', webpage, 'author', default=None),
}
class NYTimesCookingRecipeIE(InfoExtractor):
_VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P\d+)'
_TESTS = [{
'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
'md5': '579e83bbe8e61e9de67f80edba8a78a8',
'info_dict': {
'id': '1017817',
'ext': 'mp4',
'title': 'Cranberry Curd Tart',
'description': 'md5:ad77a3fc321db636256d4343c5742152',
'timestamp': 1447804800,
'upload_date': '20151118',
'creator': 'David Tanis',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies',
'md5': '58df35998241dcf0620e99e646331b42',
'info_dict': {
'id': '1024781',
'ext': 'mp4',
'title': 'Neapolitan Checkerboard Cookies',
'description': 'md5:ba12394c585ababea951cb6d2fcc6631',
'timestamp': 1701302400,
'upload_date': '20231130',
'creator': 'Sue Li',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats',
'md5': '2fe7965a3adc899913b8e25ada360823',
'info_dict': {
'id': '1019516',
'ext': 'mp4',
'timestamp': 1546387200,
'description': 'md5:8856ce10239161bd2596ac335b9f9bfb',
'upload_date': '20190102',
'title': 'Overnight Oats',
'creator': 'Genevieve Ko',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls')
return {
**traverse_obj(recipe_data, {
'id': ('id', {str_or_none}),
'title': ('title', {str}),
'description': ('topnote', {clean_html}),
'timestamp': ('publishedAt', {int_or_none}),
'creator': ('contentAttribution', 'cardByline', {str}),
}),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj(
recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))],
}
class NYTimesAudioIE(NYTimesBaseIE):
_VALID_URL = r"https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?:podcasts|books)/(?:[\w-]+/)?(?P[^./?#]+)(?:\.html)?"
_TESTS = [
{
"url": "http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html",
"md5": "cd402e44a059c8caf3b5f514c9264d0f",
"info_dict": {
"id": "100000004709062",
"title": "Revelations From the Final Weeks",
"ext": "mp3",
"description": "md5:fb5c6b93b12efc51649b4847fe066ee4",
"timestamp": 1476448332,
"upload_date": "20161014",
"creators": [''],
"series": "The Run-Up",
"episode": "‘He Was Like an Octopus’",
"episode_number": 20,
"duration": 2130,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.jpg",
},
},
{
"url": "https://www.nytimes.com/2023/11/25/podcasts/poultry-slam.html",
"info_dict": {
"id": "100000009191248",
"title": "Poultry Slam",
"ext": "mp3",
"description": "md5:1e6f16b21bb9287b8a1fe563145a72fe",
"timestamp": 1700911084,
"upload_date": "20231125",
"creators": [],
"series": "This American Life",
"episode": "Poultry Slam",
"duration": 3523,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.png",
},
"params": {
"skip_download": True,
},
},
{
"url": "http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html",
"info_dict": {
"id": "100000004709479",
"title": "Inside The New York Times Book Review: The Rise of Hitler",
"ext": "mp3",
"description": "md5:288161c98c098a0c24f07a94af7108c3",
"timestamp": 1476461513,
"upload_date": "20161014",
"creators": ['Pamela Paul'],
"series": "",
"episode": "The Rise of Hitler",
"duration": 3475,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.jpg",
},
"params": {
"skip_download": True,
},
},
{
"url": "https://www.nytimes.com/2023/12/07/podcasts/the-daily/nikki-haley.html",
"info_dict": {
"id": "100000009214128",
"title": "Nikki Haley’s Moment",
"ext": "mp3",
"description": "md5:bf9f532fe689967ef1c458bcb057f3e5",
"timestamp": 1701946819,
"upload_date": "20231207",
"creators": [],
"series": "The Daily",
"episode": "Listen to ‘The Daily’: Nikki Haley’s Moment",
"duration": 1908,
},
"params": {
"skip_download": True,
},
},
{
"url": "https://www.nytimes.com/2023/12/18/podcasts/israel-putin.html",
"md5": "708b4fd393ca103280fe9e56d91b08b5",
"info_dict": {
"id": "100000009227362",
"title": "Pressure Mounts on Israel, and Putin Profits Off Boycott",
"ext": "mp3",
"description": "Hear the news in five minutes.",
"timestamp": 1702897212,
"upload_date": "20231218",
"creators": [],
"series": "The Headlines",
"episode": "The Headlines",
"duration": 298,
"thumbnail": r"re:https?://\w+\.nyt.com/images/.*\.jpg",
},
},
]
def _extract_content_from_block(self, block):
return traverse_obj(
block,
{
"creators": ("data", "track", "credit", all),
"duration": (
("data", "media"),
("track", "length"),
("duration", None),
{int_or_none},
),
"series": (
("data", "media"),
("podcast", "podcastSeries"),
("title", None),
{str_or_none},
),
"episode": (
("data", "media"),
("track", "headline"),
("title", "default"), {str}),
"episode_number": (
"data",
"podcast",
"episode",
{lambda v: v.split()[1]},
{int_or_none},
),
"url": (
("data", "media"),
("track", "fileUrl"),
("source", None),
{url_or_none},
),
"vcodec": "none",
},
get_all=False,
)
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
art_json = self._search_json(
r"window\.__preloadedData\s*=",
webpage,
"media details",
page_id,
transform_source=js_to_json,
)["initialData"]["data"]["article"]
blocks = traverse_obj(
art_json,
(
"sprinkledBody",
"content",
lambda _, v: v["__typename"]
in ("InteractiveBlock", "HeaderMultimediaBlock"),
"media",
),
)
if not blocks:
raise ExtractorError("Unable to extract any media blocks from webpage")
common_info = {
"title": remove_end(
self._html_extract_title(webpage), " - The New York Times"
),
"description": self._html_search_meta(
["og:description", "twitter:description"], webpage
),
"id": traverse_obj(
art_json, ("sourceId")
), # poltry slam is under art_json > 'sourceId'
**traverse_obj(
art_json,
{
"id": (
"sprinkledBody",
"content",
...,
"media",
"sourceId",
any,
{str},
),
"title": ("headline", "default"),
"description": ("summary"),
"timestamp": ("firstPublished", {parse_iso8601}),
"thumbnails": (
"promotionalMedia",
"assetCrops",
...,
"renditions",
...,
all,
{self._extract_thumbnails},
),
},
),
}
entries = []
for block in blocks:
if block.get("html"):
block = self._search_json(
r"function\s+getFlexData\(\)\s*\{\s*return",
block.get("html"),
"Retrieve the inner JSON",
page_id,
)
entries.append(
merge_dicts(self._extract_content_from_block(block), common_info)
)
if len(entries) > 1:
return self.playlist_result(entries, page_id, **common_info)
return {
"id": page_id,
**entries[0],
}