[extractor] Handle json_ld with multiple @types

Closes: #4022
This commit is contained in:
pukkandan 2022-06-13 18:39:58 +05:30
parent 79e591b59b
commit f3c0c77304
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39
3 changed files with 34 additions and 18 deletions

View file

@ -1419,6 +1419,10 @@ class InfoExtractor:
'ViewAction': 'view', 'ViewAction': 'view',
} }
def is_type(e, *expected_types):
type = variadic(traverse_obj(e, '@type'))
return any(x in type for x in expected_types)
def extract_interaction_type(e): def extract_interaction_type(e):
interaction_type = e.get('interactionType') interaction_type = e.get('interactionType')
if isinstance(interaction_type, dict): if isinstance(interaction_type, dict):
@ -1432,9 +1436,7 @@ class InfoExtractor:
if not isinstance(interaction_statistic, list): if not isinstance(interaction_statistic, list):
return return
for is_e in interaction_statistic: for is_e in interaction_statistic:
if not isinstance(is_e, dict): if not is_type(is_e, 'InteractionCounter'):
continue
if is_e.get('@type') != 'InteractionCounter':
continue continue
interaction_type = extract_interaction_type(is_e) interaction_type = extract_interaction_type(is_e)
if not interaction_type: if not interaction_type:
@ -1471,7 +1473,7 @@ class InfoExtractor:
info['chapters'] = chapters info['chapters'] = chapters
def extract_video_object(e): def extract_video_object(e):
assert e['@type'] == 'VideoObject' assert is_type(e, 'VideoObject')
author = e.get('author') author = e.get('author')
info.update({ info.update({
'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none), 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
@ -1503,13 +1505,12 @@ class InfoExtractor:
if at_top_level and set(e.keys()) == {'@context', '@graph'}: if at_top_level and set(e.keys()) == {'@context', '@graph'}:
traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
break break
item_type = e.get('@type') if expected_type is not None and not is_type(e, expected_type):
if expected_type is not None and expected_type != item_type:
continue continue
rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
if rating is not None: if rating is not None:
info['average_rating'] = rating info['average_rating'] = rating
if item_type in ('TVEpisode', 'Episode'): if is_type(e, 'TVEpisode', 'Episode'):
episode_name = unescapeHTML(e.get('name')) episode_name = unescapeHTML(e.get('name'))
info.update({ info.update({
'episode': episode_name, 'episode': episode_name,
@ -1519,39 +1520,39 @@ class InfoExtractor:
if not info.get('title') and episode_name: if not info.get('title') and episode_name:
info['title'] = episode_name info['title'] = episode_name
part_of_season = e.get('partOfSeason') part_of_season = e.get('partOfSeason')
if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
info.update({ info.update({
'season': unescapeHTML(part_of_season.get('name')), 'season': unescapeHTML(part_of_season.get('name')),
'season_number': int_or_none(part_of_season.get('seasonNumber')), 'season_number': int_or_none(part_of_season.get('seasonNumber')),
}) })
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
info['series'] = unescapeHTML(part_of_series.get('name')) info['series'] = unescapeHTML(part_of_series.get('name'))
elif item_type == 'Movie': elif is_type(e, 'Movie'):
info.update({ info.update({
'title': unescapeHTML(e.get('name')), 'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')), 'description': unescapeHTML(e.get('description')),
'duration': parse_duration(e.get('duration')), 'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('dateCreated')), 'timestamp': unified_timestamp(e.get('dateCreated')),
}) })
elif item_type in ('Article', 'NewsArticle'): elif is_type(e, 'Article', 'NewsArticle'):
info.update({ info.update({
'timestamp': parse_iso8601(e.get('datePublished')), 'timestamp': parse_iso8601(e.get('datePublished')),
'title': unescapeHTML(e.get('headline')), 'title': unescapeHTML(e.get('headline')),
'description': unescapeHTML(e.get('articleBody') or e.get('description')), 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
}) })
if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
extract_video_object(e['video'][0]) extract_video_object(e['video'][0])
elif traverse_obj(e, ('subjectOf', 0, '@type')) == 'VideoObject': elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
extract_video_object(e['subjectOf'][0]) extract_video_object(e['subjectOf'][0])
elif item_type == 'VideoObject': elif is_type(e, 'VideoObject'):
extract_video_object(e) extract_video_object(e)
if expected_type is None: if expected_type is None:
continue continue
else: else:
break break
video = e.get('video') video = e.get('video')
if isinstance(video, dict) and video.get('@type') == 'VideoObject': if is_type(video, 'VideoObject'):
extract_video_object(video) extract_video_object(video)
if expected_type is None: if expected_type is None:
continue continue

View file

@ -2596,8 +2596,23 @@ class GenericIE(InfoExtractor):
'uploader': 'Mr Producer Media', 'uploader': 'Mr Producer Media',
'upload_date': '20220610', 'upload_date': '20220610',
} }
},
{
'note': 'JSON LD with multiple @type',
'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
'md5': 'c7949f34f57273013fb7ccb1156393db',
'info_dict': {
'id': 'ipy2AcGL',
'ext': 'mp4',
'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg',
'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
'timestamp': 1586577474,
'upload_date': '20200411',
'age_limit': 0,
'duration': 111.0,
} }
},
] ]
def report_following_redirect(self, new_url): def report_following_redirect(self, new_url):

View file

@ -5,7 +5,7 @@ from ..utils import unsmuggle_url
class JWPlatformIE(InfoExtractor): class JWPlatformIE(InfoExtractor):
_VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
_TESTS = [{ _TESTS = [{
'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
'md5': 'fa8899fa601eb7c83a64e9d568bdf325', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',