[ie/theplatform] Extract more metadata (#8635)

Authored by: trainman261
This commit is contained in:
trainman261 2023-12-12 01:00:35 +01:00 committed by GitHub
parent e370f9ec36
commit 7e09c147fd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 73 additions and 11 deletions

View file

@ -121,11 +121,21 @@ class AENetworksIE(AENetworksBaseIE):
'info_dict': {
'id': '22253814',
'ext': 'mp4',
'title': 'Winter is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
'title': 'Winter Is Coming',
'description': 'md5:a40e370925074260b1c8a633c632c63a',
'timestamp': 1338306241,
'upload_date': '20120529',
'uploader': 'AENE-NEW',
'duration': 2592.0,
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:5',
'tags': 'count:14',
'categories': ['Mountain Men'],
'episode_number': 1,
'episode': 'Episode 1',
'season': 'Season 1',
'season_number': 1,
'series': 'Mountain Men',
},
'params': {
# m3u8 download
@ -143,6 +153,15 @@ class AENetworksIE(AENetworksBaseIE):
'timestamp': 1452634428,
'upload_date': '20160112',
'uploader': 'AENE-NEW',
'duration': 1277.695,
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:4',
'tags': 'count:23',
'episode': 'Episode 1',
'episode_number': 1,
'season': 'Season 9',
'season_number': 9,
'series': 'Duck Dynasty',
},
'params': {
# m3u8 download

View file

@ -180,6 +180,13 @@ class CBCPlayerIE(InfoExtractor):
'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
'chapters': [],
'duration': 494.811,
'categories': ['AudioMobile/All in a Weekend Montreal'],
'tags': 'count:8',
'location': 'Quebec',
'series': 'All in a Weekend Montreal',
'season': 'Season 2015',
'season_number': 2015,
'media_type': 'Excerpt',
},
}, {
'url': 'http://www.cbc.ca/player/play/2164402062',
@ -195,25 +202,37 @@ class CBCPlayerIE(InfoExtractor):
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
'chapters': [],
'duration': 186.867,
'series': 'CBC News: Windsor at 6:00',
'categories': ['News/Canada/Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
'creator': 'Allison Johnson',
'media_type': 'Excerpt',
},
}, {
# Has subtitles
# These broadcasts expire after ~1 month, can find new test URL here:
# https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
'url': 'http://www.cbc.ca/player/play/2249992771553',
'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd',
'url': 'http://www.cbc.ca/player/play/2284799043667',
'md5': '9b49f0839e88b6ec0b01d840cf3d42b5',
'info_dict': {
'id': '2249992771553',
'id': '2284799043667',
'ext': 'mp4',
'title': 'The National | Womens soccer pay, Florida seawater, Swift quake',
'description': 'md5:adba28011a56cfa47a080ff198dad27a',
'timestamp': 1690596000,
'duration': 2716.333,
'title': 'The National | Hockey coach charged, Green grants, Safer drugs',
'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa',
'timestamp': 1700272800,
'duration': 2718.833,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg',
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg',
'uploader': 'CBCC-NEW',
'chapters': 'count:5',
'upload_date': '20230729',
'upload_date': '20231118',
'categories': 'count:4',
'series': 'The National - Full Show',
'tags': 'count:1',
'creator': 'News',
'location': 'Canada',
'media_type': 'Full Program',
},
}]

View file

@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor):
'timestamp': 1444107300,
'age_limit': 14,
'uploader': 'CWTV',
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:4',
'episode': 'Episode 20',
'season': 'Season 11',
},
'params': {
# m3u8 download

View file

@ -73,6 +73,7 @@ class MediasetIE(ThePlatformBaseIE):
'season_number': 5,
'episode_number': 5,
'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
'categories': ['Informazione'],
},
}, {
# DRM
@ -149,6 +150,7 @@ class MediasetIE(ThePlatformBaseIE):
'season_number': 12,
'episode': 'Episode 8',
'episode_number': 8,
'categories': ['Intrattenimento'],
},
'params': {
'skip_download': True,

View file

@ -53,6 +53,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'chapters': 'count:1',
'tags': 'count:4',
'thumbnail': r're:https?://.+\.jpg',
'categories': ['Series/The Tonight Show Starring Jimmy Fallon'],
'media_type': 'Full Episode',
},
'params': {
'skip_download': 'm3u8',
@ -131,6 +133,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'tags': 'count:10',
'age_limit': 0,
'thumbnail': r're:https?://.+\.jpg',
'categories': ['Series/Quantum Leap 2022'],
'media_type': 'Highlight',
},
'params': {
'skip_download': 'm3u8',

View file

@ -114,6 +114,8 @@ class ScrippsNetworksIE(InfoExtractor):
'timestamp': 1475678834,
'upload_date': '20161005',
'uploader': 'SCNI-SCND',
'tags': 'count:10',
'creator': 'Cooking Channel',
'duration': 29.995,
'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': '<Untitled Chapter 1>'}],
'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg',

View file

@ -104,6 +104,10 @@ class ThePlatformBaseIE(OnceIE):
_add_chapter(chapter.get('startTime'), chapter.get('endTime'))
_add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
def extract_site_specific_field(field):
# A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
return {
'title': info['title'],
'subtitles': subtitles,
@ -113,6 +117,14 @@ class ThePlatformBaseIE(OnceIE):
'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'),
'chapters': chapters,
'creator': traverse_obj(info, ('author', {str})) or None,
'categories': traverse_obj(info, (
'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
'location': extract_site_specific_field('region'),
'series': extract_site_specific_field('show'),
'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
}
def _extract_theplatform_metadata(self, path, video_id):