[TikTok] Extract captions (#2185)

Closes #2184 Authored by: MinePlayersPE
2024-12-28 22:24:34 +01:00 · 2022-01-20 05:35:27 +07:00 · 2022-01-20 05:35:27 +07:00 · e0585e6562
commit e0585e6562
parent 426764371f
1 changed files with 27 additions and 0 deletions
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@ -17,6 +17,7 @@ from ..utils import (
    int_or_none,
    join_nonempty,
    LazyList,
    srt_subtitles_timecode,
    str_or_none,
    traverse_obj,
    try_get,
@ -83,6 +84,27 @@ class TikTokBaseIE(InfoExtractor):
                'Accept': 'application/json',
            }, query=real_query)
    def _get_subtitles(self, aweme_detail, aweme_id):
        # TODO: Extract text positioning info
        subtitles = {}
        captions_info = traverse_obj(
            aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[])
        for caption in captions_info:
            caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
            if not caption_url:
                continue
            caption_json = self._download_json(
                caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
            if not caption_json:
                continue
            subtitles.setdefault(caption.get('language', 'en'), []).append({
                'ext': 'srt',
                'data': '\n\n'.join(
                    f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
                    for i, line in enumerate(caption_json['utterances']) if line.get('text'))
            })
        return subtitles
    def _parse_aweme_video_app(self, aweme_detail):
        aweme_id = aweme_detail['aweme_id']
        video_info = aweme_detail['video']
@ -218,6 +240,7 @@ class TikTokBaseIE(InfoExtractor):
            'artist': music_author,
            'timestamp': int_or_none(aweme_detail.get('create_time')),
            'formats': formats,
            'subtitles': self.extract_subtitles(aweme_detail, aweme_id),
            'thumbnails': thumbnails,
            'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000),
            'availability': self._availability(
@ -396,6 +419,10 @@ class TikTokIE(TikTokBaseIE):
            'comment_count': int,
        },
        'expected_warnings': ['Video not available']
    }, {
        # Auto-captions available
        'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
        'only_matching': True
    }]
    def _extract_aweme_app(self, aweme_id):