[ie/patreon] Fix comments extraction (#11530)

Closes #11483 Authored by: jshumphrey, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2024-12-25 21:59:04 +01:00 · 2024-11-16 14:02:14 -06:00 · 2024-11-16 14:02:14 -06:00 · 1d253b0a27
commit 1d253b0a27
parent 720b3dc453
1 changed files with 35 additions and 16 deletions
--- a/yt_dlp/extractor/patreon.py
+++ b/yt_dlp/extractor/patreon.py
@ -16,10 +16,10 @@ from ..utils import (
    parse_iso8601,
    smuggle_url,
    str_or_none,
-    traverse_obj,
    url_or_none,
    urljoin,
 )
+from ..utils.traversal import traverse_obj, value


 class PatreonBaseIE(InfoExtractor):
@ -252,6 +252,27 @@ class PatreonIE(PatreonBaseIE):
            'thumbnail': r're:^https?://.+',
        },
        'skip': 'Patron-only content',
+    }, {
+        # Contains a comment reply in the 'included' section
+        'url': 'https://www.patreon.com/posts/114721679',
+        'info_dict': {
+            'id': '114721679',
+            'ext': 'mp4',
+            'upload_date': '20241025',
+            'uploader': 'Japanalysis',
+            'like_count': int,
+            'thumbnail': r're:^https?://.+',
+            'comment_count': int,
+            'title': 'Karasawa Part 2',
+            'description': 'Part 2 of this video https://www.youtube.com/watch?v=Azms2-VTASk',
+            'uploader_url': 'https://www.patreon.com/japanalysis',
+            'uploader_id': '80504268',
+            'channel_url': 'https://www.patreon.com/japanalysis',
+            'channel_follower_count': int,
+            'timestamp': 1729897015,
+            'channel_id': '9346307',
+        },
+        'params': {'getcomments': True},
    }]
    _RETURN_TYPE = 'video'

@ -404,26 +425,24 @@ class PatreonIE(PatreonBaseIE):
                f'posts/{post_id}/comments', post_id, query=params, note=f'Downloading comments page {page}')

            cursor = None
-            for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)):
+            for comment in traverse_obj(response, (('data', 'included'), lambda _, v: v['type'] == 'comment' and v['id'])):
                count += 1
-                comment_id = comment.get('id')
-                attributes = comment.get('attributes') or {}
-                if comment_id is None:
-                    continue
                author_id = traverse_obj(comment, ('relationships', 'commenter', 'data', 'id'))
-                author_info = traverse_obj(
-                    response, ('included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes'),
-                    get_all=False, expected_type=dict, default={})

                yield {
-                    'id': comment_id,
-                    'text': attributes.get('body'),
-                    'timestamp': parse_iso8601(attributes.get('created')),
-                    'parent': traverse_obj(comment, ('relationships', 'parent', 'data', 'id'), default='root'),
-                    'author_is_uploader': attributes.get('is_by_creator'),
+                    **traverse_obj(comment, {
+                        'id': ('id', {str_or_none}),
+                        'text': ('attributes', 'body', {str}),
+                        'timestamp': ('attributes', 'created', {parse_iso8601}),
+                        'parent': ('relationships', 'parent', 'data', ('id', {value('root')}), {str}, any),
+                        'author_is_uploader': ('attributes', 'is_by_creator', {bool}),
+                    }),
+                    **traverse_obj(response, (
+                        'included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes', {
+                            'author': ('full_name', {str}),
+                            'author_thumbnail': ('image_url', {url_or_none}),
+                        }), get_all=False),
                    'author_id': author_id,
-                    'author': author_info.get('full_name'),
-                    'author_thumbnail': author_info.get('image_url'),
                }

            if count < traverse_obj(response, ('meta', 'count')):