From b532556d0a85e7d76f8f0880861232fb706ddbc5 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Tue, 19 Sep 2023 21:52:44 +0200 Subject: [PATCH] [ie/pr0gramm] Rewrite extractor (#8151) Authored by: Grub4K --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/pr0gramm.py | 218 ++++++++++++++++++++------------ 2 files changed, 139 insertions(+), 81 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index dd670d59c2..490b010b8d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1524,7 +1524,7 @@ from .puhutv import ( PuhuTVIE, PuhuTVSerieIE, ) -from .pr0gramm import Pr0grammStaticIE, Pr0grammIE +from .pr0gramm import Pr0grammIE from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index 2eb327fba1..c8e0bb493b 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -1,97 +1,155 @@ -import re +import json +from datetime import date +from urllib.parse import unquote from .common import InfoExtractor -from ..utils import merge_dicts +from ..compat import functools +from ..utils import ExtractorError, make_archive_id, urljoin +from ..utils.traversal import traverse_obj -class Pr0grammStaticIE(InfoExtractor): - # Possible urls: - # https://pr0gramm.com/static/5466437 - _VALID_URL = r'https?://pr0gramm\.com/static/(?P[0-9]+)' - _TEST = { - 'url': 'https://pr0gramm.com/static/5466437', - 'md5': '52fa540d70d3edc286846f8ca85938aa', - 'info_dict': { - 'id': '5466437', - 'ext': 'mp4', - 'title': 'pr0gramm-5466437 by g11st', - 'uploader': 'g11st', - 'upload_date': '20221221', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # Fetch media sources - entries = self._parse_html5_media_entries(url, webpage, video_id) - media_info = entries[0] - - # Fetch author - uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') - - # Fetch approx upload timestamp from filename - # Have None-defaults in case the extraction fails - uploadDay = None - uploadMon = None - uploadYear = None - uploadTimestr = None - # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) - m = re.search(r'//img\.pr0gramm\.com/(?P[\d]+)/(?P[\d]+)/(?P[\d]+)/\w+\.\w{,4}', webpage) - - if (m): - # Up to a day of accuracy should suffice... - uploadDay = m.groupdict().get('day') - uploadMon = m.groupdict().get('mon') - uploadYear = m.groupdict().get('year') - uploadTimestr = uploadYear + uploadMon + uploadDay - - return merge_dicts({ - 'id': video_id, - 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), - 'uploader': uploader, - 'upload_date': uploadTimestr - }, media_info) - - -# This extractor is for the primary url (used for sharing, and appears in the -# location bar) Since this page loads the DOM via JS, yt-dl can't find any -# video information here. So let's redirect to a compatibility version of -# the site, which does contain the