diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 3b0ef323d7..20e5e944fc 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -63,14 +63,15 @@ body: placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index c8702c3569..4aeff7dc64 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -75,14 +75,15 @@ body: placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 5a6d2b0fbd..2f516ebb71 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -71,14 +71,15 @@ body: placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index a17770f614..201586e9dc 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -56,14 +56,15 @@ body: placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index c600a9dcb6..765de86a29 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -52,14 +52,15 @@ body: placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 57bc9daf51..198e21bec2 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -58,14 +58,15 @@ body: placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 495d3c6306..a211ae1652 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -72,7 +72,7 @@ on: default: true type: boolean windows: - description: yt-dlp.exe, yt-dlp_min.exe, yt-dlp_win.zip + description: yt-dlp.exe, yt-dlp_win.zip default: true type: boolean windows32: @@ -199,22 +199,24 @@ jobs: GITHUB_WORKFLOW: build githubToken: ${{ github.token }} # To cache image arch: ${{ matrix.architecture }} - distro: ubuntu18.04 # Standalone executable should be built on minimum supported OS + distro: ubuntu20.04 # Standalone executable should be built on minimum supported OS dockerRunArgs: --volume "${PWD}/repo:/repo" install: | # Installing Python 3.10 from the Deadsnakes repo raises errors apt update - apt -y install zlib1g-dev libffi-dev python3.8 python3.8-dev python3.8-distutils python3-pip - python3.8 -m pip install -U pip setuptools wheel - # Cannot access any files from the repo directory at this stage - python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi secretstorage cffi + apt -y install zlib1g-dev libffi-dev python3.9 python3.9-dev python3.9-distutils python3-pip \ + python3-secretstorage # Cannot build cryptography wheel in virtual armv7 environment + python3.9 -m pip install -U pip wheel 'setuptools>=71.0.2' + # XXX: Keep this in sync with pyproject.toml (it can't be accessed at this stage) and exclude secretstorage + python3.9 -m pip install -U Pyinstaller mutagen pycryptodomex brotli certifi cffi \ + 'requests>=2.32.2,<3' 'urllib3>=1.26.17,<3' 'websockets>=13.0' run: | cd repo - python3.8 devscripts/install_deps.py -o --include build - python3.8 devscripts/install_deps.py --include pyinstaller --include secretstorage # Cached version may be out of date - python3.8 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" - python3.8 devscripts/make_lazy_extractors.py - python3.8 -m bundle.pyinstaller + python3.9 devscripts/install_deps.py -o --include build + python3.9 devscripts/install_deps.py --include pyinstaller # Cached versions may be out of date + python3.9 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" + python3.9 devscripts/make_lazy_extractors.py + python3.9 -m bundle.pyinstaller if ${{ vars.UPDATE_TO_VERIFICATION && 'true' || 'false' }}; then arch="${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }}" @@ -403,13 +405,13 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 - with: # 3.8 is used for Win7 support - python-version: "3.8" + with: + python-version: "3.10" - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.10.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.11.1-py3-none-any.whl" - name: Prepare run: | @@ -419,22 +421,12 @@ jobs: run: | python -m bundle.pyinstaller python -m bundle.pyinstaller --onedir - Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_real.exe Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip - - name: Install Requirements (py2exe) - run: | - python devscripts/install_deps.py --include py2exe - - name: Build (py2exe) - run: | - python -m bundle.py2exe - Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_min.exe - Move-Item ./dist/yt-dlp_real.exe ./dist/yt-dlp.exe - - name: Verify --update-to if: vars.UPDATE_TO_VERIFICATION run: | - foreach ($name in @("yt-dlp","yt-dlp_min")) { + foreach ($name in @("yt-dlp")) { Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" $version = & "./dist/${name}.exe" --version & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 @@ -450,7 +442,6 @@ jobs: name: build-bin-${{ github.job }} path: | dist/yt-dlp.exe - dist/yt-dlp_min.exe dist/yt-dlp_win.zip compression-level: 0 @@ -463,13 +454,13 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.10" architecture: "x86" - name: Install Requirements run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.10.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.11.1-py3-none-any.whl" - name: Prepare run: | @@ -513,7 +504,8 @@ jobs: - windows32 runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v4 + - name: Download artifacts + uses: actions/download-artifact@v4 with: path: artifact pattern: build-bin-* @@ -537,13 +529,29 @@ jobs: lock 2022.08.18.36 .+ Python 3\.6 lock 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lock 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lock 2024.10.22 py2exe .+ + lock 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lock 2024.10.22 (?!\w+_exe).+ Python 3\.8 + lock 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp 2022.08.18.36 .+ Python 3\.6 lockV2 yt-dlp/yt-dlp 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp 2024.10.22 py2exe .+ + lockV2 yt-dlp/yt-dlp 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lockV2 yt-dlp/yt-dlp 2024.10.22 (?!\w+_exe).+ Python 3\.8 + lockV2 yt-dlp/yt-dlp 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 py2exe .+ + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 (?!\w+_exe).+ Python 3\.8 + lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.045052 py2exe .+ + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 (?!\w+_exe).+ Python 3\.8 + lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) EOF - name: Sign checksum files diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index a5cb6c9707..9a4342a585 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -36,16 +36,20 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - # CPython 3.8 is in quick-test - python-version: ['3.9', '3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] + # CPython 3.9 is in quick-test + python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.8' + python-version: '3.9' + - os: windows-latest + python-version: '3.10' - os: windows-latest python-version: '3.12' - os: windows-latest - python-version: pypy-3.9 + python-version: '3.13' + - os: windows-latest + python-version: pypy-3.10 steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index 7256804d93..6849fba9b6 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -28,13 +28,13 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] + python-version: ['3.10', '3.11', '3.12', '3.13', pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.8' + python-version: '3.9' - os: windows-latest - python-version: pypy-3.9 + python-version: pypy-3.10 steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index cce7cbac1e..1a32bbfe31 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -10,10 +10,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Install test requirements run: python3 ./devscripts/install_deps.py -o --include test - name: Run tests @@ -29,7 +29,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Install dev dependencies run: python3 ./devscripts/install_deps.py -o --include static-analysis - name: Make lazy extractors diff --git a/.github/workflows/release-master.yml b/.github/workflows/release-master.yml index c49319b171..78445e417e 100644 --- a/.github/workflows/release-master.yml +++ b/.github/workflows/release-master.yml @@ -28,3 +28,20 @@ jobs: actions: write # For cleaning up cache id-token: write # mandatory for trusted publishing secrets: inherit + + publish_pypi: + needs: [release] + if: vars.MASTER_PYPI_PROJECT != '' + runs-on: ubuntu-latest + permissions: + id-token: write # mandatory for trusted publishing + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + path: dist + name: build-pypi + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + verbose: true diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index b536c50669..8f72844058 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -41,3 +41,20 @@ jobs: actions: write # For cleaning up cache id-token: write # mandatory for trusted publishing secrets: inherit + + publish_pypi: + needs: [release] + if: vars.NIGHTLY_PYPI_PROJECT != '' + runs-on: ubuntu-latest + permissions: + id-token: write # mandatory for trusted publishing + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + path: dist + name: build-pypi + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + verbose: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8d0bc4026a..26b93e429c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,10 +2,6 @@ name: Release on: workflow_call: inputs: - prerelease: - required: false - default: true - type: boolean source: required: false default: '' @@ -18,6 +14,10 @@ on: required: false default: '' type: string + prerelease: + required: false + default: true + type: boolean workflow_dispatch: inputs: source: @@ -278,7 +278,17 @@ jobs: make clean-cache python -m build --no-isolation . + - name: Upload artifacts + if: github.event_name != 'workflow_dispatch' + uses: actions/upload-artifact@v4 + with: + name: build-pypi + path: | + dist/* + compression-level: 0 + - name: Publish to PyPI + if: github.event_name == 'workflow_dispatch' uses: pypa/gh-action-pypi-publish@release/v1 with: verbose: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dbae6476f6..fd7b0f1210 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -37,14 +37,18 @@ Bugs and suggestions should be reported at: [yt-dlp/yt-dlp/issues](https://githu **Please include the full output of yt-dlp when run with `-vU`**, i.e. **add** `-vU` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` $ yt-dlp -vU -[debug] Command-line config: ['-v', 'demo.com'] -[debug] Encodings: locale UTF-8, fs utf-8, out utf-8, pref UTF-8 -[debug] yt-dlp version 2021.09.25 (zip) -[debug] Python version 3.8.10 (CPython 64bit) - Linux-5.4.0-74-generic-x86_64-with-glibc2.29 -[debug] exe versions: ffmpeg 4.2.4, ffprobe 4.2.4 +[debug] Command-line config: ['-vU', 'https://www.example.com/'] +[debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 +[debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) +[debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) +[debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 +[debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} -Current Build Hash 25cc412d1d3c0725a1f2f5b7e4682f6fb40e6d15f7024e96f7afd572e9919535 -yt-dlp is up to date (2021.09.25) +[debug] Request Handlers: urllib, requests, websockets, curl_cffi +[debug] Loaded 1838 extractors +[debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest +Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds +yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) ... ``` **Do not post screenshots of verbose logs; only plain text is acceptable.** @@ -233,7 +237,7 @@ After you have ensured this site is distributing its content legally, you can fo # * MD5 checksum; start the string with 'md5:', e.g. # 'description': 'md5:098f6bcd4621d373cade4e832627b4f6', # * A regular expression; start the string with 're:', e.g. - # 'thumbnail': r're:^https?://.*\.jpg$', + # 'thumbnail': r're:https?://.*\.jpg$', # * A count of elements in a list; start the string with 'count:', e.g. # 'tags': 'count:10', # * Any Python type, e.g. @@ -268,7 +272,7 @@ After you have ensured this site is distributing its content legally, you can fo You can use `hatch fmt` to automatically fix problems. Rules that the linter/formatter enforces should not be disabled with `# noqa` unless a maintainer requests it. The only exception allowed is for old/printf-style string formatting in GraphQL query templates (use `# noqa: UP031`). -1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython and PyPy for Python 3.8 and above. Backward compatibility is not required for even older versions of Python. +1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython >=3.9 and PyPy >=3.10. Backward compatibility is not required for even older versions of Python. 1. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files, [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: ```shell @@ -302,10 +306,9 @@ Extractors are very fragile by nature since they depend on the layout of the sou For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L119-L440) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: - `id` (media identifier) - - `title` (media title) - `url` (media download URL) or `formats` -The aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. While all extractors must return a `title`, they must also allow it's extraction to be non-fatal. +The aforementioned metadata fields are the critical data without which extraction does not make any sense. If any of them fail to be extracted, then the extractor is considered broken. All other metadata extraction should be completely non-fatal. For pornographic sites, appropriate `age_limit` must also be returned. diff --git a/CONTRIBUTORS b/CONTRIBUTORS index bcdf6a0c24..9b8207b28b 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -678,3 +678,32 @@ coreywright eric321 poyhen tetra-fox +444995 +63427083 +allendema +DarkZeros +DTrombett +imranh2 +KarboniteKream +mikkovedru +pktiuk +rubyevadestaxes +avagordon01 +CounterPillow +JoseAngelB +KBelmin +kesor +MellowKyler +Wesley107772 +a13ssandr0 +ChocoLZS +doe1080 +hugovdev +jshumphrey +julionc +manavchaudhary1 +powergold1 +Sakura286 +SamDecrock +stratus-ss +subrat-lima diff --git a/Changelog.md b/Changelog.md index 10fd437fa1..41a2da744d 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,170 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.11.18 + +#### Important changes +- **Login with OAuth is no longer supported for YouTube** +Due to a change made by the site, yt-dlp is longer able to support OAuth login for YouTube. [Read more](https://github.com/yt-dlp/yt-dlp/issues/11462#issuecomment-2471703090) + +#### Core changes +- [Catch broken Cryptodome installations](https://github.com/yt-dlp/yt-dlp/commit/b83ca24eb72e1e558b0185bd73975586c0bc0546) ([#11486](https://github.com/yt-dlp/yt-dlp/issues/11486)) by [seproDev](https://github.com/seproDev) +- **utils** + - [Fix `join_nonempty`, add `**kwargs` to `unpack`](https://github.com/yt-dlp/yt-dlp/commit/39d79c9b9cf23411d935910685c40aa1a2fdb409) ([#11559](https://github.com/yt-dlp/yt-dlp/issues/11559)) by [Grub4K](https://github.com/Grub4K) + - `subs_list_to_dict`: [Add `lang` default parameter](https://github.com/yt-dlp/yt-dlp/commit/c014fbcddcb4c8f79d914ac5bb526758b540ea33) ([#11508](https://github.com/yt-dlp/yt-dlp/issues/11508)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Allow `ext` override for thumbnails](https://github.com/yt-dlp/yt-dlp/commit/eb64ae7d5def6df2aba74fb703e7f168fb299865) ([#11545](https://github.com/yt-dlp/yt-dlp/issues/11545)) by [bashonly](https://github.com/bashonly) +- **adobepass**: [Fix provider requests](https://github.com/yt-dlp/yt-dlp/commit/85fdc66b6e01d19a94b4f39b58e3c0cf23600902) ([#11472](https://github.com/yt-dlp/yt-dlp/issues/11472)) by [bashonly](https://github.com/bashonly) +- **archive.org**: [Fix comments extraction](https://github.com/yt-dlp/yt-dlp/commit/f2a4983df7a64c4e93b56f79dbd16a781bd90206) ([#11527](https://github.com/yt-dlp/yt-dlp/issues/11527)) by [jshumphrey](https://github.com/jshumphrey) +- **bandlab**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/6365e92589e4bc17b8fffb0125a716d144ad2137) ([#11535](https://github.com/yt-dlp/yt-dlp/issues/11535)) by [seproDev](https://github.com/seproDev) +- **chaturbate** + - [Extract from API and support impersonation](https://github.com/yt-dlp/yt-dlp/commit/720b3dc453c342bc2e8df7dbc0acaab4479de46c) ([#11555](https://github.com/yt-dlp/yt-dlp/issues/11555)) by [powergold1](https://github.com/powergold1) (With fixes in [7cecd29](https://github.com/yt-dlp/yt-dlp/commit/7cecd299e4a5ef1f0f044b2fedc26f17e41f15e3) by [seproDev](https://github.com/seproDev)) + - [Support alternate domains](https://github.com/yt-dlp/yt-dlp/commit/a9f85670d03ab993dc589f21a9ffffcad61392d5) ([#10595](https://github.com/yt-dlp/yt-dlp/issues/10595)) by [manavchaudhary1](https://github.com/manavchaudhary1) +- **cloudflarestream**: [Avoid extraction via videodelivery.net](https://github.com/yt-dlp/yt-dlp/commit/2db8c2e7d57a1784b06057c48e3e91023720d195) ([#11478](https://github.com/yt-dlp/yt-dlp/issues/11478)) by [hugovdev](https://github.com/hugovdev) +- **ctvnews** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f351440f1dc5b3dfbfc5737b037a869d946056fe) ([#11534](https://github.com/yt-dlp/yt-dlp/issues/11534)) by [bashonly](https://github.com/bashonly), [jshumphrey](https://github.com/jshumphrey) + - [Fix playlist ID extraction](https://github.com/yt-dlp/yt-dlp/commit/f9d98509a898737c12977b2e2117277bada2c196) ([#8892](https://github.com/yt-dlp/yt-dlp/issues/8892)) by [qbnu](https://github.com/qbnu) +- **digitalconcerthall**: [Support login with access/refresh tokens](https://github.com/yt-dlp/yt-dlp/commit/f7257588bdff5f0b0452635a66b253a783c97357) ([#11571](https://github.com/yt-dlp/yt-dlp/issues/11571)) by [bashonly](https://github.com/bashonly) +- **facebook**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/bacc31b05a04181b63100c481565256b14813a5e) ([#11513](https://github.com/yt-dlp/yt-dlp/issues/11513)) by [bashonly](https://github.com/bashonly) +- **gamedevtv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/be3579aaf0c3b71a0a3195e1955415d5e4d6b3d8) ([#11368](https://github.com/yt-dlp/yt-dlp/issues/11368)) by [bashonly](https://github.com/bashonly), [stratus-ss](https://github.com/stratus-ss) +- **goplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6b43a8d84b881d769b480ba6e20ec691e9d1b92d) ([#11466](https://github.com/yt-dlp/yt-dlp/issues/11466)) by [bashonly](https://github.com/bashonly), [SamDecrock](https://github.com/SamDecrock) +- **kenh14**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/eb15fd5a32d8b35ef515f7a3d1158c03025648ff) ([#3996](https://github.com/yt-dlp/yt-dlp/issues/3996)) by [krichbanana](https://github.com/krichbanana), [pzhlkj6612](https://github.com/pzhlkj6612) +- **litv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/e079ffbda66de150c0a9ebef05e89f61bb4d5f76) ([#11071](https://github.com/yt-dlp/yt-dlp/issues/11071)) by [jiru](https://github.com/jiru) +- **mixchmovie**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/0ec9bfed4d4a52bfb4f8733da1acf0aeeae21e6b) ([#10897](https://github.com/yt-dlp/yt-dlp/issues/10897)) by [Sakura286](https://github.com/Sakura286) +- **patreon**: [Fix comments extraction](https://github.com/yt-dlp/yt-dlp/commit/1d253b0a27110d174c40faf8fb1c999d099e0cde) ([#11530](https://github.com/yt-dlp/yt-dlp/issues/11530)) by [bashonly](https://github.com/bashonly), [jshumphrey](https://github.com/jshumphrey) +- **pialive**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/d867f99622ef7fba690b08da56c39d739b822bb7) ([#10811](https://github.com/yt-dlp/yt-dlp/issues/10811)) by [ChocoLZS](https://github.com/ChocoLZS) +- **radioradicale**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/70c55cb08f780eab687e881ef42bb5c6007d290b) ([#5607](https://github.com/yt-dlp/yt-dlp/issues/5607)) by [a13ssandr0](https://github.com/a13ssandr0), [pzhlkj6612](https://github.com/pzhlkj6612) +- **reddit**: [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/7ea2787920cccc6b8ea30791993d114fbd564434) ([#11573](https://github.com/yt-dlp/yt-dlp/issues/11573)) by [bashonly](https://github.com/bashonly) +- **redgifsuser**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/d215fba7edb69d4fa665f43663756fd260b1489f) ([#11531](https://github.com/yt-dlp/yt-dlp/issues/11531)) by [jshumphrey](https://github.com/jshumphrey) +- **rutube**: [Rework extractors](https://github.com/yt-dlp/yt-dlp/commit/e398217aae19bb25f91797bfbe8a3243698d7f45) ([#11480](https://github.com/yt-dlp/yt-dlp/issues/11480)) by [seproDev](https://github.com/seproDev) +- **sonylivseries**: [Add `sort_order` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/2009cb27e17014787bf63eaa2ada51293d54f22a) ([#11569](https://github.com/yt-dlp/yt-dlp/issues/11569)) by [bashonly](https://github.com/bashonly) +- **soop**: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/c699bafc5038b59c9afe8c2e69175fb66424c832) ([#11545](https://github.com/yt-dlp/yt-dlp/issues/11545)) by [bashonly](https://github.com/bashonly) +- **spankbang**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/8388ec256f7753b02488788e3cfa771f6e1db247) ([#11542](https://github.com/yt-dlp/yt-dlp/issues/11542)) by [jshumphrey](https://github.com/jshumphrey) +- **spreaker** + - [Support episode pages and access keys](https://github.com/yt-dlp/yt-dlp/commit/c39016f66df76d14284c705736ca73db8055d8de) ([#11489](https://github.com/yt-dlp/yt-dlp/issues/11489)) by [julionc](https://github.com/julionc) + - [Support podcast and feed pages](https://github.com/yt-dlp/yt-dlp/commit/c6737310619022248f5d0fd13872073cac168453) ([#10968](https://github.com/yt-dlp/yt-dlp/issues/10968)) by [subrat-lima](https://github.com/subrat-lima) +- **youtube** + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/637d62a3a9fc723d68632c1af25c30acdadeeb85) ([#11528](https://github.com/yt-dlp/yt-dlp/issues/11528)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + - [Remove broken OAuth support](https://github.com/yt-dlp/yt-dlp/commit/52c0ffe40ad6e8404d93296f575007b05b04c686) ([#11558](https://github.com/yt-dlp/yt-dlp/issues/11558)) by [bashonly](https://github.com/bashonly) + - tab: [Fix podcasts tab extraction](https://github.com/yt-dlp/yt-dlp/commit/37cd7660eaff397c551ee18d80507702342b0c2b) ([#11567](https://github.com/yt-dlp/yt-dlp/issues/11567)) by [seproDev](https://github.com/seproDev) + +#### Misc. changes +- **build** + - [Bump PyInstaller version pin to `>=6.11.1`](https://github.com/yt-dlp/yt-dlp/commit/f9c8deb4e5887ff5150e911ac0452e645f988044) ([#11507](https://github.com/yt-dlp/yt-dlp/issues/11507)) by [bashonly](https://github.com/bashonly) + - [Enable attestations for trusted publishing](https://github.com/yt-dlp/yt-dlp/commit/f13df591d4d7ca8e2f31b35c9c91e69ba9e9b013) ([#11420](https://github.com/yt-dlp/yt-dlp/issues/11420)) by [bashonly](https://github.com/bashonly) + - [Pin `websockets` version to >=13.0,<14](https://github.com/yt-dlp/yt-dlp/commit/240a7d43c8a67ffb86d44dc276805aa43c358dcc) ([#11488](https://github.com/yt-dlp/yt-dlp/issues/11488)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Deprecate more compat functions](https://github.com/yt-dlp/yt-dlp/commit/f95a92b3d0169a784ee15a138fbe09d82b2754a1) ([#11439](https://github.com/yt-dlp/yt-dlp/issues/11439)) by [seproDev](https://github.com/seproDev) + - [Remove dead extractors](https://github.com/yt-dlp/yt-dlp/commit/10fc719bc7f1eef469389c5219102266ef411f29) ([#11566](https://github.com/yt-dlp/yt-dlp/issues/11566)) by [doe1080](https://github.com/doe1080) + - Miscellaneous: [da252d9](https://github.com/yt-dlp/yt-dlp/commit/da252d9d322af3e2178ac5eae324809502a0a862) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) + +### 2024.11.04 + +#### Important changes +- **Beginning with this release, yt-dlp's Python dependencies *must* be installed using the `default` group** +If you're installing yt-dlp with pip/pipx or requiring yt-dlp in your own Python project, you'll need to specify `yt-dlp[default]` if you want to also install yt-dlp's optional dependencies (which were previously included by default). [Read more](https://github.com/yt-dlp/yt-dlp/pull/11255) +- **The minimum *required* Python version has been raised to 3.9** +Python 3.8 reached its end-of-life on 2024.10.07, and yt-dlp has now removed support for it. As an unfortunate side effect, the official `yt-dlp.exe` and `yt-dlp_x86.exe` binaries are no longer supported on Windows 7. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086) + +#### Core changes +- [Allow thumbnails with `.jpe` extension](https://github.com/yt-dlp/yt-dlp/commit/5bc5fb2835ea59bdf326bd12176d74d2c7348a95) ([#11408](https://github.com/yt-dlp/yt-dlp/issues/11408)) by [bashonly](https://github.com/bashonly) +- [Expand paths in `--plugin-dirs`](https://github.com/yt-dlp/yt-dlp/commit/914af9a0cf51c9a3f74aa88d952bee8334c67511) ([#11334](https://github.com/yt-dlp/yt-dlp/issues/11334)) by [bashonly](https://github.com/bashonly) +- [Fix `--netrc` empty string parsing for Python <=3.10](https://github.com/yt-dlp/yt-dlp/commit/88402b714ec124633933737bc156b172a3dec3d6) ([#11414](https://github.com/yt-dlp/yt-dlp/issues/11414)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- [Populate format sorting fields before dependent fields](https://github.com/yt-dlp/yt-dlp/commit/5c880ef42e9c2b2fc412f6d69dad37d34fb75a62) ([#11353](https://github.com/yt-dlp/yt-dlp/issues/11353)) by [Grub4K](https://github.com/Grub4K) +- [Prioritize AV1](https://github.com/yt-dlp/yt-dlp/commit/3945677a75e94a1fecc085432d791e1c21220cd3) ([#11153](https://github.com/yt-dlp/yt-dlp/issues/11153)) by [seproDev](https://github.com/seproDev) +- [Remove Python 3.8 support](https://github.com/yt-dlp/yt-dlp/commit/d784464399b600ba9516bbcec6286f11d68974dd) ([#11321](https://github.com/yt-dlp/yt-dlp/issues/11321)) by [bashonly](https://github.com/bashonly) +- **aes**: [Fix GCM pad length calculation](https://github.com/yt-dlp/yt-dlp/commit/beae2db127d3b5017cbcf685da9de7a9ef496541) ([#11438](https://github.com/yt-dlp/yt-dlp/issues/11438)) by [seproDev](https://github.com/seproDev) +- **cookies**: [Support chrome table version 24](https://github.com/yt-dlp/yt-dlp/commit/4613096f2e6eab9dcbac0e98b6cec760bbc99375) ([#11425](https://github.com/yt-dlp/yt-dlp/issues/11425)) by [kesor](https://github.com/kesor), [seproDev](https://github.com/seproDev) +- **utils** + - [Allow partial application for more functions](https://github.com/yt-dlp/yt-dlp/commit/b6dc2c49e8793c6dfa21275e61caf49ec1148b81) ([#11391](https://github.com/yt-dlp/yt-dlp/issues/11391)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) (With fixes in [422195e](https://github.com/yt-dlp/yt-dlp/commit/422195ec70a00b0d2002b238cacbae7790c57fdf) by [Grub4K](https://github.com/Grub4K)) + - [Fix `find_element` by class](https://github.com/yt-dlp/yt-dlp/commit/f93c16395cea1fe9ffc3c594d3e019c3b214544c) ([#11402](https://github.com/yt-dlp/yt-dlp/issues/11402)) by [bashonly](https://github.com/bashonly) + - [Fix and improve `find_element` and `find_elements`](https://github.com/yt-dlp/yt-dlp/commit/b103aca24d35b72b405c340357dc01a0ed534281) ([#11443](https://github.com/yt-dlp/yt-dlp/issues/11443)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Resolve `language` to ISO639-2 for ISM formats](https://github.com/yt-dlp/yt-dlp/commit/21cdcf03a237a0c4979c941d5a5385cae44c7906) ([#11359](https://github.com/yt-dlp/yt-dlp/issues/11359)) by [bashonly](https://github.com/bashonly) +- **ardmediathek**: [Extract chapters](https://github.com/yt-dlp/yt-dlp/commit/59f8dd8239c31f00b708da53b39b1e2e9409b6e6) ([#11442](https://github.com/yt-dlp/yt-dlp/issues/11442)) by [iw0nderhow](https://github.com/iw0nderhow) +- **bfmtv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/754940e9a558565d6bd3c0c529802569b1d0ae4e) ([#11444](https://github.com/yt-dlp/yt-dlp/issues/11444)) by [seproDev](https://github.com/seproDev) +- **bluesky**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5c7a5aaab27e9c3cb367b663a6136ca58866e547) ([#11055](https://github.com/yt-dlp/yt-dlp/issues/11055)) by [MellowKyler](https://github.com/MellowKyler), [seproDev](https://github.com/seproDev) +- **ccma**: [Support new 3cat.cat domain](https://github.com/yt-dlp/yt-dlp/commit/330335386d4f7603d92d6796798375336005275e) ([#11222](https://github.com/yt-dlp/yt-dlp/issues/11222)) by [JoseAngelB](https://github.com/JoseAngelB) +- **chzzk**: video: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/9c6534da81e485b2325b3489ee4128943e6d3e4b) ([#11228](https://github.com/yt-dlp/yt-dlp/issues/11228)) by [hui1601](https://github.com/hui1601) +- **cnn**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9acf79c91a8c6c55ca972747c6858e784e2da351) ([#10185](https://github.com/yt-dlp/yt-dlp/issues/10185)) by [kylegustavo](https://github.com/kylegustavo), [seproDev](https://github.com/seproDev) +- **dailymotion** + - [Improve embed extraction](https://github.com/yt-dlp/yt-dlp/commit/a403dcf9be20b49cbb3017328f4aaa352fb6d685) ([#10843](https://github.com/yt-dlp/yt-dlp/issues/10843)) by [bashonly](https://github.com/bashonly), [pzhlkj6612](https://github.com/pzhlkj6612) + - [Support shortened URLs](https://github.com/yt-dlp/yt-dlp/commit/d1358231371f20fa23020fa9176be3b56119873e) ([#11374](https://github.com/yt-dlp/yt-dlp/issues/11374)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **facebook**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/ec9b25043f399de6a591d8370d32bf0e66c117f2) ([#11343](https://github.com/yt-dlp/yt-dlp/issues/11343)) by [kclauhk](https://github.com/kclauhk) +- **generic**: [Do not impersonate by default](https://github.com/yt-dlp/yt-dlp/commit/c29f5a7fae93a08f3cfbb6127b2faa75145b06a0) ([#11336](https://github.com/yt-dlp/yt-dlp/issues/11336)) by [bashonly](https://github.com/bashonly) +- **nfl**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/838f4385de8300a4dd4e7ffbbf0e5b7b85fb52c2) ([#11409](https://github.com/yt-dlp/yt-dlp/issues/11409)) by [bashonly](https://github.com/bashonly) +- **niconicouser**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6abef74232c0fc695cd803c18ae446cacb129389) ([#11324](https://github.com/yt-dlp/yt-dlp/issues/11324)) by [Wesley107772](https://github.com/Wesley107772) +- **soundcloud**: [Extract artists](https://github.com/yt-dlp/yt-dlp/commit/f101e5d34c97c608156ad5396714c2a2edca966a) ([#11377](https://github.com/yt-dlp/yt-dlp/issues/11377)) by [seproDev](https://github.com/seproDev) +- **tumblr**: [Support more URLs](https://github.com/yt-dlp/yt-dlp/commit/b03267bf0675eeb8df5baf1daac7cf67840c91a5) ([#6057](https://github.com/yt-dlp/yt-dlp/issues/6057)) by [selfisekai](https://github.com/selfisekai), [seproDev](https://github.com/seproDev) +- **twitter**: [Remove cookies migration workaround](https://github.com/yt-dlp/yt-dlp/commit/76802f461332d444e596437c42374fa237fa5174) ([#11392](https://github.com/yt-dlp/yt-dlp/issues/11392)) by [bashonly](https://github.com/bashonly) +- **vimeo**: [Fix API retries](https://github.com/yt-dlp/yt-dlp/commit/57212a5f97ce367590aaa5c3e9a135eead8f81f7) ([#11351](https://github.com/yt-dlp/yt-dlp/issues/11351)) by [bashonly](https://github.com/bashonly) +- **yle_areena**: [Support live events](https://github.com/yt-dlp/yt-dlp/commit/a6783a3b9905e547f6c1d4df9d7c7999feda8afa) ([#11358](https://github.com/yt-dlp/yt-dlp/issues/11358)) by [bashonly](https://github.com/bashonly), [CounterPillow](https://github.com/CounterPillow) +- **youtube**: [Adjust OAuth refresh token handling](https://github.com/yt-dlp/yt-dlp/commit/d569a8845254d90ce13ad74ae76695e8d6441068) ([#11414](https://github.com/yt-dlp/yt-dlp/issues/11414)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **build** + - [Disable attestations for trusted publishing](https://github.com/yt-dlp/yt-dlp/commit/428ffb75aa3534b275cf54de42693a4d261519da) ([#11418](https://github.com/yt-dlp/yt-dlp/issues/11418)) by [bashonly](https://github.com/bashonly) + - [Move optional dependencies to the `default` group](https://github.com/yt-dlp/yt-dlp/commit/87884f15580910e4e0fe0e1db73508debc657471) ([#11255](https://github.com/yt-dlp/yt-dlp/issues/11255)) by [bashonly](https://github.com/bashonly) + - [Use Ubuntu 20.04 and Python 3.9 for Linux ARM builds](https://github.com/yt-dlp/yt-dlp/commit/dd2e24446954246a2ec4d4a7e95531f52a14b351) ([#8638](https://github.com/yt-dlp/yt-dlp/issues/8638)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - Miscellaneous + - [ea9e35d](https://github.com/yt-dlp/yt-dlp/commit/ea9e35d85fba5eab341cdcaf1eaed69b57f7e465) by [bashonly](https://github.com/bashonly) + - [c998238](https://github.com/yt-dlp/yt-dlp/commit/c998238c2e76c62d1d29962c6e8ebe916cc7913b) by [bashonly](https://github.com/bashonly), [KBelmin](https://github.com/KBelmin) + - [197d0b0](https://github.com/yt-dlp/yt-dlp/commit/197d0b03b6a3c8fe4fa5ace630eeffec629bf72c) by [avagordon01](https://github.com/avagordon01), [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **devscripts**: `make_changelog`: [Parse full commit message for fixes](https://github.com/yt-dlp/yt-dlp/commit/0a3991edae0e10f2ea41ece9fdea5e48f789f1de) ([#11366](https://github.com/yt-dlp/yt-dlp/issues/11366)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +### 2024.10.22 + +#### Important changes +- **Following this release, yt-dlp's Python dependencies *must* be installed using the `default` group** +If you're installing yt-dlp with pip/pipx or requiring yt-dlp in your own Python project, you'll need to specify `yt-dlp[default]` if you want to also install yt-dlp's optional dependencies (which were previously included by default). [Read more](https://github.com/yt-dlp/yt-dlp/pull/11255) +- **py2exe is no longer supported** +This release's `yt-dlp_min.exe` will be the last, and it's actually a PyInstaller-bundled executable so that yt-dlp users updating their py2exe build with `-U` will be automatically migrated. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10087) + +#### Core changes +- [Add extractor helpers](https://github.com/yt-dlp/yt-dlp/commit/d710a6ca7c622705c0c8c8a3615916f531137d5d) ([#10653](https://github.com/yt-dlp/yt-dlp/issues/10653)) by [Grub4K](https://github.com/Grub4K) +- [Add option `--plugin-dirs`](https://github.com/yt-dlp/yt-dlp/commit/0f593dca9fa995d88eb763170a932da61c8f24dc) ([#11277](https://github.com/yt-dlp/yt-dlp/issues/11277)) by [coletdjnz](https://github.com/coletdjnz), [imranh2](https://github.com/imranh2) +- **cookies**: [Fix compatibility for Python <=3.9 in traceback](https://github.com/yt-dlp/yt-dlp/commit/c5f0f58efd8c3930de8202c15a5c53b1b635bd51) by [Grub4K](https://github.com/Grub4K) +- **utils** + - `Popen`: [Reset PyInstaller environment](https://github.com/yt-dlp/yt-dlp/commit/fbc66e3ab35743cc847a21223c67d88bb463cd9c) ([#11258](https://github.com/yt-dlp/yt-dlp/issues/11258)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - `sanitize_path`: [Reimplement function](https://github.com/yt-dlp/yt-dlp/commit/85b87c991af25dcb35630fa94580fd418e78ee33) ([#11198](https://github.com/yt-dlp/yt-dlp/issues/11198)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **adobepass**: [Use newer user-agent for provider redirect request](https://github.com/yt-dlp/yt-dlp/commit/dcfeea4dd5e5686821350baa6c7767a011944867) ([#11250](https://github.com/yt-dlp/yt-dlp/issues/11250)) by [bashonly](https://github.com/bashonly) +- **afreecatv**: [Adapt extractors to new sooplive.co.kr domain](https://github.com/yt-dlp/yt-dlp/commit/46fe60ff19395698a87113b2944453779e04ab9d) ([#11266](https://github.com/yt-dlp/yt-dlp/issues/11266)) by [63427083](https://github.com/63427083), [bashonly](https://github.com/bashonly) +- **cda**: [Support folders](https://github.com/yt-dlp/yt-dlp/commit/c4d95f67ddc522297bb1fea875255cf94b34d595) ([#10786](https://github.com/yt-dlp/yt-dlp/issues/10786)) by [pktiuk](https://github.com/pktiuk) +- **cwtv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/9d43dcb2c5c38f443f84dfc126cd32720e1a1ad6) ([#11230](https://github.com/yt-dlp/yt-dlp/issues/11230)) by [bashonly](https://github.com/bashonly) +- **drtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f4338714241b11d9d43768ae71a25f5e952f677d) ([#11141](https://github.com/yt-dlp/yt-dlp/issues/11141)) by [444995](https://github.com/444995) +- **funk**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/8de431ec97a4b62b73df8f686b6e21e462775336) ([#11269](https://github.com/yt-dlp/yt-dlp/issues/11269)) by [seproDev](https://github.com/seproDev) +- **gem.cbc.ca**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/40054cb4a7ebbea30d335d444e6f58b298a3baa0) ([#11196](https://github.com/yt-dlp/yt-dlp/issues/11196)) by [DavidSkrundz](https://github.com/DavidSkrundz) +- **generic**: [Impersonate browser by default](https://github.com/yt-dlp/yt-dlp/commit/edfd095b1917701c5046bd51f9542897c17d41a7) ([#11206](https://github.com/yt-dlp/yt-dlp/issues/11206)) by [Grub4K](https://github.com/Grub4K) +- **imgur** + - [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/87408ccfd772ddf31a8323d8151c24f9577cbc9f) ([#11298](https://github.com/yt-dlp/yt-dlp/issues/11298)) by [seproDev](https://github.com/seproDev) + - [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/5af774d7a36c00bea618c7047c9326532cd3f616) ([#11075](https://github.com/yt-dlp/yt-dlp/issues/11075)) by [Deer-Spangle](https://github.com/Deer-Spangle) +- **patreon**: campaign: [Stricter URL matching](https://github.com/yt-dlp/yt-dlp/commit/babb70960595e2146f06f81affc29c7e713e34e2) ([#11235](https://github.com/yt-dlp/yt-dlp/issues/11235)) by [bashonly](https://github.com/bashonly) +- **reddit**: [Detect and raise when login is required](https://github.com/yt-dlp/yt-dlp/commit/cba7868502f04175fecf9ab3e363296aee7ebec2) ([#11202](https://github.com/yt-dlp/yt-dlp/issues/11202)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **substack**: [Resolve podcast file extensions](https://github.com/yt-dlp/yt-dlp/commit/3148c1822f66533998278f0a1cf842b9bea1526a) ([#11275](https://github.com/yt-dlp/yt-dlp/issues/11275)) by [bashonly](https://github.com/bashonly) +- **telecinco**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/0b7ec08816fb196cd41d392f8331b4eb8366c4f8) ([#11142](https://github.com/yt-dlp/yt-dlp/issues/11142)) by [bashonly](https://github.com/bashonly), [DarkZeros](https://github.com/DarkZeros) +- **tubitv**: [Strip extra whitespace from titles](https://github.com/yt-dlp/yt-dlp/commit/e68b4c19af122876561a41f2dd8093fae7b417c7) ([#10795](https://github.com/yt-dlp/yt-dlp/issues/10795)) by [allendema](https://github.com/allendema) +- **tver**: [Support series URLs](https://github.com/yt-dlp/yt-dlp/commit/ceaea731b6e314dbbdfb2e358d7677785ed0b4fc) ([#9507](https://github.com/yt-dlp/yt-dlp/issues/9507)) by [pzhlkj6612](https://github.com/pzhlkj6612), [vvto33](https://github.com/vvto33) +- **twitter**: spaces: [Allow extraction when not logged in](https://github.com/yt-dlp/yt-dlp/commit/679c68240a26481ea7c07cc0c014745631ea8481) ([#11289](https://github.com/yt-dlp/yt-dlp/issues/11289)) by [rubyevadestaxes](https://github.com/rubyevadestaxes) +- **weverse**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5310fa87f6cb7f66bf42e2520878952fbf6b1652) ([#11215](https://github.com/yt-dlp/yt-dlp/issues/11215)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Fix `comment_count` extraction](https://github.com/yt-dlp/yt-dlp/commit/7af1ddaaf2a6a0a750373a9ab53c7770af4f9fe4) ([#11274](https://github.com/yt-dlp/yt-dlp/issues/11274)) by [bashonly](https://github.com/bashonly) + - [Remove broken `android_producer` client](https://github.com/yt-dlp/yt-dlp/commit/fed53d70bdb7d3e37ef63dd7fcf0ef74356167fd) ([#11297](https://github.com/yt-dlp/yt-dlp/issues/11297)) by [bashonly](https://github.com/bashonly) + - [Remove broken age-restriction workaround](https://github.com/yt-dlp/yt-dlp/commit/ec2f4bf0823a13043f98f5bd0bf6677837bf09dc) ([#11297](https://github.com/yt-dlp/yt-dlp/issues/11297)) by [bashonly](https://github.com/bashonly) + - [Support logging in with OAuth](https://github.com/yt-dlp/yt-dlp/commit/b8635c1d4779da195e71aa281f73aaad702c935e) ([#11001](https://github.com/yt-dlp/yt-dlp/issues/11001)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Migrate `py2exe` builds to `win_exe`](https://github.com/yt-dlp/yt-dlp/commit/a886cf3e900f4a2ec00af705f883539269545609) ([#11256](https://github.com/yt-dlp/yt-dlp/issues/11256)) by [bashonly](https://github.com/bashonly) + - [Use `macos-13` image for macOS builds](https://github.com/yt-dlp/yt-dlp/commit/64d84d75ca8c19ec06558cc7c511f5f4f7a822bc) ([#11236](https://github.com/yt-dlp/yt-dlp/issues/11236)) by [bashonly](https://github.com/bashonly) + - `make_lazy_extractors`: [Force running without plugins](https://github.com/yt-dlp/yt-dlp/commit/1a830394a21a81a3e9918f9e175abc9fbb21f089) ([#11205](https://github.com/yt-dlp/yt-dlp/issues/11205)) by [Grub4K](https://github.com/Grub4K) +- **cleanup**: Miscellaneous: [67adeb7](https://github.com/yt-dlp/yt-dlp/commit/67adeb7bab00662ba55d473e405b301abb42fe61) by [bashonly](https://github.com/bashonly), [DTrombett](https://github.com/DTrombett), [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [KarboniteKream](https://github.com/KarboniteKream), [mikkovedru](https://github.com/mikkovedru), [seproDev](https://github.com/seproDev) +- **test**: [Allow running tests explicitly](https://github.com/yt-dlp/yt-dlp/commit/16eb28026a2ddf5608d0a628ef15949b8d3805a9) ([#11203](https://github.com/yt-dlp/yt-dlp/issues/11203)) by [Grub4K](https://github.com/Grub4K) + ### 2024.10.07 #### Core changes diff --git a/README.md b/README.md index 1cafe51d51..dd3a3189ba 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![YT-DLP](https://raw.githubusercontent.com/yt-dlp/yt-dlp/master/.github/banner.svg)](#readme) [![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](#installation "Installation") -[![PyPi](https://img.shields.io/badge/-PyPi-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPi") +[![PyPI](https://img.shields.io/badge/-PyPI-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPI") [![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](Collaborators.md#collaborators "Donate") [![Matrix](https://img.shields.io/matrix/yt-dlp:matrix.org?color=brightgreen&labelColor=555555&label=&logo=element&style=for-the-badge)](https://matrix.to/#/#yt-dlp:matrix.org "Matrix") [![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)](https://discord.gg/H5MNcFW63r "Discord") @@ -81,7 +81,7 @@ yt-dlp is a feature-rich command-line audio/video downloader with support for [t [![Windows](https://img.shields.io/badge/-Windows_x64-blue.svg?style=for-the-badge&logo=windows)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe) [![Unix](https://img.shields.io/badge/-Linux/BSD-red.svg?style=for-the-badge&logo=linux)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp) [![MacOS](https://img.shields.io/badge/-MacOS-lightblue.svg?style=for-the-badge&logo=apple)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_macos) -[![PyPi](https://img.shields.io/badge/-PyPi-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp) +[![PyPI](https://img.shields.io/badge/-PyPI-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp) [![Source Tarball](https://img.shields.io/badge/-Source_tar-green.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) [![Other variants](https://img.shields.io/badge/-Other-grey.svg?style=for-the-badge)](#release-files) [![All versions](https://img.shields.io/badge/-All_Versions-lightgrey.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases) @@ -98,15 +98,14 @@ You can install yt-dlp using [the binaries](#release-files), [pip](https://pypi. File|Description :---|:--- [yt-dlp](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp)|Platform-independent [zipimport](https://docs.python.org/3/library/zipimport.html) binary. Needs Python (recommended for **Linux/BSD**) -[yt-dlp.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe)|Windows (Win7 SP1+) standalone x64 binary (recommended for **Windows**) +[yt-dlp.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe)|Windows (Win8+) standalone x64 binary (recommended for **Windows**) [yt-dlp_macos](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_macos)|Universal MacOS (10.15+) standalone executable (recommended for **MacOS**) #### Alternatives File|Description :---|:--- -[yt-dlp_x86.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_x86.exe)|Windows (Win7 SP1+) standalone x86 (32-bit) binary -[yt-dlp_min.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_min.exe)|Windows (Win7 SP1+) standalone x64 binary built with `py2exe`
([Not recommended](#standalone-py2exe-builds-windows)) +[yt-dlp_x86.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_x86.exe)|Windows (Win8+) standalone x86 (32-bit) binary [yt-dlp_linux](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux)|Linux standalone x64 binary [yt-dlp_linux_armv7l](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux_armv7l)|Linux standalone armv7l (32-bit) binary [yt-dlp_linux_aarch64](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux_aarch64)|Linux standalone aarch64 (64-bit) binary @@ -173,11 +172,11 @@ python3 -m pip install -U --pre "yt-dlp[default]" ``` ## DEPENDENCIES -Python versions 3.8+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. +Python versions 3.9+ (CPython) and 3.10+ (PyPy) are supported. Other versions and implementations may or may not work correctly. While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly recommended @@ -254,24 +253,12 @@ On some systems, you may need to use `py` or `python` instead of `python3`. **Important**: Running `pyinstaller` directly **instead of** using `python -m bundle.pyinstaller` is **not** officially supported. This may or may not work correctly. ### Platform-independent Binary (UNIX) -You will need the build tools `python` (3.8+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. +You will need the build tools `python` (3.9+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. After installing these, simply run `make`. You can also run `make yt-dlp` instead to compile only the binary without updating any of the additional files. (The build tools marked with **\*** are not needed for this) -### Standalone Py2Exe Builds (Windows) - -While we provide the option to build with [py2exe](https://www.py2exe.org), it is recommended to build [using PyInstaller](#standalone-pyinstaller-builds) instead since the py2exe builds **cannot contain `pycryptodomex`/`certifi`/`requests` and need VC++14** on the target computer to run. - -If you wish to build it anyway, install Python (if it is not already installed) and you can run the following commands: - -``` -py devscripts/install_deps.py --include py2exe -py devscripts/make_lazy_extractors.py -py -m bundle.py2exe -``` - ### Related scripts * **`devscripts/install_deps.py`** - Install dependencies for yt-dlp. @@ -348,8 +335,16 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git containing directory ("-" for stdin). Can be used multiple times and inside other configuration files - --flat-playlist Do not extract the videos of a playlist, - only list them + --plugin-dirs PATH Path to an additional directory to search + for plugins. This option can be used + multiple times to add multiple directories. + Note that this currently only works for + extractor plugins; postprocessor plugins can + only be loaded from the default plugin + directories + --flat-playlist Do not extract a playlist's URL result + entries; some entry metadata may be missing + and downloading may be bypassed --no-flat-playlist Fully extract the videos of a playlist (default) --live-from-start Download livestreams from the start. @@ -444,10 +439,10 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git E.g. "--date today-2weeks" downloads only videos uploaded on the same day two weeks ago --datebefore DATE Download only videos uploaded on or before - this date. The date formats accepted is the + this date. The date formats accepted are the same as --date --dateafter DATE Download only videos uploaded on or after - this date. The date formats accepted is the + this date. The date formats accepted are the same as --date --match-filters FILTER Generic video filter. Any "OUTPUT TEMPLATE" field can be compared with a number or a @@ -485,7 +480,8 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-download-archive Do not use archive file (default) --max-downloads NUMBER Abort after downloading NUMBER files --break-on-existing Stop the download process when encountering - a file that is in the archive + a file that is in the archive supplied with + the --download-archive option --no-break-on-existing Do not stop the download process when encountering a file that is in the archive (default) @@ -732,16 +728,16 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git used. This option can be used multiple times --print-to-file [WHEN:]TEMPLATE FILE Append given template to the file. The - values of WHEN and TEMPLATE are same as that - of --print. FILE uses the same syntax as the - output template. This option can be used - multiple times + values of WHEN and TEMPLATE are the same as + that of --print. FILE uses the same syntax + as the output template. This option can be + used multiple times -j, --dump-json Quiet, but print JSON information for each video. Simulate unless --no-simulate is used. See "OUTPUT TEMPLATE" for a description of available keys -J, --dump-single-json Quiet, but print JSON information for each - url or infojson passed. Simulate unless + URL or infojson passed. Simulate unless --no-simulate is used. If the URL refers to a playlist, the whole playlist information is dumped in a single line @@ -816,9 +812,9 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-audio-multistreams Only one audio stream is downloaded for each output file (default) --prefer-free-formats Prefer video formats with free containers - over non-free ones of same quality. Use with - "-S ext" to strictly prefer free containers - irrespective of quality + over non-free ones of the same quality. Use + with "-S ext" to strictly prefer free + containers irrespective of quality --no-prefer-free-formats Don't give any special preference to free containers (default) --check-formats Make sure formats are selected only from @@ -843,15 +839,17 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git (default) (Alias: --no-write-automatic-subs) --list-subs List available subtitles of each video. Simulate unless --no-simulate is used - --sub-format FORMAT Subtitle format; accepts formats preference, - e.g. "srt" or "ass/srt/best" + --sub-format FORMAT Subtitle format; accepts formats preference + separated by "/", e.g. "srt" or "ass/srt/best" --sub-langs LANGS Languages of the subtitles to download (can be regex) or "all" separated by commas, e.g. - --sub-langs "en.*,ja". You can prefix the - language code with a "-" to exclude it from - the requested languages, e.g. --sub-langs - all,-live_chat. Use --list-subs for a list - of available language tags + --sub-langs "en.*,ja" (where "en.*" is a + regex pattern that matches "en" followed by + 0 or more of any character). You can prefix + the language code with a "-" to exclude it + from the requested languages, e.g. --sub- + langs all,-live_chat. Use --list-subs for a + list of available language tags ## Authentication Options: -u, --username USERNAME Login with this account ID @@ -899,9 +897,9 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git necessary (currently supported: avi, flv, gif, mkv, mov, mp4, webm, aac, aiff, alac, flac, m4a, mka, mp3, ogg, opus, vorbis, - wav). If target container does not support - the video/audio codec, remuxing will fail. - You can specify multiple rules; e.g. + wav). If the target container does not + support the video/audio codec, remuxing will + fail. You can specify multiple rules; e.g. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv --recode-video FORMAT Re-encode the video into another format if @@ -969,29 +967,29 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git are the same as that of --use-postprocessor (default: pre_process) --xattrs Write metadata to the video file's xattrs - (using dublin core and xdg standards) + (using Dublin Core and XDG standards) --concat-playlist POLICY Concatenate videos in a playlist. One of "never", "always", or "multi_video" (default; only when the videos form a single - show). All the video files must have same - codecs and number of streams to be - concatable. The "pl_video:" prefix can be + show). All the video files must have the + same codecs and number of streams to be + concatenable. The "pl_video:" prefix can be used with "--paths" and "--output" to set the output filename for the concatenated files. See "OUTPUT TEMPLATE" for details --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the - default; fix file if we can, warn - otherwise), force (try fixing even if file - already exists) + default; fix the file if we can, warn + otherwise), force (try fixing even if the + file already exists) --ffmpeg-location PATH Location of the ffmpeg binary; either the path to the binary or its containing directory --exec [WHEN:]CMD Execute a command, optionally prefixed with when to execute it, separated by a ":". Supported values of "WHEN" are the same as that of --use-postprocessor (default: - after_move). Same syntax as the output + after_move). The same syntax as the output template can be used to pass any field as arguments to the command. If no fields are passed, %(filepath,_filename|)q is appended @@ -1029,7 +1027,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --no-force-keyframes-at-cuts Do not force keyframes around the chapters when cutting/splitting (default) --use-postprocessor NAME[:ARGS] - The (case sensitive) name of plugin + The (case-sensitive) name of plugin postprocessors to be enabled, and (optionally) arguments to be passed to it, separated by a colon ":". ARGS are a @@ -1042,8 +1040,8 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --print/--output), "before_dl" (before each video download), "post_process" (after each video download; default), "after_move" - (after moving video file to its final - locations), "after_video" (after downloading + (after moving the video file to its final + location), "after_video" (after downloading and processing all formats of a video), or "playlist" (at end of playlist). This option can be used multiple times to add different @@ -1061,7 +1059,7 @@ Make chapter entries for, or remove various segments (sponsor, music_offtopic, poi_highlight, chapter, all and default (=all). You can prefix the category with a "-" to exclude it. See [1] - for description of the categories. E.g. + for descriptions of the categories. E.g. --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories --sponsorblock-remove CATS SponsorBlock categories to be removed from @@ -1093,7 +1091,7 @@ Make chapter entries for, or remove various segments (sponsor, (Alias: --no-allow-dynamic-mpd) --hls-split-discontinuity Split HLS playlists to different formats at discontinuities such as ad breaks - --no-hls-split-discontinuity Do not split HLS playlists to different + --no-hls-split-discontinuity Do not split HLS playlists into different formats at discontinuities such as ad breaks (default) --extractor-args IE_KEY:ARGS Pass ARGS arguments to the IE_KEY extractor. @@ -1103,7 +1101,7 @@ Make chapter entries for, or remove various segments (sponsor, # CONFIGURATION -You can configure yt-dlp by placing any supported command line option to a configuration file. The configuration is loaded from the following locations: +You can configure yt-dlp by placing any supported command line option in a configuration file. The configuration is loaded from the following locations: 1. **Main Configuration**: * The file given to `--config-location` @@ -1148,7 +1146,7 @@ E.g. with the following configuration file, yt-dlp will always extract the audio -o ~/YouTube/%(title)s.%(ext)s ``` -**Note**: Options in configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. They must also be quoted when necessary, as if it were a UNIX shell. +**Note**: Options in a configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. They must also be quoted when necessary, as if it were a UNIX shell. You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded. @@ -1182,13 +1180,13 @@ As an alternative to using the `.netrc` file, which has the disadvantage of keep E.g. To use an encrypted `.netrc` file stored as `.authinfo.gpg` ``` -yt-dlp --netrc-cmd 'gpg --decrypt ~/.authinfo.gpg' https://www.youtube.com/watch?v=BaW_jenozKc +yt-dlp --netrc-cmd 'gpg --decrypt ~/.authinfo.gpg' 'https://www.youtube.com/watch?v=BaW_jenozKc' ``` ### Notes about environment variables * Environment variables are normally specified as `${VARIABLE}`/`$VARIABLE` on UNIX and `%VARIABLE%` on Windows; but is always shown as `${VARIABLE}` in this documentation -* yt-dlp also allow using UNIX-style variables on Windows for path-like options; e.g. `--output`, `--config-location` +* yt-dlp also allows using UNIX-style variables on Windows for path-like options; e.g. `--output`, `--config-location` * If unset, `${XDG_CONFIG_HOME}` defaults to `~/.config` and `${XDG_CACHE_HOME}` to `~/.cache` * On Windows, `~` points to `${HOME}` if present; or, `${USERPROFILE}` or `${HOMEDRIVE}${HOMEPATH}` otherwise * On Windows, `${USERPROFILE}` generally points to `C:\Users\` and `${APPDATA}` to `${USERPROFILE}\AppData\Roaming` @@ -1269,7 +1267,7 @@ The available fields are: - `like_count` (numeric): Number of positive ratings of the video - `dislike_count` (numeric): Number of negative ratings of the video - `repost_count` (numeric): Number of reposts of the video - - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage + - `average_rating` (numeric): Average rating given by users, the scale used depends on the webpage - `comment_count` (numeric): Number of comments on the video (For some extractors, comments are only downloaded at the end, and so this field cannot be used) - `age_limit` (numeric): Age restriction for the video (years) - `live_status` (string): One of "not_live", "is_live", "is_upcoming", "was_live", "post_live" (was live, but VOD is not yet processed) @@ -1299,7 +1297,7 @@ The available fields are: - `webpage_url` (string): A URL to the video webpage which, if given to yt-dlp, should yield the same result again - `webpage_url_basename` (string): The basename of the webpage URL - `webpage_url_domain` (string): The domain of the webpage URL - - `original_url` (string): The URL given by the user (or same as `webpage_url` for playlist entries) + - `original_url` (string): The URL given by the user (or the same as `webpage_url` for playlist entries) - `categories` (list): List of categories the video belongs to - `tags` (list): List of tags assigned to the video - `cast` (list): List of cast members @@ -1376,7 +1374,7 @@ Each aforementioned sequence when referenced in an output template will be repla **Tip**: Look at the `-j` output to identify which fields are available for the particular URL -For numeric sequences you can use [numeric related formatting](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting); e.g. `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. +For numeric sequences, you can use [numeric related formatting](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting); e.g. `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. Output templates can also contain arbitrary hierarchical path, e.g. `-o "%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s"` which will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. @@ -1418,7 +1416,7 @@ $ yt-dlp -P "C:/MyVideos" -o "%(series)s/%(season_number)s - %(season)s/%(episod # Download video as "C:\MyVideos\uploader\title.ext", subtitles as "C:\MyVideos\subs\uploader\title.ext" # and put all temporary files in "C:\MyVideos\tmp" -$ yt-dlp -P "C:/MyVideos" -P "temp:tmp" -P "subtitle:subs" -o "%(uploader)s/%(title)s.%(ext)s" BaW_jenoz --write-subs +$ yt-dlp -P "C:/MyVideos" -P "temp:tmp" -P "subtitle:subs" -o "%(uploader)s/%(title)s.%(ext)s" BaW_jenozKc --write-subs # Download video as "C:\MyVideos\uploader\title.ext" and subtitles as "C:\MyVideos\uploader\subs\title.ext" $ yt-dlp -P "C:/MyVideos" -o "%(uploader)s/%(title)s.%(ext)s" -o "subtitle:%(uploader)s/subs/%(title)s.%(ext)s" BaW_jenozKc --write-subs @@ -1557,9 +1555,9 @@ The available fields are: All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. E.g. `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. E.g. `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. E.g. `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. E.g. `filesize~1G` prefers the format with filesize closest to 1 GiB. -The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behavior can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec:vp9.2,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. +The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behavior can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. -Note that the default has `vcodec:vp9.2`; i.e. `av1` is not preferred. Similarly, the default for hdr is `hdr:12`; i.e. Dolby Vision is not preferred. These choices are made since DV and AV1 formats are not yet fully compatible with most devices. This may be changed in the future as more devices become capable of smoothly playing back these formats. +Note that the default for hdr is `hdr:12`; i.e. Dolby Vision is not preferred. This choice was made since DV formats are not yet fully compatible with most devices. This may be changed in the future. If your format selector is `worst`, the last item is selected after sorting. This means it will select the format that is worst in all respects. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-f best -S +size,+br,+res,+fps`. @@ -1636,11 +1634,11 @@ $ yt-dlp -S "res:480" # or the worst video (that also has audio) if there is no video under 50 MB $ yt-dlp -f "b[filesize<50M] / w" -# Download largest video (that also has audio) but no bigger than 50 MB, +# Download the largest video (that also has audio) but no bigger than 50 MB, # or the smallest video (that also has audio) if there is no video under 50 MB $ yt-dlp -f "b" -S "filesize:50M" -# Download best video (that also has audio) that is closest in size to 50 MB +# Download the best video (that also has audio) that is closest in size to 50 MB $ yt-dlp -f "b" -S "filesize~50M" @@ -1696,7 +1694,7 @@ The metadata obtained by the extractors can be modified by using `--parse-metada The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or an [output template](#output-template) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [Python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups, a single field name, or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields. -Note that these options preserve their relative order, allowing replacements to be made in parsed fields and viceversa. Also, any field thus created can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--embed-metadata`. +Note that these options preserve their relative order, allowing replacements to be made in parsed fields and vice versa. Also, any field thus created can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--embed-metadata`. This option also has a few special uses: @@ -1771,7 +1769,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `mediaconnect`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, and `web_creator` is added as needed for age-gated videos when account age verification is required. Similarly, the `_music` variants are added for `music.youtube.com` URLs. Some clients, such as `web` and `android`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1795,7 +1793,7 @@ The following extractors use this feature: * `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` -* `impersonate`: Target(s) to try and impersonate with the initial webpage request; e.g. `safari,chrome-110`. By default any available target will be used. Use `false` to disable impersonation +* `impersonate`: Target(s) to try and impersonate with the initial webpage request; e.g. `generic:impersonate=safari,chrome-110`. Use `generic:impersonate` to impersonate any available target, and use `generic:impersonate=false` to disable impersonation (default) #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` @@ -1869,8 +1867,8 @@ The following extractors use this feature: #### bilibili * `prefer_multi_flv`: Prefer extracting flv formats over mp4 for older videos that still provide legacy formats -#### digitalconcerthall -* `prefer_combined_hls`: Prefer extracting combined/pre-merged video and audio HLS formats. This will exclude 4K/HEVC video and lossless/FLAC audio formats, which are only available as split video/audio HLS formats +#### sonylivseries +* `sort_order`: Episode sort order for series extraction - one of `asc` (ascending, oldest first) or `desc` (descending, newest first). Default is `asc` **Note**: These options may be changed/removed in the future without concern for backward compatibility @@ -1926,7 +1924,7 @@ Plugins can be installed using various methods and locations. * Plugin packages can be installed and managed using `pip`. See [yt-dlp-sample-plugins](https://github.com/yt-dlp/yt-dlp-sample-plugins) for an example. * Note: plugin files between plugin packages installed with pip must have unique filenames. * Any path in `PYTHONPATH` is searched in for the `yt_dlp_plugins` namespace folder. - * Note: This does not apply for Pyinstaller/py2exe builds. + * Note: This does not apply for Pyinstaller builds. `.zip`, `.egg` and `.whl` archives containing a `yt_dlp_plugins` namespace folder in their root are also supported as plugin packages. @@ -2160,9 +2158,9 @@ with yt_dlp.YoutubeDL(ydl_opts) as ydl: * **YouTube improvements**: * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** - * Supports some (but not all) age-gated content without cookies * Download livestreams from the start using `--live-from-start` (*experimental*) * Channel URLs download all uploads of the channel, including shorts and live + * Support for [logging in with OAuth](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#logging-in-with-oauth) * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` @@ -2204,12 +2202,12 @@ Features marked with a **\*** have been back-ported to youtube-dl Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: -* yt-dlp supports only [Python 3.8+](## "Windows 7"), and *may* remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) +* yt-dlp supports only [Python 3.9+](## "Windows 8"), and will remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) * The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details * `avconv` is not supported as an alternative to `ffmpeg` * yt-dlp stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations * The default [output template](#output-template) is `%(title)s [%(id)s].%(ext)s`. There is no real reason for this change. This was changed before yt-dlp was ever made public and now there are no plans to change it back to `%(title)s-%(id)s.%(ext)s`. Instead, you may use `--compat-options filename` -* The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order +* The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order. Older versions of yt-dlp preferred VP9 due to its broader compatibility; you can use `--compat-options prefer-vp9-sort` to revert to that format sorting preference. These two compat options cannot be used together * The default format selector is `bv*+ba/b`. This means that if a combined video + audio format that is better than the best video-only format is found, the former will be preferred. Use `-f bv+ba/b` or `--compat-options format-spec` to revert this * Unlike youtube-dlc, yt-dlp does not allow merging multiple audio/video streams into one file by default (since this conflicts with the use of `-f bv*+ba`). If needed, this feature must be enabled using `--audio-multistreams` and `--video-multistreams`. You can also use `--compat-options multistreams` to enable both * `--no-abort-on-error` is enabled by default. Use `--abort-on-error` or `--compat-options abort-on-error` to abort on errors instead @@ -2238,11 +2236,11 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu For ease of use, a few more compat options are available: * `--compat-options all`: Use all compat options (**Do NOT use this!**) -* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` -* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` +* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` +* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` -* `--compat-options 2023`: Currently does nothing. Use this to enable all future compat options +* `--compat-options 2023`: Same as `--compat-options prefer-vp9-sort`. Use this to enable all future compat options The following compat options restore vulnerable behavior from before security patches: @@ -2280,8 +2278,8 @@ While these options are redundant, they are still expected to be used due to the --min-views COUNT --match-filters "view_count >=? COUNT" --max-views COUNT --match-filters "view_count <=? COUNT" --break-on-reject Use --break-match-filters - --user-agent UA --add-header "User-Agent:UA" - --referer URL --add-header "Referer:URL" + --user-agent UA --add-headers "User-Agent:UA" + --referer URL --add-headers "Referer:URL" --playlist-start NUMBER -I NUMBER: --playlist-end NUMBER -I :NUMBER --playlist-reverse -I ::-1 diff --git a/bundle/py2exe.py b/bundle/py2exe.py deleted file mode 100755 index 5b7f4883bc..0000000000 --- a/bundle/py2exe.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -# Allow execution from anywhere -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -import warnings - -from py2exe import freeze - -from devscripts.utils import read_version - -VERSION = read_version() - - -def main(): - warnings.warn( - 'py2exe builds do not support pycryptodomex and needs VC++14 to run. ' - 'It is recommended to run "pyinst.py" to build using pyinstaller instead') - - freeze( - console=[{ - 'script': './yt_dlp/__main__.py', - 'dest_base': 'yt-dlp', - 'icon_resources': [(1, 'devscripts/logo.ico')], - }], - version_info={ - 'version': VERSION, - 'description': 'A feature-rich command-line audio/video downloader', - 'comments': 'Official repository: ', - 'product_name': 'yt-dlp', - 'product_version': VERSION, - }, - options={ - 'bundle_files': 0, - 'compressed': 1, - 'optimize': 2, - 'dist_dir': './dist', - 'excludes': [ - # py2exe cannot import Crypto - 'Crypto', - 'Cryptodome', - # requests >=2.32.0 breaks py2exe builds due to certifi dependency - 'requests', - 'urllib3', - ], - 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], - # Modules that are only imported dynamically must be added here - 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', - 'yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated'], - }, - zipfile=None, - ) - - -if __name__ == '__main__': - main() diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index e7f553a5f2..906e5cf728 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -196,5 +196,48 @@ "when": "b31b81d85f00601710d4fac590c3e4efb4133283", "short": "[ci] Rerun failed tests (#11143)", "authors": ["Grub4K"] + }, + { + "action": "add", + "when": "a886cf3e900f4a2ec00af705f883539269545609", + "short": "[priority] **py2exe is no longer supported**\nThis release's `yt-dlp_min.exe` will be the last, and it's actually a PyInstaller-bundled executable so that yt-dlp users updating their py2exe build with `-U` will be automatically migrated. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10087)" + }, + { + "action": "add", + "when": "a886cf3e900f4a2ec00af705f883539269545609", + "short": "[priority] **Following this release, yt-dlp's Python dependencies *must* be installed using the `default` group**\nIf you're installing yt-dlp with pip/pipx or requiring yt-dlp in your own Python project, you'll need to specify `yt-dlp[default]` if you want to also install yt-dlp's optional dependencies (which were previously included by default). [Read more](https://github.com/yt-dlp/yt-dlp/pull/11255)" + }, + { + "action": "add", + "when": "87884f15580910e4e0fe0e1db73508debc657471", + "short": "[priority] **Beginning with this release, yt-dlp's Python dependencies *must* be installed using the `default` group**\nIf you're installing yt-dlp with pip/pipx or requiring yt-dlp in your own Python project, you'll need to specify `yt-dlp[default]` if you want to also install yt-dlp's optional dependencies (which were previously included by default). [Read more](https://github.com/yt-dlp/yt-dlp/pull/11255)" + }, + { + "action": "add", + "when": "d784464399b600ba9516bbcec6286f11d68974dd", + "short": "[priority] **The minimum *required* Python version has been raised to 3.9**\nPython 3.8 reached its end-of-life on 2024.10.07, and yt-dlp has now removed support for it. As an unfortunate side effect, the official `yt-dlp.exe` and `yt-dlp_x86.exe` binaries are no longer supported on Windows 7. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" + }, + { + "action": "change", + "when": "914af9a0cf51c9a3f74aa88d952bee8334c67511", + "short": "Expand paths in `--plugin-dirs` (#11334)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "c29f5a7fae93a08f3cfbb6127b2faa75145b06a0", + "short": "[ie/generic] Do not impersonate by default (#11336)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "57212a5f97ce367590aaa5c3e9a135eead8f81f7", + "short": "[ie/vimeo] Fix API retries (#11351)", + "authors": ["bashonly"] + }, + { + "action": "add", + "when": "52c0ffe40ad6e8404d93296f575007b05b04c686", + "short": "[priority] **Login with OAuth is no longer supported for YouTube**\nDue to a change made by the site, yt-dlp is longer able to support OAuth login for YouTube. [Read more](https://github.com/yt-dlp/yt-dlp/issues/11462#issuecomment-2471703090)" } ] diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py index 7f3c88bcfb..73cf803b8f 100644 --- a/devscripts/generate_aes_testdata.py +++ b/devscripts/generate_aes_testdata.py @@ -11,13 +11,12 @@ import codecs import subprocess from yt_dlp.aes import aes_encrypt, key_expansion -from yt_dlp.utils import intlist_to_bytes secret_msg = b'Secret message goes here' def hex_str(int_list): - return codecs.encode(intlist_to_bytes(int_list), 'hex') + return codecs.encode(bytes(int_list), 'hex') def openssl_encode(algo, key, iv): diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 00634fb911..7c876101b4 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -71,14 +71,13 @@ class CommitGroup(enum.Enum): def get(cls, value: str) -> tuple[CommitGroup | None, str | None]: group, _, subgroup = (group.strip().lower() for group in value.partition('/')) - result = cls.group_lookup().get(group) - if not result: - if subgroup: - return None, value - subgroup = group - result = cls.subgroup_lookup().get(subgroup) + if result := cls.group_lookup().get(group): + return result, subgroup or None - return result, subgroup or None + if subgroup: + return None, value + + return cls.subgroup_lookup().get(group), group or None @dataclass @@ -136,8 +135,7 @@ class Changelog: first = False yield '\n

Changelog

\n' - group = groups[item] - if group: + if group := groups[item]: yield self.format_module(item.value, group) if self._collapsible: @@ -253,7 +251,7 @@ class CommitRange: ''', re.VERBOSE | re.DOTALL) EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE) REVERT_RE = re.compile(r'(?:\[[^\]]+\]\s+)?(?i:Revert)\s+([\da-f]{40})') - FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert|Improve)\s+([\da-f]{40})') + FIXES_RE = re.compile(r'(?i:(?:bug\s*)?fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Improve)\s+([\da-f]{40})') UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)') def __init__(self, start, end, default_author=None): @@ -287,11 +285,16 @@ class CommitRange: short = next(lines) skip = short.startswith('Release ') or short == '[version] update' + fix_commitish = None + if match := self.FIXES_RE.search(short): + fix_commitish = match.group(1) + authors = [default_author] if default_author else [] for line in iter(lambda: next(lines), self.COMMIT_SEPARATOR): - match = self.AUTHOR_INDICATOR_RE.match(line) - if match: + if match := self.AUTHOR_INDICATOR_RE.match(line): authors = sorted(map(str.strip, line[match.end():].split(',')), key=str.casefold) + if not fix_commitish and (match := self.FIXES_RE.fullmatch(line)): + fix_commitish = match.group(1) commit = Commit(commit_hash, short, authors) if skip and (self._start or not i): @@ -301,21 +304,17 @@ class CommitRange: logger.debug(f'Reached Release commit, breaking: {commit}') break - revert_match = self.REVERT_RE.fullmatch(commit.short) - if revert_match: - reverts[revert_match.group(1)] = commit + if match := self.REVERT_RE.fullmatch(commit.short): + reverts[match.group(1)] = commit continue - fix_match = self.FIXES_RE.search(commit.short) - if fix_match: - commitish = fix_match.group(1) - fixes[commitish].append(commit) + if fix_commitish: + fixes[fix_commitish].append(commit) commits[commit.hash] = commit for commitish, revert_commit in reverts.items(): - reverted = commits.pop(commitish, None) - if reverted: + if reverted := commits.pop(commitish, None): logger.debug(f'{commitish} fully reverted {reverted}') else: commits[revert_commit.hash] = revert_commit @@ -461,8 +460,7 @@ def create_changelog(args): logger.info(f'Loaded {len(commits)} commits') - new_contributors = get_new_contributors(args.contributors_path, commits) - if new_contributors: + if new_contributors := get_new_contributors(args.contributors_path, commits): if args.contributors: write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a') logger.info(f'New contributors: {", ".join(new_contributors)}') diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 8135689c7e..2a418ddbf7 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -32,14 +32,15 @@ VERBOSE_TMPL = ''' placeholder: | [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) - [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 - [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp-nightly-builds [1a176d874] (win_exe) + [debug] Python 3.10.11 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0 (OpenSSL 1.1.1t 7 Feb 2023) + [debug] exe versions: ffmpeg 7.0.2 (setts), ffprobe 7.0.2 + [debug] Optional libraries: Cryptodome-3.21.0, brotli-1.1.0, certifi-2024.08.30, curl_cffi-0.5.10, mutagen-1.47.0, requests-2.32.3, sqlite3-3.40.1, urllib3-2.2.3, websockets-13.1 [debug] Proxy map: {} - [debug] Request Handlers: urllib, requests - [debug] Loaded 1893 extractors - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + [debug] Request Handlers: urllib, requests, websockets, curl_cffi + [debug] Loaded 1838 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest + Latest version: nightly@... from yt-dlp/yt-dlp-nightly-builds yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc diff --git a/pyproject.toml b/pyproject.toml index 200a9c99ae..92d399e319 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ maintainers = [ ] description = "A feature-rich command-line audio/video downloader" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = [ "youtube-dl", "video-downloader", @@ -29,11 +29,11 @@ classifiers = [ "Environment :: Console", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", @@ -41,7 +41,10 @@ classifiers = [ "Operating System :: OS Independent", ] dynamic = ["version"] -dependencies = [ +dependencies = [] + +[project.optional-dependencies] +default = [ "brotli; implementation_name=='cpython'", "brotlicffi; implementation_name!='cpython'", "certifi", @@ -49,11 +52,8 @@ dependencies = [ "pycryptodomex", "requests>=2.32.2,<3", "urllib3>=1.26.17,<3", - "websockets>=13.0", + "websockets>=13.0,<14", ] - -[project.optional-dependencies] -default = [] curl-cffi = [ "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", "curl-cffi>=0.5.10,!=0.6.*,<0.7.2; os_name!='nt' and implementation_name=='cpython'", @@ -76,17 +76,14 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.6.0", + "ruff~=0.7.0", ] test = [ "pytest~=8.1", "pytest-rerunfailures~=14.0", ] pyinstaller = [ - "pyinstaller>=6.10.0", # Windows temp cleanup fixed in 6.10.0 -] -py2exe = [ - "py2exe>=0.12", + "pyinstaller>=6.11.1", # Windows temp cleanup fixed in 6.11.1 ] [project.urls] @@ -172,13 +169,11 @@ run-cov = "echo Code coverage not implemented && exit 1" [[tool.hatch.envs.hatch-test.matrix]] python = [ - "3.8", "3.9", "3.10", "3.11", "3.12", - "pypy3.8", - "pypy3.9", + "3.13", "pypy3.10", ] @@ -318,6 +313,16 @@ banned-from = [ "yt_dlp.compat.compat_urllib_parse_urlparse".msg = "Use `urllib.parse.urlparse` instead." "yt_dlp.compat.compat_shlex_quote".msg = "Use `yt_dlp.utils.shell_quote` instead." "yt_dlp.utils.error_to_compat_str".msg = "Use `str` instead." +"yt_dlp.utils.bytes_to_intlist".msg = "Use `list` instead." +"yt_dlp.utils.intlist_to_bytes".msg = "Use `bytes` instead." +"yt_dlp.utils.decodeArgument".msg = "Do not use" +"yt_dlp.utils.decodeFilename".msg = "Do not use" +"yt_dlp.utils.encodeFilename".msg = "Do not use" +"yt_dlp.compat.compat_os_name".msg = "Use `os.name` instead." +"yt_dlp.compat.compat_realpath".msg = "Use `os.path.realpath` instead." +"yt_dlp.compat.functools".msg = "Use `functools` instead." +"yt_dlp.utils.decodeOption".msg = "Do not use" +"yt_dlp.utils.compiled_regex_type".msg = "Use `re.Pattern` instead." [tool.autopep8] max_line_length = 120 diff --git a/setup.cfg b/setup.cfg index 340cc3b4d9..20d40cd303 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ remove-unused-variables = true [tox:tox] skipsdist = true -envlist = py{38,39,310,311,312},pypy{38,39,310} +envlist = py{39,310,311,312,313},pypy310 skip_missing_interpreters = true [testenv] # tox @@ -29,7 +29,7 @@ setenv = [isort] -py_version = 38 +py_version = 39 multi_line_output = VERTICAL_HANGING_INDENT line_length = 80 reverse_relative = true diff --git a/supportedsites.md b/supportedsites.md index e23d395fde..916735e08b 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -45,10 +45,6 @@ - **aenetworks:collection** - **aenetworks:show** - **AeonCo** - - **afreecatv**: [*afreecatv*](## "netrc machine") afreecatv.com - - **afreecatv:catchstory**: [*afreecatv*](## "netrc machine") afreecatv.com catch story - - **afreecatv:live**: [*afreecatv*](## "netrc machine") afreecatv.com livestreams - - **afreecatv:user** - **AirTV** - **AitubeKZVideo** - **AliExpressLive** @@ -133,6 +129,8 @@ - **Bandcamp:album** - **Bandcamp:user** - **Bandcamp:weekly** + - **Bandlab** + - **BandlabPlaylist** - **BannedVideo** - **bbc**: [*bbc*](## "netrc machine") BBC - **bbc.co.uk**: [*bbc*](## "netrc machine") BBC iPlayer @@ -194,6 +192,7 @@ - **blerp** - **blogger.com** - **Bloomberg** + - **Bluesky** - **BokeCC** - **BongaCams** - **Boosty** @@ -251,9 +250,10 @@ - **cbsnews:livevideo**: CBS News Live Videos - **cbssports**: (**Currently broken**) - **cbssports:embed**: (**Currently broken**) - - **CCMA** + - **CCMA**: 3Cat, TV3 and Catalunya Ràdio - **CCTV**: 央视网 - **CDA**: [*cdapl*](## "netrc machine") + - **CDAFolder** - **Cellebrite** - **CeskaTelevize** - **CGTN** @@ -283,8 +283,6 @@ - **cmt.com**: (**Currently broken**) - **CNBCVideo** - **CNN** - - **CNNArticle** - - **CNNBlogs** - **CNNIndonesia** - **ComedyCentral** - **ComedyCentralTV** @@ -488,6 +486,7 @@ - **Gab** - **GabTV** - **Gaia**: [*gaia*](## "netrc machine") + - **GameDevTVDashboard**: [*gamedevtv*](## "netrc machine") - **GameJolt** - **GameJoltCommunity** - **GameJoltGame** @@ -655,6 +654,8 @@ - **Karaoketv** - **Katsomo**: (**Currently broken**) - **KelbyOne**: (**Currently broken**) + - **Kenh14Playlist** + - **Kenh14Video** - **Ketnet** - **khanacademy** - **khanacademy:unit** @@ -688,9 +689,9 @@ - **LastFMPlaylist** - **LastFMUser** - **LaXarxaMes**: [*laxarxames*](## "netrc machine") - - **lbry** - - **lbry:channel** - - **lbry:playlist** + - **lbry**: odysee.com + - **lbry:channel**: odysee.com channels + - **lbry:playlist**: odysee.com playlists - **LCI** - **Lcp** - **LcpPlay** @@ -788,10 +789,6 @@ - **MicrosoftLearnSession** - **MicrosoftMedius** - **microsoftstream**: Microsoft Stream - - **mildom**: Record ongoing live by specific user in Mildom - - **mildom:clip**: Clip in Mildom - - **mildom:​user:vod**: Download all VODs from specific user in Mildom - - **mildom:vod**: VOD in Mildom - **minds** - **minds:channel** - **minds:group** @@ -802,6 +799,7 @@ - **MiTele**: mitele.es - **mixch** - **mixch:archive** + - **mixch:movie** - **mixcloud** - **mixcloud:playlist** - **mixcloud:user** @@ -1046,8 +1044,8 @@ - **Parler**: Posts on parler.com - **parliamentlive.tv**: UK parliament videos - **Parlview**: (**Currently broken**) - - **Patreon** - - **PatreonCampaign** + - **patreon** + - **patreon:campaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **PBSKids** - **PearVideo** @@ -1064,8 +1062,8 @@ - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** + - **PiaLive** - **Piapro**: [*piapro*](## "netrc machine") - - **PIAULIZAPortal**: ulizaportal.jp - PIA LIVE STREAM - **Picarto** - **PicartoVod** - **Piksel** @@ -1092,8 +1090,6 @@ - **PodbayFMChannel** - **Podchaser** - **podomatic**: (**Currently broken**) - - **Pokemon** - - **PokemonWatch** - **PokerGo**: [*pokergo*](## "netrc machine") - **PokerGoCollection**: [*pokergo*](## "netrc machine") - **PolsatGo** @@ -1164,6 +1160,7 @@ - **RadioJavan**: (**Currently broken**) - **radiokapital** - **radiokapital:show** + - **RadioRadicale** - **RadioZetPodcast** - **radlive** - **radlive:channel** @@ -1339,6 +1336,10 @@ - **SohuV** - **SonyLIV**: [*sonyliv*](## "netrc machine") - **SonyLIVSeries** + - **soop**: [*afreecatv*](## "netrc machine") sooplive.co.kr + - **soop:catchstory**: [*afreecatv*](## "netrc machine") sooplive.co.kr catch story + - **soop:live**: [*afreecatv*](## "netrc machine") sooplive.co.kr livestreams + - **soop:user**: [*afreecatv*](## "netrc machine") - **soundcloud**: [*soundcloud*](## "netrc machine") - **soundcloud:playlist**: [*soundcloud*](## "netrc machine") - **soundcloud:related**: [*soundcloud*](## "netrc machine") @@ -1367,9 +1368,7 @@ - **spotify**: Spotify episodes (**Currently broken**) - **spotify:show**: Spotify shows (**Currently broken**) - **Spreaker** - - **SpreakerPage** - **SpreakerShow** - - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** - **SproutVideo** @@ -1445,7 +1444,7 @@ - **TeleQuebecSquat** - **TeleQuebecVideo** - **TeleTask**: (**Currently broken**) - - **Telewebion** + - **Telewebion**: (**Currently broken**) - **Tempo** - **TennisTV**: [*tennistv*](## "netrc machine") - **TenPlay**: [*10play*](## "netrc machine") @@ -1570,6 +1569,8 @@ - **UFCTV**: [*ufctv*](## "netrc machine") - **ukcolumn**: (**Currently broken**) - **UKTVPlay** + - **UlizaPlayer** + - **UlizaPortal**: ulizaportal.jp - **umg:de**: Universal Music Deutschland (**Currently broken**) - **Unistra** - **Unity**: (**Currently broken**) @@ -1587,8 +1588,6 @@ - **Varzesh3**: (**Currently broken**) - **Vbox7** - **Veo** - - **Veoh** - - **veoh:user** - **Vesti**: Вести.Ru (**Currently broken**) - **Vevo** - **VevoPlaylist** @@ -1778,24 +1777,24 @@ - **YouPornStar**: YouPorn Pornstar, with description, sorting and pagination - **YouPornTag**: YouPorn tag (porntags), with sorting, filtering and pagination - **YouPornVideos**: YouPorn video (browse) playlists, with sorting, filtering and pagination - - **youtube**: YouTube - - **youtube:clip** - - **youtube:favorites**: YouTube liked videos; ":ytfav" keyword (requires cookies) - - **youtube:history**: Youtube watch history; ":ythis" keyword (requires cookies) - - **youtube:​music:search_url**: YouTube music search URLs with selectable sections, e.g. #songs - - **youtube:notif**: YouTube notifications; ":ytnotif" keyword (requires cookies) - - **youtube:playlist**: YouTube playlists - - **youtube:recommended**: YouTube recommended videos; ":ytrec" keyword - - **youtube:search**: YouTube search; "ytsearch:" prefix - - **youtube:​search:date**: YouTube search, newest videos first; "ytsearchdate:" prefix - - **youtube:search_url**: YouTube search URLs with sorting and filter support - - **youtube:​shorts:pivot:audio**: YouTube Shorts audio pivot (Shorts using audio of a given video) - - **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) - - **youtube:tab**: YouTube Tabs - - **youtube:user**: YouTube user videos; "ytuser:" prefix - - **youtube:watchlater**: Youtube watch later list; ":ytwatchlater" keyword (requires cookies) - - **YoutubeLivestreamEmbed**: YouTube livestream embeds - - **YoutubeYtBe**: youtu.be + - **youtube**: [*youtube*](## "netrc machine") YouTube + - **youtube:clip**: [*youtube*](## "netrc machine") + - **youtube:favorites**: [*youtube*](## "netrc machine") YouTube liked videos; ":ytfav" keyword (requires cookies) + - **youtube:history**: [*youtube*](## "netrc machine") Youtube watch history; ":ythis" keyword (requires cookies) + - **youtube:​music:search_url**: [*youtube*](## "netrc machine") YouTube music search URLs with selectable sections, e.g. #songs + - **youtube:notif**: [*youtube*](## "netrc machine") YouTube notifications; ":ytnotif" keyword (requires cookies) + - **youtube:playlist**: [*youtube*](## "netrc machine") YouTube playlists + - **youtube:recommended**: [*youtube*](## "netrc machine") YouTube recommended videos; ":ytrec" keyword + - **youtube:search**: [*youtube*](## "netrc machine") YouTube search; "ytsearch:" prefix + - **youtube:​search:date**: [*youtube*](## "netrc machine") YouTube search, newest videos first; "ytsearchdate:" prefix + - **youtube:search_url**: [*youtube*](## "netrc machine") YouTube search URLs with sorting and filter support + - **youtube:​shorts:pivot:audio**: [*youtube*](## "netrc machine") YouTube Shorts audio pivot (Shorts using audio of a given video) + - **youtube:subscriptions**: [*youtube*](## "netrc machine") YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) + - **youtube:tab**: [*youtube*](## "netrc machine") YouTube Tabs + - **youtube:user**: [*youtube*](## "netrc machine") YouTube user videos; "ytuser:" prefix + - **youtube:watchlater**: [*youtube*](## "netrc machine") Youtube watch later list; ":ytwatchlater" keyword (requires cookies) + - **YoutubeLivestreamEmbed**: [*youtube*](## "netrc machine") YouTube livestream embeds + - **YoutubeYtBe**: [*youtube*](## "netrc machine") youtu.be - **Zaiko** - **ZaikoETicket** - **Zapiks** diff --git a/test/helper.py b/test/helper.py index 3b550d1927..c776e70b73 100644 --- a/test/helper.py +++ b/test/helper.py @@ -9,7 +9,6 @@ import types import yt_dlp.extractor from yt_dlp import YoutubeDL -from yt_dlp.compat import compat_os_name from yt_dlp.utils import preferredencoding, try_call, write_string, find_available_port if 'pytest' in sys.modules: @@ -49,7 +48,7 @@ def report_warning(message, *args, **kwargs): Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored """ - if sys.stderr.isatty() and compat_os_name != 'nt': + if sys.stderr.isatty() and os.name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 31e8f82448..54f35ef552 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -53,6 +53,18 @@ class TestInfoExtractor(unittest.TestCase): def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) + def test_get_netrc_login_info(self): + for params in [ + {'usenetrc': True, 'netrc_location': './test/testdata/netrc/netrc'}, + {'netrc_cmd': f'{sys.executable} ./test/testdata/netrc/print_netrc.py'}, + ]: + ie = DummyIE(FakeYDL(params)) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='normal_use'), ('user', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_user'), ('', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_pass'), ('user', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='both_empty'), ('', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='nonexistent'), (None, None)) + def test_html_search_regex(self): html = '

Watch this video

' search = lambda re, *args: self.ie._html_search_regex(re, html, *args) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a99e624080..966d27a498 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -15,7 +15,6 @@ import json from test.helper import FakeYDL, assertRegexpMatches, try_rm from yt_dlp import YoutubeDL -from yt_dlp.compat import compat_os_name from yt_dlp.extractor import YoutubeIE from yt_dlp.extractor.common import InfoExtractor from yt_dlp.postprocessor.common import PostProcessor @@ -839,8 +838,8 @@ class TestYoutubeDL(unittest.TestCase): test('%(filesize)#D', '1Ki') test('%(height)5.2D', ' 1.08k') test('%(title4)#S', 'foo_bar_test') - test('%(title4).10S', ('foo "bar" ', 'foo "bar"' + ('#' if compat_os_name == 'nt' else ' '))) - if compat_os_name == 'nt': + test('%(title4).10S', ('foo "bar" ', 'foo "bar"' + ('#' if os.name == 'nt' else ' '))) + if os.name == 'nt': test('%(title4)q', ('"foo ""bar"" test"', None)) test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', None)) test('%(formats.0.id)#q', ('"id 1"', None)) @@ -903,9 +902,9 @@ class TestYoutubeDL(unittest.TestCase): # Environment variable expansion for prepare_filename os.environ['__yt_dlp_var'] = 'expanded' - envvar = '%__yt_dlp_var%' if compat_os_name == 'nt' else '$__yt_dlp_var' + envvar = '%__yt_dlp_var%' if os.name == 'nt' else '$__yt_dlp_var' test(envvar, (envvar, 'expanded')) - if compat_os_name == 'nt': + if os.name == 'nt': test('%s%', ('%s%', '%s%')) os.environ['s'] = 'expanded' test('%s%', ('%s%', 'expanded')) # %s% should be expanded before escaping %s diff --git a/test/test_aes.py b/test/test_aes.py index 5f975efecf..9cd9189bcc 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -27,7 +27,6 @@ from yt_dlp.aes import ( pad_block, ) from yt_dlp.dependencies import Cryptodome -from yt_dlp.utils import bytes_to_intlist, intlist_to_bytes # the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' @@ -40,33 +39,33 @@ class TestAES(unittest.TestCase): def test_encrypt(self): msg = b'message' key = list(range(16)) - encrypted = aes_encrypt(bytes_to_intlist(msg), key) - decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) + encrypted = aes_encrypt(list(msg), key) + decrypted = bytes(aes_decrypt(encrypted, key)) self.assertEqual(decrypted, msg) def test_cbc_decrypt(self): data = b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\x27\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd' - decrypted = intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist(data), self.key, self.iv)) + decrypted = bytes(aes_cbc_decrypt(list(data), self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) if Cryptodome.AES: - decrypted = aes_cbc_decrypt_bytes(data, intlist_to_bytes(self.key), intlist_to_bytes(self.iv)) + decrypted = aes_cbc_decrypt_bytes(data, bytes(self.key), bytes(self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_cbc_encrypt(self): - data = bytes_to_intlist(self.secret_msg) - encrypted = intlist_to_bytes(aes_cbc_encrypt(data, self.key, self.iv)) + data = list(self.secret_msg) + encrypted = bytes(aes_cbc_encrypt(data, self.key, self.iv)) self.assertEqual( encrypted, b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd') def test_ctr_decrypt(self): - data = bytes_to_intlist(b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') - decrypted = intlist_to_bytes(aes_ctr_decrypt(data, self.key, self.iv)) + data = list(b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') + decrypted = bytes(aes_ctr_decrypt(data, self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_ctr_encrypt(self): - data = bytes_to_intlist(self.secret_msg) - encrypted = intlist_to_bytes(aes_ctr_encrypt(data, self.key, self.iv)) + data = list(self.secret_msg) + encrypted = bytes(aes_ctr_encrypt(data, self.key, self.iv)) self.assertEqual( encrypted, b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') @@ -75,47 +74,59 @@ class TestAES(unittest.TestCase): data = b'\x159Y\xcf5eud\x90\x9c\x85&]\x14\x1d\x0f.\x08\xb4T\xe4/\x17\xbd' authentication_tag = b'\xe8&I\x80rI\x07\x9d}YWuU@:e' - decrypted = intlist_to_bytes(aes_gcm_decrypt_and_verify( - bytes_to_intlist(data), self.key, bytes_to_intlist(authentication_tag), self.iv[:12])) + decrypted = bytes(aes_gcm_decrypt_and_verify( + list(data), self.key, list(authentication_tag), self.iv[:12])) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) if Cryptodome.AES: decrypted = aes_gcm_decrypt_and_verify_bytes( - data, intlist_to_bytes(self.key), authentication_tag, intlist_to_bytes(self.iv[:12])) + data, bytes(self.key), authentication_tag, bytes(self.iv[:12])) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + def test_gcm_aligned_decrypt(self): + data = b'\x159Y\xcf5eud\x90\x9c\x85&]\x14\x1d\x0f' + authentication_tag = b'\x08\xb1\x9d!&\x98\xd0\xeaRq\x90\xe6;\xb5]\xd8' + + decrypted = bytes(aes_gcm_decrypt_and_verify( + list(data), self.key, list(authentication_tag), self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg[:16]) + if Cryptodome.AES: + decrypted = aes_gcm_decrypt_and_verify_bytes( + data, bytes(self.key), authentication_tag, bytes(self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg[:16]) + def test_decrypt_text(self): - password = intlist_to_bytes(self.key).decode() + password = bytes(self.key).decode() encrypted = base64.b64encode( - intlist_to_bytes(self.iv[:8]) + bytes(self.iv[:8]) + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae', ).decode() decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) - password = intlist_to_bytes(self.key).decode() + password = bytes(self.key).decode() encrypted = base64.b64encode( - intlist_to_bytes(self.iv[:8]) + bytes(self.iv[:8]) + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83', ).decode() decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) def test_ecb_encrypt(self): - data = bytes_to_intlist(self.secret_msg) - encrypted = intlist_to_bytes(aes_ecb_encrypt(data, self.key)) + data = list(self.secret_msg) + encrypted = bytes(aes_ecb_encrypt(data, self.key)) self.assertEqual( encrypted, b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') def test_ecb_decrypt(self): - data = bytes_to_intlist(b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') - decrypted = intlist_to_bytes(aes_ecb_decrypt(data, self.key, self.iv)) + data = list(b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + decrypted = bytes(aes_ecb_decrypt(data, self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_key_expansion(self): key = '4f6bdaa39e2f8cb07f5e722d9edef314' - self.assertEqual(key_expansion(bytes_to_intlist(bytearray.fromhex(key))), [ + self.assertEqual(key_expansion(list(bytearray.fromhex(key))), [ 0x4F, 0x6B, 0xDA, 0xA3, 0x9E, 0x2F, 0x8C, 0xB0, 0x7F, 0x5E, 0x72, 0x2D, 0x9E, 0xDE, 0xF3, 0x14, 0x53, 0x66, 0x20, 0xA8, 0xCD, 0x49, 0xAC, 0x18, 0xB2, 0x17, 0xDE, 0x35, 0x2C, 0xC9, 0x2D, 0x21, 0x8C, 0xBE, 0xDD, 0xD9, 0x41, 0xF7, 0x71, 0xC1, 0xF3, 0xE0, 0xAF, 0xF4, 0xDF, 0x29, 0x82, 0xD5, diff --git a/test/test_compat.py b/test/test_compat.py index e7d97e3e93..b1cc2a8187 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -12,12 +12,7 @@ import struct from yt_dlp import compat from yt_dlp.compat import urllib # isort: split -from yt_dlp.compat import ( - compat_etree_fromstring, - compat_expanduser, - compat_urllib_parse_unquote, # noqa: TID251 - compat_urllib_parse_urlencode, # noqa: TID251 -) +from yt_dlp.compat import compat_etree_fromstring, compat_expanduser from yt_dlp.compat.urllib.request import getproxies @@ -43,39 +38,6 @@ class TestCompat(unittest.TestCase): finally: os.environ['HOME'] = old_home or '' - def test_compat_urllib_parse_unquote(self): - self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') - self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') - self.assertEqual(compat_urllib_parse_unquote(''), '') - self.assertEqual(compat_urllib_parse_unquote('%'), '%') - self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') - self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') - self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') - self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') - self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') - self.assertEqual( - compat_urllib_parse_unquote(''' -%%a'''), - ''' -%%a''') - self.assertEqual( - compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), - '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') - - def test_compat_urllib_parse_unquote_plus(self): - self.assertEqual(urllib.parse.unquote_plus('abc%20def'), 'abc def') - self.assertEqual(urllib.parse.unquote_plus('%7e/abc+def'), '~/abc def') - - def test_compat_urllib_parse_urlencode(self): - self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode({b'abc': 'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode({b'abc': b'def'}), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([('abc', 'def')]), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([('abc', b'def')]), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([(b'abc', 'def')]), 'abc=def') - self.assertEqual(compat_urllib_parse_urlencode([(b'abc', b'def')]), 'abc=def') - def test_compat_etree_fromstring(self): xml = ''' diff --git a/test/test_cookies.py b/test/test_cookies.py index e1271f67eb..4b9b9b5a91 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -105,6 +105,13 @@ class TestCookies(unittest.TestCase): decryptor = LinuxChromeCookieDecryptor('Chrome', Logger()) self.assertEqual(decryptor.decrypt(encrypted_value), value) + def test_chrome_cookie_decryptor_linux_v10_meta24(self): + with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b''}): + encrypted_value = b'v10\x1f\xe4\x0e[\x83\x0c\xcc*kPi \xce\x8d\x1d\xbb\x80\r\x11\t\xbb\x9e^Hy\x94\xf4\x963\x9f\x82\xba\xfe\xa1\xed\xb9\xf1)\x00710\x92\xc8/<\x96B' + value = 'DE' + decryptor = LinuxChromeCookieDecryptor('Chrome', Logger(), meta_version=24) + self.assertEqual(decryptor.decrypt(encrypted_value), value) + def test_chrome_cookie_decryptor_windows_v10(self): with MonkeyPatch(cookies, { '_get_windows_v10_key': lambda *args, **kwargs: b'Y\xef\xad\xad\xeerp\xf0Y\xe6\x9b\x12\xc2 +
1
+
2
+
3
+

4

+

5

+''' + class TestTraversal: def test_traversal_base(self): @@ -468,7 +481,7 @@ class TestTraversalHelpers: 'id': 'name', 'data': 'content', 'url': 'url', - }, all, {subs_list_to_dict}]) == { + }, all, {subs_list_to_dict(lang=None)}]) == { 'de': [{'url': 'https://example.com/subs/de.ass'}], 'en': [{'data': 'content'}], }, 'subs with mandatory items missing should be filtered' @@ -477,7 +490,7 @@ class TestTraversalHelpers: {'url': 'https://example.com/subs/en', 'name': 'en'}, ], [..., { 'id': 'name', - 'ext': ['url', {lambda x: determine_ext(x, default_ext=None)}], + 'ext': ['url', {determine_ext(default_ext=None)}], 'url': 'url', }, all, {subs_list_to_dict(ext='ext')}]) == { 'de': [{'url': 'https://example.com/subs/de.ass', 'ext': 'ass'}], @@ -494,6 +507,121 @@ class TestTraversalHelpers: {'url': 'https://example.com/subs/en1', 'ext': 'ext'}, {'url': 'https://example.com/subs/en2', 'ext': 'ext'}, ]}, '`quality` key should sort subtitle list accordingly' + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.ass'}, + {'name': 'de'}, + {'name': 'en', 'content': 'content'}, + {'url': 'https://example.com/subs/en'}, + ], [..., { + 'id': 'name', + 'url': 'url', + 'data': 'content', + }, all, {subs_list_to_dict(lang='en')}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass'}], + 'en': [ + {'data': 'content'}, + {'url': 'https://example.com/subs/en'}, + ], + }, 'optionally provided lang should be used if no id available' + assert traverse_obj([ + {'name': 1, 'url': 'https://example.com/subs/de1'}, + {'name': {}, 'url': 'https://example.com/subs/de2'}, + {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'}, + {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'}, + ], [..., { + 'id': 'name', + 'url': 'url', + 'ext': 'ext', + }, all, {subs_list_to_dict(lang=None)}]) == { + 'de': [ + {'url': 'https://example.com/subs/de3'}, + {'url': 'https://example.com/subs/de4'}, + ], + }, 'non str types should be ignored for id and ext' + assert traverse_obj([ + {'name': 1, 'url': 'https://example.com/subs/de1'}, + {'name': {}, 'url': 'https://example.com/subs/de2'}, + {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'}, + {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'}, + ], [..., { + 'id': 'name', + 'url': 'url', + 'ext': 'ext', + }, all, {subs_list_to_dict(lang='de')}]) == { + 'de': [ + {'url': 'https://example.com/subs/de1'}, + {'url': 'https://example.com/subs/de2'}, + {'url': 'https://example.com/subs/de3'}, + {'url': 'https://example.com/subs/de4'}, + ], + }, 'non str types should be replaced by default id' + + def test_trim_str(self): + with pytest.raises(TypeError): + trim_str('positional') + + assert callable(trim_str(start='a')) + assert trim_str(start='ab')('abc') == 'c' + assert trim_str(end='bc')('abc') == 'a' + assert trim_str(start='a', end='c')('abc') == 'b' + assert trim_str(start='ab', end='c')('abc') == '' + assert trim_str(start='a', end='bc')('abc') == '' + assert trim_str(start='ab', end='bc')('abc') == '' + assert trim_str(start='abc', end='abc')('abc') == '' + assert trim_str(start='', end='')('abc') == 'abc' + + def test_unpack(self): + assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123' + assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3' + assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3' + with pytest.raises(TypeError): + unpack(join_nonempty)() + with pytest.raises(TypeError): + unpack() + + def test_find_element(self): + for improper_kwargs in [ + dict(attr='data-id'), + dict(value='y'), + dict(attr='data-id', value='y', cls='a'), + dict(attr='data-id', value='y', id='x'), + dict(cls='a', id='x'), + dict(cls='a', tag='p'), + dict(cls='[ab]', regex=True), + ]: + with pytest.raises(AssertionError): + find_element(**improper_kwargs)(_TEST_HTML) + + assert find_element(cls='a')(_TEST_HTML) == '1' + assert find_element(cls='a', html=True)(_TEST_HTML) == '
1
' + assert find_element(id='x')(_TEST_HTML) == '2' + assert find_element(id='[ex]')(_TEST_HTML) is None + assert find_element(id='[ex]', regex=True)(_TEST_HTML) == '2' + assert find_element(id='x', html=True)(_TEST_HTML) == '
2
' + assert find_element(attr='data-id', value='y')(_TEST_HTML) == '3' + assert find_element(attr='data-id', value='y(?:es)?')(_TEST_HTML) is None + assert find_element(attr='data-id', value='y(?:es)?', regex=True)(_TEST_HTML) == '3' + assert find_element( + attr='data-id', value='y', html=True)(_TEST_HTML) == '
3
' + + def test_find_elements(self): + for improper_kwargs in [ + dict(tag='p'), + dict(attr='data-id'), + dict(value='y'), + dict(attr='data-id', value='y', cls='a'), + dict(cls='a', tag='div'), + dict(cls='[ab]', regex=True), + ]: + with pytest.raises(AssertionError): + find_elements(**improper_kwargs)(_TEST_HTML) + + assert find_elements(cls='a')(_TEST_HTML) == ['1', '2', '4'] + assert find_elements(cls='a', html=True)(_TEST_HTML) == [ + '
1
', '
2
', '

4

'] + assert find_elements(attr='custom', value='z')(_TEST_HTML) == ['2', '3'] + assert find_elements(attr='custom', value='[ez]')(_TEST_HTML) == [] + assert find_elements(attr='custom', value='[ez]', regex=True)(_TEST_HTML) == ['2', '3', '5'] class TestDictGet: diff --git a/test/test_update.py b/test/test_update.py index 63a21e445f..23c12d38c1 100644 --- a/test/test_update.py +++ b/test/test_update.py @@ -82,16 +82,32 @@ TEST_LOCKFILE_V1 = rf'''{TEST_LOCKFILE_COMMENT} lock 2022.08.18.36 .+ Python 3\.6 lock 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lock 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) +lock 2024.10.22 py2exe .+ +lock 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lock 2024.10.22 (?!\w+_exe).+ Python 3\.8 +lock 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) ''' TEST_LOCKFILE_V2_TMPL = r'''%s lockV2 yt-dlp/yt-dlp 2022.08.18.36 .+ Python 3\.6 lockV2 yt-dlp/yt-dlp 2023.11.16 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) +lockV2 yt-dlp/yt-dlp 2024.10.22 py2exe .+ +lockV2 yt-dlp/yt-dlp 2024.10.22 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lockV2 yt-dlp/yt-dlp 2024.10.22 (?!\w+_exe).+ Python 3\.8 +lockV2 yt-dlp/yt-dlp 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 win_x86_exe .+ Windows-(?:Vista|2008Server) +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 py2exe .+ +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 (?!\w+_exe).+ Python 3\.8 +lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 (?!win_x86_exe).+ Python 3\.7 lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 win_x86_exe .+ Windows-(?:Vista|2008Server) +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.045052 py2exe .+ +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 linux_(?:armv7l|aarch64)_exe .+-glibc2\.(?:[12]?\d|30)\b +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 (?!\w+_exe).+ Python 3\.8 +lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) ''' TEST_LOCKFILE_V2 = TEST_LOCKFILE_V2_TMPL % TEST_LOCKFILE_COMMENT @@ -145,43 +161,76 @@ class TestUpdate(unittest.TestCase): for lockfile in (TEST_LOCKFILE_V1, TEST_LOCKFILE_V2, TEST_LOCKFILE_ACTUAL, TEST_LOCKFILE_FORK): # Normal operation test(lockfile, 'zip Python 3.12.0', '2023.12.31', '2023.12.31') - test(lockfile, 'zip stable Python 3.12.0', '2023.12.31', '2023.12.31', exact=True) - # Python 3.6 --update should update only to its lock + test(lockfile, 'zip Python 3.12.0', '2023.12.31', '2023.12.31', exact=True) + # py2exe should never update beyond 2024.10.22 + test(lockfile, 'py2exe Python 3.8', '2025.01.01', '2024.10.22') + test(lockfile, 'py2exe Python 3.8', '2025.01.01', None, exact=True) + # Python 3.6 --update should update only to the py3.6 lock test(lockfile, 'zip Python 3.6.0', '2023.11.16', '2022.08.18.36') - # --update-to an exact version later than the lock should return None - test(lockfile, 'zip stable Python 3.6.0', '2023.11.16', None, exact=True) - # Python 3.7 should be able to update to its lock + # Python 3.6 --update-to an exact version later than the py3.6 lock should return None + test(lockfile, 'zip Python 3.6.0', '2023.11.16', None, exact=True) + # Python 3.7 should be able to update to the py3.7 lock test(lockfile, 'zip Python 3.7.0', '2023.11.16', '2023.11.16') - test(lockfile, 'zip stable Python 3.7.1', '2023.11.16', '2023.11.16', exact=True) - # Non-win_x86_exe builds on py3.7 must be locked + test(lockfile, 'zip Python 3.7.1', '2023.11.16', '2023.11.16', exact=True) + # Non-win_x86_exe builds on py3.7 must be locked at py3.7 lock test(lockfile, 'zip Python 3.7.1', '2023.12.31', '2023.11.16') - test(lockfile, 'zip stable Python 3.7.1', '2023.12.31', None, exact=True) - test( # Windows Vista w/ win_x86_exe must be locked - lockfile, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', + test(lockfile, 'zip Python 3.7.1', '2023.12.31', None, exact=True) + # Python 3.8 should only update to the py3.8 lock + test(lockfile, 'zip Python 3.8.10', '2025.01.01', '2024.10.22') + test(lockfile, 'zip Python 3.8.110', '2025.01.01', None, exact=True) + test( # Windows Vista w/ win_x86_exe must be locked at Vista lock + lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', '2023.12.31', '2023.11.16') - test( # Windows 2008Server w/ win_x86_exe must be locked + test( # Windows 2008Server w/ win_x86_exe must be locked at Vista lock lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-2008Server', '2023.12.31', None, exact=True) - test( # Windows 7 w/ win_x86_exe py3.7 build should be able to update beyond lock - lockfile, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', - '2023.12.31', '2023.12.31') - test( # Windows 8.1 w/ '2008Server' in platform string should be able to update beyond lock + test( # Windows 7 w/ win_x86_exe py3.7 build should be able to update beyond py3.7 lock + lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', + '2023.12.31', '2023.12.31', exact=True) + test( # Windows 7 win_x86_exe should only update to Win7 lock + lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', + '2025.01.01', '2024.10.22') + test( # Windows 2008ServerR2 win_exe should only update to Win7 lock + lockfile, 'win_exe Python 3.8.10 (CPython x86 32bit) - Windows-2008ServerR2', + '2025.12.31', '2024.10.22') + test( # Windows 8.1 w/ '2008Server' in platform string should be able to update beyond py3.7 lock lockfile, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-post2008Server-6.2.9200', '2023.12.31', '2023.12.31', exact=True) + test( # win_exe built w/Python 3.8 on Windows>=8 should be able to update beyond py3.8 lock + lockfile, 'win_exe Python 3.8.10 (CPython AMD64 64bit) - Windows-10-10.0.20348-SP0', + '2025.01.01', '2025.01.01', exact=True) + test( # linux_armv7l_exe w/glibc2.7 should only update to glibc<2.31 lock + lockfile, 'linux_armv7l_exe Python 3.8.0 (CPython armv7l 32bit) - Linux-6.5.0-1025-azure-armv7l-with-glibc2.7', + '2025.01.01', '2024.10.22') + test( # linux_armv7l_exe w/Python 3.8 and glibc>=2.31 should be able to update beyond py3.8 and glibc<2.31 locks + lockfile, 'linux_armv7l_exe Python 3.8.0 (CPython armv7l 32bit) - Linux-6.5.0-1025-azure-armv7l-with-glibc2.31', + '2025.01.01', '2025.01.01') + test( # linux_armv7l_exe w/glibc2.30 should only update to glibc<2.31 lock + lockfile, 'linux_armv7l_exe Python 3.8.0 (CPython armv7l 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.30 (OpenSSL', + '2025.01.01', '2024.10.22') + test( # linux_aarch64_exe w/glibc2.17 should only update to glibc<2.31 lock + lockfile, 'linux_aarch64_exe Python 3.8.0 (CPython aarch64 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.17', + '2025.01.01', '2024.10.22') + test( # linux_aarch64_exe w/glibc2.40 and glibc>=2.31 should be able to update beyond py3.8 and glibc<2.31 locks + lockfile, 'linux_aarch64_exe Python 3.8.0 (CPython aarch64 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.40', + '2025.01.01', '2025.01.01') + test( # linux_aarch64_exe w/glibc2.3 should only update to glibc<2.31 lock + lockfile, 'linux_aarch64_exe Python 3.8.0 (CPython aarch64 64bit) - Linux-6.5.0-1025-azure-aarch64-with-glibc2.3 (OpenSSL', + '2025.01.01', '2024.10.22') # Forks can block updates to non-numeric tags rather than lock test(TEST_LOCKFILE_FORK, 'zip Python 3.6.3', 'pr0000', None, repo='fork/yt-dlp') - test(TEST_LOCKFILE_FORK, 'zip stable Python 3.7.4', 'pr0000', 'pr0000', repo='fork/yt-dlp') - test(TEST_LOCKFILE_FORK, 'zip stable Python 3.7.4', 'pr1234', None, repo='fork/yt-dlp') + test(TEST_LOCKFILE_FORK, 'zip Python 3.7.4', 'pr0000', 'pr0000', repo='fork/yt-dlp') + test(TEST_LOCKFILE_FORK, 'zip Python 3.7.4', 'pr1234', None, repo='fork/yt-dlp') test(TEST_LOCKFILE_FORK, 'zip Python 3.8.1', 'pr1234', 'pr1234', repo='fork/yt-dlp', exact=True) test( - TEST_LOCKFILE_FORK, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', + TEST_LOCKFILE_FORK, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-Vista-6.0.6003-SP2', 'pr1234', None, repo='fork/yt-dlp') test( - TEST_LOCKFILE_FORK, 'win_x86_exe stable Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', + TEST_LOCKFILE_FORK, 'win_x86_exe Python 3.7.9 (CPython x86 32bit) - Windows-7-6.1.7601-SP1', '2023.12.31', '2023.12.31', repo='fork/yt-dlp') test(TEST_LOCKFILE_FORK, 'zip Python 3.11.2', 'pr9999', None, repo='fork/yt-dlp', exact=True) - test(TEST_LOCKFILE_FORK, 'zip stable Python 3.12.0', 'pr9999', 'pr9999', repo='fork/yt-dlp') + test(TEST_LOCKFILE_FORK, 'zip Python 3.12.0', 'pr9999', 'pr9999', repo='fork/yt-dlp') def test_query_update(self): ydl = FakeYDL() diff --git a/test/test_utils.py b/test/test_utils.py index d4b846f56f..b3de14198e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import os import sys import unittest +import unittest.mock import warnings import datetime as dt @@ -20,7 +21,6 @@ import xml.etree.ElementTree from yt_dlp.compat import ( compat_etree_fromstring, compat_HTMLParseError, - compat_os_name, ) from yt_dlp.utils import ( Config, @@ -48,7 +48,6 @@ from yt_dlp.utils import ( dfxp2srt, encode_base_n, encode_compat_str, - encodeFilename, expand_path, extract_attributes, extract_basic_auth, @@ -68,7 +67,6 @@ from yt_dlp.utils import ( get_elements_html_by_class, get_elements_text_and_html_by_attribute, int_or_none, - intlist_to_bytes, iri_to_uri, is_html, js_to_json, @@ -343,11 +341,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(remove_start(None, 'A - '), None) self.assertEqual(remove_start('A - B', 'A - '), 'B') self.assertEqual(remove_start('B - A', 'A - '), 'B - A') + self.assertEqual(remove_start('non-empty', ''), 'non-empty') def test_remove_end(self): self.assertEqual(remove_end(None, ' - B'), None) self.assertEqual(remove_end('A - B', ' - B'), 'A') self.assertEqual(remove_end('B - A', ' - B'), 'B - A') + self.assertEqual(remove_end('non-empty', ''), 'non-empty') def test_remove_quotes(self): self.assertEqual(remove_quotes(None), None) @@ -563,10 +563,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(res_data, {'a': 'b', 'c': 'd'}) def test_shell_quote(self): - args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')] + args = ['ffmpeg', '-i', 'ñ€ß\'.mp4'] self.assertEqual( shell_quote(args), - """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''') + """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if os.name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''') def test_float_or_none(self): self.assertEqual(float_or_none('42.42'), 42.42) @@ -1306,15 +1306,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') self.assertEqual(clean_html('a
\xa0b'), 'a\nb') - def test_intlist_to_bytes(self): - self.assertEqual( - intlist_to_bytes([0, 1, 127, 128, 255]), - b'\x00\x01\x7f\x80\xff') - def test_args_to_str(self): self.assertEqual( args_to_str(['foo', 'ba/r', '-baz', '2 be', '']), - 'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""', + 'foo ba/r -baz \'2 be\' \'\'' if os.name != 'nt' else 'foo ba/r -baz "2 be" ""', ) def test_parse_filesize(self): @@ -2114,7 +2109,7 @@ Line 1 assert extract_basic_auth('http://user:@foo.bar') == ('http://foo.bar', 'Basic dXNlcjo=') assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') - @unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows') + @unittest.skipUnless(os.name == 'nt', 'Only relevant on Windows') def test_windows_escaping(self): tests = [ 'test"&', @@ -2148,6 +2143,12 @@ Line 1 assert run_shell(args) == expected assert run_shell(shell_quote(args, shell=True)) == expected + def test_partial_application(self): + assert callable(int_or_none(scale=10)), 'missing positional parameter should apply partially' + assert int_or_none(10, scale=0.1) == 100, 'positionally passed argument should call function' + assert int_or_none(v=10) == 10, 'keyword passed positional should call function' + assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function' + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/netrc/netrc b/test/testdata/netrc/netrc new file mode 100644 index 0000000000..bafe92fe6a --- /dev/null +++ b/test/testdata/netrc/netrc @@ -0,0 +1,4 @@ +machine normal_use login user password pass +machine empty_user login "" password pass +machine empty_pass login user password "" +machine both_empty login "" password "" diff --git a/test/testdata/netrc/print_netrc.py b/test/testdata/netrc/print_netrc.py new file mode 100644 index 0000000000..5c25814f84 --- /dev/null +++ b/test/testdata/netrc/print_netrc.py @@ -0,0 +1,2 @@ +with open('./test/testdata/netrc/netrc', encoding='utf-8') as fp: + print(fp.read()) diff --git a/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py new file mode 100644 index 0000000000..b860300d8d --- /dev/null +++ b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class PackagePluginIE(InfoExtractor): + pass diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index eea1065036..749de5d4e3 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -26,7 +26,7 @@ import unicodedata from .cache import Cache from .compat import urllib # isort: split -from .compat import compat_os_name, urllib_req_to_req +from .compat import urllib_req_to_req from .cookies import CookieLoadError, LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version @@ -109,7 +109,6 @@ from .utils import ( determine_ext, determine_protocol, encode_compat_str, - encodeFilename, escapeHTML, expand_path, extract_basic_auth, @@ -154,7 +153,6 @@ from .utils import ( try_get, url_basename, variadic, - version_tuple, windows_enable_vt_mode, write_json_file, write_string, @@ -168,7 +166,7 @@ from .utils.networking import ( ) from .version import CHANNEL, ORIGIN, RELEASE_GIT_HEAD, VARIANT, __version__ -if compat_os_name == 'nt': +if os.name == 'nt': import ctypes @@ -251,7 +249,7 @@ class YoutubeDL: format_sort_force: Force the given format_sort. see "Sorting Formats" for more details. prefer_free_formats: Whether to prefer video formats with free containers - over non-free ones of same quality. + over non-free ones of the same quality. allow_multiple_video_streams: Allow multiple video streams to be merged into a single file allow_multiple_audio_streams: Allow multiple audio streams to be merged @@ -285,7 +283,7 @@ class YoutubeDL: rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. logtostderr: Print everything to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. + consoletitle: Display progress in the console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file clean_infojson: Remove internal metadata from the infojson @@ -471,7 +469,7 @@ class YoutubeDL: The following options do not work when used through the API: filename, abort-on-error, multistreams, no-live-chat, format-sort, no-clean-infojson, no-playlist-metafiles, - no-keep-subs, no-attach-info-json, allow-unsafe-ext. + no-keep-subs, no-attach-info-json, allow-unsafe-ext, prefer-vp9-sort. Refer __init__.py for their implementation progress_template: Dictionary of templates for progress outputs. Allowed keys are 'download', 'postprocess', @@ -513,7 +511,7 @@ class YoutubeDL: The following options are used by the extractors: extractor_retries: Number of times to retry for known errors (default: 3) dynamic_mpd: Whether to process dynamic DASH manifests (default: True) - hls_split_discontinuity: Split HLS playlists to different formats at + hls_split_discontinuity: Split HLS playlists into different formats at discontinuities such as ad breaks (default: False) extractor_args: A dictionary of arguments to be passed to the extractors. See "EXTRACTOR ARGUMENTS" for details. @@ -553,7 +551,7 @@ class YoutubeDL: include_ads: - Doesn't work Download ads as well call_home: - Not implemented - Boolean, true iff we are allowed to contact the + Boolean, true if we are allowed to contact the yt-dlp servers for debugging. post_hooks: - Register a custom postprocessor A list of functions that get called as the final step @@ -644,7 +642,7 @@ class YoutubeDL: out=stdout, error=sys.stderr, screen=sys.stderr if self.params.get('quiet') else stdout, - console=None if compat_os_name == 'nt' else next( + console=None if os.name == 'nt' else next( filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None), ) @@ -953,7 +951,7 @@ class YoutubeDL: self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) def _send_console_code(self, code): - if compat_os_name == 'nt' or not self._out_files.console: + if os.name == 'nt' or not self._out_files.console: return self._write_string(code, self._out_files.console) @@ -961,7 +959,7 @@ class YoutubeDL: if not self.params.get('consoletitle', False): return message = remove_terminal_sequences(message) - if compat_os_name == 'nt': + if os.name == 'nt': if ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is # already of type unicode() @@ -2850,13 +2848,10 @@ class YoutubeDL: sanitize_string_field(fmt, 'format_id') sanitize_numeric_fields(fmt) fmt['url'] = sanitize_url(fmt['url']) - if fmt.get('ext') is None: - fmt['ext'] = determine_ext(fmt['url']).lower() + FormatSorter._fill_sorting_fields(fmt) if fmt['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'): if fmt.get('acodec') is None: fmt['acodec'] = fmt['ext'] - if fmt.get('protocol') is None: - fmt['protocol'] = determine_protocol(fmt) if fmt.get('resolution') is None: fmt['resolution'] = self.format_resolution(fmt, default=None) if fmt.get('dynamic_range') is None and fmt.get('vcodec') != 'none': @@ -3259,9 +3254,9 @@ class YoutubeDL: if full_filename is None: return - if not self._ensure_dir_exists(encodeFilename(full_filename)): + if not self._ensure_dir_exists(full_filename): return - if not self._ensure_dir_exists(encodeFilename(temp_filename)): + if not self._ensure_dir_exists(temp_filename): return if self._write_description('video', info_dict, @@ -3293,16 +3288,16 @@ class YoutubeDL: if self.params.get('writeannotations', False): annofn = self.prepare_filename(info_dict, 'annotation') if annofn: - if not self._ensure_dir_exists(encodeFilename(annofn)): + if not self._ensure_dir_exists(annofn): return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)): + if not self.params.get('overwrites', True) and os.path.exists(annofn): self.to_screen('[info] Video annotations are already present') elif not info_dict.get('annotations'): self.report_warning('There are no annotations to write.') else: try: self.to_screen('[info] Writing video annotations to: ' + annofn) - with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + with open(annofn, 'w', encoding='utf-8') as annofile: annofile.write(info_dict['annotations']) except (KeyError, TypeError): self.report_warning('There are no annotations to write.') @@ -3318,14 +3313,14 @@ class YoutubeDL: f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown') return True linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext')) - if not self._ensure_dir_exists(encodeFilename(linkfn)): + if not self._ensure_dir_exists(linkfn): return False - if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): + if self.params.get('overwrites', True) and os.path.exists(linkfn): self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present') return True try: self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') - with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + with open(to_high_limit_path(linkfn), 'w', encoding='utf-8', newline='\r\n' if link_type == 'url' else '\n') as linkfile: template_vars = {'url': url} if link_type == 'desktop': @@ -3356,7 +3351,7 @@ class YoutubeDL: if self.params.get('skip_download'): info_dict['filepath'] = temp_filename - info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(full_filename)) info_dict['__files_to_move'] = files_to_move replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)) info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') @@ -3486,7 +3481,7 @@ class YoutubeDL: self.report_file_already_downloaded(dl_filename) dl_filename = dl_filename or temp_filename - info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) + info_dict['__finaldir'] = os.path.dirname(os.path.abspath(full_filename)) except network_exceptions as err: self.report_error(f'unable to download video data: {err}') @@ -4089,17 +4084,6 @@ class YoutubeDL: if plugin_dirs: write_debug(f'Plugin directories: {plugin_dirs}') - # Not implemented - if False and self.params.get('call_home'): - ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode() - write_debug(f'Public IP address: {ipaddr}') - latest_version = self.urlopen( - 'https://yt-dl.org/latest/version').read().decode() - if version_tuple(latest_version) > version_tuple(__version__): - self.report_warning( - f'You are using an outdated version (newest version: {latest_version})! ' - 'See https://yt-dl.org/update if you need help updating.') - @functools.cached_property def proxies(self): """Global proxy configuration""" @@ -4312,7 +4296,7 @@ class YoutubeDL: else: try: self.to_screen(f'[info] Writing {label} description to: {descfn}') - with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + with open(descfn, 'w', encoding='utf-8') as descfile: descfile.write(ie_result['description']) except OSError: self.report_error(f'Cannot write {label} description file {descfn}') @@ -4396,7 +4380,9 @@ class YoutubeDL: return None for idx, t in list(enumerate(thumbnails))[::-1]: - thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') + thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg') + if multiple: + thumb_ext = f'{t["id"]}.{thumb_ext}' thumb_display_id = f'{label} thumbnail {t["id"]}' thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) @@ -4412,7 +4398,7 @@ class YoutubeDL: try: uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {}))) self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') - with open(encodeFilename(thumb_filename), 'wb') as thumbf: + with open(thumb_filename, 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index f598b6c2fe..a1880bf7dc 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -1,8 +1,8 @@ import sys -if sys.version_info < (3, 8): +if sys.version_info < (3, 9): raise ImportError( - f'You are using an unsupported version of Python. Only Python versions 3.8 and above are supported by yt-dlp') # noqa: F541 + f'You are using an unsupported version of Python. Only Python versions 3.9 and above are supported by yt-dlp') # noqa: F541 __license__ = 'The Unlicense' @@ -14,7 +14,6 @@ import os import re import traceback -from .compat import compat_os_name from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS, CookieLoadError from .downloader.external import get_external_downloader from .extractor import list_extractor_classes @@ -34,6 +33,7 @@ from .postprocessor import ( ) from .update import Updater from .utils import ( + Config, NO_DEFAULT, POSTPROCESS_WHEN, DateRange, @@ -43,7 +43,6 @@ from .utils import ( GeoUtils, PlaylistEntries, SameFileError, - decodeOption, download_range_func, expand_path, float_or_none, @@ -158,6 +157,9 @@ def set_compat_opts(opts): opts.embed_infojson = False if 'format-sort' in opts.compat_opts: opts.format_sort.extend(FormatSorter.ytdl_default) + elif 'prefer-vp9-sort' in opts.compat_opts: + opts.format_sort.extend(FormatSorter._prefer_vp9_sort) + _video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False) _audio_multistreams_set = set_default_compat('multistreams', 'allow_multiple_audio_streams', False, remove_compat=False) if _video_multistreams_set is False and _audio_multistreams_set is False: @@ -879,8 +881,8 @@ def parse_options(argv=None): 'listsubtitles': opts.listsubtitles, 'subtitlesformat': opts.subtitlesformat, 'subtitleslangs': opts.subtitleslangs, - 'matchtitle': decodeOption(opts.matchtitle), - 'rejecttitle': decodeOption(opts.rejecttitle), + 'matchtitle': opts.matchtitle, + 'rejecttitle': opts.rejecttitle, 'max_downloads': opts.max_downloads, 'prefer_free_formats': opts.prefer_free_formats, 'trim_file_name': opts.trim_file_name, @@ -967,6 +969,11 @@ def _real_main(argv=None): parser, opts, all_urls, ydl_opts = parse_options(argv) + # HACK: Set the plugin dirs early on + # TODO(coletdjnz): remove when plugin globals system is implemented + if opts.plugin_dirs is not None: + Config._plugin_dirs = list(map(expand_path, opts.plugin_dirs)) + # Dump user agent if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) @@ -1044,7 +1051,7 @@ def _real_main(argv=None): ydl.warn_if_short_id(args) # Show a useful error message and wait for keypress if not launched from shell on Windows - if not args and compat_os_name == 'nt' and getattr(sys, 'frozen', False): + if not args and os.name == 'nt' and getattr(sys, 'frozen', False): import ctypes.wintypes import msvcrt diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index abf54a998e..0930d36df9 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -3,7 +3,6 @@ from math import ceil from .compat import compat_ord from .dependencies import Cryptodome -from .utils import bytes_to_intlist, intlist_to_bytes if Cryptodome.AES: def aes_cbc_decrypt_bytes(data, key, iv): @@ -17,15 +16,15 @@ if Cryptodome.AES: else: def aes_cbc_decrypt_bytes(data, key, iv): """ Decrypt bytes with AES-CBC using native implementation since pycryptodome is unavailable """ - return intlist_to_bytes(aes_cbc_decrypt(*map(bytes_to_intlist, (data, key, iv)))) + return bytes(aes_cbc_decrypt(*map(list, (data, key, iv)))) def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): """ Decrypt bytes with AES-GCM using native implementation since pycryptodome is unavailable """ - return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) + return bytes(aes_gcm_decrypt_and_verify(*map(list, (data, key, tag, nonce)))) def aes_cbc_encrypt_bytes(data, key, iv, **kwargs): - return intlist_to_bytes(aes_cbc_encrypt(*map(bytes_to_intlist, (data, key, iv)), **kwargs)) + return bytes(aes_cbc_encrypt(*map(list, (data, key, iv)), **kwargs)) BLOCK_SIZE_BYTES = 16 @@ -221,7 +220,7 @@ def aes_gcm_decrypt_and_verify(data, key, tag, nonce): j0 = [*nonce, 0, 0, 0, 1] else: fill = (BLOCK_SIZE_BYTES - (len(nonce) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES + 8 - ghash_in = nonce + [0] * fill + bytes_to_intlist((8 * len(nonce)).to_bytes(8, 'big')) + ghash_in = nonce + [0] * fill + list((8 * len(nonce)).to_bytes(8, 'big')) j0 = ghash(hash_subkey, ghash_in) # TODO: add nonce support to aes_ctr_decrypt @@ -230,13 +229,13 @@ def aes_gcm_decrypt_and_verify(data, key, tag, nonce): iv_ctr = inc(j0) decrypted_data = aes_ctr_decrypt(data, key, iv_ctr + [0] * (BLOCK_SIZE_BYTES - len(iv_ctr))) - pad_len = len(data) // 16 * 16 + pad_len = (BLOCK_SIZE_BYTES - (len(data) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES s_tag = ghash( hash_subkey, data - + [0] * (BLOCK_SIZE_BYTES - len(data) + pad_len) # pad - + bytes_to_intlist((0 * 8).to_bytes(8, 'big') # length of associated data - + ((len(data) * 8).to_bytes(8, 'big'))), # length of data + + [0] * pad_len # pad + + list((0 * 8).to_bytes(8, 'big') # length of associated data + + ((len(data) * 8).to_bytes(8, 'big'))), # length of data ) if tag != aes_ctr_encrypt(s_tag, key, j0): @@ -300,8 +299,8 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data)) - password = bytes_to_intlist(password.encode()) + data = list(base64.b64decode(data)) + password = list(password.encode()) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) @@ -310,7 +309,7 @@ def aes_decrypt_text(data, password, key_size_bytes): cipher = data[NONCE_LENGTH_BYTES:] decrypted_data = aes_ctr_decrypt(cipher, key, nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)) - return intlist_to_bytes(decrypted_data) + return bytes(decrypted_data) RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index d820adaf1e..d779620688 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -1,5 +1,4 @@ import os -import sys import xml.etree.ElementTree as etree from .compat_utils import passthrough_module @@ -24,33 +23,14 @@ def compat_etree_fromstring(text): return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) -compat_os_name = os._name if os.name == 'java' else os.name - - -def compat_shlex_quote(s): - from ..utils import shell_quote - return shell_quote(s) - - def compat_ord(c): return c if isinstance(c, int) else ord(c) -if compat_os_name == 'nt' and sys.version_info < (3, 8): - # os.path.realpath on Windows does not follow symbolic links - # prior to Python 3.8 (see https://bugs.python.org/issue9949) - def compat_realpath(path): - while os.path.islink(path): - path = os.path.abspath(os.readlink(path)) - return os.path.realpath(path) -else: - compat_realpath = os.path.realpath - - # Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl # See https://github.com/yt-dlp/yt-dlp/issues/792 # https://docs.python.org/3/library/os.path.html#os.path.expanduser -if compat_os_name in ('nt', 'ce'): +if os.name in ('nt', 'ce'): def compat_expanduser(path): HOME = os.environ.get('HOME') if not HOME: diff --git a/yt_dlp/compat/_deprecated.py b/yt_dlp/compat/_deprecated.py index 607bae9999..445acc1a06 100644 --- a/yt_dlp/compat/_deprecated.py +++ b/yt_dlp/compat/_deprecated.py @@ -8,16 +8,14 @@ passthrough_module(__name__, '.._legacy', callback=lambda attr: warnings.warn( DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=6)) del passthrough_module -import base64 -import urllib.error -import urllib.parse +import functools # noqa: F401 +import os -compat_str = str -compat_b64decode = base64.b64decode +compat_os_name = os.name +compat_realpath = os.path.realpath -compat_urlparse = urllib.parse -compat_parse_qs = urllib.parse.parse_qs -compat_urllib_parse_unquote = urllib.parse.unquote -compat_urllib_parse_urlencode = urllib.parse.urlencode -compat_urllib_parse_urlparse = urllib.parse.urlparse + +def compat_shlex_quote(s): + from ..utils import shell_quote + return shell_quote(s) diff --git a/yt_dlp/compat/_legacy.py b/yt_dlp/compat/_legacy.py index dfc792eae4..dae2c14592 100644 --- a/yt_dlp/compat/_legacy.py +++ b/yt_dlp/compat/_legacy.py @@ -30,7 +30,7 @@ from asyncio import run as compat_asyncio_run # noqa: F401 from re import Pattern as compat_Pattern # noqa: F401 from re import match as compat_Match # noqa: F401 -from . import compat_expanduser, compat_HTMLParseError, compat_realpath +from . import compat_expanduser, compat_HTMLParseError from .compat_utils import passthrough_module from ..dependencies import brotli as compat_brotli # noqa: F401 from ..dependencies import websockets as compat_websockets # noqa: F401 @@ -78,7 +78,7 @@ compat_kwargs = lambda kwargs: kwargs compat_map = map compat_numeric_types = (int, float, complex) compat_os_path_expanduser = compat_expanduser -compat_os_path_realpath = compat_realpath +compat_os_path_realpath = os.path.realpath compat_print = print compat_shlex_split = shlex.split compat_socket_create_connection = socket.create_connection @@ -104,5 +104,12 @@ compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseEr compat_xpath = lambda xpath: xpath compat_zip = zip workaround_optparse_bug9161 = lambda: None +compat_str = str +compat_b64decode = base64.b64decode +compat_urlparse = urllib.parse +compat_parse_qs = urllib.parse.parse_qs +compat_urllib_parse_unquote = urllib.parse.unquote +compat_urllib_parse_urlencode = urllib.parse.urlencode +compat_urllib_parse_urlparse = urllib.parse.urlparse legacy = [] diff --git a/yt_dlp/compat/compat_utils.py b/yt_dlp/compat/compat_utils.py index d62b7d0488..d8b3c45cd3 100644 --- a/yt_dlp/compat/compat_utils.py +++ b/yt_dlp/compat/compat_utils.py @@ -57,7 +57,7 @@ def passthrough_module(parent, child, allowed_attributes=(..., ), *, callback=la callback(attr) return ret - @functools.lru_cache(maxsize=None) + @functools.cache def from_child(attr): nonlocal child if attr not in allowed_attributes: diff --git a/yt_dlp/compat/functools.py b/yt_dlp/compat/functools.py deleted file mode 100644 index 96689575f6..0000000000 --- a/yt_dlp/compat/functools.py +++ /dev/null @@ -1,12 +0,0 @@ -# flake8: noqa: F405 -from functools import * # noqa: F403 - -from .compat_utils import passthrough_module - -passthrough_module(__name__, 'functools') -del passthrough_module - -try: - _ = cache # >= 3.9 -except NameError: - cache = lru_cache(maxsize=None) diff --git a/yt_dlp/compat/urllib/request.py b/yt_dlp/compat/urllib/request.py index ad9fa83c87..dfc7f4a2dc 100644 --- a/yt_dlp/compat/urllib/request.py +++ b/yt_dlp/compat/urllib/request.py @@ -7,9 +7,9 @@ passthrough_module(__name__, 'urllib.request') del passthrough_module -from .. import compat_os_name +import os -if compat_os_name == 'nt': +if os.name == 'nt': # On older Python versions, proxies are extracted from Windows registry erroneously. [1] # If the https proxy in the registry does not have a scheme, urllib will incorrectly add https:// to it. [2] # It is unlikely that the user has actually set it to be https, so we should be fine to safely downgrade @@ -37,4 +37,4 @@ if compat_os_name == 'nt': def getproxies(): return getproxies_environment() or getproxies_registry_patched() -del compat_os_name +del os diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 4a69c576be..d5b0d3991b 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -25,7 +25,6 @@ from .aes import ( aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) -from .compat import compat_os_name from .dependencies import ( _SECRETSTORAGE_UNAVAILABLE_REASON, secretstorage, @@ -302,12 +301,18 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): raise FileNotFoundError(f'could not find {browser_name} cookies database in "{search_root}"') logger.debug(f'Extracting cookies from: "{cookie_database_path}"') - decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring) - with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) + + # meta_version is necessary to determine if we need to trim the hash prefix from the cookies + # Ref: https://chromium.googlesource.com/chromium/src/+/b02dcebd7cafab92770734dc2bc317bd07f1d891/net/extras/sqlite/sqlite_persistent_cookie_store.cc#223 + meta_version = int(cursor.execute('SELECT value FROM meta WHERE key = "version"').fetchone()[0]) + decryptor = get_cookie_decryptor( + config['browser_dir'], config['keyring_name'], logger, + keyring=keyring, meta_version=meta_version) + cursor.connection.text_factory = bytes column_names = _get_column_names(cursor, 'cookies') secure_column = 'is_secure' if 'is_secure' in column_names else 'secure' @@ -337,7 +342,7 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): logger.debug(f'cookie version breakdown: {counts}') return jar except PermissionError as error: - if compat_os_name == 'nt' and error.errno == 13: + if os.name == 'nt' and error.errno == 13: message = 'Could not copy Chrome cookie database. See https://github.com/yt-dlp/yt-dlp/issues/7271 for more info' logger.error(message) raise DownloadError(message) # force exit @@ -405,22 +410,23 @@ class ChromeCookieDecryptor: raise NotImplementedError('Must be implemented by sub classes') -def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): +def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None, meta_version=None): if sys.platform == 'darwin': - return MacChromeCookieDecryptor(browser_keyring_name, logger) + return MacChromeCookieDecryptor(browser_keyring_name, logger, meta_version=meta_version) elif sys.platform in ('win32', 'cygwin'): - return WindowsChromeCookieDecryptor(browser_root, logger) - return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) + return WindowsChromeCookieDecryptor(browser_root, logger, meta_version=meta_version) + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring, meta_version=meta_version) class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_keyring_name, logger, *, keyring=None): + def __init__(self, browser_keyring_name, logger, *, keyring=None, meta_version=None): self._logger = logger self._v10_key = self.derive_key(b'peanuts') self._empty_key = self.derive_key(b'') self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} self._browser_keyring_name = browser_keyring_name self._keyring = keyring + self._meta_version = meta_version or 0 @functools.cached_property def _v11_key(self): @@ -449,14 +455,18 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): if version == b'v10': self._cookie_counts['v10'] += 1 - return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key, self._empty_key), self._logger) + return _decrypt_aes_cbc_multi( + ciphertext, (self._v10_key, self._empty_key), self._logger, + hash_prefix=self._meta_version >= 24) elif version == b'v11': self._cookie_counts['v11'] += 1 if self._v11_key is None: self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc_multi(ciphertext, (self._v11_key, self._empty_key), self._logger) + return _decrypt_aes_cbc_multi( + ciphertext, (self._v11_key, self._empty_key), self._logger, + hash_prefix=self._meta_version >= 24) else: self._logger.warning(f'unknown cookie version: "{version}"', only_once=True) @@ -465,11 +475,12 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): class MacChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_keyring_name, logger): + def __init__(self, browser_keyring_name, logger, meta_version=None): self._logger = logger password = _get_mac_keyring_password(browser_keyring_name, logger) self._v10_key = None if password is None else self.derive_key(password) self._cookie_counts = {'v10': 0, 'other': 0} + self._meta_version = meta_version or 0 @staticmethod def derive_key(password): @@ -487,7 +498,8 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - return _decrypt_aes_cbc_multi(ciphertext, (self._v10_key,), self._logger) + return _decrypt_aes_cbc_multi( + ciphertext, (self._v10_key,), self._logger, hash_prefix=self._meta_version >= 24) else: self._cookie_counts['other'] += 1 @@ -497,10 +509,11 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_root, logger): + def __init__(self, browser_root, logger, meta_version=None): self._logger = logger self._v10_key = _get_windows_v10_key(browser_root, logger) self._cookie_counts = {'v10': 0, 'other': 0} + self._meta_version = meta_version or 0 def decrypt(self, encrypted_value): version = encrypted_value[:3] @@ -524,7 +537,9 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): ciphertext = raw_ciphertext[nonce_length:-authentication_tag_length] authentication_tag = raw_ciphertext[-authentication_tag_length:] - return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger) + return _decrypt_aes_gcm( + ciphertext, self._v10_key, nonce, authentication_tag, self._logger, + hash_prefix=self._meta_version >= 24) else: self._cookie_counts['other'] += 1 @@ -1010,10 +1025,12 @@ def pbkdf2_sha1(password, salt, iterations, key_length): return hashlib.pbkdf2_hmac('sha1', password, salt, iterations, key_length) -def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16): +def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' * 16, hash_prefix=False): for key in keys: plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) try: + if hash_prefix: + return plaintext[32:].decode() return plaintext.decode() except UnicodeDecodeError: pass @@ -1021,7 +1038,7 @@ def _decrypt_aes_cbc_multi(ciphertext, keys, logger, initialization_vector=b' ' return None -def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): +def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger, hash_prefix=False): try: plaintext = aes_gcm_decrypt_and_verify_bytes(ciphertext, key, authentication_tag, nonce) except ValueError: @@ -1029,6 +1046,8 @@ def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): return None try: + if hash_prefix: + return plaintext[32:].decode() return plaintext.decode() except UnicodeDecodeError: logger.warning('failed to decrypt cookie (AES-GCM) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) diff --git a/yt_dlp/dependencies/Cryptodome.py b/yt_dlp/dependencies/Cryptodome.py index 2cfa4c9522..0e4404d49e 100644 --- a/yt_dlp/dependencies/Cryptodome.py +++ b/yt_dlp/dependencies/Cryptodome.py @@ -24,7 +24,7 @@ try: from Crypto.Cipher import AES, PKCS1_OAEP, Blowfish, PKCS1_v1_5 # noqa: F401 from Crypto.Hash import CMAC, SHA1 # noqa: F401 from Crypto.PublicKey import RSA # noqa: F401 -except ImportError: +except (ImportError, OSError): __version__ = f'broken {__version__}'.strip() diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 2e3ea2fc4e..e8dcb37cc3 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -20,9 +20,7 @@ from ..utils import ( Namespace, RetryManager, classproperty, - decodeArgument, deprecation_warning, - encodeFilename, format_bytes, join_nonempty, parse_bytes, @@ -219,7 +217,7 @@ class FileDownloader: def temp_name(self, filename): """Returns a temporary filename for the given filename.""" if self.params.get('nopart', False) or filename == '-' or \ - (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): + (os.path.exists(filename) and not os.path.isfile(filename)): return filename return filename + '.part' @@ -273,7 +271,7 @@ class FileDownloader: """Try to set the last-modified time of the given file.""" if last_modified_hdr is None: return - if not os.path.isfile(encodeFilename(filename)): + if not os.path.isfile(filename): return timestr = last_modified_hdr if timestr is None: @@ -432,13 +430,13 @@ class FileDownloader: """ nooverwrites_and_exists = ( not self.params.get('overwrites', True) - and os.path.exists(encodeFilename(filename)) + and os.path.exists(filename) ) if not hasattr(filename, 'write'): continuedl_and_exists = ( self.params.get('continuedl', True) - and os.path.isfile(encodeFilename(filename)) + and os.path.isfile(filename) and not self.params.get('nopart', False) ) @@ -448,7 +446,7 @@ class FileDownloader: self._hook_progress({ 'filename': filename, 'status': 'finished', - 'total_bytes': os.path.getsize(encodeFilename(filename)), + 'total_bytes': os.path.getsize(filename), }, info_dict) self._finish_multiline_status() return True, False @@ -489,9 +487,7 @@ class FileDownloader: if not self.params.get('verbose', False): return - str_args = [decodeArgument(a) for a in args] - if exe is None: - exe = os.path.basename(str_args[0]) + exe = os.path.basename(args[0]) - self.write_debug(f'{exe} command line: {shell_quote(str_args)}') + self.write_debug(f'{exe} command line: {shell_quote(args)}') diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 6c1ec403c8..7f6b5b45cc 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -23,7 +23,6 @@ from ..utils import ( cli_valueless_option, determine_ext, encodeArgument, - encodeFilename, find_available_port, remove_end, traverse_obj, @@ -67,7 +66,7 @@ class ExternalFD(FragmentFD): 'elapsed': time.time() - started, } if filename != '-': - fsize = os.path.getsize(encodeFilename(tmpfilename)) + fsize = os.path.getsize(tmpfilename) self.try_rename(tmpfilename, filename) status.update({ 'downloaded_bytes': fsize, @@ -184,9 +183,9 @@ class ExternalFD(FragmentFD): dest.write(decrypt_fragment(fragment, src.read())) src.close() if not self.params.get('keep_fragments', False): - self.try_remove(encodeFilename(fragment_filename)) + self.try_remove(fragment_filename) dest.close() - self.try_remove(encodeFilename(f'{tmpfilename}.frag.urls')) + self.try_remove(f'{tmpfilename}.frag.urls') return 0 def _call_process(self, cmd, info_dict): @@ -620,7 +619,7 @@ class FFmpegFD(ExternalFD): args += self._configuration_args(('_o1', '_o', '')) args = [encodeArgument(opt) for opt in args] - args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + args.append(ffpp._ffmpeg_filename_argument(tmpfilename)) self._debug_cmd(args) piped = any(fmt['url'] in ('-', 'pipe:') for fmt in selected_formats) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 0d00196e2e..98784e7039 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -9,10 +9,9 @@ import time from .common import FileDownloader from .http import HttpFD from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import compat_os_name from ..networking import Request from ..networking.exceptions import HTTPError, IncompleteRead -from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj +from ..utils import DownloadError, RetryManager, traverse_obj from ..utils.networking import HTTPHeaderDict from ..utils.progress import ProgressCalculator @@ -152,7 +151,7 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): self._write_ytdl_file(ctx) if not self.params.get('keep_fragments', False): - self.try_remove(encodeFilename(ctx['fragment_filename_sanitized'])) + self.try_remove(ctx['fragment_filename_sanitized']) del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): @@ -188,7 +187,7 @@ class FragmentFD(FileDownloader): }) if self.__do_ytdl_file(ctx): - ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + ytdl_file_exists = os.path.isfile(self.ytdl_filename(ctx['filename'])) continuedl = self.params.get('continuedl', True) if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) @@ -390,7 +389,7 @@ class FragmentFD(FileDownloader): def __exit__(self, exc_type, exc_val, exc_tb): pass - if compat_os_name == 'nt': + if os.name == 'nt': def future_result(future): while True: try: diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index c0165790d1..9c6dd8b799 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -15,7 +15,6 @@ from ..utils import ( ThrottledDownload, XAttrMetadataError, XAttrUnavailableError, - encodeFilename, int_or_none, parse_http_range, try_call, @@ -58,9 +57,8 @@ class HttpFD(FileDownloader): if self.params.get('continuedl', True): # Establish possible resume length - if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize( - encodeFilename(ctx.tmpfilename)) + if os.path.isfile(ctx.tmpfilename): + ctx.resume_len = os.path.getsize(ctx.tmpfilename) ctx.is_resume = ctx.resume_len > 0 @@ -241,7 +239,7 @@ class HttpFD(FileDownloader): ctx.resume_len = byte_counter else: try: - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = os.path.getsize(ctx.tmpfilename) except FileNotFoundError: ctx.resume_len = 0 raise RetryDownload(e) diff --git a/yt_dlp/downloader/rtmp.py b/yt_dlp/downloader/rtmp.py index d7ffb3b34d..1b831e5f30 100644 --- a/yt_dlp/downloader/rtmp.py +++ b/yt_dlp/downloader/rtmp.py @@ -8,7 +8,6 @@ from ..utils import ( Popen, check_executable, encodeArgument, - encodeFilename, get_exe_version, ) @@ -179,7 +178,7 @@ class RtmpFD(FileDownloader): return False while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: - prevsize = os.path.getsize(encodeFilename(tmpfilename)) + prevsize = os.path.getsize(tmpfilename) self.to_screen(f'[rtmpdump] Downloaded {prevsize} bytes') time.sleep(5.0) # This seems to be needed args = [*basic_args, '--resume'] @@ -187,7 +186,7 @@ class RtmpFD(FileDownloader): args += ['--skip', '1'] args = [encodeArgument(a) for a in args] retval = run_rtmpdump(args) - cursize = os.path.getsize(encodeFilename(tmpfilename)) + cursize = os.path.getsize(tmpfilename) if prevsize == cursize and retval == RD_FAILED: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those @@ -196,7 +195,7 @@ class RtmpFD(FileDownloader): retval = RD_SUCCESS break if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): - fsize = os.path.getsize(encodeFilename(tmpfilename)) + fsize = os.path.getsize(tmpfilename) self.to_screen(f'[rtmpdump] Downloaded {fsize} bytes') self.try_rename(tmpfilename, filename) self._hook_progress({ diff --git a/yt_dlp/downloader/rtsp.py b/yt_dlp/downloader/rtsp.py index e89269fed9..b4b0be7e6e 100644 --- a/yt_dlp/downloader/rtsp.py +++ b/yt_dlp/downloader/rtsp.py @@ -2,7 +2,7 @@ import os import subprocess from .common import FileDownloader -from ..utils import check_executable, encodeFilename +from ..utils import check_executable class RtspFD(FileDownloader): @@ -26,7 +26,7 @@ class RtspFD(FileDownloader): retval = subprocess.call(args) if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) + fsize = os.path.getsize(tmpfilename) self.to_screen(f'\r[{args[0]}] {fsize} bytes') self.try_rename(tmpfilename, filename) self._hook_progress({ diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1f8dfb4ec8..82106c3636 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -208,6 +208,10 @@ from .bandcamp import ( BandcampUserIE, BandcampWeeklyIE, ) +from .bandlab import ( + BandlabIE, + BandlabPlaylistIE, +) from .bannedvideo import BannedVideoIE from .bbc import ( BBCIE, @@ -278,6 +282,7 @@ from .bleacherreport import ( from .blerp import BlerpIE from .blogger import BloggerIE from .bloomberg import BloombergIE +from .bluesky import BlueskyIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE from .boosty import BoostyIE @@ -363,7 +368,10 @@ from .ccc import ( ) from .ccma import CCMAIE from .cctv import CCTVIE -from .cda import CDAIE +from .cda import ( + CDAIE, + CDAFolderIE, +) from .cellebrite import CellebriteIE from .ceskatelevize import CeskaTelevizeIE from .cgtn import CGTNIE @@ -398,8 +406,6 @@ from .cmt import CMTIE from .cnbc import CNBCVideoIE from .cnn import ( CNNIE, - CNNArticleIE, - CNNBlogsIE, CNNIndonesiaIE, ) from .comedycentral import ( @@ -707,6 +713,7 @@ from .gab import ( GabTVIE, ) from .gaia import GaiaIE +from .gamedevtv import GameDevTVDashboardIE from .gamejolt import ( GameJoltCommunityIE, GameJoltGameIE, @@ -940,6 +947,10 @@ from .kaltura import KalturaIE from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE from .kelbyone import KelbyOneIE +from .kenh14 import ( + Kenh14PlaylistIE, + Kenh14VideoIE, +) from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, @@ -1129,12 +1140,6 @@ from .microsoftembed import ( MicrosoftMediusIE, ) from .microsoftstream import MicrosoftStreamIE -from .mildom import ( - MildomClipIE, - MildomIE, - MildomUserVodIE, - MildomVodIE, -) from .minds import ( MindsChannelIE, MindsGroupIE, @@ -1154,6 +1159,7 @@ from .mitele import MiTeleIE from .mixch import ( MixchArchiveIE, MixchIE, + MixchMovieIE, ) from .mixcloud import ( MixcloudIE, @@ -1515,8 +1521,8 @@ from .pgatour import PGATourIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pialive import PiaLiveIE from .piapro import PiaproIE -from .piaulizaportal import PIAULIZAPortalIE from .picarto import ( PicartoIE, PicartoVodIE, @@ -1552,10 +1558,6 @@ from .podbayfm import ( ) from .podchaser import PodchaserIE from .podomatic import PodomaticIE -from .pokemon import ( - PokemonIE, - PokemonWatchIE, -) from .pokergo import ( PokerGoCollectionIE, PokerGoIE, @@ -1646,6 +1648,7 @@ from .radiokapital import ( RadioKapitalIE, RadioKapitalShowIE, ) +from .radioradicale import RadioRadicaleIE from .radiozet import RadioZetPodcastIE from .radlive import ( RadLiveChannelIE, @@ -1937,9 +1940,7 @@ from .spotify import ( ) from .spreaker import ( SpreakerIE, - SpreakerPageIE, SpreakerShowIE, - SpreakerShowPageIE, ) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE @@ -2250,6 +2251,10 @@ from .ufctv import ( ) from .ukcolumn import UkColumnIE from .uktvplay import UKTVPlayIE +from .uliza import ( + UlizaPlayerIE, + UlizaPortalIE, +) from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE @@ -2278,10 +2283,6 @@ from .utreon import UtreonIE from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veo import VeoIE -from .veoh import ( - VeohIE, - VeohUserIE, -) from .vesti import VestiIE from .vevo import ( VevoIE, diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 66ab083fe0..b1343eed39 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -6,7 +6,6 @@ import hmac import io import json import re -import struct import time import urllib.parse import uuid @@ -18,10 +17,8 @@ from ..networking.exceptions import TransportError from ..utils import ( ExtractorError, OnDemandPagedList, - bytes_to_intlist, decode_base_n, int_or_none, - intlist_to_bytes, time_seconds, traverse_obj, update_url_query, @@ -72,15 +69,15 @@ class AbemaLicenseRH(RequestHandler): }) res = decode_base_n(license_response['k'], table=self._STRTABLE) - encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) + encvideokey = list(res.to_bytes(16, 'big')) h = hmac.new( binascii.unhexlify(self._HKEY), (license_response['cid'] + self.ie._DEVICE_ID).encode(), digestmod=hashlib.sha256) - enckey = bytes_to_intlist(h.digest()) + enckey = list(h.digest()) - return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) + return bytes(aes_ecb_decrypt(encvideokey, enckey)) class AbemaTVBaseIE(InfoExtractor): diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index c8a2613754..919e1d6af5 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -11,11 +11,9 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, ass_subtitles_timecode, - bytes_to_intlist, bytes_to_long, float_or_none, int_or_none, - intlist_to_bytes, join_nonempty, long_to_bytes, parse_iso8601, @@ -198,16 +196,16 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') self._K = ''.join(random.choices('0123456789abcdef', k=16)) - message = bytes_to_intlist(json.dumps({ + message = list(json.dumps({ 'k': self._K, 't': token, - })) + }).encode()) # Sometimes authentication fails for no good reason, retry with # a different random padding links_data = None for _ in range(3): - padded_message = intlist_to_bytes(pkcs1pad(message, 128)) + padded_message = bytes(pkcs1pad(message, 128)) n, e = self._RSA_KEY encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) authorization = base64.b64encode(encrypted_message).decode() diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index 7cc15ec7b6..f1b8779271 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1362,7 +1362,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en def _download_webpage_handle(self, *args, **kwargs): headers = self.geo_verification_headers() - headers.update(kwargs.get('headers', {})) + headers.update(kwargs.get('headers') or {}) kwargs['headers'] = headers return super()._download_webpage_handle( *args, **kwargs) diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 815d20537f..572d1a3893 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -33,21 +33,21 @@ class AfreecaTVBaseIE(InfoExtractor): } response = self._download_json( - 'https://login.afreecatv.com/app/LoginAction.php', None, + 'https://login.sooplive.co.kr/app/LoginAction.php', None, 'Logging in', data=urlencode_postdata(login_form)) _ERRORS = { -4: 'Your account has been suspended due to a violation of our terms and policies.', - -5: 'https://member.afreecatv.com/app/user_delete_progress.php', - -6: 'https://login.afreecatv.com/membership/changeMember.php', - -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", - -9: 'https://member.afreecatv.com/app/pop_login_block.php', - -11: 'https://login.afreecatv.com/afreeca/second_login.php', - -12: 'https://member.afreecatv.com/app/user_security.php', + -5: 'https://member.sooplive.co.kr/app/user_delete_progress.php', + -6: 'https://login.sooplive.co.kr/membership/changeMember.php', + -8: "Hello! Soop here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.sooplive.co.kr/app/pop_login_block.php', + -11: 'https://login.sooplive.co.kr/afreeca/second_login.php', + -12: 'https://member.sooplive.co.kr/app/user_security.php', 0: 'The username does not exist or you have entered the wrong password.', -1: 'The username does not exist or you have entered the wrong password.', -3: 'You have entered your username/password incorrectly.', - -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -7: 'You cannot use your Global Soop account to access Korean Soop.', -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', -32008: 'You have failed to log in. Please contact our Help Center.', } @@ -61,76 +61,48 @@ class AfreecaTVBaseIE(InfoExtractor): def _call_api(self, endpoint, display_id, data=None, headers=None, query=None): return self._download_json(Request( - f'https://api.m.afreecatv.com/{endpoint}', + f'https://api.m.sooplive.co.kr/{endpoint}', data=data, headers=headers, query=query, extensions={'legacy_ssl': True}), display_id, 'Downloading API JSON', 'Unable to download API JSON') + @staticmethod + def _fixup_thumb(thumb_url): + if not url_or_none(thumb_url): + return None + # Core would determine_ext as 'php' from the url, so we need to provide the real ext + # See: https://github.com/yt-dlp/yt-dlp/issues/11537 + return [{'url': thumb_url, 'ext': 'jpg'}] + class AfreecaTVIE(AfreecaTVBaseIE): - IE_NAME = 'afreecatv' - IE_DESC = 'afreecatv.com' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)? - (?: - /app/(?:index|read_ucc_bbs)\.cgi| - /player/[Pp]layer\.(?:swf|html) - )\?.*?\bnTitleNo=| - vod\.afreecatv\.com/(PLAYER/STATION|player)/ - ) - (?P\d+)/?(?:$|[?#&]) - ''' + IE_NAME = 'soop' + IE_DESC = 'sooplive.co.kr' + _VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/(?:PLAYER/STATION|player)/(?P\d+)/?(?:$|[?#&])' _TESTS = [{ - 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', - 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', + 'url': 'https://vod.sooplive.co.kr/player/96753363', 'info_dict': { - 'id': '36164052', + 'id': '20230108_9FF5BEE1_244432674_1', 'ext': 'mp4', - 'title': '데일리 에이프릴 요정들의 시상식!', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160503', + 'uploader_id': 'rlantnghks', + 'uploader': '페이즈으', + 'duration': 10840, + 'thumbnail': r're:https?://videoimg\.sooplive\.co/.kr/.+', + 'upload_date': '20230108', + 'timestamp': 1673218805, + 'title': '젠지 페이즈', }, - 'skip': 'Video is gone', - }, { - 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867', - 'info_dict': { - 'id': '36153164', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', + 'params': { + 'skip_download': True, }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '36153164_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'upload_date': '20160502', - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '36153164_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'upload_date': '20160502', - }, - }], - 'skip': 'Video is gone', }, { # non standard key - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', + 'url': 'http://vod.sooplive.co.kr/PLAYER/STATION/20515605', 'info_dict': { 'id': '20170411_BE689A0E_190960999_1_2_h', 'ext': 'mp4', 'title': '혼자사는여자집', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+', 'uploader': '♥이슬이', 'uploader_id': 'dasl8121', 'upload_date': '20170411', @@ -142,12 +114,12 @@ class AfreecaTVIE(AfreecaTVBaseIE): }, }, { # adult content - 'url': 'https://vod.afreecatv.com/player/97267690', + 'url': 'https://vod.sooplive.co.kr/player/97267690', 'info_dict': { 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', 'title': '[생]빨개요♥ (part 1)', - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+', 'uploader': '[SA]서아', 'uploader_id': 'bjdyrksu', 'upload_date': '20180327', @@ -157,36 +129,17 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'skip_download': True, }, 'skip': 'The VOD does not exist', - }, { - 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', - 'only_matching': True, - }, { - 'url': 'https://vod.afreecatv.com/player/96753363', - 'info_dict': { - 'id': '20230108_9FF5BEE1_244432674_1', - 'ext': 'mp4', - 'uploader_id': 'rlantnghks', - 'uploader': '페이즈으', - 'duration': 10840, - 'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+', - 'upload_date': '20230108', - 'timestamp': 1673218805, - 'title': '젠지 페이즈', - }, - 'params': { - 'skip_download': True, - }, }, { # adult content - 'url': 'https://vod.afreecatv.com/player/70395877', + 'url': 'https://vod.sooplive.co.kr/player/70395877', 'only_matching': True, }, { # subscribers only - 'url': 'https://vod.afreecatv.com/player/104647403', + 'url': 'https://vod.sooplive.co.kr/player/104647403', 'only_matching': True, }, { # private - 'url': 'https://vod.afreecatv.com/player/81669846', + 'url': 'https://vod.sooplive.co.kr/player/81669846', 'only_matching': True, }] @@ -209,8 +162,8 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'title': ('title', {str}), 'uploader': ('writer_nick', {str}), 'uploader_id': ('bj_id', {str}), - 'duration': ('total_file_duration', {functools.partial(int_or_none, scale=1000)}), - 'thumbnail': ('thumb', {url_or_none}), + 'duration': ('total_file_duration', {int_or_none(scale=1000)}), + 'thumbnails': ('thumb', {self._fixup_thumb}), }) entries = [] @@ -233,7 +186,7 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'title': f'{common_info.get("title") or "Untitled"} (part {file_num})', 'formats': formats, **traverse_obj(file_element, { - 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}), + 'duration': ('duration', {int_or_none(scale=1000)}), 'timestamp': ('file_start', {unified_timestamp}), }), }) @@ -262,11 +215,11 @@ class AfreecaTVIE(AfreecaTVBaseIE): class AfreecaTVCatchStoryIE(AfreecaTVBaseIE): - IE_NAME = 'afreecatv:catchstory' - IE_DESC = 'afreecatv.com catch story' - _VALID_URL = r'https?://vod\.afreecatv\.com/player/(?P\d+)/catchstory' + IE_NAME = 'soop:catchstory' + IE_DESC = 'sooplive.co.kr catch story' + _VALID_URL = r'https?://vod\.(?:sooplive\.co\.kr|afreecatv\.com)/player/(?P\d+)/catchstory' _TESTS = [{ - 'url': 'https://vod.afreecatv.com/player/103247/catchstory', + 'url': 'https://vod.sooplive.co.kr/player/103247/catchstory', 'info_dict': { 'id': '103247', }, @@ -281,29 +234,28 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE): return self.playlist_result(self._entries(data), video_id) - @staticmethod - def _entries(data): + def _entries(self, data): # 'files' is always a list with 1 element yield from traverse_obj(data, ( 'data', lambda _, v: v['story_type'] == 'catch', 'catch_list', lambda _, v: v['files'][0]['file'], { 'id': ('files', 0, 'file_info_key', {str}), 'url': ('files', 0, 'file', {url_or_none}), - 'duration': ('files', 0, 'duration', {functools.partial(int_or_none, scale=1000)}), + 'duration': ('files', 0, 'duration', {int_or_none(scale=1000)}), 'title': ('title', {str}), 'uploader': ('writer_nick', {str}), 'uploader_id': ('writer_id', {str}), - 'thumbnail': ('thumb', {url_or_none}), + 'thumbnails': ('thumb', {self._fixup_thumb}), 'timestamp': ('write_timestamp', {int_or_none}), })) class AfreecaTVLiveIE(AfreecaTVBaseIE): - IE_NAME = 'afreecatv:live' - IE_DESC = 'afreecatv.com livestreams' - _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P[^/]+)(?:/(?P\d+))?' + IE_NAME = 'soop:live' + IE_DESC = 'sooplive.co.kr livestreams' + _VALID_URL = r'https?://play\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P[^/?#]+)(?:/(?P\d+))?' _TESTS = [{ - 'url': 'https://play.afreecatv.com/pyh3646/237852185', + 'url': 'https://play.sooplive.co.kr/pyh3646/237852185', 'info_dict': { 'id': '237852185', 'ext': 'mp4', @@ -315,30 +267,30 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): }, 'skip': 'Livestream has ended', }, { - 'url': 'https://play.afreecatv.com/pyh3646/237852185', + 'url': 'https://play.sooplive.co.kr/pyh3646/237852185', 'only_matching': True, }, { - 'url': 'https://play.afreecatv.com/pyh3646', + 'url': 'https://play.sooplive.co.kr/pyh3646', 'only_matching': True, }] - _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php' + _LIVE_API_URL = 'https://live.sooplive.co.kr/afreeca/player_live_api.php' _WORKING_CDNS = [ - 'gcp_cdn', # live-global-cdn-v02.afreecatv.com - 'gs_cdn_pc_app', # pc-app.stream.afreecatv.com - 'gs_cdn_mobile_web', # mobile-web.stream.afreecatv.com - 'gs_cdn_pc_web', # pc-web.stream.afreecatv.com + 'gcp_cdn', # live-global-cdn-v02.sooplive.co.kr + 'gs_cdn_pc_app', # pc-app.stream.sooplive.co.kr + 'gs_cdn_mobile_web', # mobile-web.stream.sooplive.co.kr + 'gs_cdn_pc_web', # pc-web.stream.sooplive.co.kr ] _BAD_CDNS = [ 'gs_cdn', # chromecast.afreeca.gscdn.com (cannot resolve) - 'gs_cdn_chromecast', # chromecast.stream.afreecatv.com (HTTP Error 400) - 'azure_cdn', # live-global-cdn-v01.afreecatv.com (cannot resolve) - 'aws_cf', # live-global-cdn-v03.afreecatv.com (cannot resolve) - 'kt_cdn', # kt.stream.afreecatv.com (HTTP Error 400) + 'gs_cdn_chromecast', # chromecast.stream.sooplive.co.kr (HTTP Error 400) + 'azure_cdn', # live-global-cdn-v01.sooplive.co.kr (cannot resolve) + 'aws_cf', # live-global-cdn-v03.sooplive.co.kr (cannot resolve) + 'kt_cdn', # kt.stream.sooplive.co.kr (HTTP Error 400) ] def _extract_formats(self, channel_info, broadcast_no, aid): - stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' + stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.sooplive.co.kr' # If user has not passed CDN IDs, try API-provided CDN ID followed by other working CDN IDs default_cdn_ids = orderedSet([ @@ -358,7 +310,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): try: return self._extract_m3u8_formats( m3u8_url, broadcast_no, 'mp4', m3u8_id='hls', query={'aid': aid}, - headers={'Referer': 'https://play.afreecatv.com/'}) + headers={'Referer': 'https://play.sooplive.co.kr/'}) except ExtractorError as e: if attempt == len(cdn_ids): raise @@ -374,7 +326,13 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): broadcaster_id = channel_info.get('BJID') or broadcaster_id broadcast_no = channel_info.get('BNO') or broadcast_no if not broadcast_no: - raise UserNotLive(video_id=broadcaster_id) + result = channel_info.get('RESULT') + if result == 0: + raise UserNotLive(video_id=broadcaster_id) + elif result == -6: + self.raise_login_required( + 'This channel is streaming for subscribers only', method='password') + raise ExtractorError('Unable to extract broadcast number') password = self.get_param('videopassword') if channel_info.get('BPWD') == 'Y' and password is None: @@ -403,7 +361,7 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): formats = self._extract_formats(channel_info, broadcast_no, aid) station_info = traverse_obj(self._download_json( - 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, + 'https://st.sooplive.co.kr/api/get_station_status.php', broadcast_no, 'Downloading channel metadata', 'Unable to download channel metadata', query={'szBjId': broadcaster_id}, fatal=False), {dict}) or {} @@ -419,11 +377,11 @@ class AfreecaTVLiveIE(AfreecaTVBaseIE): } -class AfreecaTVUserIE(InfoExtractor): - IE_NAME = 'afreecatv:user' - _VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P[^/]+)/vods/?(?P[^/]+)?' +class AfreecaTVUserIE(AfreecaTVBaseIE): + IE_NAME = 'soop:user' + _VALID_URL = r'https?://ch\.(?:sooplive\.co\.kr|afreecatv\.com)/(?P[^/?#]+)/vods/?(?P[^/?#]+)?' _TESTS = [{ - 'url': 'https://bj.afreecatv.com/ryuryu24/vods/review', + 'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/review', 'info_dict': { '_type': 'playlist', 'id': 'ryuryu24', @@ -431,7 +389,7 @@ class AfreecaTVUserIE(InfoExtractor): }, 'playlist_count': 218, }, { - 'url': 'https://bj.afreecatv.com/parang1995/vods/highlight', + 'url': 'https://ch.sooplive.co.kr/parang1995/vods/highlight', 'info_dict': { '_type': 'playlist', 'id': 'parang1995', @@ -439,7 +397,7 @@ class AfreecaTVUserIE(InfoExtractor): }, 'playlist_count': 997, }, { - 'url': 'https://bj.afreecatv.com/ryuryu24/vods', + 'url': 'https://ch.sooplive.co.kr/ryuryu24/vods', 'info_dict': { '_type': 'playlist', 'id': 'ryuryu24', @@ -447,7 +405,7 @@ class AfreecaTVUserIE(InfoExtractor): }, 'playlist_count': 221, }, { - 'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip', + 'url': 'https://ch.sooplive.co.kr/ryuryu24/vods/balloonclip', 'info_dict': { '_type': 'playlist', 'id': 'ryuryu24', @@ -459,12 +417,12 @@ class AfreecaTVUserIE(InfoExtractor): def _fetch_page(self, user_id, user_type, page): page += 1 - info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id, + info = self._download_json(f'https://chapi.sooplive.co.kr/api/{user_id}/vods/{user_type}', user_id, query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'}, note=f'Downloading {user_type} video page {page}') for item in info['data']: yield self.url_result( - f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no']) + f'https://vod.sooplive.co.kr/player/{item["title_no"]}/', AfreecaTVIE, item['title_no']) def _real_extract(self, url): user_id, user_type = self._match_valid_url(url).group('id', 'slug_type') diff --git a/yt_dlp/extractor/allstar.py b/yt_dlp/extractor/allstar.py index 5ea1c30e3d..697d83c1e5 100644 --- a/yt_dlp/extractor/allstar.py +++ b/yt_dlp/extractor/allstar.py @@ -71,7 +71,7 @@ class AllstarBaseIE(InfoExtractor): 'thumbnails': (('clipImageThumb', 'clipImageSource'), {'url': {media_url_or_none}}), 'duration': ('clipLength', {int_or_none}), 'filesize': ('clipSizeBytes', {int_or_none}), - 'timestamp': ('createdDate', {functools.partial(int_or_none, scale=1000)}), + 'timestamp': ('createdDate', {int_or_none(scale=1000)}), 'uploader': ('username', {str}), 'uploader_id': ('user', '_id', {str}), 'view_count': ('views', {int_or_none}), diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index bf3d60b5ee..bd3b19b133 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -8,10 +8,8 @@ import time from .common import InfoExtractor from ..aes import aes_encrypt from ..utils import ( - bytes_to_intlist, determine_ext, int_or_none, - intlist_to_bytes, join_nonempty, smuggle_url, strip_jsonp, @@ -33,24 +31,6 @@ class AnvatoIE(InfoExtractor): _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js _TESTS = [{ - # from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14 - 'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441', - 'md5': '921919dab3cd0b849ff3d624831ae3e2', - 'info_dict': { - 'id': '899441', - 'ext': 'mp4', - 'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14', - 'description': 'md5:85e05a3cc163f8c344340f220521136d', - 'upload_date': '20201215', - 'timestamp': 1608009755, - 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'NFL', - 'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights', - 'Player Highlights', 'Cleveland Browns', 'league'], - 'duration': 157, - 'categories': ['Entertainment', 'Game', 'Highlights'], - }, - }, { # from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/ 'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455', 'md5': '837718bcfb3a7778d022f857f7a9b19e', @@ -241,31 +221,6 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582', } - def _generate_nfl_token(self, anvack, mcp_id): - reroute = self._download_json( - 'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials', - headers={'X-Domain-Id': 100}, note='Fetching token info') - token_type = reroute.get('token_type') or 'Bearer' - auth_token = f'{token_type} {reroute["access_token"]}' - response = self._download_json( - 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ - 'query': '''{ - viewer { - mediaToken(anvack: "%s", id: %s) { - token - } - } -}''' % (anvack, mcp_id), # noqa: UP031 - }).encode(), headers={ - 'Authorization': auth_token, - 'Content-Type': 'application/json', - }, note='Fetching NFL API token') - return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token')) - - _TOKEN_GENERATORS = { - 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token, - } - def _server_time(self, access_key, video_id): return int_or_none(traverse_obj(self._download_json( f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key}, @@ -277,8 +232,8 @@ class AnvatoIE(InfoExtractor): server_time = self._server_time(access_key, video_id) input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}' - auth_secret = intlist_to_bytes(aes_encrypt( - bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) + auth_secret = bytes(aes_encrypt( + list(input_data[:64].encode()), list(self._AUTH_KEY))) query = { 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'), 'rtyp': 'fp', @@ -290,8 +245,6 @@ class AnvatoIE(InfoExtractor): } if extracted_token is not None: api['anvstk2'] = extracted_token - elif self._TOKEN_GENERATORS.get(access_key) is not None: - api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id) elif self._ANVACK_TABLE.get(access_key) is not None: api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}') else: diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index f5a55efc4f..2849d9fd5b 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor): }, }, ], + }, { + # The reviewbody is None for one of the reviews; just need to extract data without crashing + 'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn', + 'info_dict': { + 'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn', + 'ext': 'mp3', + 'title': 'Stuck Inside of Mobile with the Memphis Blues Again', + 'creators': ['Grateful Dead'], + 'duration': 338.31, + 'track': 'Stuck Inside of Mobile with the Memphis Blues Again', + 'description': 'md5:764348a470b986f1217ffd38d6ac7b72', + 'display_id': 'gd95-04-02d1t04.shn', + 'location': 'Pyramid Arena', + 'uploader': 'jon@archive.org', + 'album': '1995-04-02 - Pyramid Arena', + 'upload_date': '20040519', + 'track_number': 4, + 'release_date': '19950402', + 'timestamp': 1084927901, + }, }] @staticmethod @@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor): info['comments'].append({ 'id': review.get('review_id'), 'author': review.get('reviewer'), - 'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), + 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'), 'timestamp': unified_timestamp(review.get('createdate')), 'parent': 'root'}) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index efc79dd141..89d3299213 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -299,7 +299,7 @@ class ARDBetaMediathekIE(InfoExtractor): 'info_dict': { 'id': '94834686', 'ext': 'mp4', - 'duration': 2700, + 'duration': 2670, 'episode': '7 Tage ... unter harten Jungs', 'description': 'md5:0f215470dcd2b02f59f4bd10c963f072', 'upload_date': '20231005', @@ -307,10 +307,28 @@ class ARDBetaMediathekIE(InfoExtractor): 'display_id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3', 'series': '7 Tage ...', 'channel': 'HR', - 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:430c86d233afa42d?w=960&ch=fa32ba69bc87989a', 'title': '7 Tage ... unter harten Jungs', '_old_archive_ids': ['ardbetamediathek N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3'], }, + }, { + 'url': 'https://www.ardmediathek.de/video/lokalzeit-aus-duesseldorf/lokalzeit-aus-duesseldorf-oder-31-10-2024/wdr-duesseldorf/Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz', + 'info_dict': { + 'id': '13847165', + 'chapters': 'count:8', + 'ext': 'mp4', + 'channel': 'WDR', + 'display_id': 'Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz', + 'episode': 'Lokalzeit aus Düsseldorf | 31.10.2024', + 'series': 'Lokalzeit aus Düsseldorf', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f02ec9bd9b7bd5f6?w=960&ch=612491dcd5e09b0c', + 'title': 'Lokalzeit aus Düsseldorf | 31.10.2024', + 'upload_date': '20241031', + 'timestamp': 1730399400, + 'description': 'md5:12db30b3b706314efe3778b8df1a7058', + 'duration': 1759, + '_old_archive_ids': ['ardbetamediathek Y3JpZDovL3dkci5kZS9CZWl0cmFnLXNvcGhvcmEtOWFkMTc0ZWMtMDA5ZS00ZDEwLWFjYjctMGNmNTdhNzVmNzUz'], + }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, @@ -455,6 +473,12 @@ class ARDBetaMediathekIE(InfoExtractor): 'subtitles': subtitles, 'is_live': is_live, 'age_limit': age_limit, + **traverse_obj(media_data, { + 'chapters': ('pluginData', 'jumpmarks@all', 'chapterArray', lambda _, v: int_or_none(v['chapterTime']), { + 'start_time': ('chapterTime', {int_or_none}), + 'title': ('chapterTitle', {str}), + }), + }), **traverse_obj(media_data, ('meta', { 'title': 'title', 'description': 'synopsis', diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 0abe059829..939c2800e6 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -1,4 +1,3 @@ -import functools import json import random import re @@ -10,7 +9,6 @@ from ..utils import ( ExtractorError, extract_attributes, float_or_none, - get_element_html_by_id, int_or_none, parse_filesize, str_or_none, @@ -21,7 +19,7 @@ from ..utils import ( url_or_none, urljoin, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import find_element, traverse_obj class BandcampIE(InfoExtractor): @@ -45,6 +43,8 @@ class BandcampIE(InfoExtractor): 'uploader_url': 'https://youtube-dl.bandcamp.com', 'uploader_id': 'youtube-dl', 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg', + 'artists': ['youtube-dl "\'/\\ä↭'], + 'album_artists': ['youtube-dl "\'/\\ä↭'], }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, { @@ -271,6 +271,18 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE 'timestamp': 1311756226, 'upload_date': '20110727', 'uploader': 'Blazo', + 'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg', + 'album_artists': ['Blazo'], + 'uploader_url': 'https://blazo.bandcamp.com', + 'release_date': '20110727', + 'release_timestamp': 1311724800.0, + 'track': 'Intro', + 'uploader_id': 'blazo', + 'track_number': 1, + 'album': 'Jazz Format Mixtape vol.1', + 'artists': ['Blazo'], + 'duration': 19.335, + 'track_id': '1353101989', }, }, { @@ -282,6 +294,18 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE 'timestamp': 1311757238, 'upload_date': '20110727', 'uploader': 'Blazo', + 'track': 'Kero One - Keep It Alive (Blazo remix)', + 'release_date': '20110727', + 'track_id': '38097443', + 'track_number': 2, + 'duration': 181.467, + 'uploader_url': 'https://blazo.bandcamp.com', + 'album': 'Jazz Format Mixtape vol.1', + 'uploader_id': 'blazo', + 'album_artists': ['Blazo'], + 'artists': ['Blazo'], + 'thumbnail': 'https://f4.bcbits.com/img/a1721150828_5.jpg', + 'release_timestamp': 1311724800.0, }, }, ], @@ -289,6 +313,7 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE 'title': 'Jazz Format Mixtape vol.1', 'id': 'jazz-format-mixtape-vol-1', 'uploader_id': 'blazo', + 'description': 'md5:38052a93217f3ffdc033cd5dbbce2989', }, 'params': { 'playlistend': 2, @@ -363,10 +388,10 @@ class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P\d+)' _TESTS = [{ 'url': 'https://bandcamp.com/?show=224', - 'md5': 'b00df799c733cf7e0c567ed187dea0fd', + 'md5': '61acc9a002bed93986b91168aa3ab433', 'info_dict': { 'id': '224', - 'ext': 'opus', + 'ext': 'mp3', 'title': 'BC Weekly April 4th 2017 - Magic Moments', 'description': 'md5:5d48150916e8e02d030623a48512c874', 'duration': 5829.77, @@ -376,7 +401,7 @@ class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE 'episode_id': '224', }, 'params': { - 'format': 'opus-lo', + 'format': 'mp3-128', }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', @@ -484,7 +509,7 @@ class BandcampUserIE(InfoExtractor): or re.findall(r']+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) yield from traverse_obj(webpage, ( - {functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes}, + {find_element(id='music-grid', html=True)}, {extract_attributes}, 'data-client-items', {json.loads}, ..., 'page_url', {str})) def _real_extract(self, url): @@ -493,4 +518,4 @@ class BandcampUserIE(InfoExtractor): return self.playlist_from_matches( self._yield_items(webpage), uploader, f'Discography of {uploader}', - getter=functools.partial(urljoin, url)) + getter=urljoin(url)) diff --git a/yt_dlp/extractor/bandlab.py b/yt_dlp/extractor/bandlab.py new file mode 100644 index 0000000000..64aa2ba70d --- /dev/null +++ b/yt_dlp/extractor/bandlab.py @@ -0,0 +1,437 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + format_field, + int_or_none, + parse_iso8601, + parse_qs, + truncate_string, + url_or_none, +) +from ..utils.traversal import traverse_obj, value + + +class BandlabBaseIE(InfoExtractor): + def _call_api(self, endpoint, asset_id, **kwargs): + headers = kwargs.pop('headers', None) or {} + return self._download_json( + f'https://www.bandlab.com/api/v1.3/{endpoint}/{asset_id}', + asset_id, headers={ + 'accept': 'application/json', + 'referer': 'https://www.bandlab.com/', + 'x-client-id': 'BandLab-Web', + 'x-client-version': '10.1.124', + **headers, + }, **kwargs) + + def _parse_revision(self, revision_data, url=None): + return { + 'vcodec': 'none', + 'media_type': 'revision', + 'extractor_key': BandlabIE.ie_key(), + 'extractor': BandlabIE.IE_NAME, + **traverse_obj(revision_data, { + 'webpage_url': ( + 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/revision/%s')}), filter, any), + 'id': (('revisionId', 'id'), {str}, any), + 'title': ('song', 'name', {str}), + 'track': ('song', 'name', {str}), + 'url': ('mixdown', 'file', {url_or_none}), + 'thumbnail': ('song', 'picture', 'url', {url_or_none}), + 'description': ('description', {str}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + 'timestamp': ('createdOn', {parse_iso8601}), + 'duration': ('mixdown', 'duration', {float_or_none}), + 'view_count': ('counters', 'plays', {int_or_none}), + 'like_count': ('counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'genres': ('genres', ..., 'name', {str}), + }), + } + + def _parse_track(self, track_data, url=None): + return { + 'vcodec': 'none', + 'media_type': 'track', + 'extractor_key': BandlabIE.ie_key(), + 'extractor': BandlabIE.IE_NAME, + **traverse_obj(track_data, { + 'webpage_url': ( + 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any), + 'id': (('revisionId', 'id'), {str}, any), + 'url': ('track', 'sample', 'audioUrl', {url_or_none}), + 'title': ('track', 'name', {str}), + 'track': ('track', 'name', {str}), + 'description': ('caption', {str}), + 'thumbnail': ('track', 'picture', ('original', 'url'), {url_or_none}, any), + 'view_count': ('counters', 'plays', {int_or_none}), + 'like_count': ('counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'duration': ('track', 'sample', 'duration', {float_or_none}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + 'timestamp': ('createdOn', {parse_iso8601}), + }), + } + + def _parse_video(self, video_data, url=None): + return { + 'media_type': 'video', + 'extractor_key': BandlabIE.ie_key(), + 'extractor': BandlabIE.IE_NAME, + **traverse_obj(video_data, { + 'id': ('id', {str}), + 'webpage_url': ( + 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any), + 'url': ('video', 'url', {url_or_none}), + 'title': ('caption', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}), + 'description': ('caption', {str}), + 'thumbnail': ('video', 'picture', 'url', {url_or_none}), + 'view_count': ('video', 'counters', 'plays', {int_or_none}), + 'like_count': ('video', 'counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'duration': ('video', 'duration', {float_or_none}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + }), + } + + +class BandlabIE(BandlabBaseIE): + _VALID_URL = [ + r'https?://(?:www\.)?bandlab.com/(?Ptrack|post|revision)/(?P[\da-f_-]+)', + r'https?://(?:www\.)?bandlab.com/(?Pembed)/\?(?:[^#]*&)?id=(?P[\da-f-]+)', + ] + _EMBED_REGEX = [rf']+src=[\'"](?P{_VALID_URL[1]})[\'"]'] + _TESTS = [{ + 'url': 'https://www.bandlab.com/track/04b37e88dba24967b9dac8eb8567ff39_07d7f906fc96ee11b75e000d3a428fff', + 'md5': '46f7b43367dd268bbcf0bbe466753b2c', + 'info_dict': { + 'id': '02d7f906-fc96-ee11-b75e-000d3a428fff', + 'ext': 'm4a', + 'uploader_id': 'ender_milze', + 'track': 'sweet black', + 'description': 'composed by juanjn3737', + 'timestamp': 1702171963, + 'view_count': int, + 'like_count': int, + 'duration': 54.629999999999995, + 'title': 'sweet black', + 'upload_date': '20231210', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'genres': ['Lofi'], + 'uploader': 'ender milze', + 'comment_count': int, + 'media_type': 'revision', + }, + }, { + # Same track as above but post URL + 'url': 'https://www.bandlab.com/post/07d7f906-fc96-ee11-b75e-000d3a428fff', + 'md5': '46f7b43367dd268bbcf0bbe466753b2c', + 'info_dict': { + 'id': '02d7f906-fc96-ee11-b75e-000d3a428fff', + 'ext': 'm4a', + 'uploader_id': 'ender_milze', + 'track': 'sweet black', + 'description': 'composed by juanjn3737', + 'timestamp': 1702171973, + 'view_count': int, + 'like_count': int, + 'duration': 54.629999999999995, + 'title': 'sweet black', + 'upload_date': '20231210', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'genres': ['Lofi'], + 'uploader': 'ender milze', + 'comment_count': int, + 'media_type': 'revision', + }, + }, { + # SharedKey Example + 'url': 'https://www.bandlab.com/track/048916c2-c6da-ee11-85f9-6045bd2e11f9?sharedKey=0NNWX8qYAEmI38lWAzCNDA', + 'md5': '15174b57c44440e2a2008be9cae00250', + 'info_dict': { + 'id': '038916c2-c6da-ee11-85f9-6045bd2e11f9', + 'ext': 'm4a', + 'comment_count': int, + 'genres': ['Other'], + 'uploader_id': 'user8353034818103753', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/51b18363-da23-4b9b-a29c-2933a3e561ca/', + 'timestamp': 1709625771, + 'track': 'PodcastMaerchen4b', + 'duration': 468.14, + 'view_count': int, + 'description': 'Podcast: Neues aus der Märchenwelt', + 'like_count': int, + 'upload_date': '20240305', + 'uploader': 'Erna Wageneder', + 'title': 'PodcastMaerchen4b', + 'media_type': 'revision', + }, + }, { + # Different Revision selected + 'url': 'https://www.bandlab.com/track/130343fc-148b-ea11-96d2-0003ffd1fc09?revId=110343fc-148b-ea11-96d2-0003ffd1fc09', + 'md5': '74e055ef9325d63f37088772fbfe4454', + 'info_dict': { + 'id': '110343fc-148b-ea11-96d2-0003ffd1fc09', + 'ext': 'm4a', + 'timestamp': 1588273294, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/b612e533-e4f7-4542-9f50-3fcfd8dd822c/', + 'description': 'Final Revision.', + 'title': 'Replay ( Instrumental)', + 'uploader': 'David R Sparks', + 'uploader_id': 'davesnothome69', + 'view_count': int, + 'comment_count': int, + 'track': 'Replay ( Instrumental)', + 'genres': ['Rock'], + 'upload_date': '20200430', + 'like_count': int, + 'duration': 279.43, + 'media_type': 'revision', + }, + }, { + # Video + 'url': 'https://www.bandlab.com/post/5cdf9036-3857-ef11-991a-6045bd36e0d9', + 'md5': '8caa2ef28e86c1dacf167293cfdbeba9', + 'info_dict': { + 'id': '5cdf9036-3857-ef11-991a-6045bd36e0d9', + 'ext': 'mp4', + 'duration': 44.705, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/videos/67c6cef1-cef6-40d3-831e-a55bc1dcb972/', + 'comment_count': int, + 'title': 'backing vocals', + 'uploader_id': 'marliashya', + 'uploader': 'auraa', + 'like_count': int, + 'description': 'backing vocals', + 'media_type': 'video', + }, + }, { + # Embed Example + 'url': 'https://www.bandlab.com/embed/?blur=false&id=014de0a4-7d82-ea11-a94c-0003ffd19c0f', + 'md5': 'a4ad05cb68c54faaed9b0a8453a8cf4a', + 'info_dict': { + 'id': '014de0a4-7d82-ea11-a94c-0003ffd19c0f', + 'ext': 'm4a', + 'comment_count': int, + 'genres': ['Electronic'], + 'uploader': 'Charlie Henson', + 'timestamp': 1587328674, + 'upload_date': '20200419', + 'view_count': int, + 'track': 'Positronic Meltdown', + 'duration': 318.55, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/87165bc3-5439-496e-b1f7-a9f13b541ff2/', + 'description': 'Checkout my tracks at AOMX http://aomxsounds.com/', + 'uploader_id': 'microfreaks', + 'title': 'Positronic Meltdown', + 'like_count': int, + 'media_type': 'revision', + }, + }, { + # Track without revisions available + 'url': 'https://www.bandlab.com/track/55767ac51789ea11a94c0003ffd1fc09_2f007b0a37b94ec7a69bc25ae15108a5', + 'md5': 'f05d68a3769952c2d9257c473e14c15f', + 'info_dict': { + 'id': '55767ac51789ea11a94c0003ffd1fc09_2f007b0a37b94ec7a69bc25ae15108a5', + 'ext': 'm4a', + 'track': 'insame', + 'like_count': int, + 'duration': 84.03, + 'title': 'insame', + 'view_count': int, + 'comment_count': int, + 'uploader': 'Sorakime', + 'uploader_id': 'sorakime', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/572a351a-0f3a-4c6a-ac39-1a5defdeeb1c/', + 'timestamp': 1691162128, + 'upload_date': '20230804', + 'media_type': 'track', + }, + }, { + 'url': 'https://www.bandlab.com/revision/014de0a4-7d82-ea11-a94c-0003ffd19c0f', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://phantomluigi.github.io/', + 'info_dict': { + 'id': 'e14223c3-7871-ef11-bdfd-000d3a980db3', + 'ext': 'm4a', + 'view_count': int, + 'upload_date': '20240913', + 'uploader_id': 'phantommusicofficial', + 'timestamp': 1726194897, + 'uploader': 'Phantom', + 'comment_count': int, + 'genres': ['Progresive Rock'], + 'description': 'md5:a38cd668f7a2843295ef284114f18429', + 'duration': 225.23, + 'like_count': int, + 'title': 'Vermilion Pt. 2 (Cover)', + 'track': 'Vermilion Pt. 2 (Cover)', + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/62b10750-7aef-4f42-ad08-1af52f577e97/', + 'media_type': 'revision', + }, + }] + + def _real_extract(self, url): + display_id, url_type = self._match_valid_url(url).group('id', 'url_type') + + qs = parse_qs(url) + revision_id = traverse_obj(qs, (('revId', 'id'), 0, any)) + if url_type == 'revision': + revision_id = display_id + + revision_data = None + if not revision_id: + post_data = self._call_api( + 'posts', display_id, note='Downloading post data', + query=traverse_obj(qs, {'sharedKey': ('sharedKey', 0)})) + + revision_id = traverse_obj(post_data, (('revisionId', ('revision', 'id')), {str}, any)) + revision_data = traverse_obj(post_data, ('revision', {dict})) + + if not revision_data and not revision_id: + post_type = post_data.get('type') + if post_type == 'Video': + return self._parse_video(post_data, url=url) + if post_type == 'Track': + return self._parse_track(post_data, url=url) + raise ExtractorError(f'Could not extract data for post type {post_type!r}') + + if not revision_data: + revision_data = self._call_api( + 'revisions', revision_id, note='Downloading revision data', query={'edit': 'false'}) + + return self._parse_revision(revision_data, url=url) + + +class BandlabPlaylistIE(BandlabBaseIE): + _VALID_URL = [ + r'https?://(?:www\.)?bandlab.com/(?:[\w]+/)?(?Palbums|collections)/(?P[\da-f-]+)', + r'https?://(?:www\.)?bandlab.com/(?Pembed)/collection/\?(?:[^#]*&)?id=(?P[\da-f-]+)', + ] + _EMBED_REGEX = [rf']+src=[\'"](?P{_VALID_URL[1]})[\'"]'] + _TESTS = [{ + 'url': 'https://www.bandlab.com/davesnothome69/albums/89b79ea6-de42-ed11-b495-00224845aac7', + 'info_dict': { + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/69507ff3-579a-45be-afca-9e87eddec944/', + 'release_date': '20221003', + 'title': 'Remnants', + 'album': 'Remnants', + 'like_count': int, + 'album_type': 'LP', + 'description': 'A collection of some feel good, rock hits.', + 'comment_count': int, + 'view_count': int, + 'id': '89b79ea6-de42-ed11-b495-00224845aac7', + 'uploader': 'David R Sparks', + 'uploader_id': 'davesnothome69', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.bandlab.com/slytheband/collections/955102d4-1040-ef11-86c3-000d3a42581b', + 'info_dict': { + 'id': '955102d4-1040-ef11-86c3-000d3a42581b', + 'timestamp': 1720762659, + 'view_count': int, + 'title': 'My Shit 🖤', + 'uploader_id': 'slytheband', + 'uploader': '𝓢𝓛𝓨', + 'upload_date': '20240712', + 'like_count': int, + 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/collections/2c64ca12-b180-4b76-8587-7a8da76bddc8/', + }, + 'playlist_count': 15, + }, { + # Embeds can contain both albums and collections with the same URL pattern. This is an album + 'url': 'https://www.bandlab.com/embed/collection/?id=12cc6f7f-951b-ee11-907c-00224844f303', + 'info_dict': { + 'id': '12cc6f7f-951b-ee11-907c-00224844f303', + 'release_date': '20230706', + 'description': 'This is a collection of songs I created when I had an Amiga computer.', + 'view_count': int, + 'title': 'Mark Salud The Amiga Collection', + 'uploader_id': 'mssirmooth1962', + 'comment_count': int, + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/d618bd7b-0537-40d5-bdd8-61b066e77d59/', + 'like_count': int, + 'uploader': 'Mark Salud', + 'album': 'Mark Salud The Amiga Collection', + 'album_type': 'LP', + }, + 'playlist_count': 24, + }, { + # Tracks without revision id + 'url': 'https://www.bandlab.com/embed/collection/?id=e98aafb5-d932-ee11-b8f0-00224844c719', + 'info_dict': { + 'like_count': int, + 'uploader_id': 'sorakime', + 'comment_count': int, + 'uploader': 'Sorakime', + 'view_count': int, + 'description': 'md5:4ec31c568a5f5a5a2b17572ea64c3825', + 'release_date': '20230812', + 'title': 'Art', + 'album': 'Art', + 'album_type': 'Album', + 'id': 'e98aafb5-d932-ee11-b8f0-00224844c719', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.3/albums/20c890de-e94a-4422-828a-2da6377a13c8/', + }, + 'playlist_count': 13, + }, { + 'url': 'https://www.bandlab.com/albums/89b79ea6-de42-ed11-b495-00224845aac7', + 'only_matching': True, + }] + + def _entries(self, album_data): + for post in traverse_obj(album_data, ('posts', lambda _, v: v['type'])): + post_type = post['type'] + if post_type == 'Revision': + yield self._parse_revision(post.get('revision')) + elif post_type == 'Track': + yield self._parse_track(post) + elif post_type == 'Video': + yield self._parse_video(post) + else: + self.report_warning(f'Skipping unknown post type: "{post_type}"') + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + + endpoints = { + 'albums': ['albums'], + 'collections': ['collections'], + 'embed': ['collections', 'albums'], + }.get(playlist_type) + for endpoint in endpoints: + playlist_data = self._call_api( + endpoint, playlist_id, note=f'Downloading {endpoint[:-1]} data', + fatal=False, expected_status=404) + if not playlist_data.get('errorCode'): + playlist_type = endpoint + break + if error_code := playlist_data.get('errorCode'): + raise ExtractorError(f'Could not find playlist data. Error code: "{error_code}"') + + return self.playlist_result( + self._entries(playlist_data), playlist_id, + **traverse_obj(playlist_data, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'uploader': ('creator', 'name', {str}), + 'uploader_id': ('creator', 'username', {str}), + 'timestamp': ('createdOn', {parse_iso8601}), + 'release_date': ('releaseDate', {lambda x: x.replace('-', '')}, filter), + 'thumbnail': ('picture', ('original', 'url'), {url_or_none}, any), + 'like_count': ('counters', 'likes', {int_or_none}), + 'comment_count': ('counters', 'comments', {int_or_none}), + 'view_count': ('counters', 'plays', {int_or_none}), + }), + **(traverse_obj(playlist_data, { + 'album': ('name', {str}), + 'album_type': ('type', {str}), + }) if playlist_type == 'albums' else {})) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 3af923f958..89fcf4425d 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1284,9 +1284,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE **traverse_obj(model, { 'title': ('title', {str}), 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), - 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, filter, any), 'duration': ('versions', 0, 'duration', {int}), - 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}), + 'timestamp': ('versions', 0, 'availableFrom', {int_or_none(scale=1000)}), }), } @@ -1386,7 +1386,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), { 'url': ('url', {url_or_none}), 'ext': ('format', {str}), - 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + 'tbr': ('bitrate', {int_or_none(scale=1000)}), })) if formats: entry = { @@ -1398,7 +1398,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': ('title', {str}), 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), - 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}), + 'timestamp': ('firstPublished', {int_or_none(scale=1000)}), }), } done = True @@ -1428,7 +1428,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE if not entry.get('timestamp'): entry['timestamp'] = traverse_obj(next_data, ( ..., 'contents', is_type('timestamp'), 'model', - 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + 'timestamp', {int_or_none(scale=1000)}, any)) entries.append(entry) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) diff --git a/yt_dlp/extractor/bfmtv.py b/yt_dlp/extractor/bfmtv.py index 87f011783b..49d4819a3d 100644 --- a/yt_dlp/extractor/bfmtv.py +++ b/yt_dlp/extractor/bfmtv.py @@ -1,18 +1,33 @@ import re from .common import InfoExtractor -from ..utils import extract_attributes +from ..utils import ExtractorError, extract_attributes class BFMTVBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/' _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P\d{12})\.html' - _VIDEO_BLOCK_REGEX = r'(]+class="video_block[^"]*"[^>]*>)' + _VIDEO_BLOCK_REGEX = r'(]+class="video_block[^"]*"[^>]*>.*?)' + _VIDEO_ELEMENT_REGEX = r'(]+>)' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' - def _brightcove_url_result(self, video_id, video_block): - account_id = video_block.get('accountid') or '876450612001' - player_id = video_block.get('playerid') or 'I2qBTln4u' + def _extract_video(self, video_block): + video_element = self._search_regex( + self._VIDEO_ELEMENT_REGEX, video_block, 'video element', default=None) + if video_element: + video_element_attrs = extract_attributes(video_element) + video_id = video_element_attrs.get('data-video-id') + if not video_id: + return + account_id = video_element_attrs.get('data-account') or '876450610001' + player_id = video_element_attrs.get('adjustplayer') or '19dszYXgm' + else: + video_block_attrs = extract_attributes(video_block) + video_id = video_block_attrs.get('videoid') + if not video_id: + return + account_id = video_block_attrs.get('accountid') or '876630703001' + player_id = video_block_attrs.get('playerid') or 'KbPwEbuHx' return self.url_result( self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), 'BrightcoveNew', video_id) @@ -40,23 +55,25 @@ class BFMTVIE(BFMTVBaseIE): def _real_extract(self, url): bfmtv_id = self._match_id(url) webpage = self._download_webpage(url, bfmtv_id) - video_block = extract_attributes(self._search_regex( + video = self._extract_video(self._search_regex( self._VIDEO_BLOCK_REGEX, webpage, 'video block')) - return self._brightcove_url_result(video_block['videoid'], video_block) + if not video: + raise ExtractorError('Failed to extract video') + return video -class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE +class BFMTVLiveIE(BFMTVBaseIE): IE_NAME = 'bfmtv:live' _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P(?:[^/]+/)?en-direct)' _TESTS = [{ 'url': 'https://www.bfmtv.com/en-direct/', 'info_dict': { - 'id': '5615950982001', + 'id': '6346069778112', 'ext': 'mp4', - 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r're:^Le Live BFM TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'uploader_id': '876450610001', - 'upload_date': '20220926', - 'timestamp': 1664207191, + 'upload_date': '20240202', + 'timestamp': 1706887572, 'live_status': 'is_live', 'thumbnail': r're:https://.+/image\.jpg', 'tags': [], @@ -69,6 +86,15 @@ class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE 'only_matching': True, }] + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + video = self._extract_video(self._search_regex( + self._VIDEO_BLOCK_REGEX, webpage, 'video block')) + if not video: + raise ExtractorError('Failed to extract video') + return video + class BFMTVArticleIE(BFMTVBaseIE): IE_NAME = 'bfmtv:article' @@ -102,18 +128,16 @@ class BFMTVArticleIE(BFMTVBaseIE): }, }] + def _entries(self, webpage): + for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): + video = self._extract_video(video_block_el) + if video: + yield video + def _real_extract(self, url): bfmtv_id = self._match_id(url) webpage = self._download_webpage(url, bfmtv_id) - entries = [] - for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): - video_block = extract_attributes(video_block_el) - video_id = video_block.get('videoid') - if not video_id: - continue - entries.append(self._brightcove_url_result(video_id, video_block)) - return self.playlist_result( - entries, bfmtv_id, self._og_search_title(webpage, fatal=False), + self._entries(webpage), bfmtv_id, self._og_search_title(webpage, fatal=False), self._html_search_meta(['og:description', 'description'], webpage)) diff --git a/yt_dlp/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py index 666b51c56a..ad00245def 100644 --- a/yt_dlp/extractor/bibeltv.py +++ b/yt_dlp/extractor/bibeltv.py @@ -1,4 +1,3 @@ -import functools from .common import InfoExtractor from ..utils import ( @@ -50,7 +49,7 @@ class BibelTVBaseIE(InfoExtractor): **traverse_obj(data, { 'title': 'title', 'description': 'description', - 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}), + 'duration': ('duration', {int_or_none(scale=1000)}), 'timestamp': ('schedulingStart', {parse_iso8601}), 'season_number': 'seasonNumber', 'episode_number': 'episodeNumber', diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 62f68fbc6d..02ea67707f 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -109,7 +109,7 @@ class BilibiliBaseIE(InfoExtractor): fragments = traverse_obj(play_info, ('durl', lambda _, v: url_or_none(v['url']), { 'url': ('url', {url_or_none}), - 'duration': ('length', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('length', {float_or_none(scale=1000)}), 'filesize': ('size', {int_or_none}), })) if fragments: @@ -124,7 +124,7 @@ class BilibiliBaseIE(InfoExtractor): 'quality': ('quality', {int_or_none}), 'format_id': ('quality', {str_or_none}), 'format_note': ('quality', {lambda x: format_names.get(x)}), - 'duration': ('timelength', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('timelength', {float_or_none(scale=1000)}), }), **parse_resolution(format_names.get(play_info.get('quality'))), }) @@ -1585,7 +1585,7 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE): 'title': ('title', {str}), 'uploader': ('upper', 'name', {str}), 'uploader_id': ('upper', 'mid', {str_or_none}), - 'timestamp': ('ctime', {int_or_none}, {lambda x: x or None}), + 'timestamp': ('ctime', {int_or_none}, filter), 'thumbnail': ('cover', {url_or_none}), })), } diff --git a/yt_dlp/extractor/bluesky.py b/yt_dlp/extractor/bluesky.py new file mode 100644 index 0000000000..0e58a0932d --- /dev/null +++ b/yt_dlp/extractor/bluesky.py @@ -0,0 +1,388 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + format_field, + int_or_none, + mimetype2ext, + orderedSet, + parse_iso8601, + truncate_string, + update_url_query, + url_basename, + url_or_none, + variadic, +) +from ..utils.traversal import traverse_obj + + +class BlueskyIE(InfoExtractor): + _VALID_URL = [ + r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P[\w.:%-]+)/post/(?P\w+)', + r'at://(?P[\w.:%-]+)/app\.bsky\.feed\.post/(?P\w+)', + ] + _TESTS = [{ + 'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g', + 'md5': '375539c1930ab05d15585ed772ab54fd', + 'info_dict': { + 'id': '3l4omssdl632g', + 'ext': 'mp4', + 'uploader': 'Blu3Blu3Lilith', + 'uploader_id': 'blu3blue.bsky.social', + 'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social', + 'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2', + 'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'OMG WE HAVE VIDEOS NOW', + 'description': 'OMG WE HAVE VIDEOS NOW', + 'upload_date': '20240921', + 'timestamp': 1726940605, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g', + 'md5': 'b9e344fdbce9f2852c668a97efefb105', + 'info_dict': { + 'id': '3l3vgf77uco2g', + 'ext': 'mp4', + 'uploader': 'Bluesky', + 'uploader_id': 'bsky.app', + 'uploader_url': 'https://bsky.app/profile/bsky.app', + 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', + 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky now has video! Update your app to versi...', + 'alt_title': 'Bluesky video feature announcement', + 'description': r're:(?s)Bluesky now has video! .{239}', + 'upload_date': '20240911', + 'timestamp': 1726074716, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + 'subtitles': { + 'en': 'mincount:1', + }, + }, + }, { + 'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c', + 'md5': '5f2df8c200b5633eb7fb2c984d29772f', + 'info_dict': { + 'id': '3l4qhp7bcs52c', + 'ext': 'mp4', + 'uploader': 'souris', + 'uploader_id': 'souris.moe', + 'uploader_url': 'https://bsky.app/profile/souris.moe', + 'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp', + 'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky video #3l4qhp7bcs52c', + 'upload_date': '20240922', + 'timestamp': 1727003838, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', + 'md5': '1af9c7fda061cf7593bbffca89e43d1c', + 'info_dict': { + 'id': '3l3w4tnezek2e', + 'ext': 'mp4', + 'uploader': 'clean', + 'uploader_id': 'de1.pds.tentacle.expert', + 'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert', + 'channel_id': 'did:web:de1.tentacle.expert', + 'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky video #3l3w4tnezek2e', + 'upload_date': '20240911', + 'timestamp': 1726098823, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o', + 'info_dict': { + 'id': 'XxK3t_5V3ao', + 'ext': 'mp4', + 'uploader': 'yunayu', + 'uploader_id': '@yunayuispink', + 'uploader_url': 'https://www.youtube.com/@yunayuispink', + 'channel': 'yunayu', + 'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w', + 'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w', + 'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp', + 'description': r're:Have a good goodx10000day', + 'title': '5min vs 5hours drawing', + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'upload_date': '20241026', + 'timestamp': 1729967784, + 'duration': 321, + 'age_limit': 0, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'tags': [], + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m', + 'info_dict': { + 'id': '222792849', + 'ext': 'mp3', + 'uploader': 'LASERBAT', + 'uploader_id': 'laserbatx', + 'uploader_url': 'https://laserbatx.bandcamp.com', + 'artists': ['LASERBAT'], + 'album_artists': ['LASERBAT'], + 'album': 'Hari Nezumi [EP]', + 'track': 'Forward to the End', + 'title': 'LASERBAT - Forward to the End', + 'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg', + 'duration': 228.571, + 'track_id': '222792849', + 'release_date': '20230423', + 'upload_date': '20230423', + 'timestamp': 1682276040.0, + 'release_timestamp': 1682276040.0, + 'track_number': 1, + }, + 'add_ie': ['Bandcamp'], + }, { + 'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j', + 'md5': 'b9e344fdbce9f2852c668a97efefb105', + 'info_dict': { + 'id': '3l3vgf77uco2g', + 'ext': 'mp4', + 'uploader': 'Bluesky', + 'uploader_id': 'bsky.app', + 'uploader_url': 'https://bsky.app/profile/bsky.app', + 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', + 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky now has video! Update your app to versi...', + 'alt_title': 'Bluesky video feature announcement', + 'description': r're:(?s)Bluesky now has video! .{239}', + 'upload_date': '20240911', + 'timestamp': 1726074716, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + 'subtitles': { + 'en': 'mincount:1', + }, + }, + }, { + 'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f', + 'md5': '8775118b235cf9fa6b5ad30f95cda75c', + 'info_dict': { + 'id': '3l7rdfxhyds2f', + 'ext': 'mp4', + 'uploader': 'cinnamon', + 'uploader_id': 'alt.bun.how', + 'uploader_url': 'https://bsky.app/profile/alt.bun.how', + 'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide', + 'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'crazy that i look like this tbh', + 'description': 'crazy that i look like this tbh', + 'upload_date': '20241030', + 'timestamp': 1730332128, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': ['sexual'], + 'age_limit': 18, + }, + }, { + 'url': 'at://did:plc:ia76kvnndjutgedggx2ibrem/app.bsky.feed.post/3l6zrz6zyl2dr', + 'md5': '71b0eb6d85d03145e6af6642c7fc6d78', + 'info_dict': { + 'id': '3l6zrz6zyl2dr', + 'ext': 'mp4', + 'uploader': 'mary🐇', + 'uploader_id': 'mary.my.id', + 'uploader_url': 'https://bsky.app/profile/mary.my.id', + 'channel_id': 'did:plc:ia76kvnndjutgedggx2ibrem', + 'channel_url': 'https://bsky.app/profile/did:plc:ia76kvnndjutgedggx2ibrem', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky video #3l6zrz6zyl2dr', + 'upload_date': '20241021', + 'timestamp': 1729523172, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/purpleicetea.bsky.social/post/3l7gv55dc2o2w', + 'info_dict': { + 'id': '3l7gv55dc2o2w', + }, + 'playlist': [{ + 'info_dict': { + 'id': '3l7gv55dc2o2w', + 'ext': 'mp4', + 'upload_date': '20241026', + 'description': 'One of my favorite videos', + 'comment_count': int, + 'uploader_url': 'https://bsky.app/profile/purpleicetea.bsky.social', + 'uploader': 'Purple.Ice.Tea', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'channel_url': 'https://bsky.app/profile/did:plc:bjh5ffwya5f53dfy47dezuwx', + 'like_count': int, + 'channel_id': 'did:plc:bjh5ffwya5f53dfy47dezuwx', + 'repost_count': int, + 'timestamp': 1729973202, + 'tags': [], + 'uploader_id': 'purpleicetea.bsky.social', + 'title': 'One of my favorite videos', + }, + }, { + 'info_dict': { + 'id': '3l77u64l7le2e', + 'ext': 'mp4', + 'title': 'hearing people on twitter say that bluesky isn\'...', + 'like_count': int, + 'uploader_id': 'thafnine.net', + 'uploader_url': 'https://bsky.app/profile/thafnine.net', + 'upload_date': '20241024', + 'channel_url': 'https://bsky.app/profile/did:plc:6ttyq36rhiyed7wu3ws7dmqj', + 'description': r're:(?s)hearing people on twitter say that bluesky .{93}', + 'tags': [], + 'alt_title': 'md5:9b1ee1937fb3d1a81e932f9ec14d560e', + 'uploader': 'T9', + 'channel_id': 'did:plc:6ttyq36rhiyed7wu3ws7dmqj', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'timestamp': 1729731642, + 'comment_count': int, + 'repost_count': int, + }, + }], + }] + _BLOB_URL_TMPL = '{}/xrpc/com.atproto.sync.getBlob' + + def _get_service_endpoint(self, did, video_id): + if did.startswith('did:web:'): + url = f'https://{did[8:]}/.well-known/did.json' + else: + url = f'https://plc.directory/{did}' + services = self._download_json( + url, video_id, 'Fetching service endpoint', 'Falling back to bsky.social', fatal=False) + return traverse_obj( + services, ('service', lambda _, x: x['type'] == 'AtprotoPersonalDataServer', + 'serviceEndpoint', {url_or_none}, any)) or 'https://bsky.social' + + def _real_extract(self, url): + handle, video_id = self._match_valid_url(url).group('handle', 'id') + + post = self._download_json( + 'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread', + video_id, query={ + 'uri': f'at://{handle}/app.bsky.feed.post/{video_id}', + 'depth': 0, + 'parentHeight': 0, + })['thread']['post'] + + entries = [] + # app.bsky.embed.video.view/app.bsky.embed.external.view + entries.extend(self._extract_videos(post, video_id)) + # app.bsky.embed.recordWithMedia.view + entries.extend(self._extract_videos( + post, video_id, embed_path=('embed', 'media'), record_subpath=('embed', 'media'))) + # app.bsky.embed.record.view + if nested_post := traverse_obj(post, ('embed', 'record', ('record', None), {dict}, any)): + entries.extend(self._extract_videos( + nested_post, video_id, embed_path=('embeds', 0), record_path='value')) + + if not entries: + raise ExtractorError('No video could be found in this post', expected=True) + if len(entries) == 1: + return entries[0] + return self.playlist_result(entries, video_id) + + @staticmethod + def _build_profile_url(path): + return format_field(path, None, 'https://bsky.app/profile/%s', default=None) + + def _extract_videos(self, root, video_id, embed_path='embed', record_path='record', record_subpath='embed'): + embed_path = variadic(embed_path, (str, bytes, dict, set)) + record_path = variadic(record_path, (str, bytes, dict, set)) + record_subpath = variadic(record_subpath, (str, bytes, dict, set)) + + entries = [] + if external_uri := traverse_obj(root, ( + ((*record_path, *record_subpath), embed_path), 'external', 'uri', {url_or_none}, any)): + entries.append(self.url_result(external_uri)) + if playlist := traverse_obj(root, (*embed_path, 'playlist', {url_or_none})): + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + playlist, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + return entries + + video_cid = traverse_obj( + root, (*embed_path, 'cid', {str}), + (*record_path, *record_subpath, 'video', 'ref', '$link', {str})) + did = traverse_obj(root, ('author', 'did', {str})) + + if did and video_cid: + endpoint = self._get_service_endpoint(did, video_id) + + formats.append({ + 'format_id': 'blob', + 'url': update_url_query( + self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': video_cid}), + **traverse_obj(root, (*embed_path, 'aspectRatio', { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + })), + **traverse_obj(root, (*record_path, *record_subpath, 'video', { + 'filesize': ('size', {int_or_none}), + 'ext': ('mimeType', {mimetype2ext}), + })), + }) + + for sub_data in traverse_obj(root, ( + *record_path, *record_subpath, 'captions', lambda _, v: v['file']['ref']['$link'])): + subtitles.setdefault(sub_data.get('lang') or 'und', []).append({ + 'url': update_url_query( + self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': sub_data['file']['ref']['$link']}), + 'ext': traverse_obj(sub_data, ('file', 'mimeType', {mimetype2ext})), + }) + + entries.append({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(root, { + 'id': ('uri', {url_basename}), + 'thumbnail': (*embed_path, 'thumbnail', {url_or_none}), + 'alt_title': (*embed_path, 'alt', {str}, filter), + 'uploader': ('author', 'displayName', {str}), + 'uploader_id': ('author', 'handle', {str}), + 'uploader_url': ('author', 'handle', {self._build_profile_url}), + 'channel_id': ('author', 'did', {str}), + 'channel_url': ('author', 'did', {self._build_profile_url}), + 'like_count': ('likeCount', {int_or_none}), + 'repost_count': ('repostCount', {int_or_none}), + 'comment_count': ('replyCount', {int_or_none}), + 'timestamp': ('indexedAt', {parse_iso8601}), + 'tags': ('labels', ..., 'val', {str}, all, {orderedSet}), + 'age_limit': ( + 'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any), + 'description': (*record_path, 'text', {str}, filter), + 'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}), + }), + }) + return entries diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py index 7fe0899449..d7bf58b366 100644 --- a/yt_dlp/extractor/bpb.py +++ b/yt_dlp/extractor/bpb.py @@ -1,35 +1,20 @@ -import functools import re from .common import InfoExtractor from ..utils import ( clean_html, extract_attributes, - get_element_text_and_html_by_tag, - get_elements_by_class, join_nonempty, js_to_json, mimetype2ext, unified_strdate, url_or_none, urljoin, - variadic, ) -from ..utils.traversal import traverse_obj - - -def html_get_element(tag=None, cls=None): - assert tag or cls, 'One of tag or class is required' - - if cls: - func = functools.partial(get_elements_by_class, cls, tag=tag) - else: - func = functools.partial(get_element_text_and_html_by_tag, tag) - - def html_get_element_wrapper(html): - return variadic(func(html))[0] - - return html_get_element_wrapper +from ..utils.traversal import ( + find_element, + traverse_obj, +) class BpbIE(InfoExtractor): @@ -41,12 +26,12 @@ class BpbIE(InfoExtractor): 'info_dict': { 'id': '297', 'ext': 'mp4', - 'creator': 'Kooperative Berlin', - 'description': 'md5:f4f75885ba009d3e2b156247a8941ce6', - 'release_date': '20160115', + 'creators': ['Kooperative Berlin'], + 'description': r're:Joachim Gauck, .*\n\nKamera: .*', + 'release_date': '20150716', 'series': 'Interview auf dem Geschichtsforum 1989 | 2009', - 'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'], - 'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D', + 'tags': [], + 'thumbnail': r're:https?://www\.bpb\.de/cache/images/7/297_teaser_16x9_1240\.jpg.*', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', 'uploader': 'Bundeszentrale für politische Bildung', }, @@ -55,11 +40,12 @@ class BpbIE(InfoExtractor): 'info_dict': { 'id': '522184', 'ext': 'mp4', - 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'], 'description': 'md5:f83c795ff8f825a69456a9e51fc15903', 'release_date': '20230621', - 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], - 'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB', + 'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)', + 'tags': [], + 'thumbnail': r're:https://www\.bpb\.de/cache/images/4/522184_teaser_16x9_1240\.png.*', 'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c', 'uploader': 'Bundeszentrale für politische Bildung', }, @@ -68,11 +54,12 @@ class BpbIE(InfoExtractor): 'info_dict': { 'id': '518789', 'ext': 'mp4', - 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'], 'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8', 'release_date': '20230302', - 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], - 'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D', + 'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)', + 'tags': [], + 'thumbnail': r're:https://www\.bpb\.de/cache/images/9/518789_teaser_16x9_1240\.jpeg.*', 'title': 'md5:3e956f264bb501f6383f10495a401da4', 'uploader': 'Bundeszentrale für politische Bildung', }, @@ -84,12 +71,12 @@ class BpbIE(InfoExtractor): 'info_dict': { 'id': '315813', 'ext': 'mp3', - 'creator': 'Axel Schröder', + 'creators': ['Axel Schröder'], 'description': 'md5:eda9d1af34e5912efef5baf54fba4427', 'release_date': '20200921', 'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager', 'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'], - 'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94', + 'thumbnail': r're:https://www\.bpb\.de/cache/images/3/315813_teaser_16x9_1240\.png.*', 'title': 'Folge 1: Eine Einführung', 'uploader': 'Bundeszentrale für politische Bildung', }, @@ -98,12 +85,12 @@ class BpbIE(InfoExtractor): 'info_dict': { 'id': '517806', 'ext': 'mp3', - 'creator': 'Bundeszentrale für politische Bildung', + 'creators': ['Bundeszentrale für politische Bildung'], 'description': 'md5:594689600e919912aade0b2871cc3fed', 'release_date': '20230127', 'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"', 'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'], - 'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0', + 'thumbnail': r're:https://www\.bpb\.de/cache/images/6/517806_teaser_16x9_1240\.png.*', 'title': 'Die Weltanschauung der "Neuen Rechten"', 'uploader': 'Bundeszentrale für politische Bildung', }, @@ -147,7 +134,7 @@ class BpbIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match})) + title_result = traverse_obj(webpage, ({find_element(cls='opening-header__title')}, {self._TITLE_RE.match})) json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False)) return { @@ -156,15 +143,15 @@ class BpbIE(InfoExtractor): # This metadata could be interpreted otherwise, but it fits "series" the most 'series': traverse_obj(title_result, ('series', {str.strip})) or None, 'description': join_nonempty(*traverse_obj(webpage, [( - {html_get_element(cls='opening-intro')}, - [{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}], + {find_element(cls='opening-intro')}, + [{find_element(tag='bpb-accordion-item')}, {find_element(cls='text-content')}], ), {clean_html}]), delim='\n\n') or None, - 'creator': self._html_search_meta('author', webpage), + 'creators': traverse_obj(self._html_search_meta('author', webpage), all), 'uploader': self._html_search_meta('publisher', webpage), 'release_date': unified_strdate(self._html_search_meta('date', webpage)), 'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)), **traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), { 'formats': (':sources', ..., {self._process_source}), - 'thumbnail': ('poster', {lambda x: urljoin(url, x)}), + 'thumbnail': ('poster', {urljoin(url)}), }), } diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py index ec72f0d884..0b2c447987 100644 --- a/yt_dlp/extractor/bravotv.py +++ b/yt_dlp/extractor/bravotv.py @@ -145,10 +145,9 @@ class BravoTVIE(AdobePassIE): tp_metadata = self._download_json( update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) - seconds_or_none = lambda x: float_or_none(x, 1000) chapters = traverse_obj(tp_metadata, ('chapters', ..., { - 'start_time': ('startTime', {seconds_or_none}), - 'end_time': ('endTime', {seconds_or_none}), + 'start_time': ('startTime', {float_or_none(scale=1000)}), + 'end_time': ('endTime', {float_or_none(scale=1000)}), })) # prune pointless single chapters that span the entire duration from short videos if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): @@ -168,8 +167,8 @@ class BravoTVIE(AdobePassIE): **merge_dicts(traverse_obj(tp_metadata, { 'title': 'title', 'description': 'description', - 'duration': ('duration', {seconds_or_none}), - 'timestamp': ('pubDate', {seconds_or_none}), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'timestamp': ('pubDate', {float_or_none(scale=1000)}), 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py index 71f7726659..3dacbbd24a 100644 --- a/yt_dlp/extractor/bundestag.py +++ b/yt_dlp/extractor/bundestag.py @@ -8,11 +8,13 @@ from ..utils import ( bug_reports_message, clean_html, format_field, - get_element_text_and_html_by_tag, int_or_none, url_or_none, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import ( + find_element, + traverse_obj, +) class BundestagIE(InfoExtractor): @@ -115,9 +117,8 @@ class BundestagIE(InfoExtractor): note='Downloading metadata overlay', fatal=False, ), { 'title': ( - {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0, - {functools.partial(re.sub, r']*>[^<]+', '')}, {clean_html}), - 'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), + {find_element(tag='h3')}, {functools.partial(re.sub, r']*>[^<]+', '')}, {clean_html}), + 'description': ({find_element(tag='p')}, {clean_html}), })) return result diff --git a/yt_dlp/extractor/caffeinetv.py b/yt_dlp/extractor/caffeinetv.py index aa107f8585..ea5134d2f3 100644 --- a/yt_dlp/extractor/caffeinetv.py +++ b/yt_dlp/extractor/caffeinetv.py @@ -53,7 +53,7 @@ class CaffeineTVIE(InfoExtractor): 'like_count': ('like_count', {int_or_none}), 'view_count': ('view_count', {int_or_none}), 'comment_count': ('comment_count', {int_or_none}), - 'tags': ('tags', ..., {str}, {lambda x: x or None}), + 'tags': ('tags', ..., {str}, filter), 'uploader': ('user', 'name', {str}), 'uploader_id': (((None, 'user'), 'username'), {str}, any), 'is_live': ('is_live', {bool}), @@ -62,7 +62,7 @@ class CaffeineTVIE(InfoExtractor): 'title': ('broadcast_title', {str}), 'duration': ('content_duration', {int_or_none}), 'timestamp': ('broadcast_start_time', {parse_iso8601}), - 'thumbnail': ('preview_image_path', {lambda x: urljoin(url, x)}), + 'thumbnail': ('preview_image_path', {urljoin(url)}), }), 'age_limit': { # assume Apple Store ratings: https://en.wikipedia.org/wiki/Mobile_software_content_rating_system diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 40224f63f5..c0cf3da3de 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -4,7 +4,6 @@ import json import re import time import urllib.parse -import xml.etree.ElementTree from .common import InfoExtractor from ..networking import HEADRequest @@ -12,7 +11,6 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, - join_nonempty, js_to_json, mimetype2ext, orderedSet, @@ -455,8 +453,8 @@ class CBCPlayerIE(InfoExtractor): chapters = traverse_obj(data, ( 'media', 'chapters', lambda _, v: float(v['startTime']) is not None, { - 'start_time': ('startTime', {functools.partial(float_or_none, scale=1000)}), - 'end_time': ('endTime', {functools.partial(float_or_none, scale=1000)}), + 'start_time': ('startTime', {float_or_none(scale=1000)}), + 'end_time': ('endTime', {float_or_none(scale=1000)}), 'title': ('name', {str}), })) # Filter out pointless single chapters with start_time==0 and no end_time @@ -467,8 +465,8 @@ class CBCPlayerIE(InfoExtractor): **traverse_obj(data, { 'title': ('title', {str}), 'description': ('description', {str.strip}), - 'thumbnail': ('image', 'url', {url_or_none}, {functools.partial(update_url, query=None)}), - 'timestamp': ('publishedAt', {functools.partial(float_or_none, scale=1000)}), + 'thumbnail': ('image', 'url', {url_or_none}, {update_url(query=None)}), + 'timestamp': ('publishedAt', {float_or_none(scale=1000)}), 'media_type': ('media', 'clipType', {str}), 'series': ('showName', {str}), 'season_number': ('media', 'season', {int_or_none}), @@ -524,14 +522,13 @@ class CBCGemIE(InfoExtractor): _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', - 'md5': '93dbb31c74a8e45b378cf13bd3f6f11e', 'info_dict': { 'id': 'schitts-creek/s06e01', 'ext': 'mp4', 'title': 'Smoke Signals', 'description': 'md5:929868d20021c924020641769eb3e7f1', - 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_06e01_thumbnail_v01.jpg?im=Resize=(Size)', - 'duration': 1314, + 'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_06e01_thumbnail_v01\.jpg', + 'duration': 1324, 'categories': ['comedy'], 'series': 'Schitt\'s Creek', 'season': 'Season 6', @@ -539,19 +536,21 @@ class CBCGemIE(InfoExtractor): 'episode': 'Smoke Signals', 'episode_number': 1, 'episode_id': 'schitts-creek/s06e01', + 'upload_date': '20210618', + 'timestamp': 1623988800, + 'release_date': '20200107', + 'release_timestamp': 1578427200, }, 'params': {'format': 'bv'}, - 'skip': 'Geo-restricted to Canada', }, { # This video requires an account in the browser, but works fine in yt-dlp 'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01', - 'md5': '297a9600f554f2258aed01514226a697', 'info_dict': { 'id': 'schitts-creek/s01e01', 'ext': 'mp4', 'title': 'The Cup Runneth Over', 'description': 'md5:9bca14ea49ab808097530eb05a29e797', - 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_01e01_thumbnail_v01.jpg?im=Resize=(Size)', + 'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_01e01_thumbnail_v01\.jpg', 'series': 'Schitt\'s Creek', 'season_number': 1, 'season': 'Season 1', @@ -560,9 +559,12 @@ class CBCGemIE(InfoExtractor): 'episode_id': 'schitts-creek/s01e01', 'duration': 1309, 'categories': ['comedy'], + 'upload_date': '20210617', + 'timestamp': 1623902400, + 'release_date': '20151124', + 'release_timestamp': 1448323200, }, 'params': {'format': 'bv'}, - 'skip': 'Geo-restricted to Canada', }, { 'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01', 'only_matching': True, @@ -631,38 +633,6 @@ class CBCGemIE(InfoExtractor): return self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token') - def _find_secret_formats(self, formats, video_id): - """ Find a valid video url and convert it to the secret variant """ - base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) - if not base_format: - return - - base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) - url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) - - secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) - if not isinstance(secret_xml, xml.etree.ElementTree.Element): - return - - for child in secret_xml: - if child.attrib.get('Type') != 'video': - continue - for video_quality in child: - bitrate = int_or_none(video_quality.attrib.get('Bitrate')) - if not bitrate or 'Index' not in video_quality.attrib: - continue - height = int_or_none(video_quality.attrib.get('MaxHeight')) - - yield { - **base_format, - 'format_id': join_nonempty('sec', height), - # Note: \g<1> is necessary instead of \1 since bitrate is a number - 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), - 'width': int_or_none(video_quality.attrib.get('MaxWidth')), - 'tbr': bitrate / 1000.0, - 'height': height, - } - def _real_extract(self, url): video_id = self._match_id(url) video_info = self._download_json( @@ -676,7 +646,6 @@ class CBCGemIE(InfoExtractor): else: headers = {} m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) - m3u8_url = m3u8_info.get('url') if m3u8_info.get('errorCode') == 1: self.raise_geo_restricted(countries=['CA']) @@ -685,9 +654,9 @@ class CBCGemIE(InfoExtractor): elif m3u8_info.get('errorCode') != 0: raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') - formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + formats = self._extract_m3u8_formats( + m3u8_info['url'], video_id, 'mp4', m3u8_id='hls', query={'manifestType': ''}) self._remove_duplicate_formats(formats) - formats.extend(self._find_secret_formats(formats, video_id)) for fmt in formats: if fmt.get('vcodec') == 'none': @@ -703,20 +672,21 @@ class CBCGemIE(InfoExtractor): return { 'id': video_id, - 'title': video_info['title'], - 'description': video_info.get('description'), - 'thumbnail': video_info.get('image'), - 'series': video_info.get('series'), - 'season_number': video_info.get('season'), - 'season': f'Season {video_info.get("season")}', - 'episode_number': video_info.get('episode'), - 'episode': video_info.get('title'), 'episode_id': video_id, - 'duration': video_info.get('duration'), - 'categories': [video_info.get('category')], 'formats': formats, - 'release_timestamp': video_info.get('airDate'), - 'timestamp': video_info.get('availableDate'), + **traverse_obj(video_info, { + 'title': ('title', {str}), + 'episode': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'series': ('series', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'categories': ('category', {str}, all), + 'release_timestamp': ('airDate', {int_or_none(scale=1000)}), + 'timestamp': ('availableDate', {int_or_none(scale=1000)}), + }), } diff --git a/yt_dlp/extractor/cbsnews.py b/yt_dlp/extractor/cbsnews.py index 972e111190..b01c0efd5d 100644 --- a/yt_dlp/extractor/cbsnews.py +++ b/yt_dlp/extractor/cbsnews.py @@ -96,7 +96,7 @@ class CBSNewsBaseIE(InfoExtractor): **traverse_obj(item, { 'title': (None, ('fulltitle', 'title')), 'description': 'dek', - 'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('timestamp', {float_or_none(scale=1000)}), 'duration': ('duration', {float_or_none}), 'subtitles': ('captions', {get_subtitles}), 'thumbnail': ('images', ('hd', 'sd'), {url_or_none}), diff --git a/yt_dlp/extractor/ccma.py b/yt_dlp/extractor/ccma.py index ffe4b49c15..7014c208d4 100644 --- a/yt_dlp/extractor/ccma.py +++ b/yt_dlp/extractor/ccma.py @@ -12,53 +12,86 @@ from ..utils import ( class CCMAIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' + IE_DESC = '3Cat, TV3 and Catalunya Ràdio' + _VALID_URL = r'https?://(?:www\.)?3cat\.cat/(?:3cat|tv3/sx3)/[^/?#]+/(?Pvideo|audio)/(?P\d+)' _TESTS = [{ - 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + # ccma.cat/tv3/alacarta/ URLs redirect to 3cat.cat/3cat/ + 'url': 'https://www.3cat.cat/3cat/lespot-de-la-marato-de-tv3/video/5630208/', 'md5': '7296ca43977c8ea4469e719c609b0871', 'info_dict': { 'id': '5630208', 'ext': 'mp4', - 'title': 'L\'espot de La Marató de TV3', + 'title': 'L\'espot de La Marató 2016: Ictus i les lesions medul·lars i cerebrals traumàtiques', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', 'timestamp': 1478608140, 'upload_date': '20161108', 'age_limit': 0, + 'alt_title': 'EsportMarató2016WEB_PerPublicar', + 'duration': 79, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/4/6/1478536106664.jpg', + 'series': 'Dedicada a l\'ictus i les lesions medul·lars i cerebrals traumàtiques', + 'categories': ['Divulgació'], }, }, { - 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + # ccma.cat/catradio/alacarta/ URLs redirect to 3cat.cat/3cat/ + 'url': 'https://www.3cat.cat/3cat/el-consell-de-savis-analitza-el-derbi/audio/943685/', 'md5': 'fa3e38f269329a278271276330261425', 'info_dict': { 'id': '943685', 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20170512', - 'timestamp': 1494622500, + 'upload_date': '20161217', + 'timestamp': 1482011700, 'vcodec': 'none', 'categories': ['Esports'], + 'series': 'Tot gira', + 'duration': 821, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/8/9/1482002602598.jpg', }, }, { - 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', - 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'url': 'https://www.3cat.cat/3cat/crims-josep-tallada-lespereu-me-part-1/video/6031387/', + 'md5': '27493513d08a3e5605814aee9bb778d2', 'info_dict': { 'id': '6031387', 'ext': 'mp4', - 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'title': 'T1xC5 - Josep Talleda, l\'"Espereu-me" (part 1)', 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', - 'timestamp': 1582577700, + 'timestamp': 1582577919, 'upload_date': '20200224', - 'subtitles': 'mincount:4', - 'age_limit': 16, + 'subtitles': 'mincount:1', + 'age_limit': 13, 'series': 'Crims', + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/1/9/1582564376991.jpg', + 'duration': 3203, + 'categories': ['Divulgació'], + 'alt_title': 'Crims - 5 - Josep Talleda, l\'"Espereu-me" (1a part) - Josep Talleda, l\'"Espereu-me" (part 1)', + 'episode_number': 5, + 'episode': 'Episode 5', + }, + }, { + 'url': 'https://www.3cat.cat/tv3/sx3/una-mosca-volava-per-la-llum/video/5759227/', + 'info_dict': { + 'id': '5759227', + 'ext': 'mp4', + 'title': 'Una mosca volava per la llum', + 'alt_title': '17Z004Ç UNA MOSCA VOLAVA PER LA LLUM', + 'description': 'md5:9ab64276944b0825336f4147f13f7854', + 'series': 'Mic', + 'upload_date': '20180411', + 'timestamp': 1523440105, + 'duration': 160, + 'age_limit': 0, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/6/1/1524071667216.jpg', + 'categories': ['Música'], }, }] def _real_extract(self, url): - media_type, media_id = self._match_valid_url(url).groups() + media_type, media_id = self._match_valid_url(url).group('type', 'id') media = self._download_json( - 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'http://api-media.3cat.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, 'format': 'dm', diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 62ee8b17f1..b2738e492f 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -12,6 +12,7 @@ from .common import InfoExtractor from ..compat import compat_ord from ..utils import ( ExtractorError, + OnDemandPagedList, float_or_none, int_or_none, merge_dicts, @@ -351,3 +352,50 @@ class CDAIE(InfoExtractor): extract_format(webpage, resolution) return merge_dicts(info_dict, info) + + +class CDAFolderIE(InfoExtractor): + _MAX_PAGE_SIZE = 36 + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P\w+)/folder/(?P\d+)' + _TESTS = [ + { + 'url': 'https://www.cda.pl/domino264/folder/31188385', + 'info_dict': { + 'id': '31188385', + 'title': 'SERIA DRUGA', + }, + 'playlist_mincount': 13, + }, + { + 'url': 'https://www.cda.pl/smiechawaTV/folder/2664592/vfilm', + 'info_dict': { + 'id': '2664592', + 'title': 'VideoDowcipy - wszystkie odcinki', + }, + 'playlist_mincount': 71, + }, + { + 'url': 'https://www.cda.pl/DeliciousBeauty/folder/19129979/vfilm', + 'info_dict': { + 'id': '19129979', + 'title': 'TESTY KOSMETYKÓW', + }, + 'playlist_mincount': 139, + }] + + def _real_extract(self, url): + folder_id, channel = self._match_valid_url(url).group('id', 'channel') + + webpage = self._download_webpage(url, folder_id) + + def extract_page_entries(page): + webpage = self._download_webpage( + f'https://www.cda.pl/{channel}/folder/{folder_id}/vfilm/{page + 1}', folder_id, + f'Downloading page {page + 1}', expected_status=404) + items = re.findall(r']+href="/video/([0-9a-z]+)"', webpage) + for video_id in items: + yield self.url_result(f'https://www.cda.pl/video/{video_id}', CDAIE, video_id) + + return self.playlist_result( + OnDemandPagedList(extract_page_entries, self._MAX_PAGE_SIZE), + folder_id, self._og_search_title(webpage)) diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index b49f741efa..a40b7d39c7 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -5,11 +5,12 @@ from ..utils import ( ExtractorError, lowercase_escape, url_or_none, + urlencode_postdata, ) class ChaturbateIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.(?Pcom|eu|global)/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://www.chaturbate.com/siswet19/', 'info_dict': { @@ -29,16 +30,59 @@ class ChaturbateIE(InfoExtractor): }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, + }, { + 'url': 'https://chaturbate.eu/siswet19/', + 'only_matching': True, + }, { + 'url': 'https://chaturbate.eu/fullvideo/?b=caylin', + 'only_matching': True, + }, { + 'url': 'https://chaturbate.global/siswet19/', + 'only_matching': True, }] - _ROOM_OFFLINE = 'Room is currently offline' + _ERROR_MAP = { + 'offline': 'Room is currently offline', + 'private': 'Room is currently in a private show', + 'away': 'Performer is currently away', + 'password protected': 'Room is password protected', + 'hidden': 'Hidden session in progress', + } - def _real_extract(self, url): - video_id = self._match_id(url) + def _extract_from_api(self, video_id, tld): + response = self._download_json( + f'https://chaturbate.{tld}/get_edge_hls_url_ajax/', video_id, + data=urlencode_postdata({'room_slug': video_id}), + headers={ + **self.geo_verification_headers(), + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': 'application/json', + }, fatal=False, impersonate=True) or {} + status = response.get('room_status') + if status != 'public': + if error := self._ERROR_MAP.get(status): + raise ExtractorError(error, expected=True) + self.report_warning('Falling back to webpage extraction') + return None + + m3u8_url = response.get('url') + if not m3u8_url: + self.raise_geo_restricted() + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': f'https://roomimg.stream.highwebmedia.com/ri/{video_id}.jpg', + 'is_live': True, + 'age_limit': 18, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), + } + + def _extract_from_html(self, video_id, tld): webpage = self._download_webpage( - f'https://chaturbate.com/{video_id}/', video_id, - headers=self.geo_verification_headers()) + f'https://chaturbate.{tld}/{video_id}/', video_id, + headers=self.geo_verification_headers(), impersonate=True) found_m3u8_urls = [] @@ -76,8 +120,8 @@ class ChaturbateIE(InfoExtractor): webpage, 'error', group='error', default=None) if not error: if any(p in webpage for p in ( - self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): - error = self._ROOM_OFFLINE + self._ERROR_MAP['offline'], 'offline_tipping', 'tip_offline')): + error = self._ERROR_MAP['offline'] if error: raise ExtractorError(error, expected=True) raise ExtractorError('Unable to find stream URL') @@ -104,3 +148,7 @@ class ChaturbateIE(InfoExtractor): 'is_live': True, 'formats': formats, } + + def _real_extract(self, url): + video_id, tld = self._match_valid_url(url).group('id', 'tld') + return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld) diff --git a/yt_dlp/extractor/chzzk.py b/yt_dlp/extractor/chzzk.py index e0b9980afd..aec77ac454 100644 --- a/yt_dlp/extractor/chzzk.py +++ b/yt_dlp/extractor/chzzk.py @@ -1,5 +1,3 @@ -import functools - from .common import InfoExtractor from ..utils import ( UserNotLive, @@ -77,7 +75,7 @@ class CHZZKLiveIE(InfoExtractor): 'thumbnails': thumbnails, **traverse_obj(live_detail, { 'title': ('liveTitle', {str}), - 'timestamp': ('openDate', {functools.partial(parse_iso8601, delimiter=' ')}), + 'timestamp': ('openDate', {parse_iso8601(delimiter=' ')}), 'concurrent_view_count': ('concurrentUserCount', {int_or_none}), 'view_count': ('accumulateCount', {int_or_none}), 'channel': ('channel', 'channelName', {str}), @@ -146,23 +144,37 @@ class CHZZKVideoIE(InfoExtractor): video_meta = self._download_json( f'https://api.chzzk.naver.com/service/v3/videos/{video_id}', video_id, note='Downloading video info', errnote='Unable to download video info')['content'] - formats, subtitles = self._extract_mpd_formats_and_subtitles( - f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id, - query={ - 'key': video_meta['inKey'], - 'env': 'real', - 'lc': 'en_US', - 'cpl': 'en_US', - }, note='Downloading video playback', errnote='Unable to download video playback') + + live_status = 'was_live' if video_meta.get('liveOpenDate') else 'not_live' + video_status = video_meta.get('vodStatus') + if video_status == 'UPLOAD': + playback = self._parse_json(video_meta['liveRewindPlaybackJson'], video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + playback['media'][0]['path'], video_id, 'mp4', m3u8_id='hls') + elif video_status == 'ABR_HLS': + formats, subtitles = self._extract_mpd_formats_and_subtitles( + f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', + video_id, query={ + 'key': video_meta['inKey'], + 'env': 'real', + 'lc': 'en_US', + 'cpl': 'en_US', + }) + else: + self.raise_no_formats( + f'Unknown video status detected: "{video_status}"', expected=True, video_id=video_id) + formats, subtitles = [], {} + live_status = 'post_live' if live_status == 'was_live' else None return { 'id': video_id, 'formats': formats, 'subtitles': subtitles, + 'live_status': live_status, **traverse_obj(video_meta, { 'title': ('videoTitle', {str}), 'thumbnail': ('thumbnailImageUrl', {url_or_none}), - 'timestamp': ('publishDateAt', {functools.partial(float_or_none, scale=1000)}), + 'timestamp': ('publishDateAt', {float_or_none(scale=1000)}), 'view_count': ('readCount', {int_or_none}), 'duration': ('duration', {int_or_none}), 'channel': ('channel', 'channelName', {str}), diff --git a/yt_dlp/extractor/cineverse.py b/yt_dlp/extractor/cineverse.py index c8c6c48c27..124c874e2c 100644 --- a/yt_dlp/extractor/cineverse.py +++ b/yt_dlp/extractor/cineverse.py @@ -3,6 +3,7 @@ import re from .common import InfoExtractor from ..utils import ( filter_dict, + float_or_none, int_or_none, parse_age_limit, smuggle_url, @@ -85,7 +86,7 @@ class CineverseIE(CineverseBaseIE): 'title': 'title', 'id': ('details', 'item_id'), 'description': ('details', 'description'), - 'duration': ('duration', {lambda x: x / 1000}), + 'duration': ('duration', {float_or_none(scale=1000)}), 'cast': ('details', 'cast', {lambda x: x.split(', ')}), 'modified_timestamp': ('details', 'updated_by', 0, 'update_time', 'time', {int_or_none}), 'season_number': ('details', 'season', {int_or_none}), diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index 8a409461a8..9e9e89a801 100644 --- a/yt_dlp/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py @@ -8,7 +8,7 @@ class CloudflareStreamIE(InfoExtractor): _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' _EMBED_RE = rf'(?:embed\.|{_SUBDOMAIN_RE}){_DOMAIN_RE}/embed/[^/?#]+\.js\?(?:[^#]+&)?video=' _ID_RE = r'[\da-f]{32}|eyJ[\w-]+\.[\w-]+\.[\w-]+' - _VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P{_ID_RE})' + _VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}(?P{_DOMAIN_RE})/|{_EMBED_RE})(?P{_ID_RE})' _EMBED_REGEX = [ rf']+\bsrc=(["\'])(?P(?:https?:)?//{_EMBED_RE}(?:{_ID_RE})(?:(?!\1).)*)\1', rf']+\bsrc=["\'](?Phttps?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})', @@ -19,7 +19,7 @@ class CloudflareStreamIE(InfoExtractor): 'id': '31c9291ab41fac05471db4e73aa11717', 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', - 'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg', + 'thumbnail': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg', }, 'params': { 'skip_download': 'm3u8', @@ -30,7 +30,7 @@ class CloudflareStreamIE(InfoExtractor): 'id': '0e8e040aec776862e1d632a699edf59e', 'ext': 'mp4', 'title': '0e8e040aec776862e1d632a699edf59e', - 'thumbnail': 'https://videodelivery.net/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg', + 'thumbnail': 'https://cloudflarestream.com/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg', }, }, { 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', @@ -54,7 +54,7 @@ class CloudflareStreamIE(InfoExtractor): 'id': 'eaef9dea5159cf968be84241b5cedfe7', 'ext': 'mp4', 'title': 'eaef9dea5159cf968be84241b5cedfe7', - 'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg', + 'thumbnail': 'https://cloudflarestream.com/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg', }, 'params': { 'skip_download': 'm3u8', @@ -62,8 +62,9 @@ class CloudflareStreamIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' + video_id, domain = self._match_valid_url(url).group('id', 'domain') + if domain != 'bytehighway.net': + domain = 'cloudflarestream.com' base_url = f'https://{domain}/{video_id}/' if '.' in video_id: video_id = self._parse_json(base64.urlsafe_b64decode( diff --git a/yt_dlp/extractor/cnn.py b/yt_dlp/extractor/cnn.py index fe7615a891..8148762c54 100644 --- a/yt_dlp/extractor/cnn.py +++ b/yt_dlp/extractor/cnn.py @@ -1,146 +1,225 @@ +import json +import re + from .common import InfoExtractor -from .turner import TurnerBaseIE -from ..utils import merge_dicts, try_call, url_basename +from ..utils import ( + clean_html, + extract_attributes, + int_or_none, + merge_dicts, + parse_duration, + parse_iso8601, + parse_resolution, + try_call, + update_url, + url_or_none, +) +from ..utils.traversal import find_elements, traverse_obj -class CNNIE(TurnerBaseIE): - _VALID_URL = r'''(?x)https?://(?:(?Pedition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ - (?P.+?/(?P[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' +class CNNIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:edition|www|money|cnnespanol)\.)?cnn\.com/(?!audio/)(?P<display_id>[^?#]+?)(?:[?#]|$|/index\.html)' _TESTS = [{ - 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', - 'md5': '3e6121ea48df7e2259fe73a0628605c4', + 'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl', 'info_dict': { - 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', + 'id': 'med0e97ad0d154f56e29aa96e57192a14226734b6b', + 'display_id': '2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl', 'ext': 'mp4', - 'title': 'Nadal wins 8th French Open title', - 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', - 'duration': 135, - 'upload_date': '20130609', + 'upload_date': '20240531', + 'description': 'md5:844bcdb0629e1877a7a466c913f4c19c', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/gettyimages-2151936122.jpg?c=original', + 'duration': 373.0, + 'timestamp': 1717148586, + 'title': 'Borussia Dortmund star Jadon Sancho seeks Wembley redemption after 2020 Euros hurt', + 'modified_date': '20240531', + 'modified_timestamp': 1717150140, }, - 'expected_warnings': ['Failed to download m3u8 information'], }, { - 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', - 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', + 'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid', 'info_dict': { - 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'id': 'me522945c4709b299e5cb8657900a7a21ad3b559f9', + 'display_id': '2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid', 'ext': 'mp4', - 'title': "Student's epic speech stuns new freshmen", - 'description': 'A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from "2001: A Space Odyssey."', - 'upload_date': '20130821', + 'description': 'md5:e0120fe5da9ad8259fd707c1cbb64a60', + 'title': 'Here’s how some inmates in closely divided state are now able to vote from jail', + 'timestamp': 1718158269, + 'upload_date': '20240612', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701554-13565-571-still.jpg?c=original', + 'duration': 202.0, + 'modified_date': '20240612', + 'modified_timestamp': 1718158509, }, - 'expected_warnings': ['Failed to download m3u8 information'], }, { - 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', - 'md5': 'f14d02ebd264df951feb2400e2c25a1b', + 'url': 'https://edition.cnn.com/2024/06/11/style/king-charles-portrait-vandalized/index.html', 'info_dict': { - 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'id': 'mef5f52b9e1fe28b1ad192afcbc9206ae984894b68', + 'display_id': '2024/06/11/style/king-charles-portrait-vandalized', 'ext': 'mp4', - 'title': 'Nashville Ep. 1: Hand crafted skateboards', - 'description': 'md5:e7223a503315c9f150acac52e76de086', - 'upload_date': '20141222', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701257-8846-816-still.jpg?c=original', + 'description': 'md5:19f78338ccec533db0fa8a4511012dae', + 'title': 'Video shows King Charles\' portrait being vandalized by activists', + 'timestamp': 1718113852, + 'upload_date': '20240611', + 'duration': 51.0, + 'modified_timestamp': 1718116193, + 'modified_date': '20240611', }, - 'expected_warnings': ['Failed to download m3u8 information'], }, { - 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', - 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'url': 'https://edition.cnn.com/videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln', 'info_dict': { - 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'id': 'mefba13799201b084ea3b1d0f7ca820ae94d4bb5b2', + 'display_id': 'videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln', 'ext': 'mp4', - 'title': '5 stunning stats about Netflix', - 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', - 'upload_date': '20160819', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/221205163510-robin-meade-sign-off.jpg?c=original', + 'duration': 158.0, + 'title': 'Robin Meade signs off after HLN\'s last broadcast', + 'description': 'md5:cff3c62d18d2fbc6c5c75cb029b7353b', + 'upload_date': '20221205', + 'timestamp': 1670284296, + 'modified_timestamp': 1670332404, + 'modified_date': '20221206', }, - 'params': { - # m3u8 download - 'skip_download': True, + 'params': {'format': 'direct'}, + }, { + 'url': 'https://cnnespanol.cnn.com/video/ataque-misil-israel-beirut-libano-octubre-trax', + 'info_dict': { + 'id': 'me484a43722642aa00627b812fe928f2e99c6e2997', + 'ext': 'mp4', + 'display_id': 'video/ataque-misil-israel-beirut-libano-octubre-trax', + 'timestamp': 1729501452, + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/ataqeubeirut-1.jpg?c=original', + 'description': 'md5:256ee7137d161f776cda429654135e52', + 'upload_date': '20241021', + 'duration': 31.0, + 'title': 'VIDEO | Israel lanza un nuevo ataque sobre Beirut', + 'modified_date': '20241021', + 'modified_timestamp': 1729501530, }, }, { - 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', - 'only_matching': True, - }, { - 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', - 'only_matching': True, - }, { - 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', - 'only_matching': True, + 'url': 'https://edition.cnn.com/2024/10/16/politics/kamala-harris-fox-news-interview/index.html', + 'info_dict': { + 'id': '2024/10/16/politics/kamala-harris-fox-news-interview', + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': '073ffab87b8bef97c9913e71cc18ef9e', + 'info_dict': { + 'id': 'me19d548fdd54df0924087039283128ef473ab397d', + 'ext': 'mp4', + 'title': '\'I\'m not finished\': Harris interview with Fox News gets heated', + 'display_id': 'kamala-harris-fox-news-interview-ebof-digvid', + 'description': 'md5:e7dd3d1a04df916062230b60ca419a0a', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/harris-20241016234916617.jpg?c=original', + 'duration': 173.0, + 'timestamp': 1729122182, + 'upload_date': '20241016', + 'modified_timestamp': 1729194706, + 'modified_date': '20241017', + }, + 'params': {'format': 'direct'}, + }, { + 'md5': '11604ab4af83b650826753f1ccb8ecff', + 'info_dict': { + 'id': 'med04507d8ca3da827001f63d22af321ec29c7d97b', + 'ext': 'mp4', + 'title': '\'Wise\': Buttigieg on Harris\' handling of interview question about gender transition surgery', + 'display_id': 'pete-buttigieg-harris-fox-newssrc-digvid', + 'description': 'md5:602a8a7e853ed5e574acd3159428c98e', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/buttigieg-20241017040412074.jpg?c=original', + 'duration': 145.0, + 'timestamp': 1729137765, + 'upload_date': '20241017', + 'modified_timestamp': 1729138184, + 'modified_date': '20241017', + }, + 'params': {'format': 'direct'}, + }], }] - _CONFIG = { - # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml - 'edition': { - 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', - 'media_src': 'http://pmd.cdn.turner.com/cnn/big', - }, - # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml - 'money': { - 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', - 'media_src': 'http://ht3.cdn.turner.com/money/big', - }, - } - - def _extract_timestamp(self, video_data): - # TODO: fix timestamp extraction - return None - def _real_extract(self, url): - sub_domain, path, page_title = self._match_valid_url(url).groups() - if sub_domain not in ('money', 'edition'): - sub_domain = 'edition' - config = self._CONFIG[sub_domain] - return self._extract_cvp_info( - config['data_src'] % path, page_title, { - 'default': { - 'media_src': config['media_src'], - }, - 'f4m': { - 'host': 'cnn-vh.akamaihd.net', - }, + display_id = self._match_valid_url(url).group('display_id') + webpage = self._download_webpage(url, display_id) + app_id = traverse_obj( + self._search_json(r'window\.env\s*=', webpage, 'window env', display_id, default={}), + ('TOP_AUTH_SERVICE_APP_ID', {str})) + + entries = [] + for player_data in traverse_obj(webpage, ( + {find_elements(tag='div', attr='data-component-name', value='video-player', html=True)}, + ..., {extract_attributes}, all, lambda _, v: v['data-media-id'])): + media_id = player_data['data-media-id'] + parent_uri = player_data.get('data-video-resource-parent-uri') + formats, subtitles = [], {} + + video_data = {} + if parent_uri: + video_data = self._download_json( + 'https://fave.api.cnn.io/v1/video', media_id, fatal=False, + query={ + 'id': media_id, + 'stellarUri': parent_uri, + }) + for direct_url in traverse_obj(video_data, ('files', ..., 'fileUri', {url_or_none})): + resolution, bitrate = None, None + if mobj := re.search(r'-(?P<res>\d+x\d+)_(?P<tbr>\d+)k\.mp4', direct_url): + resolution, bitrate = mobj.group('res', 'tbr') + formats.append({ + 'url': direct_url, + 'format_id': 'direct', + 'quality': 1, + 'tbr': int_or_none(bitrate), + **parse_resolution(resolution), + }) + for sub_data in traverse_obj(video_data, ( + 'closedCaptions', 'types', lambda _, v: url_or_none(v['track']['url']), 'track')): + subtitles.setdefault(sub_data.get('lang') or 'en', []).append({ + 'url': sub_data['url'], + 'name': sub_data.get('label'), + }) + + if app_id: + media_data = self._download_json( + f'https://medium.ngtv.io/v2/media/{media_id}/desktop', media_id, fatal=False, + query={'appId': app_id}) + m3u8_url = traverse_obj(media_data, ( + 'media', 'desktop', 'unprotected', 'unencrypted', 'url', {url_or_none})) + if m3u8_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + entries.append({ + **traverse_obj(player_data, { + 'title': ('data-headline', {clean_html}), + 'description': ('data-description', {clean_html}), + 'duration': ('data-duration', {parse_duration}), + 'timestamp': ('data-publish-date', {parse_iso8601}), + 'thumbnail': ( + 'data-poster-image-override', {json.loads}, 'big', 'uri', {url_or_none}, + {update_url(query='c=original')}), + 'display_id': 'data-video-slug', + }), + **traverse_obj(video_data, { + 'timestamp': ('dateCreated', 'uts', {int_or_none(scale=1000)}), + 'description': ('description', {clean_html}), + 'title': ('headline', {str}), + 'modified_timestamp': ('lastModified', 'uts', {int_or_none(scale=1000)}), + 'duration': ('trt', {int_or_none}), + }), + 'id': media_id, + 'formats': formats, + 'subtitles': subtitles, }) + if len(entries) == 1: + return { + **entries[0], + 'display_id': display_id, + } -class CNNBlogsIE(InfoExtractor): - _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' - _TEST = { - 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', - 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', - 'info_dict': { - 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', - 'ext': 'mp4', - 'title': 'Criminalizing journalism?', - 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', - 'upload_date': '20140209', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } - - def _real_extract(self, url): - webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') - return self.url_result(cnn_url, CNNIE.ie_key()) - - -class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' - _TEST = { - 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', - 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', - 'info_dict': { - 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', - 'ext': 'mp4', - 'title': 'Obama: Cyberattack not an act of war', - 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', - 'upload_date': '20141221', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } - - def _real_extract(self, url): - webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') - return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) + return self.playlist_result(entries, display_id) class CNNIndonesiaIE(InfoExtractor): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 812fbfa9f9..28a3adf936 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -25,7 +25,6 @@ import xml.etree.ElementTree from ..compat import ( compat_etree_fromstring, compat_expanduser, - compat_os_name, urllib_req_to_req, ) from ..cookies import LenientSimpleCookie @@ -47,6 +46,7 @@ from ..utils import ( FormatSorter, GeoRestrictedError, GeoUtils, + ISO639Utils, LenientJSONDecoder, Popen, RegexNotFoundError, @@ -278,6 +278,7 @@ class InfoExtractor: thumbnails: A list of dictionaries, with the following entries: * "id" (optional, string) - Thumbnail format ID * "url" + * "ext" (optional, string) - actual image extension if not given in URL * "preference" (optional, int) - quality of the image * "width" (optional, int) * "height" (optional, int) @@ -333,7 +334,7 @@ class InfoExtractor: like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video repost_count: Number of reposts of the video - average_rating: Average rating give by users, the scale used depends on the webpage + average_rating: Average rating given by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following properties (all but one of text or html optional): @@ -520,7 +521,7 @@ class InfoExtractor: or _extract_from_webpage as necessary. While these are normally classmethods, _extract_from_webpage is allowed to be an instance method. - _extract_from_webpage may raise self.StopExtraction() to stop further + _extract_from_webpage may raise self.StopExtraction to stop further processing of the webpage and obtain exclusive rights to it. This is useful when the extractor cannot reliably be matched using just the URL, e.g. invidious/peertube instances @@ -1027,7 +1028,7 @@ class InfoExtractor: filename = sanitize_filename(f'{basen}.dump', restricted=True) # Working around MAX_PATH limitation on Windows (see # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': + if os.name == 'nt': absfilepath = os.path.abspath(filename) if len(absfilepath) > 259: filename = fR'\\?\{absfilepath}' @@ -1408,6 +1409,13 @@ class InfoExtractor: return None, None self.write_debug(f'Using netrc for {netrc_machine} authentication') + + # compat: <=py3.10: netrc cannot parse tokens as empty strings, will return `""` instead + # Ref: https://github.com/yt-dlp/yt-dlp/issues/11413 + # https://github.com/python/cpython/commit/15409c720be0503131713e3d3abc1acd0da07378 + if sys.version_info < (3, 11): + return tuple(x if x != '""' else '' for x in info[::2]) + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): @@ -1570,7 +1578,9 @@ class InfoExtractor: if default is not NO_DEFAULT: fatal = False for mobj in re.finditer(JSON_LD_RE, html): - json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal) + json_ld_item = self._parse_json( + mobj.group('json_ld'), video_id, fatal=fatal, + errnote=False if default is not NO_DEFAULT else None) for json_ld in variadic(json_ld_item): if isinstance(json_ld, dict): yield json_ld @@ -3071,7 +3081,11 @@ class InfoExtractor: url_pattern = stream.attrib['Url'] stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') - stream_language = stream.get('Language', 'und') + # IsmFD expects ISO 639 Set 2 language codes (3-character length) + # See: https://github.com/yt-dlp/yt-dlp/issues/11356 + stream_language = stream.get('Language') or 'und' + if len(stream_language) != 3: + stream_language = ISO639Utils.short2long(stream_language) or 'und' for track in stream.findall('QualityLevel'): KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'} fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) @@ -3753,7 +3767,7 @@ class InfoExtractor: """ Merge subtitle dictionaries, language by language. """ if target is None: target = {} - for d in dicts: + for d in filter(None, dicts): for lang, subs in d.items(): target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs) return target diff --git a/yt_dlp/extractor/condenast.py b/yt_dlp/extractor/condenast.py index 9c02cd3429..0c84cfdab7 100644 --- a/yt_dlp/extractor/condenast.py +++ b/yt_dlp/extractor/condenast.py @@ -12,6 +12,7 @@ from ..utils import ( parse_iso8601, strip_or_none, try_get, + urljoin, ) @@ -112,8 +113,7 @@ class CondeNastIE(InfoExtractor): m_paths = re.finditer( r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage) paths = orderedSet(m.group(1) for m in m_paths) - build_url = lambda path: urllib.parse.urljoin(base_url, path) - entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] + entries = [self.url_result(urljoin(base_url, path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) def _extract_video_params(self, webpage, display_id): diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 1b124c6557..8faed179b7 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -456,7 +456,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): }), }), **traverse_obj(metadata, { - 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'duration': ('duration_ms', {float_or_none(scale=1000)}), 'timestamp': ('upload_date', {parse_iso8601}), 'series': ('series_title', {str}), 'series_id': ('series_id', {str}), @@ -484,7 +484,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): }), }), **traverse_obj(metadata, { - 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'duration': ('duration_ms', {float_or_none(scale=1000)}), 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), }), } diff --git a/yt_dlp/extractor/ctvnews.py b/yt_dlp/extractor/ctvnews.py index ebed9eb2d3..6d33f85e4a 100644 --- a/yt_dlp/extractor/ctvnews.py +++ b/yt_dlp/extractor/ctvnews.py @@ -1,14 +1,27 @@ +import json import re +import urllib.parse from .common import InfoExtractor -from ..utils import orderedSet +from .ninecninemedia import NineCNineMediaIE +from ..utils import extract_attributes, orderedSet +from ..utils.traversal import find_element, traverse_obj class CTVNewsIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)' + _BASE_REGEX = r'https?://(?:[^.]+\.)?ctvnews\.ca/' + _VIDEO_ID_RE = r'(?P<id>\d{5,})' + _PLAYLIST_ID_RE = r'(?P<id>\d\.\d{5,})' + _VALID_URL = [ + rf'{_BASE_REGEX}video/c{_VIDEO_ID_RE}', + rf'{_BASE_REGEX}video(?:-gallery)?/?\?clipId={_VIDEO_ID_RE}', + rf'{_BASE_REGEX}video/?\?(?:playlist|bin)Id={_PLAYLIST_ID_RE}', + rf'{_BASE_REGEX}(?!video/)[^?#]*?{_PLAYLIST_ID_RE}/?(?:$|[?#])', + rf'{_BASE_REGEX}(?!video/)[^?#]+\?binId={_PLAYLIST_ID_RE}', + ] _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', - 'md5': '9b8624ba66351a23e0b6e1391971f9af', + 'md5': 'b608f466c7fa24b9666c6439d766ab7e', 'info_dict': { 'id': '901995', 'ext': 'flv', @@ -16,6 +29,33 @@ class CTVNewsIE(InfoExtractor): 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', 'timestamp': 1467286284, 'upload_date': '20160630', + 'categories': [], + 'season_number': 0, + 'season': 'Season 0', + 'tags': [], + 'series': 'CTV News National | Archive | Stories 2', + 'season_id': '57981', + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 764.631, + }, + }, { + 'url': 'https://barrie.ctvnews.ca/video/c3030933-here_s-what_s-making-news-for-nov--15?binId=1272429', + 'md5': '8b8c2b33c5c1803e3c26bc74ff8694d5', + 'info_dict': { + 'id': '3030933', + 'ext': 'flv', + 'title': 'Here’s what’s making news for Nov. 15', + 'description': 'Here are the top stories we’re working on for CTV News at 11 for Nov. 15', + 'thumbnail': 'http://images2.9c9media.com/image_asset/2021_2_22_a602e68e-1514-410e-a67a-e1f7cccbacab_png_2000x1125.jpg', + 'season_id': '58104', + 'season_number': 0, + 'tags': [], + 'season': 'Season 0', + 'categories': [], + 'series': 'CTV News Barrie', + 'upload_date': '20241116', + 'duration': 42.943, + 'timestamp': 1731722452, }, }, { 'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224', @@ -31,6 +71,72 @@ class CTVNewsIE(InfoExtractor): 'id': '1.2876780', }, 'playlist_mincount': 100, + }, { + 'url': 'https://www.ctvnews.ca/it-s-been-23-years-since-toronto-called-in-the-army-after-a-major-snowstorm-1.5736957', + 'info_dict': + { + 'id': '1.5736957', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://www.ctvnews.ca/business/respondents-to-bank-of-canada-questionnaire-largely-oppose-creating-a-digital-loonie-1.6665797', + 'md5': '24bc4b88cdc17d8c3fc01dfc228ab72c', + 'info_dict': { + 'id': '2695026', + 'ext': 'flv', + 'season_id': '89852', + 'series': 'From CTV News Channel', + 'description': 'md5:796a985a23cacc7e1e2fafefd94afd0a', + 'season': '2023', + 'title': 'Bank of Canada asks public about digital currency', + 'categories': [], + 'tags': [], + 'upload_date': '20230526', + 'season_number': 2023, + 'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg', + 'timestamp': 1685105157, + 'duration': 253.553, + }, + }, { + 'url': 'https://stox.ctvnews.ca/video-gallery?clipId=582589', + 'md5': '135cc592df607d29dddc931f1b756ae2', + 'info_dict': { + 'id': '582589', + 'ext': 'flv', + 'categories': [], + 'timestamp': 1427906183, + 'season_number': 0, + 'duration': 125.559, + 'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg', + 'series': 'CTV News Stox', + 'description': 'CTV original footage of the rise and fall of the Berlin Wall.', + 'title': 'Berlin Wall', + 'season_id': '63817', + 'season': 'Season 0', + 'tags': [], + 'upload_date': '20150401', + }, + }, { + 'url': 'https://ottawa.ctvnews.ca/features/regional-contact/regional-contact-archive?binId=1.1164587#3023759', + 'md5': 'a14c0603557decc6531260791c23cc5e', + 'info_dict': { + 'id': '3023759', + 'ext': 'flv', + 'season_number': 2024, + 'timestamp': 1731798000, + 'season': '2024', + 'episode': 'Episode 125', + 'description': 'CTV News Ottawa at Six', + 'duration': 2712.076, + 'episode_number': 125, + 'upload_date': '20241116', + 'title': 'CTV News Ottawa at Six for Saturday, November 16, 2024', + 'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg', + 'categories': [], + 'tags': [], + 'series': 'CTV News Ottawa at Six', + 'season_id': '92667', + }, }, { 'url': 'http://www.ctvnews.ca/1.810401', 'only_matching': True, @@ -42,29 +148,35 @@ class CTVNewsIE(InfoExtractor): 'only_matching': True, }] + def _ninecninemedia_url_result(self, clip_id): + return self.url_result(f'9c9media:ctvnews_web:{clip_id}', NineCNineMediaIE, clip_id) + def _real_extract(self, url): page_id = self._match_id(url) - def ninecninemedia_url_result(clip_id): - return { - '_type': 'url_transparent', - 'id': clip_id, - 'url': f'9c9media:ctvnews_web:{clip_id}', - 'ie_key': 'NineCNineMedia', - } + if mobj := re.fullmatch(self._VIDEO_ID_RE, urllib.parse.urlparse(url).fragment): + page_id = mobj.group('id') - if page_id.isdigit(): - return ninecninemedia_url_result(page_id) - else: - webpage = self._download_webpage(f'http://www.ctvnews.ca/{page_id}', page_id, query={ - 'ot': 'example.AjaxPageLayout.ot', - 'maxItemsPerPage': 1000000, - }) - entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( - re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] - if not entries: - webpage = self._download_webpage(url, page_id) - if 'getAuthStates("' in webpage: - entries = [ninecninemedia_url_result(clip_id) for clip_id in - self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')] - return self.playlist_result(entries, page_id) + if re.fullmatch(self._VIDEO_ID_RE, page_id): + return self._ninecninemedia_url_result(page_id) + + webpage = self._download_webpage(f'https://www.ctvnews.ca/{page_id}', page_id, query={ + 'ot': 'example.AjaxPageLayout.ot', + 'maxItemsPerPage': 1000000, + }) + entries = [self._ninecninemedia_url_result(clip_id) + for clip_id in orderedSet(re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + if not entries: + webpage = self._download_webpage(url, page_id) + if 'getAuthStates("' in webpage: + entries = [self._ninecninemedia_url_result(clip_id) for clip_id in + self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')] + else: + entries = [ + self._ninecninemedia_url_result(clip_id) for clip_id in + traverse_obj(webpage, ( + {find_element(tag='jasper-player-container', html=True)}, + {extract_attributes}, 'axis-ids', {json.loads}, ..., 'axisId', {str})) + ] + + return self.playlist_result(entries, page_id) diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 632335e5b0..cb1453d3f5 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -10,11 +10,14 @@ from ..utils import ( OnDemandPagedList, age_restricted, clean_html, + extract_attributes, int_or_none, traverse_obj, try_get, unescapeHTML, unsmuggle_url, + update_url, + url_or_none, urlencode_postdata, ) @@ -98,12 +101,20 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// + (?: + dai\.ly/| (?: - (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)| - (?:www\.)?lequipe\.fr/video + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}| + (?:www\.)?lequipe\.fr + )/ + (?: + swf/(?!video)| + (?:(?:crawler|embed|swf)/)?video/| + player(?:/[\da-z]+)?\.html\?(?:video|(?P<is_playlist>playlist))= ) - [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? - ''' + ) + (?P<id>[^/?_&#]+)(?:[\w-]*\?playlist=(?P<playlist_id>x[0-9a-z]+))? + ''' IE_NAME = 'dailymotion' _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] _TESTS = [{ @@ -123,7 +134,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'view_count': int, 'like_count': int, 'tags': ['hollywood', 'celeb', 'celebrity', 'movies', 'red carpet'], - 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1aXqIx58LKWQ/x1080', + 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1cmt4ZcZ9KiM/x1080', }, }, { 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', @@ -142,7 +153,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'view_count': int, 'like_count': int, 'tags': ['en_quete_d_esprit'], - 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1YNg_RUl7ueu/x1080', + 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1clTH6StrxMP/x1080', }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -217,6 +228,66 @@ class DailymotionIE(DailymotionBaseInfoExtractor): }, { 'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video', 'only_matching': True, + }, { # playlist-only + 'url': 'https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj', + 'only_matching': True, + }, { + 'url': 'https://geo.dailymotion.com/player/xmyye.html?video=x93blhi', + 'only_matching': True, + }, { + 'url': 'https://www.dailymotion.com/crawler/video/x8u4owg', + 'only_matching': True, + }, { + 'url': 'https://www.dailymotion.com/embed/video/x8u4owg', + 'only_matching': True, + }, { + 'url': 'https://dai.ly/x94cnnk', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # https://geo.dailymotion.com/player/xmyye.html?video=x93blhi + 'url': 'https://www.financialounge.com/video/2024/08/01/borse-europee-in-rosso-dopo-la-fed-a-milano-volano-mediobanca-e-tim-edizione-del-1-agosto/', + 'info_dict': { + 'id': 'x93blhi', + 'ext': 'mp4', + 'title': 'OnAir - 01/08/24', + 'description': '', + 'duration': 217, + 'timestamp': 1722505658, + 'upload_date': '20240801', + 'uploader': 'Financialounge', + 'uploader_id': 'x2vtgmm', + 'age_limit': 0, + 'tags': [], + 'view_count': int, + 'like_count': int, + }, + }, { + # https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj + 'url': 'https://www.cycleworld.com/blogs/ask-kevin/ducati-continues-to-evolve-with-v4/', + 'info_dict': { + 'id': 'x7wdsj', + }, + 'playlist_mincount': 50, + }, { + # https://www.dailymotion.com/crawler/video/x8u4owg + 'url': 'https://www.leparisien.fr/environnement/video-le-veloto-la-voiture-a-pedales-qui-aimerait-se-faire-une-place-sur-les-routes-09-03-2024-KCYMCPM4WFHJXMSKBUI66UNFPU.php', + 'info_dict': { + 'id': 'x8u4owg', + 'ext': 'mp4', + 'like_count': int, + 'uploader': 'Le Parisien', + 'thumbnail': 'https://www.leparisien.fr/resizer/ho_GwveeYftNkLwg_cEta--5Bv4=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/BFXJNEBN75EUNHGYJLORUC3TX4.jpg', + 'upload_date': '20240309', + 'view_count': int, + 'timestamp': 1709997866, + 'age_limit': 0, + 'uploader_id': 'x32f7b', + 'title': 'VIDÉO. Le «\xa0véloto\xa0», la voiture à pédales qui aimerait se faire une place sur les routes', + 'duration': 428.0, + 'description': 'À bord du « véloto », l’alternative à la voiture pour la campagne', + 'tags': ['biclou', 'vélo', 'véloto', 'campagne', 'voiture', 'environnement', 'véhicules intermédiaires'], + }, }] _GEO_BYPASS = False _COMMON_MEDIA_FIELDS = '''description @@ -232,16 +303,35 @@ class DailymotionIE(DailymotionBaseInfoExtractor): for mobj in re.finditer( r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id') + for mobj in re.finditer( + r'(?s)<script [^>]*\bsrc=(["\'])(?:https?:)?//[\w-]+\.dailymotion\.com/player/(?:(?!\1).)+\1[^>]*>', webpage): + attrs = extract_attributes(mobj.group(0)) + player_url = url_or_none(attrs.get('src')) + if not player_url: + continue + player_url = player_url.replace('.js', '.html') + if player_url.startswith('//'): + player_url = f'https:{player_url}' + if video_id := attrs.get('data-video'): + query_string = f'video={video_id}' + elif playlist_id := attrs.get('data-playlist'): + query_string = f'playlist={playlist_id}' + else: + continue + yield update_url(player_url, query=query_string) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url) - video_id, playlist_id = self._match_valid_url(url).groups() + video_id, is_playlist, playlist_id = self._match_valid_url(url).group('id', 'is_playlist', 'playlist_id') - if playlist_id: - if self._yes_playlist(playlist_id, video_id): - return self.url_result( - 'http://www.dailymotion.com/playlist/' + playlist_id, - 'DailymotionPlaylist', playlist_id) + if is_playlist: # We matched the playlist query param as video_id + playlist_id = video_id + video_id = None + + if self._yes_playlist(playlist_id, video_id): + return self.url_result( + f'http://www.dailymotion.com/playlist/{playlist_id}', + 'DailymotionPlaylist', playlist_id) password = self.get_param('videopassword') media = self._call_api( @@ -282,6 +372,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): title = metadata['title'] is_live = media.get('isOnAir') formats = [] + subtitles = {} + for quality, media_list in metadata['qualities'].items(): for m in media_list: media_url = m.get('url') @@ -289,8 +381,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): if not media_url or media_type == 'application/vnd.lumberjack.manifest': continue if media_type == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) + fmt, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False) + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) else: f = { 'url': media_url, @@ -310,20 +404,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor): if not f.get('fps') and f['format_id'].endswith('@60'): f['fps'] = 60 - subtitles = {} subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} for subtitle_lang, subtitle in subtitles_data.items(): subtitles[subtitle_lang] = [{ 'url': subtitle_url, } for subtitle_url in subtitle.get('urls', [])] - thumbnails = [] - for height, poster_url in metadata.get('posters', {}).items(): - thumbnails.append({ - 'height': int_or_none(height), - 'id': height, - 'url': poster_url, - }) + thumbnails = traverse_obj(metadata, ( + ('posters', 'thumbnails'), {dict.items}, lambda _, v: url_or_none(v[1]), { + 'height': (0, {int_or_none}), + 'id': (0, {str}), + 'url': 1, + })) owner = metadata.get('owner') or {} stats = media.get('stats') or {} @@ -447,7 +539,7 @@ class DailymotionSearchIE(DailymotionPlaylistBaseIE): class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search|crawler)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', 'info_dict': { diff --git a/yt_dlp/extractor/dangalplay.py b/yt_dlp/extractor/dangalplay.py index 50e4136b57..f7b243234a 100644 --- a/yt_dlp/extractor/dangalplay.py +++ b/yt_dlp/extractor/dangalplay.py @@ -40,7 +40,7 @@ class DangalPlayBaseIE(InfoExtractor): 'id': ('content_id', {str}), 'title': ('display_title', {str}), 'episode': ('title', {str}), - 'series': ('show_name', {str}, {lambda x: x or None}), + 'series': ('show_name', {str}, filter), 'series_id': ('catalog_id', {str}), 'duration': ('duration', {int_or_none}), 'release_timestamp': ('release_date_uts', {int_or_none}), diff --git a/yt_dlp/extractor/digitalconcerthall.py b/yt_dlp/extractor/digitalconcerthall.py index edb6fa9c08..4c4fe470da 100644 --- a/yt_dlp/extractor/digitalconcerthall.py +++ b/yt_dlp/extractor/digitalconcerthall.py @@ -1,7 +1,10 @@ +import time + from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, + jwt_decode_hs256, parse_codecs, try_get, url_or_none, @@ -13,9 +16,6 @@ from ..utils.traversal import traverse_obj class DigitalConcertHallIE(InfoExtractor): IE_DESC = 'DigitalConcertHall extractor' _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/(?P<type>film|concert|work)/(?P<id>[0-9]+)-?(?P<part>[0-9]+)?' - _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' - _USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15' - _ACCESS_TOKEN = None _NETRC_MACHINE = 'digitalconcerthall' _TESTS = [{ 'note': 'Playlist with only one video', @@ -69,59 +69,157 @@ class DigitalConcertHallIE(InfoExtractor): 'params': {'skip_download': 'm3u8'}, 'playlist_count': 1, }] + _LOGIN_HINT = ('Use --username token --password ACCESS_TOKEN where ACCESS_TOKEN ' + 'is the "access_token_production" from your browser local storage') + _REFRESH_HINT = 'or else use a "refresh_token" with --username refresh --password REFRESH_TOKEN' + _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' + _CLIENT_ID = 'dch.webapp' + _CLIENT_SECRET = '2ySLN+2Fwb' + _USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15' + _OAUTH_HEADERS = { + 'Accept': 'application/json', + 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', + 'Origin': 'https://www.digitalconcerthall.com', + 'Referer': 'https://www.digitalconcerthall.com/', + 'User-Agent': _USER_AGENT, + } + _access_token = None + _access_token_expiry = 0 + _refresh_token = None - def _perform_login(self, username, password): - login_token = self._download_json( - self._OAUTH_URL, - None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({ + @property + def _access_token_is_expired(self): + return self._access_token_expiry - 30 <= int(time.time()) + + def _set_access_token(self, value): + self._access_token = value + self._access_token_expiry = traverse_obj(value, ({jwt_decode_hs256}, 'exp', {int})) or 0 + + def _cache_tokens(self, /): + self.cache.store(self._NETRC_MACHINE, 'tokens', { + 'access_token': self._access_token, + 'refresh_token': self._refresh_token, + }) + + def _fetch_new_tokens(self, invalidate=False): + if invalidate: + self.report_warning('Access token has been invalidated') + self._set_access_token(None) + + if not self._access_token_is_expired: + return + + if not self._refresh_token: + self._set_access_token(None) + self._cache_tokens() + raise ExtractorError( + 'Access token has expired or been invalidated. ' + 'Get a new "access_token_production" value from your browser ' + f'and try again, {self._REFRESH_HINT}', expected=True) + + # If we only have a refresh token, we need a temporary "initial token" for the refresh flow + bearer_token = self._access_token or self._download_json( + self._OAUTH_URL, None, 'Obtaining initial token', 'Unable to obtain initial token', + data=urlencode_postdata({ 'affiliate': 'none', 'grant_type': 'device', 'device_vendor': 'unknown', - # device_model 'Safari' gets split streams of 4K/HEVC video and lossless/FLAC audio - 'device_model': 'unknown' if self._configuration_arg('prefer_combined_hls') else 'Safari', - 'app_id': 'dch.webapp', + # device_model 'Safari' gets split streams of 4K/HEVC video and lossless/FLAC audio, + # but this is no longer effective since actual login is not possible anymore + 'device_model': 'unknown', + 'app_id': self._CLIENT_ID, 'app_distributor': 'berlinphil', - 'app_version': '1.84.0', - 'client_secret': '2ySLN+2Fwb', - }), headers={ - 'Accept': 'application/json', - 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', - 'User-Agent': self._USER_AGENT, - })['access_token'] + 'app_version': '1.95.0', + 'client_secret': self._CLIENT_SECRET, + }), headers=self._OAUTH_HEADERS)['access_token'] + try: - login_response = self._download_json( - self._OAUTH_URL, - None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({ - 'grant_type': 'password', - 'username': username, - 'password': password, + response = self._download_json( + self._OAUTH_URL, None, 'Refreshing token', 'Unable to refresh token', + data=urlencode_postdata({ + 'grant_type': 'refresh_token', + 'refresh_token': self._refresh_token, + 'client_id': self._CLIENT_ID, + 'client_secret': self._CLIENT_SECRET, }), headers={ - 'Accept': 'application/json', - 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', - 'Referer': 'https://www.digitalconcerthall.com', - 'Authorization': f'Bearer {login_token}', - 'User-Agent': self._USER_AGENT, + **self._OAUTH_HEADERS, + 'Authorization': f'Bearer {bearer_token}', }) - except ExtractorError as error: - if isinstance(error.cause, HTTPError) and error.cause.status == 401: - raise ExtractorError('Invalid username or password', expected=True) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self._set_access_token(None) + self._refresh_token = None + self._cache_tokens() + raise ExtractorError('Your tokens have been invalidated', expected=True) raise - self._ACCESS_TOKEN = login_response['access_token'] + + self._set_access_token(response['access_token']) + if refresh_token := traverse_obj(response, ('refresh_token', {str})): + self.write_debug('New refresh token granted') + self._refresh_token = refresh_token + self._cache_tokens() + + def _perform_login(self, username, password): + self.report_login() + + if username == 'refresh': + self._refresh_token = password + self._fetch_new_tokens() + + if username == 'token': + if not traverse_obj(password, {jwt_decode_hs256}): + raise ExtractorError( + f'The access token passed to yt-dlp is not valid. {self._LOGIN_HINT}', expected=True) + self._set_access_token(password) + self._cache_tokens() + + if username in ('refresh', 'token'): + if self.get_param('cachedir') is not False: + token_type = 'access' if username == 'token' else 'refresh' + self.to_screen(f'Your {token_type} token has been cached to disk. To use the cached ' + 'token next time, pass --username cache along with any password') + return + + if username != 'cache': + raise ExtractorError( + 'Login with username and password is no longer supported ' + f'for this site. {self._LOGIN_HINT}, {self._REFRESH_HINT}', expected=True) + + # Try cached access_token + cached_tokens = self.cache.load(self._NETRC_MACHINE, 'tokens', default={}) + self._set_access_token(cached_tokens.get('access_token')) + self._refresh_token = cached_tokens.get('refresh_token') + if not self._access_token_is_expired: + return + + # Try cached refresh_token + self._fetch_new_tokens(invalidate=True) def _real_initialize(self): - if not self._ACCESS_TOKEN: - self.raise_login_required(method='password') + if not self._access_token: + self.raise_login_required( + 'All content on this site is only available for registered users. ' + f'{self._LOGIN_HINT}, {self._REFRESH_HINT}', method=None) def _entries(self, items, language, type_, **kwargs): for item in items: video_id = item['id'] - stream_info = self._download_json( - self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={ - 'Accept': 'application/json', - 'Authorization': f'Bearer {self._ACCESS_TOKEN}', - 'Accept-Language': language, - 'User-Agent': self._USER_AGENT, - }) + + for should_retry in (True, False): + self._fetch_new_tokens(invalidate=not should_retry) + try: + stream_info = self._download_json( + self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={ + 'Accept': 'application/json', + 'Authorization': f'Bearer {self._access_token}', + 'Accept-Language': language, + 'User-Agent': self._USER_AGENT, + }) + break + except ExtractorError as error: + if should_retry and isinstance(error.cause, HTTPError) and error.cause.status == 401: + continue + raise formats = [] for m3u8_url in traverse_obj(stream_info, ('channel', ..., 'stream', ..., 'url', {url_or_none})): @@ -157,7 +255,6 @@ class DigitalConcertHallIE(InfoExtractor): 'Accept': 'application/json', 'Accept-Language': language, 'User-Agent': self._USER_AGENT, - 'Authorization': f'Bearer {self._ACCESS_TOKEN}', }) videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...)) diff --git a/yt_dlp/extractor/err.py b/yt_dlp/extractor/err.py index 7896cdbdc0..d4139c6f3c 100644 --- a/yt_dlp/extractor/err.py +++ b/yt_dlp/extractor/err.py @@ -207,7 +207,7 @@ class ERRJupiterIE(InfoExtractor): **traverse_obj(data, { 'title': ('heading', {str}), 'alt_title': ('subHeading', {str}), - 'description': (('lead', 'body'), {clean_html}, {lambda x: x or None}), + 'description': (('lead', 'body'), {clean_html}, filter), 'timestamp': ('created', {int_or_none}), 'modified_timestamp': ('updated', {int_or_none}), 'release_timestamp': (('scheduleStart', 'publicStart'), {int_or_none}), diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 1adb35b5f0..c07efcd581 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -563,12 +563,13 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) - def extract_dash_manifest(video, formats): - dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str) + def extract_dash_manifest(vid_data, formats, mpd_url=None): + dash_manifest = traverse_obj( + vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str) if dash_manifest: formats.extend(self._parse_mpd_formats( compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), - mpd_url=video.get('dash_manifest_url'))) + mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url)) def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around @@ -618,16 +619,20 @@ class FacebookIE(InfoExtractor): video = video['creation_story'] video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video.update(reel_info) + formats = [] q = qualities(['sd', 'hd']) + + # Legacy formats extraction + fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('browser_native_sd_url', 'sd')): - playable_url = video.get(key) + playable_url = fmt_data.get(key) if not playable_url: continue if determine_ext(playable_url) == 'mpd': - formats.extend(self._extract_mpd_formats(playable_url, video_id)) + formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -635,7 +640,29 @@ class FacebookIE(InfoExtractor): 'quality': q(format_id) - 3, 'url': playable_url, }) - extract_dash_manifest(video, formats) + extract_dash_manifest(fmt_data, formats) + + # New videoDeliveryResponse formats extraction + fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult')) + mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none})) + dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml'])) + for idx, dash_manifest in enumerate(dash_manifests): + extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx)) + if not dash_manifests: + # Only extract from MPD URLs if the manifests are not already provided + for mpd_url in mpd_urls: + formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False)) + for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])): + format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower})) + formats.append({ + 'format_id': format_id, + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, + 'url': prog_fmt['progressive_url'], + }) + for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls')) + if not formats: # Do not append false positive entry w/o any formats return diff --git a/yt_dlp/extractor/funk.py b/yt_dlp/extractor/funk.py index 8bdea3fce7..ef8ea72a8c 100644 --- a/yt_dlp/extractor/funk.py +++ b/yt_dlp/extractor/funk.py @@ -3,7 +3,7 @@ from .nexx import NexxIE class FunkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www|origin|play)\.)?funk\.net/(?:channel|playlist)/[^/?#]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821', 'md5': '8610449476156f338761a75391b0017d', @@ -27,6 +27,9 @@ class FunkIE(InfoExtractor): }, { 'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699', 'only_matching': True, + }, { + 'url': 'https://play.funk.net/playlist/neuesteVideos/george-floyd-wenn-die-polizei-toetet-der-fall-2004391', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/gamedevtv.py b/yt_dlp/extractor/gamedevtv.py new file mode 100644 index 0000000000..06e8b7356d --- /dev/null +++ b/yt_dlp/extractor/gamedevtv.py @@ -0,0 +1,141 @@ +import json + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + clean_html, + int_or_none, + join_nonempty, + parse_iso8601, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class GameDevTVDashboardIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gamedev\.tv/dashboard/courses/(?P<course_id>\d+)(?:/(?P<lecture_id>\d+))?' + _NETRC_MACHINE = 'gamedevtv' + _TESTS = [{ + 'url': 'https://www.gamedev.tv/dashboard/courses/25', + 'info_dict': { + 'id': '25', + 'title': 'Complete Blender Creator 3: Learn 3D Modelling for Beginners', + 'tags': ['blender', 'course', 'all', 'box modelling', 'sculpting'], + 'categories': ['Blender', '3D Art'], + 'thumbnail': 'https://gamedev-files.b-cdn.net/courses/qisc9pmu1jdc.jpg', + 'upload_date': '20220516', + 'timestamp': 1652694420, + 'modified_date': '20241027', + 'modified_timestamp': 1730049658, + }, + 'playlist_count': 100, + }, { + 'url': 'https://www.gamedev.tv/dashboard/courses/63/2279', + 'info_dict': { + 'id': 'df04f4d8-68a4-4756-a71b-9ca9446c3a01', + 'ext': 'mp4', + 'modified_timestamp': 1701695752, + 'upload_date': '20230504', + 'episode': 'MagicaVoxel Community Course Introduction', + 'series_id': '63', + 'title': 'MagicaVoxel Community Course Introduction', + 'timestamp': 1683195397, + 'modified_date': '20231204', + 'categories': ['3D Art', 'MagicaVoxel'], + 'season': 'MagicaVoxel Community Course', + 'tags': ['MagicaVoxel', 'all', 'course'], + 'series': 'MagicaVoxel 3D Art Mini Course', + 'duration': 1405, + 'episode_number': 1, + 'season_number': 1, + 'season_id': '219', + 'description': 'md5:a378738c5bbec1c785d76c067652d650', + 'display_id': '63-219-2279', + 'alt_title': '1_CC_MVX MagicaVoxel Community Course Introduction.mp4', + 'thumbnail': 'https://vz-23691c65-6fa.b-cdn.net/df04f4d8-68a4-4756-a71b-9ca9446c3a01/thumbnail.jpg', + }, + }] + _API_HEADERS = {} + + def _perform_login(self, username, password): + try: + response = self._download_json( + 'https://api.gamedev.tv/api/students/login', None, 'Logging in', + headers={'Content-Type': 'application/json'}, + data=json.dumps({ + 'email': username, + 'password': password, + 'cart_items': [], + }).encode()) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + raise ExtractorError('Invalid username/password', expected=True) + raise + + self._API_HEADERS['Authorization'] = f'{response["token_type"]} {response["access_token"]}' + + def _real_initialize(self): + if not self._API_HEADERS.get('Authorization'): + self.raise_login_required( + 'This content is only available with purchase', method='password') + + def _entries(self, data, course_id, course_info, selected_lecture): + for section in traverse_obj(data, ('sections', ..., {dict})): + section_info = traverse_obj(section, { + 'season_id': ('id', {str_or_none}), + 'season': ('title', {str}), + 'season_number': ('order', {int_or_none}), + }) + for lecture in traverse_obj(section, ('lectures', lambda _, v: url_or_none(v['video']['playListUrl']))): + if selected_lecture and str(lecture.get('id')) != selected_lecture: + continue + display_id = join_nonempty(course_id, section_info.get('season_id'), lecture.get('id')) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + lecture['video']['playListUrl'], display_id, 'mp4', m3u8_id='hls') + yield { + **course_info, + **section_info, + 'id': display_id, # fallback + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'series': course_info.get('title'), + 'series_id': course_id, + **traverse_obj(lecture, { + 'id': ('video', 'guid', {str}), + 'title': ('title', {str}), + 'alt_title': ('video', 'title', {str}), + 'description': ('description', {clean_html}), + 'episode': ('title', {str}), + 'episode_number': ('order', {int_or_none}), + 'duration': ('video', 'duration_in_sec', {int_or_none}), + 'timestamp': ('video', 'created_at', {parse_iso8601}), + 'modified_timestamp': ('video', 'updated_at', {parse_iso8601}), + 'thumbnail': ('video', 'thumbnailUrl', {url_or_none}), + }), + } + + def _real_extract(self, url): + course_id, lecture_id = self._match_valid_url(url).group('course_id', 'lecture_id') + data = self._download_json( + f'https://api.gamedev.tv/api/courses/my/{course_id}', course_id, + headers=self._API_HEADERS)['data'] + + course_info = traverse_obj(data, { + 'title': ('title', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'categories': ('categories', ..., 'title', {str}), + 'timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'thumbnail': ('image', {url_or_none}), + }) + + entries = self._entries(data, course_id, course_info, lecture_id) + if lecture_id: + lecture = next(entries, None) + if not lecture: + raise ExtractorError('Lecture not found') + return lecture + return self.playlist_result(entries, course_id, **course_info) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 9b5421e41d..320a47772b 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -8,6 +8,8 @@ from .common import InfoExtractor from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring +from ..cookies import LenientSimpleCookie +from ..networking.exceptions import HTTPError from ..networking.impersonate import ImpersonateTarget from ..utils import ( KNOWN_EXTENSIONS, @@ -2374,10 +2376,9 @@ class GenericIE(InfoExtractor): else: video_id = self._generic_id(url) - # Try to impersonate a web-browser by default if possible - # Skip impersonation if not available to omit the warning - impersonate = self._configuration_arg('impersonate', ['']) - if 'false' in impersonate or not self._downloader._impersonate_target_available(ImpersonateTarget()): + # Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335 + impersonate = self._configuration_arg('impersonate', ['false']) + if 'false' in impersonate: impersonate = None # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) @@ -2388,10 +2389,29 @@ class GenericIE(InfoExtractor): # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. - full_response = self._request_webpage(url, video_id, headers=filter_dict({ - 'Accept-Encoding': 'identity', - 'Referer': smuggled_data.get('referer'), - }), impersonate=impersonate) + try: + full_response = self._request_webpage(url, video_id, headers=filter_dict({ + 'Accept-Encoding': 'identity', + 'Referer': smuggled_data.get('referer'), + }), impersonate=impersonate) + except ExtractorError as e: + if not (isinstance(e.cause, HTTPError) and e.cause.status == 403 + and e.cause.response.get_header('cf-mitigated') == 'challenge' + and e.cause.response.extensions.get('impersonate') is None): + raise + cf_cookie_domain = traverse_obj( + LenientSimpleCookie(e.cause.response.get_header('set-cookie')), + ('__cf_bm', 'domain')) + if cf_cookie_domain: + self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}') + self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm') + msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; ' + if not self._downloader._impersonate_target_available(ImpersonateTarget()): + msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for ' + 'how to install the required impersonation dependency, and ') + raise ExtractorError( + f'{msg}try again with --extractor-args "generic:impersonate"', expected=True) + new_url = full_response.url if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index dfe5afe635..32300f75c2 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -5,56 +5,63 @@ import hashlib import hmac import json import os +import re +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, + js_to_json, + remove_end, traverse_obj, - unescapeHTML, ) class GoPlayIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/]+/[^/]+/|)(?P<display_id>[^/#]+)' + _VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/?#]+/[^/?#]+/|)(?P<id>[^/#]+)' _NETRC_MACHINE = 'goplay' _TESTS = [{ - 'url': 'https://www.goplay.be/video/de-container-cup/de-container-cup-s3/de-container-cup-s3-aflevering-2#autoplay', + 'url': 'https://www.goplay.be/video/de-slimste-mens-ter-wereld/de-slimste-mens-ter-wereld-s22/de-slimste-mens-ter-wereld-s22-aflevering-1', 'info_dict': { - 'id': '9c4214b8-e55d-4e4b-a446-f015f6c6f811', + 'id': '2baa4560-87a0-421b-bffc-359914e3c387', 'ext': 'mp4', - 'title': 'S3 - Aflevering 2', - 'series': 'De Container Cup', - 'season': 'Season 3', - 'season_number': 3, - 'episode': 'Episode 2', - 'episode_number': 2, + 'title': 'S22 - Aflevering 1', + 'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}', + 'series': 'De Slimste Mens ter Wereld', + 'episode': 'Episode 1', + 'season_number': 22, + 'episode_number': 1, + 'season': 'Season 22', }, + 'params': {'skip_download': True}, 'skip': 'This video is only available for registered users', }, { - 'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay', + 'url': 'https://www.goplay.be/video/1917', 'info_dict': { - 'id': '74e3ed07-748c-49e4-85a0-393a93337dbf', + 'id': '40cac41d-8d29-4ef5-aa11-75047b9f0907', 'ext': 'mp4', - 'title': 'A Family for the Holidays', + 'title': '1917', + 'description': r're:Op het hoogtepunt van de Eerste Wereldoorlog krijgen twee jonge .{94}', }, + 'params': {'skip_download': True}, 'skip': 'This video is only available for registered users', }, { 'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay', 'info_dict': { - 'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656', + 'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee', 'ext': 'mp4', 'title': 'S11 - Aflevering 1', + 'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}', 'episode': 'Episode 1', 'series': 'De Mol', 'season_number': 11, 'episode_number': 1, 'season': 'Season 11', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, 'skip': 'This video is only available for registered users', }] @@ -69,27 +76,42 @@ class GoPlayIE(InfoExtractor): if not self._id_token: raise self.raise_login_required(method='password') - def _real_extract(self, url): - url, display_id = self._match_valid_url(url).group(0, 'display_id') - webpage = self._download_webpage(url, display_id) - video_data_json = self._html_search_regex(r'<div\s+data-hero="([^"]+)"', webpage, 'video_data') - video_data = self._parse_json(unescapeHTML(video_data_json), display_id).get('data') + def _find_json(self, s): + return self._search_json( + r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) - movie = video_data.get('movie') - if movie: - video_id = movie['videoUuid'] - info_dict = { - 'title': movie.get('title'), - } - else: - episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False) - video_id = episode['videoUuid'] - info_dict = { - 'title': episode.get('episodeTitle'), - 'series': traverse_obj(episode, ('program', 'title')), - 'season_number': episode.get('seasonNumber'), - 'episode_number': episode.get('episodeNumber'), - } + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + nextjs_data = traverse_obj( + re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage), + (..., {js_to_json}, {json.loads}, ..., {self._find_json}, ...)) + meta = traverse_obj(nextjs_data, ( + ..., lambda _, v: v['meta']['path'] == urllib.parse.urlparse(url).path, 'meta', any)) + + video_id = meta['uuid'] + info_dict = traverse_obj(meta, { + 'title': ('title', {str}), + 'description': ('description', {str.strip}), + }) + + if traverse_obj(meta, ('program', 'subtype')) != 'movie': + for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)): + episode_data = traverse_obj( + season_data, ('videos', lambda _, v: v['videoId'] == video_id, any)) + if not episode_data: + continue + + episode_title = traverse_obj( + episode_data, 'contextualTitle', 'episodeTitle', expected_type=str) + info_dict.update({ + 'title': episode_title or info_dict.get('title'), + 'series': remove_end(info_dict.get('title'), f' - {episode_title}'), + 'season_number': traverse_obj(season_data, ('season', {int_or_none})), + 'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})), + }) + break api = self._download_json( f'https://api.goplay.be/web/v1/videos/long-form/{video_id}', diff --git a/yt_dlp/extractor/ilpost.py b/yt_dlp/extractor/ilpost.py index 2868f0c62c..da203cf5ff 100644 --- a/yt_dlp/extractor/ilpost.py +++ b/yt_dlp/extractor/ilpost.py @@ -1,4 +1,3 @@ -import functools from .common import InfoExtractor from ..utils import ( @@ -63,7 +62,7 @@ class IlPostIE(InfoExtractor): 'url': ('podcast_raw_url', {url_or_none}), 'thumbnail': ('image', {url_or_none}), 'timestamp': ('timestamp', {int_or_none}), - 'duration': ('milliseconds', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('milliseconds', {float_or_none(scale=1000)}), 'availability': ('free', {lambda v: 'public' if v else 'subscriber_only'}), }), } diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py index f0c3419d49..e2644e6a40 100644 --- a/yt_dlp/extractor/imgur.py +++ b/yt_dlp/extractor/imgur.py @@ -37,7 +37,7 @@ class ImgurBaseIE(InfoExtractor): class ImgurIE(ImgurBaseIE): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://imgur.com/A61SaA1', @@ -54,6 +54,22 @@ class ImgurIE(ImgurBaseIE): 'like_count': int, 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', }, + }, { + # Test with URL slug + 'url': 'https://imgur.com/mrw-gifv-is-up-running-without-any-bugs-A61SaA1', + 'info_dict': { + 'id': 'A61SaA1', + 'ext': 'mp4', + 'title': 'MRW gifv is up and running without any bugs', + 'timestamp': 1416446068, + 'upload_date': '20141120', + 'dislike_count': int, + 'comment_count': int, + 'release_timestamp': 1416446068, + 'release_date': '20141120', + 'like_count': int, + 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', + }, }, { 'url': 'https://i.imgur.com/A61SaA1.gifv', 'only_matching': True, @@ -92,6 +108,7 @@ class ImgurIE(ImgurBaseIE): 'comment_count': int, 'release_timestamp': 1710491255, 'release_date': '20240315', + 'thumbnail': 'https://i.imgur.com/zV03bd5h.jpg', }, }] @@ -208,7 +225,10 @@ class ImgurIE(ImgurBaseIE): }), get_all=False), 'id': video_id, 'formats': formats, - 'thumbnail': url_or_none(search('thumbnailUrl')), + 'thumbnails': [{ + 'url': thumbnail_url, + 'http_headers': {'Accept': '*/*'}, + }] if (thumbnail_url := search(['thumbnailUrl', 'twitter:image', 'og:image'])) else None, 'http_headers': {'Accept': '*/*'}, } @@ -252,17 +272,9 @@ class ImgurGalleryBaseIE(ImgurBaseIE): class ImgurGalleryIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:gallery' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://imgur.com/gallery/Q95ko', - 'info_dict': { - 'id': 'Q95ko', - 'title': 'Adding faces make every GIF better', - }, - 'playlist_count': 25, - 'skip': 'Zoinks! You\'ve taken a wrong turn.', - }, { # TODO: static images - replace with animated/video gallery 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, @@ -280,7 +292,27 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): 'release_timestamp': 1358554297, 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', 'release_date': '20130119', - 'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand', + 'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand', + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + }, { + # Test with slug + 'url': 'https://imgur.com/gallery/classic-steve-carell-gif-cracks-me-up-everytime-repost-downvotes-YcAQlkx', + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + 'timestamp': 1358554297, + 'upload_date': '20130119', + 'uploader_id': '1648642', + 'uploader': 'wittyusernamehere', + 'release_timestamp': 1358554297, + 'release_date': '20130119', + 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', + 'uploader_url': 'https://i.imgur.com/N5Flb2v_d.png?maxwidth=290&fidelity=grand', 'comment_count': int, 'dislike_count': int, 'like_count': int, @@ -317,6 +349,13 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): 'title': 'Penguins !', }, 'playlist_count': 3, + }, { + 'url': 'https://imgur.com/t/unmuted/penguins-penguins-6lAn9VQ', + 'info_dict': { + 'id': '6lAn9VQ', + 'title': 'Penguins !', + }, + 'playlist_count': 3, }, { 'url': 'https://imgur.com/t/unmuted/kx2uD3C', 'add_ies': ['Imgur'], @@ -357,7 +396,7 @@ class ImgurGalleryIE(ImgurGalleryBaseIE): class ImgurAlbumIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:album' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?:[^/?#]+-)?(?P<id>[a-zA-Z0-9]+)' _GALLERY = False _TESTS = [{ # TODO: only static images - replace with animated/video gallery @@ -372,6 +411,14 @@ class ImgurAlbumIE(ImgurGalleryBaseIE): 'title': 'enen-no-shouboutai', }, 'playlist_count': 2, + }, { + # Test with URL slug + 'url': 'https://imgur.com/a/enen-no-shouboutai-iX265HX', + 'info_dict': { + 'id': 'iX265HX', + 'title': 'enen-no-shouboutai', + }, + 'playlist_count': 2, }, { 'url': 'https://imgur.com/a/8pih2Ed', 'info_dict': { diff --git a/yt_dlp/extractor/jiocinema.py b/yt_dlp/extractor/jiocinema.py index 30d98ba796..94c85064ef 100644 --- a/yt_dlp/extractor/jiocinema.py +++ b/yt_dlp/extractor/jiocinema.py @@ -326,11 +326,11 @@ class JioCinemaIE(JioCinemaBaseIE): # fallback metadata 'title': ('name', {str}), 'description': ('fullSynopsis', {str}), - 'series': ('show', 'name', {str}, {lambda x: x or None}), + 'series': ('show', 'name', {str}, filter), 'season': ('tournamentName', {str}, {lambda x: x if x != 'Season 0' else None}), - 'season_number': ('episode', 'season', {int_or_none}, {lambda x: x or None}), + 'season_number': ('episode', 'season', {int_or_none}, filter), 'episode': ('fullTitle', {str}), - 'episode_number': ('episode', 'episodeNo', {int_or_none}, {lambda x: x or None}), + 'episode_number': ('episode', 'episodeNo', {int_or_none}, filter), 'age_limit': ('ageNemonic', {parse_age_limit}), 'duration': ('totalDuration', {float_or_none}), 'thumbnail': ('images', {url_or_none}), @@ -338,10 +338,10 @@ class JioCinemaIE(JioCinemaBaseIE): **traverse_obj(metadata, ('result', 0, { 'title': ('fullTitle', {str}), 'description': ('fullSynopsis', {str}), - 'series': ('showName', {str}, {lambda x: x or None}), - 'season': ('seasonName', {str}, {lambda x: x or None}), + 'series': ('showName', {str}, filter), + 'season': ('seasonName', {str}, filter), 'season_number': ('season', {int_or_none}), - 'season_id': ('seasonId', {str}, {lambda x: x or None}), + 'season_id': ('seasonId', {str}, filter), 'episode': ('fullTitle', {str}), 'episode_number': ('episode', {int_or_none}), 'timestamp': ('uploadTime', {int_or_none}), diff --git a/yt_dlp/extractor/kenh14.py b/yt_dlp/extractor/kenh14.py new file mode 100644 index 0000000000..3c46020e8b --- /dev/null +++ b/yt_dlp/extractor/kenh14.py @@ -0,0 +1,160 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_attribute, + get_elements_html_by_class, + int_or_none, + parse_duration, + parse_iso8601, + remove_start, + strip_or_none, + unescapeHTML, + update_url, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class Kenh14VideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn' + _TESTS = [{ + 'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn', + 'md5': '1ed67f9c3a1e74acf15db69590cf6210', + 'info_dict': { + 'id': '316173', + 'ext': 'mp4', + 'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', + 'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], + 'uploader': 'Unbox Therapy', + 'upload_date': '20220517', + 'view_count': int, + 'duration': 722.86, + 'timestamp': 1652764468, + }, + }, { + 'url': 'https://video.kenh14.vn/video-316174.chn', + 'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd', + 'info_dict': { + 'id': '316174', + 'ext': 'mp4', + 'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu', + 'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc', + 'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$', + 'tags': [], + 'upload_date': '20220517', + 'view_count': int, + 'duration': 70.04, + 'timestamp': 1652766021, + }, + }, { + 'url': 'https://video.kenh14.vn/0-344740.chn', + 'md5': 'b843495d5e728142c8870c09b46df2a9', + 'info_dict': { + 'id': '344740', + 'ext': 'mov', + 'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi', + 'description': 'md5:2a2dbb4a7397169fb21ee68f09160497', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$', + 'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'], + 'uploader': 'Quang Vũ', + 'upload_date': '20241024', + 'view_count': int, + 'duration': 198.88, + 'timestamp': 1729741590, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '') + direct_url = attrs['data-vid'] + + metadata = self._download_json( + 'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format( + remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False) + + formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}] + subtitles = {} + video_data = self._download_json( + f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False) + if hls_url := traverse_obj(video_data, ('hls', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_url, video_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})): + fmts, subs = self._extract_mpd_formats_and_subtitles( + dash_url, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + **traverse_obj(metadata, { + 'duration': ('duration', {parse_duration}), + 'uploader': ('author', {strip_or_none}), + 'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}), + 'view_count': ('views', {int_or_none}), + }), + 'id': video_id, + 'title': ( + traverse_obj(metadata, ('title', {strip_or_none})) + or clean_html(self._og_search_title(webpage)) + or clean_html(get_element_by_class('vdbw-title', webpage))), + 'formats': formats, + 'subtitles': subtitles, + 'description': ( + clean_html(self._og_search_description(webpage)) + or clean_html(get_element_by_class('vdbw-sapo', webpage))), + 'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')), + 'tags': traverse_obj(self._html_search_meta('keywords', webpage), ( + {lambda x: x.split(';')}, ..., filter)), + } + + +class Kenh14PlaylistIE(InfoExtractor): + _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn' + _TESTS = [{ + 'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn', + 'info_dict': { + 'id': '71', + 'title': 'Trần Tình (Naked love) mùa 2', + 'description': 'md5:e9522339304956dea931722dd72eddb2', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 9, + }, { + 'url': 'https://video.kenh14.vn/playlist/0-72.chn', + 'info_dict': { + 'id': '72', + 'title': 'Lau Lại Đầu Từ', + 'description': 'Cùng xem xưa và nay có gì khác biệt nhé!', + 'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + category_detail = get_element_by_class('category-detail', webpage) or '' + embed_info = traverse_obj( + self._yield_json_ld(webpage, playlist_id), + (lambda _, v: v['name'] and v['alternateName'], any)) or {} + + return self.playlist_from_matches( + get_elements_html_by_class('video-item', webpage), playlist_id, + (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))), + getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']), + ie=Kenh14VideoIE, playlist_description=( + clean_html(get_element_by_class('description', category_detail)) + or unescapeHTML(embed_info.get('alternateName'))), + thumbnail=traverse_obj( + self._og_search_thumbnail(webpage), + ({url_or_none}, {update_url(query=None)}))) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index bd21e59501..1f001d421a 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -1,4 +1,3 @@ -import functools from .common import InfoExtractor from ..networking import HEADRequest @@ -137,7 +136,7 @@ class KickVODIE(KickBaseIE): 'uploader': ('livestream', 'channel', 'user', 'username', {str}), 'uploader_id': ('livestream', 'channel', 'user_id', {int}, {str_or_none}), 'timestamp': ('created_at', {parse_iso8601}), - 'duration': ('livestream', 'duration', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('livestream', 'duration', {float_or_none(scale=1000)}), 'thumbnail': ('livestream', 'thumbnail', {url_or_none}), 'categories': ('livestream', 'categories', ..., 'name', {str}), 'view_count': ('views', {int_or_none}), diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py index 852a4de3f2..69f4a3ce03 100644 --- a/yt_dlp/extractor/kika.py +++ b/yt_dlp/extractor/kika.py @@ -119,7 +119,7 @@ class KikaIE(InfoExtractor): 'width': ('frameWidth', {int_or_none}), 'height': ('frameHeight', {int_or_none}), # NB: filesize is 0 if unknown, bitrate is -1 if unknown - 'filesize': ('fileSize', {int_or_none}, {lambda x: x or None}), + 'filesize': ('fileSize', {int_or_none}, filter), 'abr': ('bitrateAudio', {int_or_none}, {lambda x: None if x == -1 else x}), 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), }), diff --git a/yt_dlp/extractor/laracasts.py b/yt_dlp/extractor/laracasts.py index 4494c4b79a..4a61d6ab14 100644 --- a/yt_dlp/extractor/laracasts.py +++ b/yt_dlp/extractor/laracasts.py @@ -32,7 +32,7 @@ class LaracastsBaseIE(InfoExtractor): VimeoIE, url_transparent=True, **traverse_obj(episode, { 'id': ('id', {int}, {str_or_none}), - 'webpage_url': ('path', {lambda x: urljoin('https://laracasts.com', x)}), + 'webpage_url': ('path', {urljoin('https://laracasts.com')}), 'title': ('title', {clean_html}), 'season_number': ('chapter', {int_or_none}), 'episode_number': ('position', {int_or_none}), @@ -104,7 +104,7 @@ class LaracastsPlaylistIE(LaracastsBaseIE): 'description': ('body', {clean_html}), 'thumbnail': (('large_thumbnail', 'thumbnail'), {url_or_none}, any), 'duration': ('runTime', {parse_duration}), - 'categories': ('taxonomy', 'name', {str}, {lambda x: x and [x]}), + 'categories': ('taxonomy', 'name', {str}, all, filter), 'tags': ('topics', ..., 'name', {str}), 'modified_date': ('lastUpdated', {unified_strdate}), }), diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index c764d49611..0445b7cbfc 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -66,7 +66,7 @@ class LBRYBaseIE(InfoExtractor): 'license': ('value', 'license', {str}), 'timestamp': ('timestamp', {int_or_none}), 'release_timestamp': ('value', 'release_time', {int_or_none}), - 'tags': ('value', 'tags', ..., {lambda x: x or None}), + 'tags': ('value', 'tags', ..., filter), 'duration': ('value', stream_type, 'duration', {int_or_none}), 'channel': ('signing_channel', 'value', 'title', {str}), 'channel_id': ('signing_channel', 'claim_id', {str}), @@ -136,6 +136,7 @@ class LBRYBaseIE(InfoExtractor): class LBRYIE(LBRYBaseIE): IE_NAME = 'lbry' + IE_DESC = 'odysee.com' _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf''' (?:\$/(?:download|embed)/)? (?P<id> @@ -364,6 +365,7 @@ class LBRYIE(LBRYBaseIE): class LBRYChannelIE(LBRYBaseIE): IE_NAME = 'lbry:channel' + IE_DESC = 'odysee.com channels' _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P<id>@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)' _TESTS = [{ 'url': 'https://lbry.tv/@LBRYFoundation:0', @@ -391,6 +393,7 @@ class LBRYChannelIE(LBRYBaseIE): class LBRYPlaylistIE(LBRYBaseIE): IE_NAME = 'lbry:playlist' + IE_DESC = 'odysee.com playlists' _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P<id>[0-9a-f-]+)' _TESTS = [{ 'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2', diff --git a/yt_dlp/extractor/learningonscreen.py b/yt_dlp/extractor/learningonscreen.py index dcf83144c8..f4b51e66c3 100644 --- a/yt_dlp/extractor/learningonscreen.py +++ b/yt_dlp/extractor/learningonscreen.py @@ -6,13 +6,11 @@ from ..utils import ( ExtractorError, clean_html, extract_attributes, - get_element_by_class, - get_element_html_by_id, join_nonempty, parse_duration, unified_timestamp, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import find_element, traverse_obj class LearningOnScreenIE(InfoExtractor): @@ -32,28 +30,24 @@ class LearningOnScreenIE(InfoExtractor): def _real_initialize(self): if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'): - self.raise_login_required( - 'Use --cookies for authentication. See ' - ' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp ' - 'for how to manually pass cookies', method=None) + self.raise_login_required(method='session_cookies') def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) details = traverse_obj(webpage, ( - {functools.partial(get_element_html_by_id, 'programme-details')}, { - 'title': ({functools.partial(re.search, r'<h2>([^<]+)</h2>')}, 1, {clean_html}), + {find_element(id='programme-details', html=True)}, { + 'title': ({find_element(tag='h2')}, {clean_html}), 'timestamp': ( - {functools.partial(get_element_by_class, 'broadcast-date')}, + {find_element(cls='broadcast-date')}, {functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}), 'duration': ( - {functools.partial(get_element_by_class, 'prog-running-time')}, - {clean_html}, {parse_duration}), + {find_element(cls='prog-running-time')}, {clean_html}, {parse_duration}), })) title = details.pop('title', None) or traverse_obj(webpage, ( - {functools.partial(get_element_html_by_id, 'add-to-existing-playlist')}, + {find_element(id='add-to-existing-playlist', html=True)}, {extract_attributes}, 'data-record-title', {clean_html})) entries = self._parse_html5_media_entries( diff --git a/yt_dlp/extractor/listennotes.py b/yt_dlp/extractor/listennotes.py index 61eae95edf..9d68e18301 100644 --- a/yt_dlp/extractor/listennotes.py +++ b/yt_dlp/extractor/listennotes.py @@ -6,12 +6,10 @@ from ..utils import ( extract_attributes, get_element_by_class, get_element_html_by_id, - get_element_text_and_html_by_tag, parse_duration, strip_or_none, - traverse_obj, - try_call, ) +from ..utils.traversal import find_element, traverse_obj class ListenNotesIE(InfoExtractor): @@ -22,14 +20,14 @@ class ListenNotesIE(InfoExtractor): 'info_dict': { 'id': 'KrDgvNb_u1n', 'ext': 'mp3', - 'title': 'md5:32236591a921adf17bbdbf0441b6c0e9', - 'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd', - 'duration': 2148.0, - 'channel': 'Thriving on Overload', + 'title': r're:Tim O’Reilly on noticing things other people .{113}', + 'description': r're:(?s)‘’We shape reality by what we notice and .{27459}', + 'duration': 2215.0, + 'channel': 'Amplifying Cognition', 'channel_id': 'ed84wITivxF', 'episode_id': 'e1312583fa7b4e24acfbb5131050be00', - 'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg', - 'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/', + 'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/amplifying-cognition-ross-dawson-Iemft4Gdr0k-ed84wITivxF.300x300.jpg', + 'channel_url': 'https://www.listennotes.com/podcasts/amplifying-cognition-ross-dawson-ed84wITivxF/', 'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'], }, }, { @@ -39,13 +37,13 @@ class ListenNotesIE(InfoExtractor): 'id': 'lwEA3154JzG', 'ext': 'mp3', 'title': 'Episode 177: WireGuard with Jason Donenfeld', - 'description': 'md5:24744f36456a3e95f83c1193a3458594', + 'description': r're:(?s)Jason Donenfeld lead developer joins us this hour to discuss WireGuard, .{3169}', 'duration': 3861.0, 'channel': 'Ask Noah Show', 'channel_id': '4DQTzdS5-j7', 'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4', 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/', - 'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg', + 'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-gD7vG150cxf-4DQTzdS5-j7.300x300.jpg', 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'], }, }] @@ -70,7 +68,7 @@ class ListenNotesIE(InfoExtractor): 'id': audio_id, 'url': data['audio'], 'title': (data.get('data-title') - or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0]) + or traverse_obj(webpage, ({find_element(tag='h1')}, {clean_html})) or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')), 'description': (self._clean_description(get_element_by_class('ln-text-p', webpage)) or strip_or_none(description)), diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 93f926a9ff..df9d141de3 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -1,30 +1,32 @@ import json +import uuid from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + join_nonempty, smuggle_url, traverse_obj, try_call, unsmuggle_url, + urljoin, ) class LiTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)' - - _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?content_id=%s' - + _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:[^/?#]+/watch/|vod/[^/?#]+/content\.do\?content_id=)(?P<id>[\w-]+)' + _URL_TEMPLATE = 'https://www.litv.tv/%s/watch/%s' + _GEO_COUNTRIES = ['TW'] _TESTS = [{ - 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'url': 'https://www.litv.tv/drama/watch/VOD00041610', 'info_dict': { 'id': 'VOD00041606', 'title': '花千骨', }, 'playlist_count': 51, # 50 episodes + 1 trailer }, { - 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'url': 'https://www.litv.tv/drama/watch/VOD00041610', 'md5': 'b90ff1e9f1d8f5cfcd0a44c3e2b34c7a', 'info_dict': { 'id': 'VOD00041610', @@ -32,16 +34,15 @@ class LiTVIE(InfoExtractor): 'title': '花千骨第1集', 'thumbnail': r're:https?://.*\.jpg$', 'description': '《花千骨》陸劇線上看。十六年前,平靜的村莊內,一名女嬰隨異相出生,途徑此地的蜀山掌門清虛道長算出此女命運非同一般,她體內散發的異香易招惹妖魔。一念慈悲下,他在村莊周邊設下結界阻擋妖魔入侵,讓其年滿十六後去蜀山,並賜名花千骨。', - 'categories': ['奇幻', '愛情', '中國', '仙俠'], + 'categories': ['奇幻', '愛情', '仙俠', '古裝'], 'episode': 'Episode 1', 'episode_number': 1, }, 'params': { 'noplaylist': True, }, - 'skip': 'Georestricted to Taiwan', }, { - 'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&', + 'url': 'https://www.litv.tv/drama/watch/VOD00044841', 'md5': '88322ea132f848d6e3e18b32a832b918', 'info_dict': { 'id': 'VOD00044841', @@ -55,94 +56,62 @@ class LiTVIE(InfoExtractor): def _extract_playlist(self, playlist_data, content_type): all_episodes = [ self.url_result(smuggle_url( - self._URL_TEMPLATE % (content_type, episode['contentId']), + self._URL_TEMPLATE % (content_type, episode['content_id']), {'force_noplaylist': True})) # To prevent infinite recursion - for episode in traverse_obj(playlist_data, ('seasons', ..., 'episode', lambda _, v: v['contentId']))] + for episode in traverse_obj(playlist_data, ('seasons', ..., 'episodes', lambda _, v: v['content_id']))] - return self.playlist_result(all_episodes, playlist_data['contentId'], playlist_data.get('title')) + return self.playlist_result(all_episodes, playlist_data['content_id'], playlist_data.get('title')) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + vod_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps'] - if self._search_regex( - r'(?i)<meta\s[^>]*http-equiv="refresh"\s[^>]*content="[0-9]+;\s*url=https://www\.litv\.tv/"', - webpage, 'meta refresh redirect', default=False, group=0): - raise ExtractorError('No such content found', expected=True) + program_info = traverse_obj(vod_data, ('programInformation', {dict})) or {} + playlist_data = traverse_obj(vod_data, ('seriesTree')) + if playlist_data and self._yes_playlist(program_info.get('series_id'), video_id, smuggled_data): + return self._extract_playlist(playlist_data, program_info.get('content_type')) - program_info = self._parse_json(self._search_regex( - r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), - video_id) + asset_id = traverse_obj(program_info, ('assets', 0, 'asset_id', {str})) + if asset_id: # This is a VOD + media_type = 'vod' + else: # This is a live stream + asset_id = program_info['content_id'] + media_type = program_info['content_type'] + puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value) + if puid: + endpoint = 'get-urls' + else: + puid = str(uuid.uuid4()) + endpoint = 'get-urls-no-auth' + video_data = self._download_json( + f'https://www.litv.tv/api/{endpoint}', video_id, + data=json.dumps({'AssetId': asset_id, 'MediaType': media_type, 'puid': puid}).encode(), + headers={'Content-Type': 'application/json'}) - # In browsers `getProgramInfo` request is always issued. Usually this - # endpoint gives the same result as the data embedded in the webpage. - # If, for some reason, there are no embedded data, we do an extra request. - if 'assetId' not in program_info: - program_info = self._download_json( - 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, - query={'contentId': video_id}, - headers={'Accept': 'application/json'}) - - series_id = program_info['seriesId'] - if self._yes_playlist(series_id, video_id, smuggled_data): - playlist_data = self._download_json( - 'https://www.litv.tv/vod/ajax/getSeriesTree', video_id, - query={'seriesId': series_id}, headers={'Accept': 'application/json'}) - return self._extract_playlist(playlist_data, program_info['contentType']) - - video_data = self._parse_json(self._search_regex( - r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', - webpage, 'video data', default='{}'), video_id) - if not video_data: - payload = {'assetId': program_info['assetId']} - puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value) - if puid: - payload.update({ - 'type': 'auth', - 'puid': puid, - }) - endpoint = 'getUrl' - else: - payload.update({ - 'watchDevices': program_info['watchDevices'], - 'contentType': program_info['contentType'], - }) - endpoint = 'getMainUrlNoAuth' - video_data = self._download_json( - f'https://www.litv.tv/vod/ajax/{endpoint}', video_id, - data=json.dumps(payload).encode(), - headers={'Content-Type': 'application/json'}) - - if not video_data.get('fullpath'): - error_msg = video_data.get('errorMessage') - if error_msg == 'vod.error.outsideregionerror': + if error := traverse_obj(video_data, ('error', {dict})): + error_msg = traverse_obj(error, ('message', {str})) + if error_msg and 'OutsideRegionError' in error_msg: self.raise_geo_restricted('This video is available in Taiwan only') - if error_msg: + elif error_msg: raise ExtractorError(f'{self.IE_NAME} said: {error_msg}', expected=True) - raise ExtractorError(f'Unexpected result from {self.IE_NAME}') + raise ExtractorError(f'Unexpected error from {self.IE_NAME}') formats = self._extract_m3u8_formats( - video_data['fullpath'], video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + video_data['result']['AssetURLs'][0], video_id, ext='mp4', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity' - title = program_info['title'] + program_info.get('secondaryMark', '') - description = program_info.get('description') - thumbnail = program_info.get('imageFile') - categories = [item['name'] for item in program_info.get('category', [])] - episode = int_or_none(program_info.get('episode')) - return { 'id': video_id, 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'categories': categories, - 'episode_number': episode, + 'title': join_nonempty('title', 'secondary_mark', delim='', from_dict=program_info), + **traverse_obj(program_info, { + 'description': ('description', {str}), + 'thumbnail': ('picture', {urljoin('https://p-cdnstatic.svc.litv.tv/')}), + 'categories': ('genres', ..., 'name', {str}), + 'episode_number': ('episode', {int_or_none}), + }), } diff --git a/yt_dlp/extractor/lsm.py b/yt_dlp/extractor/lsm.py index f5be08f97d..56c06d7458 100644 --- a/yt_dlp/extractor/lsm.py +++ b/yt_dlp/extractor/lsm.py @@ -114,7 +114,7 @@ class LSMLREmbedIE(InfoExtractor): def _real_extract(self, url): query = parse_qs(url) video_id = traverse_obj(query, ( - ('show', 'id'), 0, {int_or_none}, {lambda x: x or None}, {str_or_none}), get_all=False) + ('show', 'id'), 0, {int_or_none}, filter, {str_or_none}), get_all=False) webpage = self._download_webpage(url, video_id) player_data, media_data = self._search_regex( diff --git a/yt_dlp/extractor/magentamusik.py b/yt_dlp/extractor/magentamusik.py index 5bfc0a1545..24c46a1529 100644 --- a/yt_dlp/extractor/magentamusik.py +++ b/yt_dlp/extractor/magentamusik.py @@ -57,6 +57,6 @@ class MagentaMusikIE(InfoExtractor): 'duration': ('runtimeInSeconds', {int_or_none}), 'location': ('countriesOfProduction', {list}, {lambda x: join_nonempty(*x, delim=', ')}), 'release_year': ('yearOfProduction', {int_or_none}), - 'categories': ('mainGenre', {str}, {lambda x: x and [x]}), + 'categories': ('mainGenre', {str}, all, filter), })), } diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index ae0fb2aed2..d2a22f98f3 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -17,7 +17,7 @@ class MediaStreamBaseIE(InfoExtractor): _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)' def _extract_mediastream_urls(self, webpage): - yield from traverse_obj(list(self._yield_json_ld(webpage, None, fatal=False)), ( + yield from traverse_obj(list(self._yield_json_ld(webpage, None, default={})), ( lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'), {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None})) diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py deleted file mode 100644 index 88a2b9e891..0000000000 --- a/yt_dlp/extractor/mildom.py +++ /dev/null @@ -1,291 +0,0 @@ -import functools -import json -import uuid - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - OnDemandPagedList, - determine_ext, - dict_get, - float_or_none, - traverse_obj, -) - - -class MildomBaseIE(InfoExtractor): - _GUEST_ID = None - - def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): - if not self._GUEST_ID: - self._GUEST_ID = f'pc-gp-{uuid.uuid4()}' - - content = self._download_json( - url, video_id, note=note, data=json.dumps(body).encode() if body else None, - headers={'Content-Type': 'application/json'} if body else {}, - query={ - '__guest_id': self._GUEST_ID, - '__platform': 'web', - **(query or {}), - }) - - if content['code'] != 0: - raise ExtractorError( - f'Mildom says: {content["message"]} (code {content["code"]})', - expected=True) - return content['body'] - - -class MildomIE(MildomBaseIE): - IE_NAME = 'mildom' - IE_DESC = 'Record ongoing live by specific user in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/(?P<id>\d+)' - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id) - - enterstudio = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id, - note='Downloading live metadata', query={'user_id': video_id}) - result_video_id = enterstudio.get('log_id', video_id) - - servers = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id, - note='Downloading live server list', query={ - 'user_id': video_id, - 'live_server_type': 'hls', - }) - - playback_token = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id, - note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'}) - playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False) - if not playback_token: - raise ExtractorError('Failed to obtain live playback token') - - formats = self._extract_m3u8_formats( - f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}', - result_video_id, 'mp4', headers={ - 'Referer': 'https://www.mildom.com/', - 'Origin': 'https://www.mildom.com', - }) - - for fmt in formats: - fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' - - return { - 'id': result_video_id, - 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'), - 'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str), - 'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000), - 'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'), - 'uploader_id': video_id, - 'formats': formats, - 'is_live': True, - } - - -class MildomVodIE(MildomBaseIE): - IE_NAME = 'mildom:vod' - IE_DESC = 'VOD in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)' - _TESTS = [{ - 'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269', - 'info_dict': { - 'id': '10882672-1597662269', - 'ext': 'mp4', - 'title': '始めてのミルダム配信じゃぃ!', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'upload_date': '20200817', - 'duration': 4138.37, - 'description': 'ゲームをしたくて!', - 'timestamp': 1597662269.0, - 'uploader_id': '10882672', - 'uploader': 'kson組長(けいそん)', - }, - }, { - 'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477', - 'info_dict': { - 'id': '10882672-1597758589870-477', - 'ext': 'mp4', - 'title': '【kson】感染メイズ!麻酔銃で無双する', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'timestamp': 1597759093.0, - 'uploader': 'kson組長(けいそん)', - 'duration': 4302.58, - 'uploader_id': '10882672', - 'description': 'このステージ絶対乗り越えたい', - 'upload_date': '20200818', - }, - }, { - 'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0', - 'info_dict': { - 'id': '10882672-buha9td2lrn97fk2jme0', - 'ext': 'mp4', - 'title': '【kson組長】CART RACER!!!', - 'thumbnail': r're:^https?://.*\.(png|jpg)$', - 'uploader_id': '10882672', - 'uploader': 'kson組長(けいそん)', - 'upload_date': '20201104', - 'timestamp': 1604494797.0, - 'duration': 4657.25, - 'description': 'WTF', - }, - }] - - def _real_extract(self, url): - user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id) - - autoplay = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id, - note='Downloading playback metadata', query={ - 'v_id': video_id, - })['playback'] - - formats = [{ - 'url': autoplay['audio_url'], - 'format_id': 'audio', - 'protocol': 'm3u8_native', - 'vcodec': 'none', - 'acodec': 'aac', - 'ext': 'm4a', - }] - for fmt in autoplay['video_link']: - formats.append({ - 'format_id': 'video-{}'.format(fmt['name']), - 'url': fmt['url'], - 'protocol': 'm3u8_native', - 'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'], - 'height': fmt['level'], - 'vcodec': 'h264', - 'acodec': 'aac', - 'ext': 'mp4', - }) - - return { - 'id': video_id, - 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'), - 'description': traverse_obj(autoplay, 'video_intro'), - 'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000), - 'duration': float_or_none(autoplay.get('video_length'), scale=1000), - 'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')), - 'uploader': traverse_obj(autoplay, ('author_info', 'login_name')), - 'uploader_id': user_id, - 'formats': formats, - } - - -class MildomClipIE(MildomBaseIE): - IE_NAME = 'mildom:clip' - IE_DESC = 'Clip in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)' - _TESTS = [{ - 'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9', - 'info_dict': { - 'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9', - 'title': '全然違ったよ', - 'timestamp': 1619181890, - 'duration': 59, - 'thumbnail': r're:https?://.+', - 'uploader': 'ざきんぽ', - 'uploader_id': '10042245', - }, - }, { - 'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864', - 'info_dict': { - 'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864', - 'title': 'かっこいい', - 'timestamp': 1621094003, - 'duration': 59, - 'thumbnail': r're:https?://.+', - 'uploader': '(ルーキー', - 'uploader_id': '10111524', - }, - }, { - 'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', - 'info_dict': { - 'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902', - 'title': 'あ', - 'timestamp': 1614769431, - 'duration': 31, - 'thumbnail': r're:https?://.+', - 'uploader': 'ドルゴルスレンギーン=ダグワドルジ', - 'uploader_id': '10660174', - }, - }] - - def _real_extract(self, url): - user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id) - - clip_detail = self._call_api( - 'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id, - note='Downloading playback metadata', query={ - 'clip_id': video_id, - }) - - return { - 'id': video_id, - 'title': self._html_search_meta( - ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'), - 'timestamp': float_or_none(clip_detail.get('create_time')), - 'duration': float_or_none(clip_detail.get('length')), - 'thumbnail': clip_detail.get('cover'), - 'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')), - 'uploader_id': user_id, - - 'url': clip_detail['url'], - 'ext': determine_ext(clip_detail.get('url'), 'mp4'), - } - - -class MildomUserVodIE(MildomBaseIE): - IE_NAME = 'mildom:user:vod' - IE_DESC = 'Download all VODs from specific user in Mildom' - _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/profile/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.mildom.com/profile/10093333', - 'info_dict': { - 'id': '10093333', - 'title': 'Uploads from ねこばたけ', - }, - 'playlist_mincount': 732, - }, { - 'url': 'https://www.mildom.com/profile/10882672', - 'info_dict': { - 'id': '10882672', - 'title': 'Uploads from kson組長(けいそん)', - }, - 'playlist_mincount': 201, - }] - - def _fetch_page(self, user_id, page): - page += 1 - reply = self._call_api( - 'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList', - user_id, note=f'Downloading page {page}', query={ - 'user_id': user_id, - 'page': page, - 'limit': '30', - }) - if not reply: - return - for x in reply: - v_id = x.get('v_id') - if not v_id: - continue - yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}') - - def _real_extract(self, url): - user_id = self._match_id(url) - self.to_screen(f'This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/{user_id}" instead') - - profile = self._call_api( - 'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id, - query={'user_id': user_id}, note='Downloading user profile')['user_info'] - - return self.playlist_result( - OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30), - user_id, f'Uploads from {profile["loginname"]}') diff --git a/yt_dlp/extractor/mitele.py b/yt_dlp/extractor/mitele.py index ea29986729..3573a2a3fd 100644 --- a/yt_dlp/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py @@ -1,14 +1,13 @@ -from .telecinco import TelecincoIE +from .telecinco import TelecincoBaseIE from ..utils import ( int_or_none, parse_iso8601, ) -class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE +class MiTeleIE(TelecincoBaseIE): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player' - _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'info_dict': { @@ -27,6 +26,7 @@ class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE 'timestamp': 1471209401, 'upload_date': '20160814', }, + 'skip': 'HTTP Error 404 Not Found', }, { # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', @@ -49,6 +49,26 @@ class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404 Not Found', + }, { + 'url': 'https://www.mitele.es/programas-tv/horizonte/temporada-5/programa-171-40_013480051/player/', + 'info_dict': { + 'id': '7adbe22e-cd41-4787-afa4-36f3da7c2c6f', + 'ext': 'mp4', + 'title': 'Horizonte Temporada 5 Programa 171', + 'description': 'md5:97f1fb712c5ac27e5693a8b3c5c0c6e3', + 'episode': 'Las Zonas de Bajas Emisiones, a debate', + 'episode_number': 171, + 'season': 'Season 5', + 'season_number': 5, + 'series': 'Horizonte', + 'duration': 7012, + 'upload_date': '20240927', + 'timestamp': 1727416450, + 'thumbnail': 'https://album.mediaset.es/eimg/2024/09/27/horizonte-171_9f02.jpg', + 'age_limit': 12, + }, + 'params': {'geo_bypass_country': 'ES'}, }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index 9b7c7b89b9..4bccc81bdc 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -12,7 +12,7 @@ from ..utils.traversal import traverse_obj class MixchIE(InfoExtractor): IE_NAME = 'mixch' - _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)' + _VALID_URL = r'https?://mixch\.tv/u/(?P<id>\d+)' _TESTS = [{ 'url': 'https://mixch.tv/u/16943797/live', @@ -66,7 +66,7 @@ class MixchIE(InfoExtractor): note='Downloading comments', errnote='Failed to download comments'), (..., { 'author': ('name', {str}), 'author_id': ('user_id', {str_or_none}), - 'id': ('message_id', {str}, {lambda x: x or None}), + 'id': ('message_id', {str}, filter), 'text': ('body', {str}), 'timestamp': ('created', {int}), })) @@ -74,7 +74,7 @@ class MixchIE(InfoExtractor): class MixchArchiveIE(InfoExtractor): IE_NAME = 'mixch:archive' - _VALID_URL = r'https?://(?:www\.)?mixch\.tv/archive/(?P<id>\d+)' + _VALID_URL = r'https?://mixch\.tv/archive/(?P<id>\d+)' _TESTS = [{ 'url': 'https://mixch.tv/archive/421', @@ -116,3 +116,56 @@ class MixchArchiveIE(InfoExtractor): 'formats': self._extract_m3u8_formats(info_json['archiveURL'], video_id), 'thumbnail': traverse_obj(info_json, ('thumbnailURL', {url_or_none})), } + + +class MixchMovieIE(InfoExtractor): + IE_NAME = 'mixch:movie' + _VALID_URL = r'https?://mixch\.tv/m/(?P<id>\w+)' + + _TESTS = [{ + 'url': 'https://mixch.tv/m/Ve8KNkJ5', + 'info_dict': { + 'id': 'Ve8KNkJ5', + 'title': '夏☀️\nムービーへのポイントは本イベントに加算されないので配信にてお願い致します🙇🏻\u200d♀️\n#TGCCAMPUS #ミス東大 #ミス東大2024 ', + 'ext': 'mp4', + 'uploader': 'ミス東大No.5 松藤百香🍑💫', + 'uploader_id': '12299174', + 'channel_follower_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'timestamp': 1724070828, + 'uploader_url': 'https://mixch.tv/u/12299174', + 'live_status': 'not_live', + 'upload_date': '20240819', + }, + }, { + 'url': 'https://mixch.tv/m/61DzpIKE', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + f'https://mixch.tv/api-web/movies/{video_id}', video_id) + return { + 'id': video_id, + 'formats': [{ + 'format_id': 'mp4', + 'url': data['movie']['file'], + 'ext': 'mp4', + }], + **traverse_obj(data, { + 'title': ('movie', 'title', {str}), + 'thumbnail': ('movie', 'thumbnailURL', {url_or_none}), + 'uploader': ('ownerInfo', 'name', {str}), + 'uploader_id': ('ownerInfo', 'id', {int}, {str_or_none}), + 'channel_follower_count': ('ownerInfo', 'fan', {int_or_none}), + 'view_count': ('ownerInfo', 'view', {int_or_none}), + 'like_count': ('movie', 'favCount', {int_or_none}), + 'comment_count': ('movie', 'commentCount', {int_or_none}), + 'timestamp': ('movie', 'published', {int_or_none}), + 'uploader_url': ('ownerInfo', 'id', {lambda x: x and f'https://mixch.tv/u/{x}'}, filter), + }), + 'live_status': 'not_live', + } diff --git a/yt_dlp/extractor/monstercat.py b/yt_dlp/extractor/monstercat.py index 930c13e278..f17b91f5a2 100644 --- a/yt_dlp/extractor/monstercat.py +++ b/yt_dlp/extractor/monstercat.py @@ -4,15 +4,11 @@ from .common import InfoExtractor from ..utils import ( clean_html, extract_attributes, - get_element_by_class, - get_element_html_by_class, - get_element_text_and_html_by_tag, int_or_none, strip_or_none, - traverse_obj, - try_call, unified_strdate, ) +from ..utils.traversal import find_element, traverse_obj class MonstercatIE(InfoExtractor): @@ -26,19 +22,21 @@ class MonstercatIE(InfoExtractor): 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover', 'release_date': '20230711', 'album': 'The Secret Language of Trees', - 'album_artist': 'BT', + 'album_artists': ['BT'], }, }] def _extract_tracks(self, table, album_meta): for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag - title = clean_html(try_call( - lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0])) - ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '') + title = traverse_obj(td, ( + {find_element(cls='d-inline-flex flex-column')}, + {lambda x: x.partition(' <span')}, 0, {clean_html})) + ids = traverse_obj(td, ( + {find_element(cls='btn-play cursor-pointer mr-small', html=True)}, {extract_attributes})) or {} track_id = ids.get('data-track-id') release_id = ids.get('data-release-id') - track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td))) + track_number = traverse_obj(td, ({find_element(cls='py-xsmall')}, {int_or_none})) if not track_id or not release_id: self.report_warning(f'Skipping track {track_number}, ID(s) not found') self.write_debug(f'release_id={release_id!r} track_id={track_id!r}') @@ -48,7 +46,7 @@ class MonstercatIE(InfoExtractor): 'title': title, 'track': title, 'track_number': track_number, - 'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))), + 'artists': traverse_obj(td, ({find_element(cls='d-block fs-xxsmall')}, {clean_html}, all)), 'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}', 'id': track_id, 'ext': 'mp3', @@ -57,20 +55,19 @@ class MonstercatIE(InfoExtractor): def _real_extract(self, url): url_id = self._match_id(url) html = self._download_webpage(url, url_id) - # wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html - tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or '' - - title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0]) - date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block', - html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate})) + # NB: HTMLParser may choke on this html; use {find_element} or try_call(lambda: get_element...) + tracklist_table = traverse_obj(html, {find_element(cls='table table-small')}) or '' + title = traverse_obj(html, ({find_element(tag='h1')}, {clean_html})) album_meta = { 'title': title, 'album': title, 'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover', - 'album_artist': try_call( - lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)), - 'release_date': date, + 'album_artists': traverse_obj(html, ( + {find_element(cls='h-normal text-uppercase mb-desktop-medium mb-smallish')}, {clean_html}, all)), + 'release_date': traverse_obj(html, ( + {find_element(cls='font-italic mb-medium d-tablet-none d-phone-block')}, + {lambda x: x.partition('Released ')}, 2, {strip_or_none}, {unified_strdate})), } return self.playlist_result( diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index cb8f6a67d4..42ef25f17f 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -86,7 +86,7 @@ class NebulaBaseIE(InfoExtractor): def _extract_video_metadata(self, episode): channel_url = traverse_obj( - episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False) + episode, (('channel_slug', 'class_slug'), {urljoin('https://nebula.tv/')}), get_all=False) return { 'id': episode['id'].partition(':')[2], **traverse_obj(episode, { diff --git a/yt_dlp/extractor/nekohacker.py b/yt_dlp/extractor/nekohacker.py index 537158e87b..7168a2080e 100644 --- a/yt_dlp/extractor/nekohacker.py +++ b/yt_dlp/extractor/nekohacker.py @@ -6,12 +6,10 @@ from ..utils import ( determine_ext, extract_attributes, get_element_by_class, - get_element_text_and_html_by_tag, parse_duration, - traverse_obj, - try_call, url_or_none, ) +from ..utils.traversal import find_element, traverse_obj class NekoHackerIE(InfoExtractor): @@ -35,7 +33,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20221101', 'album': 'Nekoverse', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': 'Spaceship', 'track_number': 1, 'duration': 195.0, @@ -53,7 +51,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20221101', 'album': 'Nekoverse', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': 'City Runner', 'track_number': 2, 'duration': 148.0, @@ -71,7 +69,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20221101', 'album': 'Nekoverse', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': 'Nature Talk', 'track_number': 3, 'duration': 174.0, @@ -89,7 +87,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20221101', 'album': 'Nekoverse', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': 'Crystal World', 'track_number': 4, 'duration': 199.0, @@ -115,7 +113,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20210115', 'album': '進め!むじなカンパニー', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', 'track_number': 1, }, @@ -132,7 +130,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20210115', 'album': '進め!むじなカンパニー', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', 'track_number': 2, }, @@ -149,7 +147,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20210115', 'album': '進め!むじなカンパニー', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': '進め!むじなカンパニー (instrumental)', 'track_number': 3, }, @@ -166,7 +164,7 @@ class NekoHackerIE(InfoExtractor): 'acodec': 'mp3', 'release_date': '20210115', 'album': '進め!むじなカンパニー', - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'track': 'むじな de なじむ (instrumental)', 'track_number': 4, }, @@ -181,14 +179,17 @@ class NekoHackerIE(InfoExtractor): playlist = get_element_by_class('playlist', webpage) if not playlist: - iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or '' - iframe_src = url_or_none(extract_attributes(iframe).get('src')) + iframe_src = traverse_obj(webpage, ( + {find_element(tag='iframe', html=True)}, {extract_attributes}, 'src', {url_or_none})) if not iframe_src: raise ExtractorError('No playlist or embed found in webpage') elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src): raise ExtractorError('Spotify embeds are not supported', expected=True) return self.url_result(url, 'Generic') + player_params = self._search_json( + r'var srp_player_params_[\da-f]+\s*=', webpage, 'player params', playlist_id, default={}) + entries = [] for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1): entry = traverse_obj(extract_attributes(track), { @@ -200,12 +201,12 @@ class NekoHackerIE(InfoExtractor): 'album': 'data-albumtitle', 'duration': ('data-tracktime', {parse_duration}), 'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), - 'thumbnail': ('data-albumart', {url_or_none}), }) entries.append({ **entry, + 'thumbnail': url_or_none(player_params.get('artwork')), 'track_number': track_number, - 'artist': 'Neko Hacker', + 'artists': ['Neko Hacker'], 'vcodec': 'none', 'acodec': 'mp3' if entry['ext'] == 'mp3' else None, }) diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index a759da2147..900b8b2a30 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -36,10 +36,6 @@ class NetEaseMusicBaseIE(InfoExtractor): _API_BASE = 'http://music.163.com/api/' _GEO_BYPASS = False - @staticmethod - def _kilo_or_none(value): - return int_or_none(value, scale=1000) - def _create_eapi_cipher(self, api_path, query_body, cookies): request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':')) @@ -101,7 +97,7 @@ class NetEaseMusicBaseIE(InfoExtractor): 'vcodec': 'none', **traverse_obj(song, { 'ext': ('type', {str}), - 'abr': ('br', {self._kilo_or_none}), + 'abr': ('br', {int_or_none(scale=1000)}), 'filesize': ('size', {int_or_none}), }), }) @@ -282,9 +278,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): **lyric_data, **traverse_obj(info, { 'title': ('name', {str}), - 'timestamp': ('album', 'publishTime', {self._kilo_or_none}), + 'timestamp': ('album', 'publishTime', {int_or_none(scale=1000)}), 'thumbnail': ('album', 'picUrl', {url_or_none}), - 'duration': ('duration', {self._kilo_or_none}), + 'duration': ('duration', {int_or_none(scale=1000)}), 'album': ('album', 'name', {str}), 'average_rating': ('score', {int_or_none}), }), @@ -440,7 +436,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): 'tags': ('tags', ..., {str}), 'uploader': ('creator', 'nickname', {str}), 'uploader_id': ('creator', 'userId', {str_or_none}), - 'timestamp': ('updateTime', {self._kilo_or_none}), + 'timestamp': ('updateTime', {int_or_none(scale=1000)}), })) if traverse_obj(info, ('playlist', 'specialType')) == 10: metainfo['title'] = f'{metainfo.get("title")} {strftime_or_none(metainfo.get("timestamp"), "%Y-%m-%d")}' @@ -517,10 +513,10 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): 'creators': traverse_obj(info, ('artists', ..., 'name')) or [info.get('artistName')], **traverse_obj(info, { 'title': ('name', {str}), - 'description': (('desc', 'briefDesc'), {str}, {lambda x: x or None}), + 'description': (('desc', 'briefDesc'), {str}, filter), 'upload_date': ('publishTime', {unified_strdate}), 'thumbnail': ('cover', {url_or_none}), - 'duration': ('duration', {self._kilo_or_none}), + 'duration': ('duration', {int_or_none(scale=1000)}), 'view_count': ('playCount', {int_or_none}), 'like_count': ('likeCount', {int_or_none}), 'comment_count': ('commentCount', {int_or_none}), @@ -588,7 +584,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'description': ('description', {str}), 'creator': ('dj', 'brand', {str}), 'thumbnail': ('coverUrl', {url_or_none}), - 'timestamp': ('createTime', {self._kilo_or_none}), + 'timestamp': ('createTime', {int_or_none(scale=1000)}), }) if not self._yes_playlist( @@ -598,7 +594,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): return { 'id': str(info['mainSong']['id']), 'formats': formats, - 'duration': traverse_obj(info, ('mainSong', 'duration', {self._kilo_or_none})), + 'duration': traverse_obj(info, ('mainSong', 'duration', {int_or_none(scale=1000)})), **metainfo, } diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py index cd32892fa0..ee1bc281c6 100644 --- a/yt_dlp/extractor/nexx.py +++ b/yt_dlp/extractor/nexx.py @@ -371,7 +371,7 @@ class NexxIE(InfoExtractor): # not all videos work via arc, e.g. nexx:741:1269984 if not video: # Reverse engineered from JS code (see getDeviceID function) - device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(1e4, 99999)}{random.randint(1, 9)}' + device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(10000, 99999)}{random.randint(1, 9)}' result = self._call_api(domain_id, 'session/init', video_id, data={ 'nxp_devh': device_id, diff --git a/yt_dlp/extractor/nfl.py b/yt_dlp/extractor/nfl.py index c537c1c47c..59213a44be 100644 --- a/yt_dlp/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py @@ -11,9 +11,12 @@ from ..utils import ( clean_html, determine_ext, get_element_by_class, - traverse_obj, + int_or_none, + make_archive_id, + url_or_none, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class NFLBaseIE(InfoExtractor): @@ -75,22 +78,15 @@ class NFLBaseIE(InfoExtractor): 'osVersion': '10.0', }, separators=(',', ':')).encode()).decode(), 'networkType': 'other', - 'nflClaimGroupsToAdd': [], - 'nflClaimGroupsToRemove': [], + 'peacockUUID': 'undefined', } _ACCOUNT_INFO = {} - _API_KEY = None + _API_KEY = '3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f' _TOKEN = None _TOKEN_EXPIRY = 0 - def _get_account_info(self, url, slug): - if not self._API_KEY: - webpage = self._download_webpage(url, slug, fatal=False) or '' - self._API_KEY = self._search_regex( - r'window\.gigyaApiKey\s*=\s*["\'](\w+)["\'];', webpage, 'API key', - fatal=False) or '3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f' - + def _get_account_info(self): cookies = self._get_cookies('https://auth-id.nfl.com/') login_token = traverse_obj(cookies, ( (f'glt_{self._API_KEY}', lambda k, _: k.startswith('glt_')), {lambda x: x.value}), get_all=False) @@ -103,7 +99,7 @@ class NFLBaseIE(InfoExtractor): 'or else try using --cookies-from-browser instead', expected=True) account = self._download_json( - 'https://auth-id.nfl.com/accounts.getAccountInfo', slug, + 'https://auth-id.nfl.com/accounts.getAccountInfo', None, note='Downloading account info', data=urlencode_postdata({ 'include': 'profile,data', 'lang': 'en', @@ -111,7 +107,7 @@ class NFLBaseIE(InfoExtractor): 'sdk': 'js_latest', 'login_token': login_token, 'authMode': 'cookie', - 'pageURL': url, + 'pageURL': 'https://www.nfl.com/', 'sdkBuild': traverse_obj(cookies, ( 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='15170'), 'format': 'json', @@ -126,55 +122,78 @@ class NFLBaseIE(InfoExtractor): if len(self._ACCOUNT_INFO) != 3: raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) - def _get_auth_token(self, url, slug): + def _get_auth_token(self): if self._TOKEN and self._TOKEN_EXPIRY > int(time.time() + 30): return - if not self._ACCOUNT_INFO: - self._get_account_info(url, slug) - token = self._download_json( 'https://api.nfl.com/identity/v3/token%s' % ( '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), - slug, headers={'Content-Type': 'application/json'}, note='Downloading access token', + None, headers={'Content-Type': 'application/json'}, note='Downloading access token', data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) self._TOKEN = token['accessToken'] self._TOKEN_EXPIRY = token['expiresIn'] self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] + def _extract_video(self, mcp_id, is_live=False): + self._get_auth_token() + data = self._download_json( + f'https://api.nfl.com/play/v1/asset/{mcp_id}', mcp_id, headers={ + 'Authorization': f'Bearer {self._TOKEN}', + 'Accept': 'application/json', + 'Content-Type': 'application/json', + }, data=json.dumps({'init': True, 'live': is_live}, separators=(',', ':')).encode()) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + data['accessUrl'], mcp_id, 'mp4', m3u8_id='hls') + + return { + 'id': mcp_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + '_old_archive_ids': [make_archive_id(AnvatoIE, mcp_id)], + **traverse_obj(data, ('metadata', { + 'title': ('event', ('def_title', 'friendlyName'), {str}, any), + 'description': ('event', 'def_description', {str}), + 'duration': ('event', 'duration', {int_or_none}), + 'thumbnails': ('thumbnails', ..., 'url', {'url': {url_or_none}}), + })), + } + def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) + is_live = traverse_obj(video_config, ('live', {bool})) or False item = video_config['playlist'][0] - mcp_id = item.get('mcpID') - if mcp_id: - info = self.url_result(f'{self._ANVATO_PREFIX}{mcp_id}', AnvatoIE, mcp_id) + if mcp_id := item.get('mcpID'): + return self._extract_video(mcp_id, is_live=is_live) + + info = {'id': item.get('id') or item['entityId']} + + item_url = item['url'] + ext = determine_ext(item_url) + if ext == 'm3u8': + info['formats'] = self._extract_m3u8_formats(item_url, info['id'], 'mp4') else: - media_id = item.get('id') or item['entityId'] - title = item.get('title') - item_url = item['url'] - info = {'id': media_id} - ext = determine_ext(item_url) - if ext == 'm3u8': - info['formats'] = self._extract_m3u8_formats(item_url, media_id, 'mp4') - else: - info['url'] = item_url - if item.get('audio') is True: - info['vcodec'] = 'none' - is_live = video_config.get('live') is True - thumbnails = None - image_url = item.get(item.get('imageSrc')) or item.get(item.get('posterImage')) - if image_url: - thumbnails = [{ - 'url': image_url, - 'ext': determine_ext(image_url, 'jpg'), - }] - info.update({ - 'title': title, - 'is_live': is_live, - 'description': clean_html(item.get('description')), - 'thumbnails': thumbnails, - }) + info['url'] = item_url + if item.get('audio') is True: + info['vcodec'] = 'none' + + thumbnails = None + if image_url := traverse_obj(item, 'imageSrc', 'posterImage', expected_type=url_or_none): + thumbnails = [{ + 'url': image_url, + 'ext': determine_ext(image_url, 'jpg'), + }] + + info.update({ + **traverse_obj(item, { + 'title': ('title', {str}), + 'description': ('description', {clean_html}), + }), + 'is_live': is_live, + 'thumbnails': thumbnails, + }) return info @@ -188,24 +207,20 @@ class NFLIE(NFLBaseIE): 'ext': 'mp4', 'title': "Baker Mayfield's game-changing plays from 3-TD game Week 14", 'description': 'md5:85e05a3cc163f8c344340f220521136d', - 'upload_date': '20201215', - 'timestamp': 1608009755, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'NFL', - 'tags': 'count:6', + 'thumbnail': r're:https?://.+\.jpg', 'duration': 157, - 'categories': 'count:3', + '_old_archive_ids': ['anvato 899441'], }, }, { 'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown', - 'md5': '6886b32c24b463038c760ceb55a34566', + 'md5': '92a517f05bd3eb50fe50244bc621aec8', 'info_dict': { - 'id': 'd87e8790-3e14-11eb-8ceb-ff05c2867f99', + 'id': '8b7c3625-a461-4751-8db4-85f536f2bbd0', 'ext': 'mp3', 'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown', 'description': 'md5:12ada8ee70e6762658c30e223e095075', + 'thumbnail': 'https://static.clubs.nfl.com/image/private/t_editorial_landscape_12_desktop/v1571153441/chiefs/rfljejccnyhhkpkfq855', }, - 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14', 'only_matching': True, @@ -236,13 +251,16 @@ class NFLArticleIE(NFLBaseIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - entries = [] - for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage): - entries.append(self._parse_video_config(video_config, display_id)) + + def entries(): + for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage): + yield self._parse_video_config(video_config, display_id) + title = clean_html(get_element_by_class( 'nfl-c-article__title', webpage)) or self._html_search_meta( ['og:title', 'twitter:title'], webpage) - return self.playlist_result(entries, display_id, title) + + return self.playlist_result(entries(), display_id, title) class NFLPlusReplayIE(NFLBaseIE): @@ -307,6 +325,9 @@ class NFLPlusReplayIE(NFLBaseIE): 'all_22': 'All-22', } + def _real_initialize(self): + self._get_account_info() + def _real_extract(self, url): slug, video_id = self._match_valid_url(url).group('slug', 'id') requested_types = self._configuration_arg('type', ['all']) @@ -315,7 +336,7 @@ class NFLPlusReplayIE(NFLBaseIE): requested_types = traverse_obj(self._REPLAY_TYPES, (None, requested_types)) if not video_id: - self._get_auth_token(url, slug) + self._get_auth_token() headers = {'Authorization': f'Bearer {self._TOKEN}'} game_id = self._download_json( f'https://api.nfl.com/football/v2/games/externalId/slug/{slug}', slug, @@ -328,14 +349,13 @@ class NFLPlusReplayIE(NFLBaseIE): 'items', lambda _, v: v['subType'] == requested_types[0], 'mcpPlaybackId'), get_all=False) if video_id: - return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + return self._extract_video(video_id) def entries(): for replay in traverse_obj( replays, ('items', lambda _, v: v['mcpPlaybackId'] and v['subType'] in requested_types), ): - video_id = replay['mcpPlaybackId'] - yield self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + yield self._extract_video(replay['mcpPlaybackId']) return self.playlist_result(entries(), slug) @@ -362,12 +382,15 @@ class NFLPlusEpisodeIE(NFLBaseIE): 'params': {'skip_download': 'm3u8'}, }] + def _real_initialize(self): + self._get_account_info() + def _real_extract(self, url): slug = self._match_id(url) - self._get_auth_token(url, slug) + self._get_auth_token() video_id = self._download_json( f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={ 'Authorization': f'Bearer {self._TOKEN}', })['mcpPlaybackId'] - return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + return self._extract_video(video_id) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index e06740d62e..29fc1da1e2 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -371,11 +371,11 @@ class NiconicoIE(InfoExtractor): 'acodec': 'aac', 'vcodec': 'h264', **traverse_obj(audio_quality, ('metadata', { - 'abr': ('bitrate', {functools.partial(float_or_none, scale=1000)}), + 'abr': ('bitrate', {float_or_none(scale=1000)}), 'asr': ('samplingRate', {int_or_none}), })), **traverse_obj(video_quality, ('metadata', { - 'vbr': ('bitrate', {functools.partial(float_or_none, scale=1000)}), + 'vbr': ('bitrate', {float_or_none(scale=1000)}), 'height': ('resolution', 'height', {int_or_none}), 'width': ('resolution', 'width', {int_or_none}), })), @@ -428,7 +428,7 @@ class NiconicoIE(InfoExtractor): **audio_fmt, **traverse_obj(audios, (lambda _, v: audio_fmt['format_id'].startswith(v['id']), { 'format_id': ('id', {str}), - 'abr': ('bitRate', {functools.partial(float_or_none, scale=1000)}), + 'abr': ('bitRate', {float_or_none(scale=1000)}), 'asr': ('samplingRate', {int_or_none}), }), get_all=False), 'acodec': 'aac', @@ -869,7 +869,7 @@ class NicovideoTagURLIE(NicovideoSearchBaseIE): class NiconicoUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)(?:/video)?/?(?:$|[#?])' _TEST = { 'url': 'https://www.nicovideo.jp/user/419948', 'info_dict': { @@ -877,7 +877,7 @@ class NiconicoUserIE(InfoExtractor): }, 'playlist_mincount': 101, } - _API_URL = 'https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s' + _API_URL = 'https://nvapi.nicovideo.jp/v2/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s' _PAGE_SIZE = 100 _API_HEADERS = { @@ -897,12 +897,13 @@ class NiconicoUserIE(InfoExtractor): total_count = int_or_none(json_parsed['data'].get('totalCount')) for entry in json_parsed['data']['items']: count += 1 - yield self.url_result('https://www.nicovideo.jp/watch/{}'.format(entry['id'])) + yield self.url_result( + f'https://www.nicovideo.jp/watch/{entry["essential"]["id"]}', ie=NiconicoIE) page_num += 1 def _real_extract(self, url): list_id = self._match_id(url) - return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) + return self.playlist_result(self._entries(list_id), list_id) class NiconicoLiveIE(InfoExtractor): diff --git a/yt_dlp/extractor/nubilesporn.py b/yt_dlp/extractor/nubilesporn.py index c2079d8b07..47c7be61d5 100644 --- a/yt_dlp/extractor/nubilesporn.py +++ b/yt_dlp/extractor/nubilesporn.py @@ -10,10 +10,10 @@ from ..utils import ( get_element_html_by_class, get_elements_by_class, int_or_none, - try_call, unified_timestamp, urlencode_postdata, ) +from ..utils.traversal import find_element, find_elements, traverse_obj class NubilesPornIE(InfoExtractor): @@ -70,9 +70,8 @@ class NubilesPornIE(InfoExtractor): url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0] channel_id, channel_name = self._search_regex( - r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page), + r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page) or '', 'channel', fatal=False, group=('id', 'name')) or (None, None) - channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) return { 'id': video_id, @@ -82,14 +81,14 @@ class NubilesPornIE(InfoExtractor): 'thumbnail': media_entries.get('thumbnail'), 'description': clean_html(get_element_html_by_class('content-pane-description', page)), 'timestamp': unified_timestamp(get_element_by_class('date', page)), - 'channel': channel_name, + 'channel': re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) if channel_name else None, 'channel_id': channel_id, 'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'), 'like_count': int_or_none(get_element_by_id('likecount', page)), 'average_rating': float_or_none(get_element_by_class('score', page)), 'age_limit': 18, - 'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))), - 'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))), + 'categories': traverse_obj(page, ({find_element(cls='categories')}, {find_elements(cls='btn')}, ..., {clean_html})), + 'tags': traverse_obj(page, ({find_elements(cls='tags')}, 1, {find_elements(cls='btn')}, ..., {clean_html})), 'cast': get_elements_by_class('content-pane-performer', page), 'availability': 'needs_auth', 'series': channel_name, diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index 5ec3cdd675..9ef57410ac 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -235,7 +235,7 @@ class NYTimesArticleIE(NYTimesBaseIE): details = traverse_obj(block, { 'id': ('sourceId', {str}), 'uploader': ('bylines', ..., 'renderedRepresentation', {str}), - 'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))), + 'duration': (None, (('duration', {float_or_none(scale=1000)}), ('length', {int_or_none}))), 'timestamp': ('firstPublished', {parse_iso8601}), 'series': ('podcastSeries', {str}), }, get_all=False) diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index 591b4147eb..1921f3fd8a 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -115,7 +115,7 @@ class OnDemandKoreaIE(InfoExtractor): **traverse_obj(data, { 'thumbnail': ('episode', 'images', 'thumbnail', {url_or_none}), 'release_date': ('episode', 'release_date', {lambda x: x.replace('-', '')}, {unified_strdate}), - 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('duration', {float_or_none(scale=1000)}), 'age_limit': ('age_rating', 'name', {lambda x: x.replace('R', '')}, {parse_age_limit}), 'series': ('episode', {if_series(key='program')}, 'title'), 'series_id': ('episode', {if_series(key='program')}, 'id', {str_or_none}), diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index 9c37a54d62..12c4a21041 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -1,5 +1,4 @@ import base64 -import functools import re from .common import InfoExtractor @@ -192,7 +191,7 @@ class ORFPodcastIE(InfoExtractor): 'ext': ('enclosures', 0, 'type', {mimetype2ext}), 'title': 'title', 'description': ('description', {clean_html}), - 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('duration', {float_or_none(scale=1000)}), 'series': ('podcast', 'title'), })), } @@ -494,7 +493,7 @@ class ORFONIE(InfoExtractor): return traverse_obj(api_json, { 'id': ('id', {int}, {str_or_none}), 'age_limit': ('age_classification', {parse_age_limit}), - 'duration': ('exact_duration', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('exact_duration', {float_or_none(scale=1000)}), 'title': (('title', 'headline'), {str}), 'description': (('description', 'teaser_text'), {str}), 'media_type': ('video_type', {str}), diff --git a/yt_dlp/extractor/parler.py b/yt_dlp/extractor/parler.py index 9be288a7d0..e5bb3be4ee 100644 --- a/yt_dlp/extractor/parler.py +++ b/yt_dlp/extractor/parler.py @@ -1,5 +1,3 @@ -import functools - from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( @@ -83,7 +81,7 @@ class ParlerIE(InfoExtractor): 'timestamp': ('date_created', {unified_timestamp}), 'uploader': ('user', 'name', {strip_or_none}), 'uploader_id': ('user', 'username', {str}), - 'uploader_url': ('user', 'username', {functools.partial(urljoin, 'https://parler.com/')}), + 'uploader_url': ('user', 'username', {urljoin('https://parler.com/')}), 'view_count': ('views', {int_or_none}), 'comment_count': ('total_comments', {int_or_none}), 'repost_count': ('echos', {int_or_none}), diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 4d668cd37d..6bdeaf1571 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -16,10 +16,10 @@ from ..utils import ( parse_iso8601, smuggle_url, str_or_none, - traverse_obj, url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj, value class PatreonBaseIE(InfoExtractor): @@ -252,6 +252,27 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': r're:^https?://.+', }, 'skip': 'Patron-only content', + }, { + # Contains a comment reply in the 'included' section + 'url': 'https://www.patreon.com/posts/114721679', + 'info_dict': { + 'id': '114721679', + 'ext': 'mp4', + 'upload_date': '20241025', + 'uploader': 'Japanalysis', + 'like_count': int, + 'thumbnail': r're:^https?://.+', + 'comment_count': int, + 'title': 'Karasawa Part 2', + 'description': 'Part 2 of this video https://www.youtube.com/watch?v=Azms2-VTASk', + 'uploader_url': 'https://www.patreon.com/japanalysis', + 'uploader_id': '80504268', + 'channel_url': 'https://www.patreon.com/japanalysis', + 'channel_follower_count': int, + 'timestamp': 1729897015, + 'channel_id': '9346307', + }, + 'params': {'getcomments': True}, }] _RETURN_TYPE = 'video' @@ -404,26 +425,24 @@ class PatreonIE(PatreonBaseIE): f'posts/{post_id}/comments', post_id, query=params, note=f'Downloading comments page {page}') cursor = None - for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)): + for comment in traverse_obj(response, (('data', 'included'), lambda _, v: v['type'] == 'comment' and v['id'])): count += 1 - comment_id = comment.get('id') - attributes = comment.get('attributes') or {} - if comment_id is None: - continue author_id = traverse_obj(comment, ('relationships', 'commenter', 'data', 'id')) - author_info = traverse_obj( - response, ('included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes'), - get_all=False, expected_type=dict, default={}) yield { - 'id': comment_id, - 'text': attributes.get('body'), - 'timestamp': parse_iso8601(attributes.get('created')), - 'parent': traverse_obj(comment, ('relationships', 'parent', 'data', 'id'), default='root'), - 'author_is_uploader': attributes.get('is_by_creator'), + **traverse_obj(comment, { + 'id': ('id', {str_or_none}), + 'text': ('attributes', 'body', {str}), + 'timestamp': ('attributes', 'created', {parse_iso8601}), + 'parent': ('relationships', 'parent', 'data', ('id', {value('root')}), {str}, any), + 'author_is_uploader': ('attributes', 'is_by_creator', {bool}), + }), + **traverse_obj(response, ( + 'included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes', { + 'author': ('full_name', {str}), + 'author_thumbnail': ('image_url', {url_or_none}), + }), get_all=False), 'author_id': author_id, - 'author': author_info.get('full_name'), - 'author_thumbnail': author_info.get('image_url'), } if count < traverse_obj(response, ('meta', 'count')): diff --git a/yt_dlp/extractor/pialive.py b/yt_dlp/extractor/pialive.py new file mode 100644 index 0000000000..7469135c1b --- /dev/null +++ b/yt_dlp/extractor/pialive.py @@ -0,0 +1,122 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + multipart_encode, + str_or_none, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class PiaLiveIE(InfoExtractor): + _VALID_URL = r'https?://player\.pia-live\.jp/stream/(?P<id>[\w-]+)' + _PLAYER_ROOT_URL = 'https://player.pia-live.jp/' + _PIA_LIVE_API_URL = 'https://api.pia-live.jp' + _API_KEY = 'kfds)FKFps-dms9e' + _TESTS = [{ + 'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krUDqGOwN4d61dCWQYOd6CTxl4hjya9dsfEZGsM4uGOUdax60lEI4twsXGXf7crmz8Gk__GhupTrWxA7RFRVt76', + 'info_dict': { + 'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84', + 'display_id': '2431867_001', + 'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)', + 'live_status': 'was_live', + 'comment_count': int, + }, + 'params': { + 'getcomments': True, + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'skip': 'The video is no longer available', + }, { + 'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krJdu0GVBVbVy01IwpJ6J3qBEm3d9TCTt1d0eWpsZGj7DrOjVOmS7GAWGwyscMgiThopJvzgWC4H5b-7XQjAfRZ', + 'info_dict': { + 'id': '9ce8b8ba-f6d1-4d1f-83a0-18c3148ded93', + 'display_id': '2431867_002', + 'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)', + 'live_status': 'was_live', + 'comment_count': int, + }, + 'params': { + 'getcomments': True, + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'skip': 'The video is no longer available', + }] + + def _extract_var(self, variable, html): + return self._search_regex( + rf'(?:var|const|let)\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + html, f'variable {variable}', group='value') + + def _real_extract(self, url): + video_key = self._match_id(url) + webpage = self._download_webpage(url, video_key) + + program_code = self._extract_var('programCode', webpage) + article_code = self._extract_var('articleCode', webpage) + title = self._html_extract_title(webpage) + + if get_element_html_by_class('play-end', webpage): + raise ExtractorError('The video is no longer available', expected=True, video_id=program_code) + + if start_info := clean_html(get_element_by_class('play-waiting__date', webpage)): + date, time = self._search_regex( + r'(?P<date>\d{4}/\d{1,2}/\d{1,2})\([月火水木金土日]\)(?P<time>\d{2}:\d{2})', + start_info, 'start_info', fatal=False, group=('date', 'time')) + if date and time: + release_timestamp_str = f'{date} {time} +09:00' + release_timestamp = unified_timestamp(release_timestamp_str) + self.raise_no_formats(f'The video will be available after {release_timestamp_str}', expected=True) + return { + 'id': program_code, + 'title': title, + 'live_status': 'is_upcoming', + 'release_timestamp': release_timestamp, + } + + payload, content_type = multipart_encode({ + 'play_url': video_key, + 'api_key': self._API_KEY, + }) + api_data_and_headers = { + 'data': payload, + 'headers': {'Content-Type': content_type, 'Referer': self._PLAYER_ROOT_URL}, + } + + player_tag_list = self._download_json( + f'{self._PIA_LIVE_API_URL}/perf/player-tag-list/{program_code}', program_code, + 'Fetching player tag list', 'Unable to fetch player tag list', **api_data_and_headers) + + return self.url_result( + extract_attributes(player_tag_list['data']['movie_one_tag'])['src'], + url_transparent=True, title=title, display_id=program_code, + __post_extractor=self.extract_comments(program_code, article_code, api_data_and_headers)) + + def _get_comments(self, program_code, article_code, api_data_and_headers): + chat_room_url = traverse_obj(self._download_json( + f'{self._PIA_LIVE_API_URL}/perf/chat-tag-list/{program_code}/{article_code}', program_code, + 'Fetching chat info', 'Unable to fetch chat info', fatal=False, **api_data_and_headers), + ('data', 'chat_one_tag', {extract_attributes}, 'src', {url_or_none})) + if not chat_room_url: + return + comment_page = self._download_webpage( + chat_room_url, program_code, 'Fetching comment page', 'Unable to fetch comment page', + fatal=False, headers={'Referer': self._PLAYER_ROOT_URL}) + if not comment_page: + return + yield from traverse_obj(self._search_json( + r'var\s+_history\s*=', comment_page, 'comment list', + program_code, contains_pattern=r'\[(?s:.+)\]', fatal=False), (..., { + 'timestamp': (0, {int}), + 'author_is_uploader': (1, {lambda x: x == 2}), + 'author': (2, {str}), + 'text': (3, {str}), + 'id': (4, {str_or_none}), + })) diff --git a/yt_dlp/extractor/piaulizaportal.py b/yt_dlp/extractor/piaulizaportal.py deleted file mode 100644 index 1eb6d92b72..0000000000 --- a/yt_dlp/extractor/piaulizaportal.py +++ /dev/null @@ -1,70 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - parse_qs, - time_seconds, - traverse_obj, -) - - -class PIAULIZAPortalIE(InfoExtractor): - IE_DESC = 'ulizaportal.jp - PIA LIVE STREAM' - _VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' - _TESTS = [{ - 'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44', - 'info_dict': { - 'id': '005f18b7-e810-5618-cb82-0987c5755d44', - 'title': 'プレゼンテーションプレイヤーのサンプル', - 'live_status': 'not_live', - }, - 'params': { - 'skip_download': True, - 'ignore_no_formats_error': True, - }, - }, { - 'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1', - 'info_dict': { - 'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d', - 'title': '【確認用】視聴サンプルページ(ULIZA)', - 'live_status': 'not_live', - }, - 'params': { - 'skip_download': True, - 'ignore_no_formats_error': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0))) - if expires and expires <= time_seconds(): - raise ExtractorError('The link is expired.', video_id=video_id, expected=True) - - webpage = self._download_webpage(url, video_id) - - player_data = self._download_webpage( - self._search_regex( - r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"', - webpage, 'player data url'), - video_id, headers={'Referer': 'https://ulizaportal.jp/'}, - note='Fetching player data', errnote='Unable to fetch player data') - - formats = self._extract_m3u8_formats( - self._search_regex( - r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data, - 'm3u8 url', default=None), - video_id, fatal=False) - m3u8_type = self._search_regex( - r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None) - - return { - 'id': video_id, - 'title': self._html_extract_title(webpage), - 'formats': formats, - 'live_status': { - 'video': 'is_live', - 'dvr': 'was_live', # short-term archives - }.get(m3u8_type, 'not_live'), # VOD or long-term archives - } diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py deleted file mode 100644 index 1769684f72..0000000000 --- a/yt_dlp/extractor/pokemon.py +++ /dev/null @@ -1,136 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - extract_attributes, - int_or_none, - js_to_json, - merge_dicts, -) - - -class PokemonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' - _TESTS = [{ - 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', - 'md5': '2fe8eaec69768b25ef898cda9c43062e', - 'info_dict': { - 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', - 'ext': 'mp4', - 'title': 'The Ol’ Raise and Switch!', - 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', - }, - 'add_id': ['LimelightMedia'], - }, { - # no data-video-title - 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', - 'info_dict': { - 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', - 'ext': 'mp4', - 'title': "Pokémon : L'ascension de Darkrai", - 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', - }, - 'add_id': ['LimelightMedia'], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', - 'only_matching': True, - }, { - 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id, display_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, video_id or display_id) - video_data = extract_attributes(self._search_regex( - r'(<[^>]+data-video-id="{}"[^>]*>)'.format(video_id if video_id else '[a-z0-9]{32}'), - webpage, 'video data element')) - video_id = video_data['data-video-id'] - title = video_data.get('data-video-title') or self._html_search_meta( - 'pkm-title', webpage, ' title', default=None) or self._search_regex( - r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': f'limelight:media:{video_id}', - 'title': title, - 'description': video_data.get('data-video-summary'), - 'thumbnail': video_data.get('data-video-poster'), - 'series': 'Pokémon', - 'season_number': int_or_none(video_data.get('data-video-season')), - 'episode': title, - 'episode_number': int_or_none(video_data.get('data-video-episode')), - 'ie_key': 'LimelightMedia', - } - - -class PokemonWatchIE(InfoExtractor): - _VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P<id>[a-z0-9]{32})' - _API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}' - _TESTS = [{ - 'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667', - 'md5': '62833938a31e61ab49ada92f524c42ff', - 'info_dict': { - 'id': '8309a40969894a8e8d5bc1311e9c5667', - 'ext': 'mp4', - 'title': 'Lillier and the Staff!', - 'description': 'md5:338841b8c21b283d24bdc9b568849f04', - }, - }, { - 'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2', - 'only_matching': True, - }, { - 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07', - 'only_matching': True, - }] - - def _extract_media(self, channel_array, video_id): - for channel in channel_array: - for media in channel.get('media'): - if media.get('id') == video_id: - return media - return None - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = { - '_type': 'url', - 'id': video_id, - 'url': f'limelight:media:{video_id}', - 'ie_key': 'LimelightMedia', - } - - # API call can be avoided entirely if we are listing formats - if self.get_param('listformats', False): - return info - - webpage = self._download_webpage(url, video_id) - build_vars = self._parse_json(self._search_regex( - r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'), - video_id, transform_source=js_to_json) - region = build_vars.get('region') - channel_array = self._download_json(self._API_URL.format(region), video_id) - video_data = self._extract_media(channel_array, video_id) - - if video_data is None: - raise ExtractorError( - f'Video {video_id} does not exist', expected=True) - - info['_type'] = 'url_transparent' - images = video_data.get('images') - - return merge_dicts(info, { - 'title': video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': images.get('medium') or images.get('small'), - 'series': 'Pokémon', - 'season_number': int_or_none(video_data.get('season')), - 'episode': video_data.get('title'), - 'episode_number': int_or_none(video_data.get('episode')), - }) diff --git a/yt_dlp/extractor/pornbox.py b/yt_dlp/extractor/pornbox.py index e15244dac0..0996e4d979 100644 --- a/yt_dlp/extractor/pornbox.py +++ b/yt_dlp/extractor/pornbox.py @@ -1,5 +1,5 @@ + from .common import InfoExtractor -from ..compat import functools from ..utils import ( int_or_none, parse_duration, @@ -104,7 +104,7 @@ class PornboxIE(InfoExtractor): get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k']) metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], { 'url': 'src', - 'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + 'vbr': ('bitrate', {int_or_none(scale=1000)}), 'format_id': ('quality', {str_or_none}), 'quality': ('quality', {get_quality}), 'width': ('size', {lambda x: int(x[:-1])}), diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index b0d6475fe4..d5d6ecdfd8 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -198,6 +198,6 @@ class Pr0grammIE(InfoExtractor): 'dislike_count': ('down', {int}), 'timestamp': ('created', {int}), 'upload_date': ('created', {int}, {dt.date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}), - 'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)}), + 'thumbnail': ('thumb', {urljoin('https://thumb.pr0gramm.com')}), }), } diff --git a/yt_dlp/extractor/qdance.py b/yt_dlp/extractor/qdance.py index 934ebbfd70..4f71657c3f 100644 --- a/yt_dlp/extractor/qdance.py +++ b/yt_dlp/extractor/qdance.py @@ -140,7 +140,7 @@ class QDanceIE(InfoExtractor): 'description': ('description', {str.strip}), 'display_id': ('slug', {str}), 'thumbnail': ('thumbnail', {url_or_none}), - 'duration': ('durationInSeconds', {int_or_none}, {lambda x: x or None}), + 'duration': ('durationInSeconds', {int_or_none}, filter), 'availability': ('subscription', 'level', {extract_availability}), 'is_live': ('type', {lambda x: x.lower() == 'live'}), 'artist': ('acts', ..., {str}), diff --git a/yt_dlp/extractor/qqmusic.py b/yt_dlp/extractor/qqmusic.py index d0238692f6..fb46e0d124 100644 --- a/yt_dlp/extractor/qqmusic.py +++ b/yt_dlp/extractor/qqmusic.py @@ -211,10 +211,10 @@ class QQMusicIE(QQMusicBaseIE): 'formats': formats, **traverse_obj(info_data, { 'title': ('title', {str}), - 'album': ('album', 'title', {str}, {lambda x: x or None}), + 'album': ('album', 'title', {str}, filter), 'release_date': ('time_public', {lambda x: x.replace('-', '') or None}), 'creators': ('singer', ..., 'name', {str}), - 'alt_title': ('subtitle', {str}, {lambda x: x or None}), + 'alt_title': ('subtitle', {str}, filter), 'duration': ('interval', {int_or_none}), }), **traverse_obj(init_data, ('detail', { diff --git a/yt_dlp/extractor/radioradicale.py b/yt_dlp/extractor/radioradicale.py new file mode 100644 index 0000000000..472e25c45f --- /dev/null +++ b/yt_dlp/extractor/radioradicale.py @@ -0,0 +1,105 @@ +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class RadioRadicaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radioradicale\.it/scheda/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.radioradicale.it/scheda/471591', + 'md5': 'eb0fbe43a601f1a361cbd00f3c45af4a', + 'info_dict': { + 'id': '471591', + 'ext': 'mp4', + 'title': 'md5:e8fbb8de57011a3255db0beca69af73d', + 'description': 'md5:5e15a789a2fe4d67da8d1366996e89ef', + 'location': 'Napoli', + 'duration': 2852.0, + 'timestamp': 1459987200, + 'upload_date': '20160407', + 'thumbnail': 'https://www.radioradicale.it/photo400/0/0/9/0/1/00901768.jpg', + }, + }, { + 'url': 'https://www.radioradicale.it/scheda/742783/parlamento-riunito-in-seduta-comune-11a-della-xix-legislatura', + 'info_dict': { + 'id': '742783', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + 'description': '-) Votazione per l\'elezione di un giudice della Corte Costituzionale (nono scrutinio)', + 'location': 'CAMERA', + 'duration': 5868.0, + 'timestamp': 1730246400, + 'upload_date': '20241030', + }, + 'playlist': [{ + 'md5': 'aa48de55dcc45478e4cd200f299aab7d', + 'info_dict': { + 'id': '742783-0', + 'ext': 'mp4', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + }, + }, { + 'md5': 'be915c189c70ad2920e5810f32260ff5', + 'info_dict': { + 'id': '742783-1', + 'ext': 'mp4', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + }, + }, { + 'md5': 'f0ee4047342baf8ed3128a8417ac5e0a', + 'info_dict': { + 'id': '742783-2', + 'ext': 'mp4', + 'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)', + }, + }], + }] + + def _entries(self, videos_info, page_id): + for idx, video in enumerate(traverse_obj( + videos_info, ('playlist', lambda _, v: v['sources']))): + video_id = f'{page_id}-{idx}' + formats = [] + subtitles = {} + + for m3u8_url in traverse_obj(video, ('sources', ..., 'src', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + for sub in traverse_obj(video, ('subtitles', ..., lambda _, v: url_or_none(v['src']))): + self._merge_subtitles({sub.get('srclang') or 'und': [{ + 'url': sub['src'], + 'name': sub.get('label'), + }]}, target=subtitles) + + yield { + 'id': video_id, + 'title': video.get('title'), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + videos_info = self._search_json( + r'jQuery\.extend\(Drupal\.settings\s*,', + webpage, 'videos_info', page_id)['RRscheda'] + + entries = list(self._entries(videos_info, page_id)) + + common_info = { + 'id': page_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'location': videos_info.get('luogo'), + **self._search_json_ld(webpage, page_id), + } + + if len(entries) == 1: + return { + **entries[0], + **common_info, + } + + return self.playlist_result(entries, multi_video=True, **common_info) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index b633dc48af..7325e547b3 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -259,6 +259,8 @@ class RedditIE(InfoExtractor): f'https://www.reddit.com/{slug}/.json', video_id, expected_status=403) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError): + if self._get_cookies('https://www.reddit.com/').get('reddit_session'): + raise ExtractorError('Your IP address is unable to access the Reddit API', expected=True) self.raise_login_required('Account authentication is required') raise diff --git a/yt_dlp/extractor/redge.py b/yt_dlp/extractor/redge.py index 7cb91eea48..5ae09a096b 100644 --- a/yt_dlp/extractor/redge.py +++ b/yt_dlp/extractor/redge.py @@ -1,4 +1,3 @@ -import functools from .common import InfoExtractor from ..networking import HEADRequest @@ -118,7 +117,7 @@ class RedCDNLivxIE(InfoExtractor): time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000 duration = traverse_obj( - ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None + ism_doc, ('@Duration', {float_or_none(scale=time_scale)})) or None live_status = None if traverse_obj(ism_doc, '@IsLive') == 'TRUE': diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py index 50138ab12c..b11ea273de 100644 --- a/yt_dlp/extractor/redgifs.py +++ b/yt_dlp/extractor/redgifs.py @@ -213,7 +213,7 @@ class RedGifsSearchIE(RedGifsBaseInfoExtractor): class RedGifsUserIE(RedGifsBaseInfoExtractor): IE_DESC = 'Redgifs user' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P<username>[^/?#]+)(?:\?(?P<query>[^#]+))?' - _PAGE_SIZE = 30 + _PAGE_SIZE = 80 _TESTS = [ { 'url': 'https://www.redgifs.com/users/lamsinka89', @@ -222,7 +222,7 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor): 'title': 'lamsinka89', 'description': 'RedGifs user lamsinka89, ordered by recent', }, - 'playlist_mincount': 100, + 'playlist_mincount': 391, }, { 'url': 'https://www.redgifs.com/users/lamsinka89?page=3', @@ -231,7 +231,7 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor): 'title': 'lamsinka89', 'description': 'RedGifs user lamsinka89, ordered by recent', }, - 'playlist_count': 30, + 'playlist_count': 80, }, { 'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g', @@ -240,7 +240,17 @@ class RedGifsUserIE(RedGifsBaseInfoExtractor): 'title': 'lamsinka89', 'description': 'RedGifs user lamsinka89, ordered by best', }, - 'playlist_mincount': 100, + 'playlist_mincount': 391, + }, + { + 'url': 'https://www.redgifs.com/users/ignored52', + 'note': 'https://github.com/yt-dlp/yt-dlp/issues/7382', + 'info_dict': { + 'id': 'ignored52', + 'title': 'ignored52', + 'description': 'RedGifs user ignored52, ordered by recent', + }, + 'playlist_mincount': 121, }, ] diff --git a/yt_dlp/extractor/rtvslo.py b/yt_dlp/extractor/rtvslo.py index 9c2e6fb6b5..49bebb178a 100644 --- a/yt_dlp/extractor/rtvslo.py +++ b/yt_dlp/extractor/rtvslo.py @@ -187,4 +187,4 @@ class RTVSLOShowIE(InfoExtractor): return self.playlist_from_matches( re.findall(r'<a [^>]*\bhref="(/arhiv/[^"]+)"', webpage), playlist_id, self._html_extract_title(webpage), - getter=lambda x: urljoin('https://365.rtvslo.si', x), ie=RTVSLOIE) + getter=urljoin('https://365.rtvslo.si'), ie=RTVSLOIE) diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index 2c416811af..20365ac5d1 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -2,15 +2,21 @@ import itertools from .common import InfoExtractor from ..utils import ( + UnsupportedError, bool_or_none, determine_ext, int_or_none, + js_to_json, parse_qs, - traverse_obj, + str_or_none, try_get, unified_timestamp, url_or_none, ) +from ..utils.traversal import ( + subs_list_to_dict, + traverse_obj, +) class RutubeBaseIE(InfoExtractor): @@ -19,7 +25,7 @@ class RutubeBaseIE(InfoExtractor): query = {} query['format'] = 'json' return self._download_json( - f'http://rutube.ru/api/video/{video_id}/', + f'https://rutube.ru/api/video/{video_id}/', video_id, 'Downloading video JSON', 'Unable to download video JSON', query=query) @@ -61,18 +67,21 @@ class RutubeBaseIE(InfoExtractor): query = {} query['format'] = 'json' return self._download_json( - f'http://rutube.ru/api/play/options/{video_id}/', + f'https://rutube.ru/api/play/options/{video_id}/', video_id, 'Downloading options JSON', 'Unable to download options JSON', headers=self.geo_verification_headers(), query=query) - def _extract_formats(self, options, video_id): + def _extract_formats_and_subtitles(self, options, video_id): formats = [] + subtitles = {} for format_id, format_url in options['video_balancer'].items(): ext = determine_ext(format_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( format_url, video_id, f4m_id=format_id, fatal=False)) @@ -82,11 +91,19 @@ class RutubeBaseIE(InfoExtractor): 'format_id': format_id, }) for hls_url in traverse_obj(options, ('live_streams', 'hls', ..., 'url', {url_or_none})): - formats.extend(self._extract_m3u8_formats(hls_url, video_id, ext='mp4', fatal=False)) - return formats + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls') + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + self._merge_subtitles(traverse_obj(options, ('captions', ..., { + 'id': 'code', + 'url': 'file', + 'name': ('langTitle', {str}), + }, all, {subs_list_to_dict(lang='ru')})), target=subtitles) + return formats, subtitles - def _download_and_extract_formats(self, video_id, query=None): - return self._extract_formats( + def _download_and_extract_formats_and_subtitles(self, video_id, query=None): + return self._extract_formats_and_subtitles( self._download_api_options(video_id, query=query), video_id) @@ -97,8 +114,8 @@ class RutubeIE(RutubeBaseIE): _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ - 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - 'md5': 'e33ac625efca66aba86cbec9851f2692', + 'url': 'https://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '3d73fdfe5bb81b9aef139e22ef3de26a', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', 'ext': 'mp4', @@ -111,26 +128,25 @@ class RutubeIE(RutubeBaseIE): 'upload_date': '20131016', 'age_limit': 0, 'view_count': int, - 'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', + 'thumbnail': 'https://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', 'categories': ['Новости и СМИ'], 'chapters': [], }, - 'expected_warnings': ['Unable to download f4m'], }, { - 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'url': 'https://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, }, { - 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'url': 'https://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, }, { - 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'url': 'https://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', 'only_matching': True, }, { 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', 'only_matching': True, }, { 'url': 'https://rutube.ru/video/private/884fb55f07a97ab673c7d654553e0f48/?p=x2QojCumHTS3rsKHWXN8Lg', - 'md5': 'd106225f15d625538fe22971158e896f', + 'md5': '4fce7b4fcc7b1bcaa3f45eb1e1ad0dd7', 'info_dict': { 'id': '884fb55f07a97ab673c7d654553e0f48', 'ext': 'mp4', @@ -143,11 +159,10 @@ class RutubeIE(RutubeBaseIE): 'upload_date': '20221210', 'age_limit': 0, 'view_count': int, - 'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', + 'thumbnail': 'https://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', 'categories': ['Видеоигры'], 'chapters': [], }, - 'expected_warnings': ['Unable to download f4m'], }, { 'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/', 'info_dict': { @@ -156,17 +171,16 @@ class RutubeIE(RutubeBaseIE): 'chapters': 'count:4', 'categories': ['Бизнес и предпринимательство'], 'description': 'md5:252feac1305257d8c1bab215cedde75d', - 'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png', + 'thumbnail': 'https://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png', 'duration': 782, 'age_limit': 0, 'uploader_id': '23491359', 'timestamp': 1677153329, 'view_count': int, 'upload_date': '20230223', - 'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании', + 'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании #1', 'uploader': 'Стас Быков', }, - 'expected_warnings': ['Unable to download f4m'], }, { 'url': 'https://rutube.ru/live/video/c58f502c7bb34a8fcdd976b221fca292/', 'info_dict': { @@ -174,7 +188,7 @@ class RutubeIE(RutubeBaseIE): 'ext': 'mp4', 'categories': ['Телепередачи'], 'description': '', - 'thumbnail': 'http://pic.rutubelist.ru/video/14/19/14190807c0c48b40361aca93ad0867c7.jpg', + 'thumbnail': 'https://pic.rutubelist.ru/video/14/19/14190807c0c48b40361aca93ad0867c7.jpg', 'live_status': 'is_live', 'age_limit': 0, 'uploader_id': '23460655', @@ -184,6 +198,24 @@ class RutubeIE(RutubeBaseIE): 'title': r're:Первый канал. Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'uploader': 'Первый канал', }, + }, { + 'url': 'https://rutube.ru/play/embed/03a9cb54bac3376af4c5cb0f18444e01/', + 'info_dict': { + 'id': '03a9cb54bac3376af4c5cb0f18444e01', + 'ext': 'mp4', + 'age_limit': 0, + 'description': '', + 'title': 'Церемония начала торгов акциями ПАО «ЕвроТранс»', + 'chapters': [], + 'upload_date': '20240829', + 'duration': 293, + 'uploader': 'MOEX - Московская биржа', + 'timestamp': 1724946628, + 'thumbnail': 'https://pic.rutubelist.ru/video/2e/24/2e241fddb459baf0fa54acfca44874f4.jpg', + 'view_count': int, + 'uploader_id': '38420507', + 'categories': ['Интервью'], + }, }, { 'url': 'https://rutube.ru/video/5ab908fccfac5bb43ef2b1e4182256b0/', 'only_matching': True, @@ -192,40 +224,46 @@ class RutubeIE(RutubeBaseIE): 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if RutubePlaylistIE.suitable(url) else super().suitable(url) - def _real_extract(self, url): video_id = self._match_id(url) query = parse_qs(url) info = self._download_and_extract_info(video_id, query) - info['formats'] = self._download_and_extract_formats(video_id, query) - return info + formats, subtitles = self._download_and_extract_formats_and_subtitles(video_id, query) + return { + **info, + 'formats': formats, + 'subtitles': subtitles, + } class RutubeEmbedIE(RutubeBaseIE): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)(?:[?#/]|$)' _TESTS = [{ - 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', + 'url': 'https://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', 'ext': 'mp4', 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', - 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', 'uploader': 'subziro89 ILya', 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', + 'age_limit': 0, + 'duration': 1395, + 'chapters': [], + 'description': 'md5:a5acea57bbc3ccdc3cacd1f11a014b5b', + 'view_count': int, + 'thumbnail': 'https://pic.rutubelist.ru/video/d3/03/d3031f4670a6e6170d88fb3607948418.jpg', + 'categories': ['Сериалы'], }, 'params': { 'skip_download': True, }, }, { - 'url': 'http://rutube.ru/play/embed/8083783', + 'url': 'https://rutube.ru/play/embed/8083783', 'only_matching': True, }, { # private video @@ -240,11 +278,12 @@ class RutubeEmbedIE(RutubeBaseIE): query = parse_qs(url) options = self._download_api_options(embed_id, query) video_id = options['effective_video'] - formats = self._extract_formats(options, video_id) + formats, subtitles = self._extract_formats_and_subtitles(options, video_id) info = self._download_and_extract_info(video_id, query) info.update({ 'extractor_key': 'Rutube', 'formats': formats, + 'subtitles': subtitles, }) return info @@ -295,14 +334,14 @@ class RutubeTagsIE(RutubePlaylistBaseIE): IE_DESC = 'Rutube tags' _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://rutube.ru/tags/video/1800/', + 'url': 'https://rutube.ru/tags/video/1800/', 'info_dict': { 'id': '1800', }, 'playlist_mincount': 68, }] - _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' + _PAGE_TEMPLATE = 'https://rutube.ru/api/tags/video/%s/?page=%s&format=json' class RutubeMovieIE(RutubePlaylistBaseIE): @@ -310,8 +349,8 @@ class RutubeMovieIE(RutubePlaylistBaseIE): IE_DESC = 'Rutube movies' _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' - _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' - _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' + _MOVIE_TEMPLATE = 'https://rutube.ru/api/metainfo/tv/%s/?format=json' + _PAGE_TEMPLATE = 'https://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' def _real_extract(self, url): movie_id = self._match_id(url) @@ -327,62 +366,82 @@ class RutubePersonIE(RutubePlaylistBaseIE): IE_DESC = 'Rutube person videos' _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://rutube.ru/video/person/313878/', + 'url': 'https://rutube.ru/video/person/313878/', 'info_dict': { 'id': '313878', }, - 'playlist_mincount': 37, + 'playlist_mincount': 36, }] - _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + _PAGE_TEMPLATE = 'https://rutube.ru/api/video/person/%s/?page=%s&format=json' class RutubePlaylistIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:playlist' IE_DESC = 'Rutube playlists' - _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/plst/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'url': 'https://rutube.ru/plst/308547/', 'info_dict': { - 'id': '3097', + 'id': '308547', }, - 'playlist_count': 27, - }, { - 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', - 'only_matching': True, + 'playlist_mincount': 22, }] - - _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' - - @classmethod - def suitable(cls, url): - from ..utils import int_or_none, parse_qs - - if not super().suitable(url): - return False - params = parse_qs(url) - return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) - - def _next_page_url(self, page_num, playlist_id, item_kind): - return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) - - def _real_extract(self, url): - qs = parse_qs(url) - playlist_kind = qs['pl_type'][0] - playlist_id = qs['pl_id'][0] - return self._extract_playlist(playlist_id, item_kind=playlist_kind) + _PAGE_TEMPLATE = 'https://rutube.ru/api/playlist/custom/%s/videos?page=%s&format=json' class RutubeChannelIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channel' - _VALID_URL = r'https?://rutube\.ru/channel/(?P<id>\d+)/videos' + _VALID_URL = r'https?://rutube\.ru/(?:channel/(?P<id>\d+)|u/(?P<slug>\w+))(?:/(?P<section>videos|shorts|playlists))?' _TESTS = [{ 'url': 'https://rutube.ru/channel/639184/videos/', 'info_dict': { - 'id': '639184', + 'id': '639184_videos', }, - 'playlist_mincount': 133, + 'playlist_mincount': 129, + }, { + 'url': 'https://rutube.ru/channel/25902603/shorts/', + 'info_dict': { + 'id': '25902603_shorts', + }, + 'playlist_mincount': 277, + }, { + 'url': 'https://rutube.ru/channel/25902603/', + 'info_dict': { + 'id': '25902603', + }, + 'playlist_mincount': 406, + }, { + 'url': 'https://rutube.ru/u/rutube/videos/', + 'info_dict': { + 'id': '23704195_videos', + }, + 'playlist_mincount': 113, }] - _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + _PAGE_TEMPLATE = 'https://rutube.ru/api/video/person/%s/?page=%s&format=json&origin__type=%s' + + def _next_page_url(self, page_num, playlist_id, section): + origin_type = { + 'videos': 'rtb,rst,ifrm,rspa', + 'shorts': 'rshorts', + None: '', + }.get(section) + return self._PAGE_TEMPLATE % (playlist_id, page_num, origin_type) + + def _real_extract(self, url): + playlist_id, slug, section = self._match_valid_url(url).group('id', 'slug', 'section') + if section == 'playlists': + raise UnsupportedError(url) + if slug: + webpage = self._download_webpage(url, slug) + redux_state = self._search_json( + r'window\.reduxState\s*=', webpage, 'redux state', slug, transform_source=js_to_json) + playlist_id = traverse_obj(redux_state, ( + 'api', 'queries', lambda k, _: k.startswith('channelIdBySlug'), + 'data', 'channel_id', {int}, {str_or_none}, any)) + playlist = self._extract_playlist(playlist_id, section=section) + if section: + playlist['id'] = f'{playlist_id}_{section}' + return playlist diff --git a/yt_dlp/extractor/shemaroome.py b/yt_dlp/extractor/shemaroome.py index 284b2f89c1..3ab322f67d 100644 --- a/yt_dlp/extractor/shemaroome.py +++ b/yt_dlp/extractor/shemaroome.py @@ -1,11 +1,9 @@ import base64 from .common import InfoExtractor -from ..aes import aes_cbc_decrypt, unpad_pkcs7 +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..utils import ( ExtractorError, - bytes_to_intlist, - intlist_to_bytes, unified_strdate, ) @@ -68,10 +66,10 @@ class ShemarooMeIE(InfoExtractor): data_json = self._download_json('https://www.shemaroome.com/users/user_all_lists', video_id, data=data.encode()) if not data_json.get('status'): raise ExtractorError('Premium videos cannot be downloaded yet.', expected=True) - url_data = bytes_to_intlist(base64.b64decode(data_json['new_play_url'])) - key = bytes_to_intlist(base64.b64decode(data_json['key'])) - iv = [0] * 16 - m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') + url_data = base64.b64decode(data_json['new_play_url']) + key = base64.b64decode(data_json['key']) + iv = bytes(16) + m3u8_url = unpad_pkcs7(aes_cbc_decrypt_bytes(url_data, key, iv)).decode('ascii') headers = {'stream_key': data_json['stream_key']} formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers) for fmt in formats: diff --git a/yt_dlp/extractor/snapchat.py b/yt_dlp/extractor/snapchat.py index 732677c190..09e5766d4f 100644 --- a/yt_dlp/extractor/snapchat.py +++ b/yt_dlp/extractor/snapchat.py @@ -56,13 +56,13 @@ class SnapchatSpotlightIE(InfoExtractor): **traverse_obj(video_data, ('videoMetadata', { 'title': ('name', {str}), 'description': ('description', {str}), - 'timestamp': ('uploadDateMs', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('uploadDateMs', {float_or_none(scale=1000)}), 'view_count': ('viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), 'repost_count': ('shareCount', {int_or_none}), 'url': ('contentUrl', {url_or_none}), 'width': ('width', {int_or_none}), 'height': ('height', {int_or_none}), - 'duration': ('durationMs', {lambda x: float_or_none(x, 1000)}), + 'duration': ('durationMs', {float_or_none(scale=1000)}), 'thumbnail': ('thumbnailUrl', {url_or_none}), 'uploader': ('creator', 'personCreator', 'username', {str}), 'uploader_url': ('creator', 'personCreator', 'url', {url_or_none}), diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py index a0a051e972..0cd914cbba 100644 --- a/yt_dlp/extractor/sonyliv.py +++ b/yt_dlp/extractor/sonyliv.py @@ -199,8 +199,9 @@ class SonyLIVSeriesIE(InfoExtractor): }, }] _API_BASE = 'https://apiv2.sonyliv.com/AGL' + _SORT_ORDERS = ('asc', 'desc') - def _entries(self, show_id): + def _entries(self, show_id, sort_order): headers = { 'Accept': 'application/json, text/plain, */*', 'Referer': 'https://www.sonyliv.com', @@ -215,6 +216,9 @@ class SonyLIVSeriesIE(InfoExtractor): 'from': '0', 'to': '49', }), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id']))) + + if sort_order == 'desc': + seasons = reversed(seasons) for season in seasons: season_id = str(season['id']) note = traverse_obj(season, ('metadata', 'title', {str})) or 'season' @@ -226,7 +230,7 @@ class SonyLIVSeriesIE(InfoExtractor): 'from': str(cursor), 'to': str(cursor + 99), 'orderBy': 'episodeNumber', - 'sortOrder': 'asc', + 'sortOrder': sort_order, }), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id']))) if not episodes: break @@ -237,4 +241,10 @@ class SonyLIVSeriesIE(InfoExtractor): def _real_extract(self, url): show_id = self._match_id(url) - return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + sort_order = self._configuration_arg('sort_order', [self._SORT_ORDERS[0]])[0] + if sort_order not in self._SORT_ORDERS: + raise ValueError( + f'Invalid sort order "{sort_order}". Allowed values are: {", ".join(self._SORT_ORDERS)}') + + return self.playlist_result(self._entries(show_id, sort_order), playlist_id=show_id) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 4f8d96407d..03089e98ea 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -208,7 +208,6 @@ class SoundcloudBaseIE(InfoExtractor): def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False): track_id = str(info['id']) - title = info['title'] format_urls = set() formats = [] @@ -242,7 +241,7 @@ class SoundcloudBaseIE(InfoExtractor): format_urls.add(format_url) formats.append({ 'format_id': 'download', - 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'ext': urlhandle_detect_ext(urlh, default='mp3'), 'filesize': int_or_none(urlh.headers.get('Content-Length')), 'url': format_url, 'quality': 10, @@ -367,7 +366,7 @@ class SoundcloudBaseIE(InfoExtractor): 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), 'uploader_url': user.get('permalink_url'), 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, + 'title': info.get('title'), 'description': info.get('description'), 'thumbnails': thumbnails, 'duration': float_or_none(info.get('duration'), 1000), @@ -377,7 +376,8 @@ class SoundcloudBaseIE(InfoExtractor): 'like_count': extract_count('favoritings') or extract_count('likes'), 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), - 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)), + 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)), + 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)), 'formats': formats if not extract_flat else None, } @@ -429,7 +429,6 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', 'uploader_url': 'https://soundcloud.com/ethmusic', - 'genres': [], }, }, # geo-restricted @@ -453,6 +452,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/the-concept-band', 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', 'genres': ['Alternative'], + 'artists': ['The Royal Concept'], }, }, # private link @@ -525,6 +525,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'view_count': int, 'genres': ['Dance & EDM'], + 'artists': ['80M'], }, }, # private link, downloadable format @@ -549,6 +550,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg', 'uploader_url': 'https://soundcloud.com/oriuplift', 'genres': ['Trance'], + 'artists': ['Ori Uplift'], }, }, # no album art, use avatar pic for thumbnail @@ -572,7 +574,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'comment_count': int, 'repost_count': int, 'uploader_url': 'https://soundcloud.com/garyvee', - 'genres': [], + 'artists': ['MadReal'], }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/spankbang.py b/yt_dlp/extractor/spankbang.py index 6805a72deb..05f0bb1468 100644 --- a/yt_dlp/extractor/spankbang.py +++ b/yt_dlp/extractor/spankbang.py @@ -71,9 +71,11 @@ class SpankBangIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') or mobj.group('id_2') + country = self.get_param('geo_bypass_country') or 'US' + self._set_cookie('.spankbang.com', 'country', country.upper()) webpage = self._download_webpage( url.replace(f'/{video_id}/embed', f'/{video_id}/video'), - video_id, headers={'Cookie': 'country=US'}) + video_id, impersonate=True) if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): raise ExtractorError( diff --git a/yt_dlp/extractor/spreaker.py b/yt_dlp/extractor/spreaker.py index d1df45969b..c64c2fcd2e 100644 --- a/yt_dlp/extractor/spreaker.py +++ b/yt_dlp/extractor/spreaker.py @@ -2,13 +2,16 @@ import itertools from .common import InfoExtractor from ..utils import ( + filter_dict, float_or_none, int_or_none, + parse_qs, str_or_none, try_get, unified_timestamp, url_or_none, ) +from ..utils.traversal import traverse_obj def _extract_episode(data, episode_id=None): @@ -58,15 +61,10 @@ def _extract_episode(data, episode_id=None): class SpreakerIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - api\.spreaker\.com/ - (?: - (?:download/)?episode| - v2/episodes - )/ - (?P<id>\d+) - ''' + _VALID_URL = [ + r'https?://api\.spreaker\.com/(?:(?:download/)?episode|v2/episodes)/(?P<id>\d+)', + r'https?://(?:www\.)?spreaker\.com/episode/[^#?/]*?(?P<id>\d+)/?(?:[?#]|$)', + ] _TESTS = [{ 'url': 'https://api.spreaker.com/episode/12534508', 'info_dict': { @@ -83,7 +81,9 @@ class SpreakerIE(InfoExtractor): 'view_count': int, 'like_count': int, 'comment_count': int, - 'series': 'Success With Music (SWM)', + 'series': 'Success With Music | SWM', + 'thumbnail': 'https://d3wo5wojvuv7l.cloudfront.net/t_square_limited_160/images.spreaker.com/original/777ce4f96b71b0e1b7c09a5e625210e3.jpg', + 'creators': ['SWM'], }, }, { 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', @@ -91,52 +91,75 @@ class SpreakerIE(InfoExtractor): }, { 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', 'only_matching': True, + }, { + 'note': 'episode', + 'url': 'https://www.spreaker.com/episode/grunge-music-origins-the-raw-sound-that-defined-a-generation--60269615', + 'info_dict': { + 'id': '60269615', + 'display_id': 'grunge-music-origins-the-raw-sound-that-', + 'ext': 'mp3', + 'title': 'Grunge Music Origins - The Raw Sound that Defined a Generation', + 'description': str, + 'timestamp': 1717468905, + 'upload_date': '20240604', + 'uploader': 'Katie Brown 2', + 'uploader_id': '17733249', + 'duration': 818.83, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': '90s Grunge', + 'thumbnail': 'https://d3wo5wojvuv7l.cloudfront.net/t_square_limited_160/images.spreaker.com/original/bb0d4178f7cf57cc8786dedbd9c5d969.jpg', + 'creators': ['Katie Brown 2'], + }, + }, { + 'url': 'https://www.spreaker.com/episode/60269615', + 'only_matching': True, }] def _real_extract(self, url): episode_id = self._match_id(url) data = self._download_json( - f'https://api.spreaker.com/v2/episodes/{episode_id}', - episode_id)['response']['episode'] + f'https://api.spreaker.com/v2/episodes/{episode_id}', episode_id, + query=traverse_obj(parse_qs(url), {'key': ('key', 0)}))['response']['episode'] return _extract_episode(data, episode_id) -class SpreakerPageIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - episode_id = self._search_regex( - (r'data-episode_id=["\'](?P<id>\d+)', - r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') - return self.url_result( - f'https://api.spreaker.com/episode/{episode_id}', - ie=SpreakerIE.ie_key(), video_id=episode_id) - - class SpreakerShowIE(InfoExtractor): - _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' + _VALID_URL = [ + r'https?://api\.spreaker\.com/show/(?P<id>\d+)', + r'https?://(?:www\.)?spreaker\.com/podcast/[\w-]+--(?P<id>[\d]+)', + r'https?://(?:www\.)?spreaker\.com/show/(?P<id>\d+)/episodes/feed', + ] _TESTS = [{ 'url': 'https://api.spreaker.com/show/4652058', 'info_dict': { 'id': '4652058', }, 'playlist_mincount': 118, + }, { + 'url': 'https://www.spreaker.com/podcast/health-wealth--5918323', + 'info_dict': { + 'id': '5918323', + }, + 'playlist_mincount': 60, + }, { + 'url': 'https://www.spreaker.com/show/5887186/episodes/feed', + 'info_dict': { + 'id': '5887186', + }, + 'playlist_mincount': 290, }] - def _entries(self, show_id): + def _entries(self, show_id, key=None): for page_num in itertools.count(1): episodes = self._download_json( f'https://api.spreaker.com/show/{show_id}/episodes', - show_id, note=f'Downloading JSON page {page_num}', query={ + show_id, note=f'Downloading JSON page {page_num}', query=filter_dict({ 'page': page_num, 'max_per_page': 100, - }) + 'key': key, + })) pager = try_get(episodes, lambda x: x['response']['pager'], dict) if not pager: break @@ -152,21 +175,5 @@ class SpreakerShowIE(InfoExtractor): def _real_extract(self, url): show_id = self._match_id(url) - return self.playlist_result(self._entries(show_id), playlist_id=show_id) - - -class SpreakerShowPageIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.spreaker.com/show/success-with-music', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - show_id = self._search_regex( - r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') - return self.url_result( - f'https://api.spreaker.com/show/{show_id}', - ie=SpreakerShowIE.ie_key(), video_id=show_id) + key = traverse_obj(parse_qs(url), ('key', 0)) + return self.playlist_result(self._entries(show_id, key), playlist_id=show_id) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 30cb322dc2..b70d40f2ca 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -2,7 +2,13 @@ import re import urllib.parse from .common import InfoExtractor -from ..utils import js_to_json, str_or_none, traverse_obj +from ..networking import HEADRequest +from ..utils import ( + determine_ext, + js_to_json, + str_or_none, +) +from ..utils.traversal import traverse_obj class SubstackIE(InfoExtractor): @@ -43,6 +49,19 @@ class SubstackIE(InfoExtractor): 'uploader': "Andrew Zimmern's Spilled Milk ", 'uploader_id': '577659', }, + }, { + # Podcast that needs its file extension resolved to mp3 + 'url': 'https://persuasion1.substack.com/p/summers', + 'md5': '1456a755d46084744facdfac9edf900f', + 'info_dict': { + 'id': '141970405', + 'ext': 'mp3', + 'title': 'Larry Summers on What Went Wrong on Campus', + 'description': 'Yascha Mounk and Larry Summers also discuss the promise and perils of artificial intelligence.', + 'thumbnail': r're:https://substackcdn\.com/image/.+\.jpeg', + 'uploader': 'Persuasion', + 'uploader_id': '61579', + }, }] @classmethod @@ -89,7 +108,15 @@ class SubstackIE(InfoExtractor): post_type = webpage_info['post']['type'] formats, subtitles = [], {} if post_type == 'podcast': - formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} + fmt = {'url': webpage_info['post']['podcast_url']} + if not determine_ext(fmt['url'], default_ext=None): + # The redirected format URL expires but the original URL doesn't, + # so we only want to extract the extension from this request + fmt['ext'] = determine_ext(self._request_webpage( + HEADRequest(fmt['url']), display_id, + 'Resolving podcast file extension', + 'Podcast URL is invalid').url) + formats.append(fmt) elif post_type == 'video': formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url) else: diff --git a/yt_dlp/extractor/tbsjp.py b/yt_dlp/extractor/tbsjp.py index 32f9cfbdec..0d521f1061 100644 --- a/yt_dlp/extractor/tbsjp.py +++ b/yt_dlp/extractor/tbsjp.py @@ -3,14 +3,12 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, clean_html, - get_element_text_and_html_by_tag, int_or_none, str_or_none, - traverse_obj, - try_call, unified_timestamp, urljoin, ) +from ..utils.traversal import find_element, traverse_obj class TBSJPEpisodeIE(InfoExtractor): @@ -64,7 +62,7 @@ class TBSJPEpisodeIE(InfoExtractor): self._merge_subtitles(subs, target=subtitles) return { - 'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])), + 'title': traverse_obj(webpage, ({find_element(tag='h3')}, {clean_html})), 'id': video_id, **traverse_obj(episode, { 'categories': ('keywords', {list}), diff --git a/yt_dlp/extractor/teamcoco.py b/yt_dlp/extractor/teamcoco.py index 3fb899cac5..a94ff9b332 100644 --- a/yt_dlp/extractor/teamcoco.py +++ b/yt_dlp/extractor/teamcoco.py @@ -136,7 +136,7 @@ class TeamcocoIE(TeamcocoBaseIE): 'blocks', lambda _, v: v['name'] in ('meta-tags', 'video-player', 'video-info'), 'props', {dict}))) thumbnail = traverse_obj( - info, (('image', 'poster'), {lambda x: urljoin('https://teamcoco.com/', x)}), get_all=False) + info, (('image', 'poster'), {urljoin('https://teamcoco.com/')}), get_all=False) video_id = traverse_obj(parse_qs(thumbnail), ('id', 0)) or display_id formats, subtitles = self._get_formats_and_subtitles(info, video_id) diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index 7a9dcd71c5..9ef621446d 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -2,15 +2,69 @@ import json import re from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, clean_html, int_or_none, + join_nonempty, str_or_none, - try_get, + traverse_obj, + update_url, + url_or_none, ) -class TelecincoIE(InfoExtractor): +class TelecincoBaseIE(InfoExtractor): + def _parse_content(self, content, url): + video_id = content['dataMediaId'] + config = self._download_json( + content['dataConfig'], video_id, 'Downloading config JSON') + services = config['services'] + caronte = self._download_json(services['caronte'], video_id) + if traverse_obj(caronte, ('dls', 0, 'drm', {bool})): + self.report_drm(video_id) + + stream = caronte['dls'][0]['stream'] + headers = { + 'Referer': url, + 'Origin': re.match(r'https?://[^/]+', url).group(0), + } + geo_headers = {**headers, **self.geo_verification_headers()} + + try: + cdn = self._download_json( + caronte['cerbero'], video_id, data=json.dumps({ + 'bbx': caronte['bbx'], + 'gbx': self._download_json(services['gbx'], video_id)['gbx'], + }).encode(), headers={ + 'Content-Type': 'application/json', + **geo_headers, + })['tokens']['1']['cdn'] + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 403: + error_code = traverse_obj( + self._webpage_read_content(error.cause.response, caronte['cerbero'], video_id, fatal=False), + ({json.loads}, 'code', {int})) + if error_code == 4038: + self.raise_geo_restricted(countries=['ES']) + raise + + formats = self._extract_m3u8_formats( + update_url(stream, query=cdn), video_id, 'mp4', m3u8_id='hls', headers=geo_headers) + + return { + 'id': video_id, + 'title': traverse_obj(config, ('info', 'title', {str})), + 'formats': formats, + 'thumbnail': (traverse_obj(content, ('dataPoster', {url_or_none})) + or traverse_obj(config, 'poster', 'imageUrl', expected_type=url_or_none)), + 'duration': traverse_obj(content, ('dataDuration', {int_or_none})), + 'http_headers': headers, + } + + +class TelecincoIE(TelecincoBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' @@ -30,6 +84,7 @@ class TelecincoIE(InfoExtractor): 'duration': 662, }, }], + 'skip': 'HTTP Error 410 Gone', }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', 'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a', @@ -40,23 +95,24 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:a62ecb5f1934fc787107d7b9a2262805', 'duration': 79, }, + 'skip': 'Redirects to main page', }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'eddb50291df704ce23c74821b995bcac', + 'md5': '5ce057f43f30b634fbaf0f18c71a140a', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', 'title': '#DOYLACARA. Con la trata no hay trato', - 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, + 'thumbnail': 'https://album.mediaset.es/eimg/2017/11/02/1tlQLO5Q3mtKT24f3EaC24.jpg', }, }, { # video in opening's content 'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html', 'info_dict': { - 'id': '2907195140', + 'id': '1691427', 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', - 'description': 'md5:73f340a7320143d37ab895375b2bf13a', + 'description': r're:Fiorella, la sobrina de Edmundo Arrocet, concedió .{727}', }, 'playlist': [{ 'md5': 'adb28c37238b675dad0f042292f209a7', @@ -65,6 +121,7 @@ class TelecincoIE(InfoExtractor): 'ext': 'mp4', 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"', 'duration': 1015, + 'thumbnail': 'https://album.mediaset.es/eimg/2020/02/29/5opaC37lUhKlZ7FoDhiVC.jpg', }, }], 'params': { @@ -81,66 +138,29 @@ class TelecincoIE(InfoExtractor): 'only_matching': True, }] - def _parse_content(self, content, url): - video_id = content['dataMediaId'] - config = self._download_json( - content['dataConfig'], video_id, 'Downloading config JSON') - title = config['info']['title'] - services = config['services'] - caronte = self._download_json(services['caronte'], video_id) - stream = caronte['dls'][0]['stream'] - headers = self.geo_verification_headers() - headers.update({ - 'Content-Type': 'application/json;charset=UTF-8', - 'Origin': re.match(r'https?://[^/]+', url).group(0), - }) - cdn = self._download_json( - caronte['cerbero'], video_id, data=json.dumps({ - 'bbx': caronte['bbx'], - 'gbx': self._download_json(services['gbx'], video_id)['gbx'], - }).encode(), headers=headers)['tokens']['1']['cdn'] - formats = self._extract_m3u8_formats( - stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), - 'duration': int_or_none(content.get('dataDuration')), - } - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - article = self._parse_json(self._search_regex( - r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})', - webpage, 'article'), display_id)['article'] - title = article.get('title') - description = clean_html(article.get('leadParagraph')) or '' + article = self._search_json( + r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=', + webpage, 'article', display_id)['article'] + description = traverse_obj(article, ('leadParagraph', {clean_html}, filter)) + if article.get('editorialType') != 'VID': entries = [] - body = [article.get('opening')] - body.extend(try_get(article, lambda x: x['body'], list) or []) - for p in body: - if not isinstance(p, dict): - continue - content = p.get('content') - if not content: - continue + + for p in traverse_obj(article, ((('opening', all), 'body'), lambda _, v: v['content'])): + content = p['content'] type_ = p.get('type') - if type_ == 'paragraph': - content_str = str_or_none(content) - if content_str: - description += content_str - continue - if type_ == 'video' and isinstance(content, dict): + if type_ == 'paragraph' and isinstance(content, str): + description = join_nonempty(description, content, delim='') + elif type_ == 'video' and isinstance(content, dict): entries.append(self._parse_content(content, url)) + return self.playlist_result( - entries, str_or_none(article.get('id')), title, description) - content = article['opening']['content'] - info = self._parse_content(content, url) - info.update({ - 'description': description, - }) + entries, str_or_none(article.get('id')), + traverse_obj(article, ('title', {str})), clean_html(description)) + + info = self._parse_content(article['opening']['content'], url) + info['description'] = description return info diff --git a/yt_dlp/extractor/telewebion.py b/yt_dlp/extractor/telewebion.py index b651160240..02a6ea85bc 100644 --- a/yt_dlp/extractor/telewebion.py +++ b/yt_dlp/extractor/telewebion.py @@ -10,10 +10,11 @@ from ..utils.traversal import traverse_obj def _fmt_url(url): - return functools.partial(format_field, template=url, default=None) + return format_field(template=url, default=None) class TelewebionIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?telewebion\.com/episode/(?P<id>(?:0x[a-fA-F\d]+|\d+))' _TESTS = [{ 'url': 'http://www.telewebion.com/episode/0x1b3139c/', diff --git a/yt_dlp/extractor/tencent.py b/yt_dlp/extractor/tencent.py index fc2b07ac27..b281ad1a9f 100644 --- a/yt_dlp/extractor/tencent.py +++ b/yt_dlp/extractor/tencent.py @@ -1,4 +1,3 @@ -import functools import random import re import string @@ -278,7 +277,7 @@ class VQQSeriesIE(VQQBaseIE): webpage)] return self.playlist_from_matches( - episode_paths, series_id, ie=VQQVideoIE, getter=functools.partial(urljoin, url), + episode_paths, series_id, ie=VQQVideoIE, getter=urljoin(url), title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) or self._og_search_title(webpage)), description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) @@ -328,7 +327,7 @@ class WeTvBaseIE(TencentBaseIE): or re.findall(r'<a[^>]+class="play-video__link"[^>]+href="(?P<path>[^"]+)', webpage)) return self.playlist_from_matches( - episode_paths, series_id, ie=ie, getter=functools.partial(urljoin, url), + episode_paths, series_id, ie=ie, getter=urljoin(url), title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) or self._og_search_title(webpage)), description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index 07db583470..cc7bc3b2fc 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,4 +1,3 @@ -import functools import itertools from .common import InfoExtractor @@ -161,4 +160,4 @@ class TenPlaySeasonIE(InfoExtractor): return self.playlist_from_matches( self._entries(urljoin(url, episodes_carousel['loadMoreUrl']), playlist_id), playlist_id, traverse_obj(season_info, ('content', 0, 'title', {str})), - getter=functools.partial(urljoin, url)) + getter=urljoin(url)) diff --git a/yt_dlp/extractor/theguardian.py b/yt_dlp/extractor/theguardian.py index a9e4990649..7e8f9fef26 100644 --- a/yt_dlp/extractor/theguardian.py +++ b/yt_dlp/extractor/theguardian.py @@ -131,4 +131,4 @@ class TheGuardianPodcastPlaylistIE(InfoExtractor): return self.playlist_from_matches( self._entries(url, podcast_id), podcast_id, title, description=description, - ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x)) + ie=TheGuardianPodcastIE, getter=urljoin('https://www.theguardian.com')) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index f7e103fe9f..ba15f08b6d 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -469,7 +469,7 @@ class TikTokBaseIE(InfoExtractor): aweme_detail, aweme_id, traverse_obj(author_info, 'uploader', 'uploader_id', 'channel_id')), 'thumbnails': thumbnails, 'duration': (traverse_obj(video_info, ( - (None, 'download_addr'), 'duration', {functools.partial(int_or_none, scale=1000)}, any)) + (None, 'download_addr'), 'duration', {int_or_none(scale=1000)}, any)) or traverse_obj(music_info, ('duration', {int_or_none}))), 'availability': self._availability( is_private='Private' in labels, @@ -583,7 +583,7 @@ class TikTokBaseIE(InfoExtractor): author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None), **traverse_obj(aweme_detail, ('music', { 'track': ('title', {str}), - 'album': ('album', {str}, {lambda x: x or None}), + 'album': ('album', {str}, filter), 'artists': ('authorName', {str}, {lambda x: re.split(r'(?:, | & )', x) if x else None}), 'duration': ('duration', {int_or_none}), })), @@ -591,7 +591,7 @@ class TikTokBaseIE(InfoExtractor): 'title': ('desc', {str}), 'description': ('desc', {str}), # audio-only slideshows have a video duration of 0 and an actual audio duration - 'duration': ('video', 'duration', {int_or_none}, {lambda x: x or None}), + 'duration': ('video', 'duration', {int_or_none}, filter), 'timestamp': ('createTime', {int_or_none}), }), **traverse_obj(aweme_detail, ('stats', { @@ -1493,7 +1493,7 @@ class TikTokLiveIE(TikTokBaseIE): sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, { 'vcodec': ('VCodec', {str}), - 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}), + 'tbr': ('vbitrate', {int_or_none(scale=1000)}), 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}), })) diff --git a/yt_dlp/extractor/tubetugraz.py b/yt_dlp/extractor/tubetugraz.py index d5dbf007b1..805e2686f7 100644 --- a/yt_dlp/extractor/tubetugraz.py +++ b/yt_dlp/extractor/tubetugraz.py @@ -236,7 +236,7 @@ class TubeTuGrazSeriesIE(TubeTuGrazBaseIE): }, }, ], - 'min_playlist_count': 4, + 'playlist_mincount': 4, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index 85eb3a211c..694a92fcd4 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -6,6 +6,7 @@ from ..utils import ( ExtractorError, int_or_none, js_to_json, + strip_or_none, traverse_obj, url_or_none, urlencode_postdata, @@ -132,12 +133,12 @@ class TubiTvIE(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': strip_or_none(title), 'formats': formats, 'subtitles': subtitles, 'season_number': int_or_none(season_number), 'episode_number': int_or_none(episode_number), - 'episode': episode_title, + 'episode': strip_or_none(episode_title), **traverse_obj(video_data, { 'description': ('description', {str}), 'duration': ('duration', {int_or_none}), diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py index 7f851bf63b..d6d4368839 100644 --- a/yt_dlp/extractor/tumblr.py +++ b/yt_dlp/extractor/tumblr.py @@ -3,12 +3,13 @@ from ..utils import ( ExtractorError, int_or_none, traverse_obj, + url_or_none, urlencode_postdata, ) class TumblrIE(InfoExtractor): - _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _VALID_URL = r'https?://(?P<blog_name_1>[^/?#&]+)\.tumblr\.com/(?:post|video|(?P<blog_name_2>[a-zA-Z\d-]+))/(?P<id>[0-9]+)(?:$|[/?#])' _NETRC_MACHINE = 'tumblr' _LOGIN_URL = 'https://www.tumblr.com/login' _OAUTH_URL = 'https://www.tumblr.com/api/v2/oauth2/token' @@ -66,6 +67,7 @@ class TumblrIE(InfoExtractor): 'age_limit': 0, 'tags': [], }, + 'skip': '404', }, { 'note': 'dashboard only (original post)', 'url': 'https://jujanon.tumblr.com/post/159704441298/my-baby-eating', @@ -98,7 +100,6 @@ class TumblrIE(InfoExtractor): 'like_count': int, 'repost_count': int, 'age_limit': 0, - 'tags': [], }, }, { 'note': 'dashboard only (external)', @@ -109,14 +110,13 @@ class TumblrIE(InfoExtractor): 'title': 'The Blues Remembers Everything the Country Forgot', 'alt_title': 'The Blues Remembers Everything the Country Forgot', 'description': 'md5:1a6b4097e451216835a24c1023707c79', - 'release_date': '20201224', 'creator': 'md5:c2239ba15430e87c3b971ba450773272', 'uploader': 'Moor Mother - Topic', 'upload_date': '20201223', 'uploader_id': 'UCxrMtFBRkFvQJ_vVM4il08w', 'uploader_url': 'http://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', 'thumbnail': r're:^https?://i.ytimg.com/.*', - 'channel': 'Moor Mother - Topic', + 'channel': 'Moor Mother', 'channel_id': 'UCxrMtFBRkFvQJ_vVM4il08w', 'channel_url': 'https://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', 'channel_follower_count': int, @@ -135,24 +135,10 @@ class TumblrIE(InfoExtractor): 'release_year': 2020, }, 'add_ie': ['Youtube'], - }, { - 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', - 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', - 'info_dict': { - 'id': 'Wmur', - 'ext': 'mp4', - 'title': 'naked smoking & stretching', - 'upload_date': '20150506', - 'timestamp': 1430931613, - 'age_limit': 18, - 'uploader_id': '1638622', - 'uploader': 'naked-yogi', - }, - # 'add_ie': ['Vidme'], - 'skip': 'dead embedded video host', + 'skip': 'Video Unavailable', }, { 'url': 'https://prozdvoices.tumblr.com/post/673201091169681408/what-recording-voice-acting-sounds-like', - 'md5': 'a0063fc8110e6c9afe44065b4ea68177', + 'md5': 'cb8328a6723c30556cef59e370202918', 'info_dict': { 'id': 'eomhW5MLGWA', 'ext': 'mp4', @@ -160,8 +146,8 @@ class TumblrIE(InfoExtractor): 'description': 'md5:1da3faa22d0e0b1d8b50216c284ee798', 'uploader': 'ProZD', 'upload_date': '20220112', - 'uploader_id': 'ProZD', - 'uploader_url': 'http://www.youtube.com/user/ProZD', + 'uploader_id': '@ProZD', + 'uploader_url': 'https://www.youtube.com/@ProZD', 'thumbnail': r're:^https?://i.ytimg.com/.*', 'channel': 'ProZD', 'channel_id': 'UC6MFZAOHXlKK1FI7V0XQVeA', @@ -176,6 +162,10 @@ class TumblrIE(InfoExtractor): 'live_status': 'not_live', 'playable_in_embed': True, 'availability': 'public', + 'heatmap': 'count:100', + 'channel_is_verified': True, + 'timestamp': 1642014562, + 'comment_count': int, }, 'add_ie': ['Youtube'], }, { @@ -183,16 +173,20 @@ class TumblrIE(InfoExtractor): 'md5': '203e9eb8077e3f45bfaeb4c86c1467b8', 'info_dict': { 'id': '87816359', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Harold Ramis', - 'description': 'md5:be8e68cbf56ce0785c77f0c6c6dfaf2c', + 'description': 'md5:c99882405fcca0b1d348ad093f8f1672', 'uploader': 'Resolution Productions Group', 'uploader_id': 'resolutionproductions', 'uploader_url': 'https://vimeo.com/resolutionproductions', 'upload_date': '20140227', 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*', - 'timestamp': 1393523719, + 'timestamp': 1393541719, 'duration': 291, + 'comment_count': int, + 'like_count': int, + 'release_timestamp': 1393541719, + 'release_date': '20140227', }, 'add_ie': ['Vimeo'], }, { @@ -214,6 +208,7 @@ class TumblrIE(InfoExtractor): 'view_count': int, }, 'add_ie': ['Vine'], + 'skip': 'Vine is unavailable', }, { 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine', 'md5': '3c92d7c3d867f14ccbeefa2119022277', @@ -232,6 +227,140 @@ class TumblrIE(InfoExtractor): 'upload_date': '20140429', }, 'add_ie': ['Instagram'], + }, { + 'note': 'new url scheme', + 'url': 'https://www.tumblr.com/autumnsister/765162750456578048?source=share', + 'info_dict': { + 'id': '765162750456578048', + 'ext': 'mp4', + 'uploader_url': 'https://autumnsister.tumblr.com/', + 'tags': ['autumn', 'food', 'curators on tumblr'], + 'like_count': int, + 'thumbnail': 'https://64.media.tumblr.com/tumblr_sklad89N3x1ygquow_frame1.jpg', + 'title': '🪹', + 'uploader_id': 'autumnsister', + 'repost_count': int, + 'age_limit': 0, + }, + }, { + 'note': 'bandcamp album embed', + 'url': 'https://patricia-taxxon.tumblr.com/post/704473755725004800/patricia-taxxon-agnes-hilda-patricia-taxxon', + 'info_dict': { + 'id': 'agnes-hilda', + 'title': 'Agnes & Hilda', + 'description': 'The inexplicable joy of an artist. Wash paws after listening.', + 'uploader_id': 'patriciataxxon', + }, + 'playlist_count': 8, + }, { + 'note': 'bandcamp track embeds (many)', + 'url': 'https://www.tumblr.com/felixcosm/730460905855467520/if-youre-looking-for-new-music-to-write-or', + 'info_dict': { + 'id': '730460905855467520', + 'uploader_id': 'felixcosm', + 'repost_count': int, + 'tags': 'count:15', + 'description': 'md5:2eb3482a3c6987280cbefb6839068f32', + 'like_count': int, + 'age_limit': 0, + 'title': 'If you\'re looking for new music to write or imagine scenerios to: STOP. This is for you.', + 'uploader_url': 'https://felixcosm.tumblr.com/', + }, + 'playlist_count': 10, + }, { + 'note': 'soundcloud track embed', + 'url': 'https://silverfoxstole.tumblr.com/post/765305403763556352/jamie-robertson-doctor-who-8th-doctor', + 'info_dict': { + 'id': '1218136399', + 'ext': 'opus', + 'comment_count': int, + 'genres': [], + 'repost_count': int, + 'uploader': 'Jamie Robertson', + 'title': 'Doctor Who - 8th doctor - Stranded Theme never released and used.', + 'duration': 46.106, + 'uploader_id': '2731064', + 'thumbnail': 'https://i1.sndcdn.com/artworks-MVgcPm5jN42isC5M-6Dz22w-original.jpg', + 'timestamp': 1645181261, + 'uploader_url': 'https://soundcloud.com/jamierobertson', + 'view_count': int, + 'upload_date': '20220218', + 'description': 'md5:ab924dd9994d0a7d64d6d31bf2af4625', + 'license': 'all-rights-reserved', + 'like_count': int, + }, + }, { + 'note': 'soundcloud set embed', + 'url': 'https://www.tumblr.com/beyourselfchulanmaria/703505323122638848/chu-lan-maria-the-playlist-%E5%BF%83%E7%9A%84%E5%91%BC%E5%96%9A-call-of-the', + 'info_dict': { + 'id': '691222680', + 'title': '心的呼喚 Call of the heart I', + 'description': 'md5:25952a8d178a3aa55e40fcbb646a38c3', + }, + 'playlist_mincount': 19, + }, { + 'note': 'dailymotion video embed', + 'url': 'https://www.tumblr.com/funvibecentral/759390024460632064', + 'info_dict': { + 'id': 'x94cnnk', + 'ext': 'mp4', + 'description': 'Funny dailymotion shorts.\n#funny #fun#comedy #romantic #exciting', + 'uploader': 'FunVibe Central', + 'like_count': int, + 'view_count': int, + 'timestamp': 1724210553, + 'title': 'Woman watching other Woman', + 'tags': [], + 'upload_date': '20240821', + 'age_limit': 0, + 'uploader_id': 'x32m6ye', + 'duration': 20, + 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Wtqh01cnxKNXLG1N8/x1080', + }, + }, { + 'note': 'tiktok video embed', + 'url': 'https://fansofcolor.tumblr.com/post/660637918605475840/blockquote-class-tiktok-embed', + 'info_dict': { + 'id': '7000937272010935558', + 'ext': 'mp4', + 'artists': ['Alicia Dreaming'], + 'like_count': int, + 'repost_count': int, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'channel_id': 'MS4wLjABAAAAsJohwz_dU4KfAOc61cbGDAZ46-5hg2ANTXVQlRe1ipDhpX08PywR3PPiple1NTAo', + 'uploader': 'aliciadreaming', + 'description': 'huge casting news Greyworm will be #louisdulac #racebending #interviewwiththevampire', + 'title': 'huge casting news Greyworm will be #louisdulac #racebending #interviewwiththevampire', + 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAsJohwz_dU4KfAOc61cbGDAZ46-5hg2ANTXVQlRe1ipDhpX08PywR3PPiple1NTAo', + 'uploader_id': '7000478462196990982', + 'uploader_url': 'https://www.tiktok.com/@aliciadreaming', + 'timestamp': 1630032733, + 'channel': 'Alicia Dreaming', + 'track': 'original sound', + 'upload_date': '20210827', + 'view_count': int, + 'comment_count': int, + 'duration': 59, + }, + }, { + 'note': 'tumblr video AND youtube embed', + 'url': 'https://www.tumblr.com/anyaboz/765332564457209856/my-music-video-for-selkie-by-nobodys-wolf-child', + 'info_dict': { + 'id': '765332564457209856', + 'uploader_id': 'anyaboz', + 'repost_count': int, + 'age_limit': 0, + 'uploader_url': 'https://anyaboz.tumblr.com/', + 'description': 'md5:9a129cf6ce9d87a80ffd3c6dedd4d1e6', + 'like_count': int, + 'title': 'md5:b18a2ac9387681d20303e485db85c1b5', + 'tags': ['music video', 'nobodys wolf child', 'selkie', 'Stop Motion Animation', 'stop Motion', 'room guardians', 'Youtube'], + }, + 'playlist_count': 2, + }, { + # twitch_live provider - error when linked account is not live + 'url': 'https://www.tumblr.com/anarcho-skamunist/722224493650722816/hollow-knight-stream-right-now-going-to-fight', + 'only_matching': True, }] _providers = { @@ -239,6 +368,16 @@ class TumblrIE(InfoExtractor): 'vimeo': 'Vimeo', 'vine': 'Vine', 'youtube': 'Youtube', + 'dailymotion': 'Dailymotion', + 'tiktok': 'TikTok', + 'twitch_live': 'TwitchStream', + 'bandcamp': None, + 'soundcloud': None, + } + # known not to be supported + _unsupported_providers = { + # seems like podcasts can't be embedded + 'spotify', } _ACCESS_TOKEN = None @@ -256,23 +395,40 @@ class TumblrIE(InfoExtractor): if not self._ACCESS_TOKEN: return - self._download_json( - self._OAUTH_URL, None, 'Logging in', - data=urlencode_postdata({ - 'password': password, - 'grant_type': 'password', - 'username': username, - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Authorization': f'Bearer {self._ACCESS_TOKEN}', - }, - errnote='Login failed', fatal=False) + data = { + 'password': password, + 'grant_type': 'password', + 'username': username, + } + if self.get_param('twofactor'): + data['tfa_token'] = self.get_param('twofactor') + + def _call_login(): + return self._download_json( + self._OAUTH_URL, None, 'Logging in', + data=urlencode_postdata(data), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', + }, + errnote='Login failed', fatal=False, + expected_status=lambda s: 400 <= s < 500) + + response = _call_login() + if traverse_obj(response, 'error') == 'tfa_required': + data['tfa_token'] = self._get_tfa_info() + response = _call_login() + if traverse_obj(response, 'error'): + raise ExtractorError( + f'API returned error {": ".join(traverse_obj(response, (("error", "error_description"), {str})))}') def _real_extract(self, url): - blog, video_id = self._match_valid_url(url).groups() + blog_1, blog_2, video_id = self._match_valid_url(url).groups() + blog = blog_2 or blog_1 - url = f'http://{blog}.tumblr.com/post/{video_id}/' - webpage, urlh = self._download_webpage_handle(url, video_id) + url = f'http://{blog}.tumblr.com/post/{video_id}' + webpage, urlh = self._download_webpage_handle( + url, video_id, headers={'User-Agent': 'WhatsApp/2.0'}) # whatsapp ua bypasses problems redirect_url = urlh.url @@ -289,23 +445,69 @@ class TumblrIE(InfoExtractor): self._download_json( f'https://www.tumblr.com/api/v2/blog/{blog}/posts/{video_id}/permalink', video_id, headers={'Authorization': f'Bearer {self._ACCESS_TOKEN}'}, fatal=False), - ('response', 'timeline', 'elements', 0)) or {} - content_json = traverse_obj(post_json, ('trail', 0, 'content'), ('content')) or [] - video_json = next( - (item for item in content_json if item.get('type') == 'video'), {}) - media_json = video_json.get('media') or {} - if api_only and not media_json.get('url') and not video_json.get('url'): - raise ExtractorError('Failed to find video data for dashboard-only post') + ('response', 'timeline', 'elements', 0, {dict})) or {} + content_json = traverse_obj(post_json, ((('trail', 0), None), 'content', ..., {dict})) - if not media_json.get('url') and video_json.get('url'): - # external video host - return self.url_result( - video_json['url'], - self._providers.get(video_json.get('provider'), 'Generic')) + # the url we're extracting from might be an original post or it might be a reblog. + # if it's a reblog, og:description will be the reblogger's comment, not the uploader's. + # content_json is always the op, so if it exists but has no text, there's no description + if content_json: + description = '\n\n'.join( + item.get('text') for item in content_json if item.get('type') == 'text') or None + else: + description = self._og_search_description(webpage, default=None) + uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name') - video_url = self._og_search_video_url(webpage, default=None) - duration = None + info_dict = { + 'id': video_id, + 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex( + r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?', webpage, 'title', default=blog)), + 'description': description, + 'uploader_id': uploader_id, + 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None, + **traverse_obj(post_json, { + 'like_count': ('like_count', {int_or_none}), + 'repost_count': ('reblog_count', {int_or_none}), + 'tags': ('tags', ..., {str}), + }), + 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')), + } + + # for tumblr's own video hosting + fallback_format = None formats = [] + video_url = self._og_search_video_url(webpage, default=None) + # for external video hosts + entries = [] + ignored_providers = set() + unknown_providers = set() + + for video_json in traverse_obj(content_json, lambda _, v: v['type'] in ('video', 'audio')): + media_json = video_json.get('media') or {} + if api_only and not media_json.get('url') and not video_json.get('url'): + raise ExtractorError('Failed to find video data for dashboard-only post') + provider = video_json.get('provider') + + if provider in ('tumblr', None): + fallback_format = { + 'url': media_json.get('url') or video_url, + 'width': int_or_none( + media_json.get('width') or self._og_search_property('video:width', webpage, default=None)), + 'height': int_or_none( + media_json.get('height') or self._og_search_property('video:height', webpage, default=None)), + } + continue + elif provider in self._unsupported_providers: + ignored_providers.add(provider) + continue + elif provider and provider not in self._providers: + unknown_providers.add(provider) + if video_json.get('url'): + # external video host + entries.append(self.url_result( + video_json['url'], self._providers.get(provider))) + + duration = None # iframes can supply duration and sometimes additional formats, so check for one iframe_url = self._search_regex( @@ -344,44 +546,36 @@ class TumblrIE(InfoExtractor): 'quality': quality, } for quality, (video_url, format_id) in enumerate(sources)] - if not media_json.get('url') and not video_url and not iframe_url: - # external video host (but we weren't able to figure it out from the api) - iframe_url = self._search_regex( - r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']', - webpage, 'embed iframe url', default=None) - return self.url_result(iframe_url or redirect_url, 'Generic') + if not formats and fallback_format: + formats.append(fallback_format) - formats = formats or [{ - 'url': media_json.get('url') or video_url, - 'width': int_or_none( - media_json.get('width') or self._og_search_property('video:width', webpage, default=None)), - 'height': int_or_none( - media_json.get('height') or self._og_search_property('video:height', webpage, default=None)), - }] + if formats: + # tumblr's own video is always above embeds + entries.insert(0, { + **info_dict, + 'formats': formats, + 'duration': duration, + 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url', {url_or_none})) + or self._og_search_thumbnail(webpage, default=None)), + }) - # the url we're extracting from might be an original post or it might be a reblog. - # if it's a reblog, og:description will be the reblogger's comment, not the uploader's. - # content_json is always the op, so if it exists but has no text, there's no description - if content_json: - description = '\n\n'.join( - item.get('text') for item in content_json if item.get('type') == 'text') or None - else: - description = self._og_search_description(webpage, default=None) - uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name') + if ignored_providers: + if not entries: + raise ExtractorError(f'None of embed providers are supported: {", ".join(ignored_providers)!s}', video_id=video_id, expected=True) + else: + self.report_warning(f'Skipped embeds from unsupported providers: {", ".join(ignored_providers)!s}', video_id) + if unknown_providers: + self.report_warning(f'Unrecognized providers, please report: {", ".join(unknown_providers)!s}', video_id) + if not entries: + self.raise_no_formats('No video could be found in this post', expected=True, video_id=video_id) + if len(entries) == 1: + return { + **info_dict, + **entries[0], + } return { - 'id': video_id, - 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex( - r'(?s)(?P<title>.*?)(?: \| Tumblr)?', webpage, 'title')), - 'description': description, - 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url')) - or self._og_search_thumbnail(webpage, default=None)), - 'uploader_id': uploader_id, - 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None, - 'duration': duration, - 'like_count': post_json.get('like_count'), - 'repost_count': post_json.get('reblog_count'), - 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')), - 'tags': post_json.get('tags'), - 'formats': formats, + **info_dict, + '_type': 'playlist', + 'entries': entries, } diff --git a/yt_dlp/extractor/tva.py b/yt_dlp/extractor/tva.py index d702640f33..48c4e9cba6 100644 --- a/yt_dlp/extractor/tva.py +++ b/yt_dlp/extractor/tva.py @@ -1,4 +1,3 @@ -import functools import re from .brightcove import BrightcoveNewIE @@ -68,7 +67,7 @@ class TVAIE(InfoExtractor): 'episode': episode, **traverse_obj(entity, { 'description': ('longDescription', {str}), - 'duration': ('durationMillis', {functools.partial(float_or_none, scale=1000)}), + 'duration': ('durationMillis', {float_or_none(scale=1000)}), 'channel': ('knownEntities', 'channel', 'name', {str}), 'series': ('knownEntities', 'videoShow', 'name', {str}), 'season_number': ('slug', {lambda x: re.search(r'/s(?:ai|ea)son-(\d+)/', x)}, 1, {int_or_none}), diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index aca94df2dd..8196ce6c32 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -150,14 +150,6 @@ class TwitterBaseIE(InfoExtractor): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - # XXX: Temporary workaround until twitter.com => x.com migration is completed - def _real_initialize(self): - if self.is_logged_in or not self._get_cookies('https://twitter.com/').get('auth_token'): - return - # User has not yet been migrated to x.com and has passed twitter.com cookies - TwitterBaseIE._API_BASE = 'https://api.twitter.com/1.1/' - TwitterBaseIE._GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - @functools.cached_property def _selected_api(self): return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] @@ -934,14 +926,13 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', 'release_timestamp': 1658417414, - 'description': 'md5:acce559345fd49f129c20dbcda3f1201', + 'description': r're:Twitter Space participated by Sergej Sumlenny.+', 'timestamp': 1658407771, 'release_date': '20220721', 'upload_date': '20220721', }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, - 'skip': 'Requires authentication', }, { # URL specifies video number but --yes-playlist 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', @@ -1856,8 +1847,6 @@ class TwitterSpacesIE(TwitterBaseIE): def _real_extract(self, url): space_id = self._match_id(url) - if not self.is_logged_in: - self.raise_login_required('Twitter Spaces require authentication') space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace'] if not space_data: raise ExtractorError('Twitter Space not found', expected=True) diff --git a/yt_dlp/extractor/uliza.py b/yt_dlp/extractor/uliza.py new file mode 100644 index 0000000000..5766bd378f --- /dev/null +++ b/yt_dlp/extractor/uliza.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + make_archive_id, + parse_qs, + time_seconds, +) +from ..utils.traversal import traverse_obj + + +class UlizaPlayerIE(InfoExtractor): + _VALID_URL = r'https://player-api\.p\.uliza\.jp/v1/players/[^?#]+\?(?:[^#]*&)?name=(?P[^#&]+)' + _TESTS = [{ + 'url': 'https://player-api.p.uliza.jp/v1/players/timeshift-disabled/pia/admin?type=normal&playerobjectname=ulizaPlayer&name=livestream01_dvr&repeatable=true', + 'info_dict': { + 'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84', + 'ext': 'mp4', + 'title': '88f3109a-f503-4d0f-a9f7-9f39ac745d84', + 'live_status': 'was_live', + '_old_archive_ids': ['piaulizaportal 88f3109a-f503-4d0f-a9f7-9f39ac745d84'], + }, + }, { + 'url': 'https://player-api.p.uliza.jp/v1/players/uliza_jp_gallery_normal/promotion/admin?type=presentation&name=cookings&targetid=player1', + 'info_dict': { + 'id': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800', + 'ext': 'mp4', + 'title': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800', + 'live_status': 'not_live', + '_old_archive_ids': ['piaulizaportal ae350126-5e22-4a7f-a8ac-8d0fd448b800'], + }, + }, { + 'url': 'https://player-api.p.uliza.jp/v1/players/default-player/pia/admin?type=normal&name=pia_movie_uliza_fix&targetid=ulizahtml5&repeatable=true', + 'info_dict': { + 'id': '0644ecc8-e354-41b4-b957-3b08a2d63df1', + 'ext': 'mp4', + 'title': '0644ecc8-e354-41b4-b957-3b08a2d63df1', + 'live_status': 'not_live', + '_old_archive_ids': ['piaulizaportal 0644ecc8-e354-41b4-b957-3b08a2d63df1'], + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + player_data = self._download_webpage( + url, display_id, headers={'Referer': 'https://player-api.p.uliza.jp/'}, + note='Fetching player data', errnote='Unable to fetch player data') + + m3u8_url = self._search_regex( + r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data, 'm3u8 url') + video_id = parse_qs(m3u8_url).get('ss', [display_id])[0] + + formats = self._extract_m3u8_formats(m3u8_url, video_id) + m3u8_type = self._search_regex( + r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None) + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + 'live_status': { + 'video': 'is_live', + 'dvr': 'was_live', # short-term archives + }.get(m3u8_type, 'not_live'), # VOD or long-term archives + '_old_archive_ids': [make_archive_id('PIAULIZAPortal', video_id)], + } + + +class UlizaPortalIE(InfoExtractor): + IE_DESC = 'ulizaportal.jp' + _VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44', + 'info_dict': { + 'id': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800', + 'display_id': '005f18b7-e810-5618-cb82-0987c5755d44', + 'title': 'プレゼンテーションプレイヤーのサンプル', + 'live_status': 'not_live', + '_old_archive_ids': ['piaulizaportal ae350126-5e22-4a7f-a8ac-8d0fd448b800'], + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1', + 'info_dict': { + 'id': '0644ecc8-e354-41b4-b957-3b08a2d63df1', + 'display_id': '005e1b23-fe93-5780-19a0-98e917cc4b7d', + 'title': '【確認用】視聴サンプルページ(ULIZA)', + 'live_status': 'not_live', + '_old_archive_ids': ['piaulizaportal 0644ecc8-e354-41b4-b957-3b08a2d63df1'], + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0))) + if expires and expires <= time_seconds(): + raise ExtractorError('The link is expired', video_id=video_id, expected=True) + + webpage = self._download_webpage(url, video_id) + + player_data_url = self._search_regex( + r'