From 9f62eaf4ef87cd379318a1330373317cd6d4d63c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 3 Mar 2014 12:53:11 +0100 Subject: [PATCH] [canal13cl] Add test and improve extraction (#2498) --- youtube_dl/extractor/canal13cl.py | 48 ++++++++++++++++++++----------- youtube_dl/extractor/common.py | 4 +-- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py index 781c1b5037..93241fefec 100644 --- a/youtube_dl/extractor/canal13cl.py +++ b/youtube_dl/extractor/canal13cl.py @@ -1,32 +1,48 @@ +# coding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor class Canal13clIE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?13\.cl/' - IE_NAME = 'Canal13cl' + _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P[^/?#]+)' + _TEST = { + 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': '1403022125', + 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', + 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', + } + } def _real_extract(self, url): - webpage = self._download_webpage(url, url) - video_id = self._html_search_regex( - r'http://streaming.13.cl/(.*)\.mp4', - webpage, u'video_id') - title = self._html_search_regex( - r'(articuloTitulo = \"(.*?)\"|(.*?)\|)', - webpage, u'title') + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True) + description = self._html_search_meta( + 'twitter:description', webpage, 'description') url = self._html_search_regex( - r'articuloVideo = \"(.*?)\"', - webpage, u'url') - thumbnail = self._html_search_regex ( - r'articuloImagen = \"(.*?)\"', - webpage, u'thumbnail') + r'articuloVideo = \"(.*?)\"', webpage, 'url') + real_id = self._search_regex( + r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) + thumbnail = self._html_search_regex( + r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') return { - 'video_id': video_id, + 'id': real_id, + 'display_id': display_id, 'url': url, 'title': title, + 'description': description, 'ext': 'mp4', - 'thumbnail': thumbnail + 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 080c9bdfad..7ee95fe391 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -436,14 +436,14 @@ class InfoExtractor(object): if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) - def _html_search_meta(self, name, html, display_name=None): + def _html_search_meta(self, name, html, display_name=None, fatal=False): if display_name is None: display_name = name return self._html_search_regex( r'''(?ix)]+(?:itemprop|name|property)=["\']%s["\']) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=False) + html, display_name, fatal=fatal) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader')