youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

telecinco.py (6226B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import json
      5 import re
      6 
      7 from .common import InfoExtractor
      8 from ..utils import (
      9     clean_html,
     10     int_or_none,
     11     str_or_none,
     12     try_get,
     13 )
     14 
     15 
     16 class TelecincoIE(InfoExtractor):
     17     IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
     18     _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
     19 
     20     _TESTS = [{
     21         'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
     22         'info_dict': {
     23             'id': '1876350223',
     24             'title': 'Bacalao con kokotxas al pil-pil',
     25             'description': 'md5:716caf5601e25c3c5ab6605b1ae71529',
     26         },
     27         'playlist': [{
     28             'md5': '7ee56d665cfd241c0e6d80fd175068b0',
     29             'info_dict': {
     30                 'id': 'JEA5ijCnF6p5W08A1rNKn7',
     31                 'ext': 'mp4',
     32                 'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido',
     33                 'duration': 662,
     34             },
     35         }]
     36     }, {
     37         'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
     38         'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a',
     39         'info_dict': {
     40             'id': 'jn24Od1zGLG4XUZcnUnZB6',
     41             'ext': 'mp4',
     42             'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?',
     43             'description': 'md5:a62ecb5f1934fc787107d7b9a2262805',
     44             'duration': 79,
     45         },
     46     }, {
     47         'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
     48         'md5': 'eddb50291df704ce23c74821b995bcac',
     49         'info_dict': {
     50             'id': 'aywerkD2Sv1vGNqq9b85Q2',
     51             'ext': 'mp4',
     52             'title': '#DOYLACARA. Con la trata no hay trato',
     53             'description': 'md5:2771356ff7bfad9179c5f5cd954f1477',
     54             'duration': 50,
     55         },
     56     }, {
     57         # video in opening's content
     58         'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html',
     59         'info_dict': {
     60             'id': '2907195140',
     61             'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"',
     62             'description': 'md5:73f340a7320143d37ab895375b2bf13a',
     63         },
     64         'playlist': [{
     65             'md5': 'adb28c37238b675dad0f042292f209a7',
     66             'info_dict': {
     67                 'id': 'TpI2EttSDAReWpJ1o0NVh2',
     68                 'ext': 'mp4',
     69                 'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"',
     70                 'duration': 1015,
     71             },
     72         }],
     73         'params': {
     74             'skip_download': True,
     75         },
     76     }, {
     77         'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
     78         'only_matching': True,
     79     }, {
     80         'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',
     81         'only_matching': True,
     82     }, {
     83         # ooyala video
     84         'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html',
     85         'only_matching': True,
     86     }]
     87 
     88     def _parse_content(self, content, url):
     89         video_id = content['dataMediaId']
     90         config = self._download_json(
     91             content['dataConfig'], video_id, 'Downloading config JSON')
     92         title = config['info']['title']
     93         services = config['services']
     94         caronte = self._download_json(services['caronte'], video_id)
     95         stream = caronte['dls'][0]['stream']
     96         headers = self.geo_verification_headers()
     97         headers.update({
     98             'Content-Type': 'application/json;charset=UTF-8',
     99             'Origin': re.match(r'https?://[^/]+', url).group(0),
    100         })
    101         cdn = self._download_json(
    102             caronte['cerbero'], video_id, data=json.dumps({
    103                 'bbx': caronte['bbx'],
    104                 'gbx': self._download_json(services['gbx'], video_id)['gbx'],
    105             }).encode(), headers=headers)['tokens']['1']['cdn']
    106         formats = self._extract_m3u8_formats(
    107             stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
    108         self._sort_formats(formats)
    109 
    110         return {
    111             'id': video_id,
    112             'title': title,
    113             'formats': formats,
    114             'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'),
    115             'duration': int_or_none(content.get('dataDuration')),
    116         }
    117 
    118     def _real_extract(self, url):
    119         display_id = self._match_id(url)
    120         webpage = self._download_webpage(url, display_id)
    121         article = self._parse_json(self._search_regex(
    122             r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})',
    123             webpage, 'article'), display_id)['article']
    124         title = article.get('title')
    125         description = clean_html(article.get('leadParagraph')) or ''
    126         if article.get('editorialType') != 'VID':
    127             entries = []
    128             body = [article.get('opening')]
    129             body.extend(try_get(article, lambda x: x['body'], list) or [])
    130             for p in body:
    131                 if not isinstance(p, dict):
    132                     continue
    133                 content = p.get('content')
    134                 if not content:
    135                     continue
    136                 type_ = p.get('type')
    137                 if type_ == 'paragraph':
    138                     content_str = str_or_none(content)
    139                     if content_str:
    140                         description += content_str
    141                     continue
    142                 if type_ == 'video' and isinstance(content, dict):
    143                     entries.append(self._parse_content(content, url))
    144             return self.playlist_result(
    145                 entries, str_or_none(article.get('id')), title, description)
    146         content = article['opening']['content']
    147         info = self._parse_content(content, url)
    148         info.update({
    149             'description': description,
    150         })
    151         return info