youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

ceskatelevize.py (10657B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..compat import (
      8     compat_urllib_parse_unquote,
      9     compat_urllib_parse_urlparse,
     10 )
     11 from ..utils import (
     12     ExtractorError,
     13     float_or_none,
     14     sanitized_Request,
     15     unescapeHTML,
     16     update_url_query,
     17     urlencode_postdata,
     18     USER_AGENTS,
     19 )
     20 
     21 
     22 class CeskaTelevizeIE(InfoExtractor):
     23     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
     24     _TESTS = [{
     25         'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
     26         'info_dict': {
     27             'id': '61924494877246241',
     28             'ext': 'mp4',
     29             'title': 'Hyde Park Civilizace: Život v Grónsku',
     30             'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
     31             'thumbnail': r're:^https?://.*\.jpg',
     32             'duration': 3350,
     33         },
     34         'params': {
     35             # m3u8 download
     36             'skip_download': True,
     37         },
     38     }, {
     39         'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
     40         'info_dict': {
     41             'id': '61924494877028507',
     42             'ext': 'mp4',
     43             'title': 'Hyde Park Civilizace: Bonus 01 - En',
     44             'description': 'English Subtittles',
     45             'thumbnail': r're:^https?://.*\.jpg',
     46             'duration': 81.3,
     47         },
     48         'params': {
     49             # m3u8 download
     50             'skip_download': True,
     51         },
     52     }, {
     53         # live stream
     54         'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
     55         'info_dict': {
     56             'id': 402,
     57             'ext': 'mp4',
     58             'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
     59             'is_live': True,
     60         },
     61         'params': {
     62             # m3u8 download
     63             'skip_download': True,
     64         },
     65         'skip': 'Georestricted to Czech Republic',
     66     }, {
     67         'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
     68         'only_matching': True,
     69     }]
     70 
     71     def _real_extract(self, url):
     72         playlist_id = self._match_id(url)
     73 
     74         webpage = self._download_webpage(url, playlist_id)
     75 
     76         NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
     77         if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
     78             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
     79 
     80         type_ = None
     81         episode_id = None
     82 
     83         playlist = self._parse_json(
     84             self._search_regex(
     85                 r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
     86                 default='{}'), playlist_id)
     87         if playlist:
     88             type_ = playlist.get('type')
     89             episode_id = playlist.get('id')
     90 
     91         if not type_:
     92             type_ = self._html_search_regex(
     93                 r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
     94                 webpage, 'type')
     95         if not episode_id:
     96             episode_id = self._html_search_regex(
     97                 r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
     98                 webpage, 'episode_id')
     99 
    100         data = {
    101             'playlist[0][type]': type_,
    102             'playlist[0][id]': episode_id,
    103             'requestUrl': compat_urllib_parse_urlparse(url).path,
    104             'requestSource': 'iVysilani',
    105         }
    106 
    107         entries = []
    108 
    109         for user_agent in (None, USER_AGENTS['Safari']):
    110             req = sanitized_Request(
    111                 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
    112                 data=urlencode_postdata(data))
    113 
    114             req.add_header('Content-type', 'application/x-www-form-urlencoded')
    115             req.add_header('x-addr', '127.0.0.1')
    116             req.add_header('X-Requested-With', 'XMLHttpRequest')
    117             if user_agent:
    118                 req.add_header('User-Agent', user_agent)
    119             req.add_header('Referer', url)
    120 
    121             playlistpage = self._download_json(req, playlist_id, fatal=False)
    122 
    123             if not playlistpage:
    124                 continue
    125 
    126             playlist_url = playlistpage['url']
    127             if playlist_url == 'error_region':
    128                 raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
    129 
    130             req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
    131             req.add_header('Referer', url)
    132 
    133             playlist_title = self._og_search_title(webpage, default=None)
    134             playlist_description = self._og_search_description(webpage, default=None)
    135 
    136             playlist = self._download_json(req, playlist_id, fatal=False)
    137             if not playlist:
    138                 continue
    139 
    140             playlist = playlist.get('playlist')
    141             if not isinstance(playlist, list):
    142                 continue
    143 
    144             playlist_len = len(playlist)
    145 
    146             for num, item in enumerate(playlist):
    147                 is_live = item.get('type') == 'LIVE'
    148                 formats = []
    149                 for format_id, stream_url in item.get('streamUrls', {}).items():
    150                     if 'drmOnly=true' in stream_url:
    151                         continue
    152                     if 'playerType=flash' in stream_url:
    153                         stream_formats = self._extract_m3u8_formats(
    154                             stream_url, playlist_id, 'mp4', 'm3u8_native',
    155                             m3u8_id='hls-%s' % format_id, fatal=False)
    156                     else:
    157                         stream_formats = self._extract_mpd_formats(
    158                             stream_url, playlist_id,
    159                             mpd_id='dash-%s' % format_id, fatal=False)
    160                     # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031
    161                     if format_id == 'audioDescription':
    162                         for f in stream_formats:
    163                             f['source_preference'] = -10
    164                     formats.extend(stream_formats)
    165 
    166                 if user_agent and len(entries) == playlist_len:
    167                     entries[num]['formats'].extend(formats)
    168                     continue
    169 
    170                 item_id = item.get('id') or item['assetId']
    171                 title = item['title']
    172 
    173                 duration = float_or_none(item.get('duration'))
    174                 thumbnail = item.get('previewImageUrl')
    175 
    176                 subtitles = {}
    177                 if item.get('type') == 'VOD':
    178                     subs = item.get('subtitles')
    179                     if subs:
    180                         subtitles = self.extract_subtitles(episode_id, subs)
    181 
    182                 if playlist_len == 1:
    183                     final_title = playlist_title or title
    184                     if is_live:
    185                         final_title = self._live_title(final_title)
    186                 else:
    187                     final_title = '%s (%s)' % (playlist_title, title)
    188 
    189                 entries.append({
    190                     'id': item_id,
    191                     'title': final_title,
    192                     'description': playlist_description if playlist_len == 1 else None,
    193                     'thumbnail': thumbnail,
    194                     'duration': duration,
    195                     'formats': formats,
    196                     'subtitles': subtitles,
    197                     'is_live': is_live,
    198                 })
    199 
    200         for e in entries:
    201             self._sort_formats(e['formats'])
    202 
    203         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
    204 
    205     def _get_subtitles(self, episode_id, subs):
    206         original_subtitles = self._download_webpage(
    207             subs[0]['url'], episode_id, 'Downloading subtitles')
    208         srt_subs = self._fix_subtitles(original_subtitles)
    209         return {
    210             'cs': [{
    211                 'ext': 'srt',
    212                 'data': srt_subs,
    213             }]
    214         }
    215 
    216     @staticmethod
    217     def _fix_subtitles(subtitles):
    218         """ Convert millisecond-based subtitles to SRT """
    219 
    220         def _msectotimecode(msec):
    221             """ Helper utility to convert milliseconds to timecode """
    222             components = []
    223             for divider in [1000, 60, 60, 100]:
    224                 components.append(msec % divider)
    225                 msec //= divider
    226             return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
    227 
    228         def _fix_subtitle(subtitle):
    229             for line in subtitle.splitlines():
    230                 m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
    231                 if m:
    232                     yield m.group(1)
    233                     start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
    234                     yield '{0} --> {1}'.format(start, stop)
    235                 else:
    236                     yield line
    237 
    238         return '\r\n'.join(_fix_subtitle(subtitles))
    239 
    240 
    241 class CeskaTelevizePoradyIE(InfoExtractor):
    242     _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
    243     _TESTS = [{
    244         # video with 18+ caution trailer
    245         'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
    246         'info_dict': {
    247             'id': '215562210900007-bogotart',
    248             'title': 'Queer: Bogotart',
    249             'description': 'Alternativní průvodce současným queer světem',
    250         },
    251         'playlist': [{
    252             'info_dict': {
    253                 'id': '61924494876844842',
    254                 'ext': 'mp4',
    255                 'title': 'Queer: Bogotart (Varování 18+)',
    256                 'duration': 10.2,
    257             },
    258         }, {
    259             'info_dict': {
    260                 'id': '61924494877068022',
    261                 'ext': 'mp4',
    262                 'title': 'Queer: Bogotart (Queer)',
    263                 'thumbnail': r're:^https?://.*\.jpg',
    264                 'duration': 1558.3,
    265             },
    266         }],
    267         'params': {
    268             # m3u8 download
    269             'skip_download': True,
    270         },
    271     }, {
    272         # iframe embed
    273         'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
    274         'only_matching': True,
    275     }]
    276 
    277     def _real_extract(self, url):
    278         video_id = self._match_id(url)
    279 
    280         webpage = self._download_webpage(url, video_id)
    281 
    282         data_url = update_url_query(unescapeHTML(self._search_regex(
    283             (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
    284              r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
    285             webpage, 'iframe player url', group='url')), query={
    286                 'autoStart': 'true',
    287         })
    288 
    289         return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())