youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

ted.py (13942B)


      1 from __future__ import unicode_literals
      2 
      3 import json
      4 import re
      5 
      6 from .common import InfoExtractor
      7 
      8 from ..compat import (
      9     compat_str,
     10     compat_urlparse
     11 )
     12 from ..utils import (
     13     extract_attributes,
     14     float_or_none,
     15     int_or_none,
     16     try_get,
     17     url_or_none,
     18 )
     19 
     20 
     21 class TEDIE(InfoExtractor):
     22     IE_NAME = 'ted'
     23     _VALID_URL = r'''(?x)
     24         (?P<proto>https?://)
     25         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
     26         (
     27             (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
     28             |
     29             ((?P<type_talk>talks)) # We have a simple talk
     30             |
     31             (?P<type_watch>watch)/[^/]+/[^/]+
     32         )
     33         (/lang/(.*?))? # The url may contain the language
     34         /(?P<name>[\w-]+) # Here goes the name and then ".html"
     35         .*)$
     36         '''
     37     _TESTS = [{
     38         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
     39         'md5': 'b0ce2b05ca215042124fbc9e3886493a',
     40         'info_dict': {
     41             'id': '102',
     42             'ext': 'mp4',
     43             'title': 'The illusion of consciousness',
     44             'description': ('Philosopher Dan Dennett makes a compelling '
     45                             'argument that not only don\'t we understand our own '
     46                             'consciousness, but that half the time our brains are '
     47                             'actively fooling us.'),
     48             'uploader': 'Dan Dennett',
     49             'width': 853,
     50             'duration': 1308,
     51             'view_count': int,
     52             'comment_count': int,
     53             'tags': list,
     54         },
     55         'params': {
     56             'skip_download': True,
     57         },
     58     }, {
     59         # missing HTTP bitrates
     60         'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
     61         'info_dict': {
     62             'id': '6069',
     63             'ext': 'mp4',
     64             'title': 'The beauty and power of algorithms',
     65             'thumbnail': r're:^https?://.+\.jpg',
     66             'description': 'md5:734e352710fb00d840ab87ae31aaf688',
     67             'uploader': 'Vishal Sikka',
     68         },
     69         'params': {
     70             'skip_download': True,
     71         },
     72     }, {
     73         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
     74         'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
     75         'info_dict': {
     76             'id': '1972',
     77             'ext': 'mp4',
     78             'title': 'Be passionate. Be courageous. Be your best.',
     79             'uploader': 'Gabby Giffords and Mark Kelly',
     80             'description': 'md5:5174aed4d0f16021b704120360f72b92',
     81             'duration': 1128,
     82         },
     83         'params': {
     84             'skip_download': True,
     85         },
     86     }, {
     87         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
     88         'info_dict': {
     89             'id': '10',
     90             'title': 'Who are the hackers?',
     91             'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
     92         },
     93         'playlist_mincount': 6,
     94     }, {
     95         # contains a youtube video
     96         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
     97         'add_ie': ['Youtube'],
     98         'info_dict': {
     99             'id': '_ZG8HBuDjgc',
    100             'ext': 'webm',
    101             'title': 'Douglas Adams: Parrots the Universe and Everything',
    102             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
    103             'uploader': 'University of California Television (UCTV)',
    104             'uploader_id': 'UCtelevision',
    105             'upload_date': '20080522',
    106         },
    107         'params': {
    108             'skip_download': True,
    109         },
    110     }, {
    111         # no nativeDownloads
    112         'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
    113         'info_dict': {
    114             'id': '1792',
    115             'ext': 'mp4',
    116             'title': 'The orchestra in my mouth',
    117             'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
    118             'uploader': 'Tom Thum',
    119             'view_count': int,
    120             'comment_count': int,
    121             'tags': list,
    122         },
    123         'params': {
    124             'skip_download': True,
    125         },
    126     }, {
    127         # with own formats and private Youtube external
    128         'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
    129         'only_matching': True,
    130     }]
    131 
    132     _NATIVE_FORMATS = {
    133         'low': {'width': 320, 'height': 180},
    134         'medium': {'width': 512, 'height': 288},
    135         'high': {'width': 854, 'height': 480},
    136     }
    137 
    138     def _extract_info(self, webpage):
    139         info_json = self._search_regex(
    140             r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
    141             webpage, 'info json')
    142         return json.loads(info_json)
    143 
    144     def _real_extract(self, url):
    145         m = re.match(self._VALID_URL, url, re.VERBOSE)
    146         if m.group('type').startswith('embed'):
    147             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
    148             return self.url_result(desktop_url, 'TED')
    149         name = m.group('name')
    150         if m.group('type_talk'):
    151             return self._talk_info(url, name)
    152         elif m.group('type_watch'):
    153             return self._watch_info(url, name)
    154         else:
    155             return self._playlist_videos_info(url, name)
    156 
    157     def _playlist_videos_info(self, url, name):
    158         '''Returns the videos of the playlist'''
    159 
    160         webpage = self._download_webpage(url, name,
    161                                          'Downloading playlist webpage')
    162 
    163         playlist_entries = []
    164         for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
    165             attrs = extract_attributes(entry)
    166             entry_url = compat_urlparse.urljoin(url, attrs['href'])
    167             playlist_entries.append(self.url_result(entry_url, self.ie_key()))
    168 
    169         final_url = self._og_search_url(webpage, fatal=False)
    170         playlist_id = (
    171             re.match(self._VALID_URL, final_url).group('playlist_id')
    172             if final_url else None)
    173 
    174         return self.playlist_result(
    175             playlist_entries, playlist_id=playlist_id,
    176             playlist_title=self._og_search_title(webpage, fatal=False),
    177             playlist_description=self._og_search_description(webpage))
    178 
    179     def _talk_info(self, url, video_name):
    180         webpage = self._download_webpage(url, video_name)
    181 
    182         info = self._extract_info(webpage)
    183 
    184         data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
    185         talk_info = data['talks'][0]
    186 
    187         title = talk_info['title'].strip()
    188 
    189         downloads = talk_info.get('downloads') or {}
    190         native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
    191 
    192         formats = [{
    193             'url': format_url,
    194             'format_id': format_id,
    195         } for (format_id, format_url) in native_downloads.items() if format_url is not None]
    196 
    197         subtitled_downloads = downloads.get('subtitledDownloads') or {}
    198         for lang, subtitled_download in subtitled_downloads.items():
    199             for q in self._NATIVE_FORMATS:
    200                 q_url = subtitled_download.get(q)
    201                 if not q_url:
    202                     continue
    203                 formats.append({
    204                     'url': q_url,
    205                     'format_id': '%s-%s' % (q, lang),
    206                     'language': lang,
    207                 })
    208 
    209         if formats:
    210             for f in formats:
    211                 finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
    212                 if finfo:
    213                     f.update(finfo)
    214 
    215         player_talk = talk_info['player_talks'][0]
    216 
    217         resources_ = player_talk.get('resources') or talk_info.get('resources')
    218 
    219         http_url = None
    220         for format_id, resources in resources_.items():
    221             if format_id == 'hls':
    222                 if not isinstance(resources, dict):
    223                     continue
    224                 stream_url = url_or_none(resources.get('stream'))
    225                 if not stream_url:
    226                     continue
    227                 formats.extend(self._extract_m3u8_formats(
    228                     stream_url, video_name, 'mp4', m3u8_id=format_id,
    229                     fatal=False))
    230             else:
    231                 if not isinstance(resources, list):
    232                     continue
    233                 if format_id == 'h264':
    234                     for resource in resources:
    235                         h264_url = resource.get('file')
    236                         if not h264_url:
    237                             continue
    238                         bitrate = int_or_none(resource.get('bitrate'))
    239                         formats.append({
    240                             'url': h264_url,
    241                             'format_id': '%s-%sk' % (format_id, bitrate),
    242                             'tbr': bitrate,
    243                         })
    244                         if re.search(r'\d+k', h264_url):
    245                             http_url = h264_url
    246                 elif format_id == 'rtmp':
    247                     streamer = talk_info.get('streamer')
    248                     if not streamer:
    249                         continue
    250                     for resource in resources:
    251                         formats.append({
    252                             'format_id': '%s-%s' % (format_id, resource.get('name')),
    253                             'url': streamer,
    254                             'play_path': resource['file'],
    255                             'ext': 'flv',
    256                             'width': int_or_none(resource.get('width')),
    257                             'height': int_or_none(resource.get('height')),
    258                             'tbr': int_or_none(resource.get('bitrate')),
    259                         })
    260 
    261         m3u8_formats = list(filter(
    262             lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
    263             formats))
    264         if http_url:
    265             for m3u8_format in m3u8_formats:
    266                 bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
    267                 if not bitrate:
    268                     continue
    269                 bitrate_url = re.sub(r'\d+k', bitrate, http_url)
    270                 if not self._is_valid_url(
    271                         bitrate_url, video_name, '%s bitrate' % bitrate):
    272                     continue
    273                 f = m3u8_format.copy()
    274                 f.update({
    275                     'url': bitrate_url,
    276                     'format_id': m3u8_format['format_id'].replace('hls', 'http'),
    277                     'protocol': 'http',
    278                 })
    279                 if f.get('acodec') == 'none':
    280                     del f['acodec']
    281                 formats.append(f)
    282 
    283         audio_download = talk_info.get('audioDownload')
    284         if audio_download:
    285             formats.append({
    286                 'url': audio_download,
    287                 'format_id': 'audio',
    288                 'vcodec': 'none',
    289             })
    290 
    291         if not formats:
    292             external = player_talk.get('external')
    293             if isinstance(external, dict):
    294                 service = external.get('service')
    295                 if isinstance(service, compat_str):
    296                     ext_url = None
    297                     if service.lower() == 'youtube':
    298                         ext_url = external.get('code')
    299                     return self.url_result(ext_url or external['uri'])
    300 
    301         self._sort_formats(formats)
    302 
    303         video_id = compat_str(talk_info['id'])
    304 
    305         return {
    306             'id': video_id,
    307             'title': title,
    308             'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
    309             'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
    310             'description': self._og_search_description(webpage),
    311             'subtitles': self._get_subtitles(video_id, talk_info),
    312             'formats': formats,
    313             'duration': float_or_none(talk_info.get('duration')),
    314             'view_count': int_or_none(data.get('viewed_count')),
    315             'comment_count': int_or_none(
    316                 try_get(data, lambda x: x['comments']['count'])),
    317             'tags': try_get(talk_info, lambda x: x['tags'], list),
    318         }
    319 
    320     def _get_subtitles(self, video_id, talk_info):
    321         sub_lang_list = {}
    322         for language in try_get(
    323                 talk_info,
    324                 (lambda x: x['downloads']['languages'],
    325                  lambda x: x['languages']), list):
    326             lang_code = language.get('languageCode') or language.get('ianaCode')
    327             if not lang_code:
    328                 continue
    329             sub_lang_list[lang_code] = [
    330                 {
    331                     'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
    332                     'ext': ext,
    333                 }
    334                 for ext in ['ted', 'srt']
    335             ]
    336         return sub_lang_list
    337 
    338     def _watch_info(self, url, name):
    339         webpage = self._download_webpage(url, name)
    340 
    341         config_json = self._html_search_regex(
    342             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
    343             webpage, 'config', default=None)
    344         if not config_json:
    345             embed_url = self._search_regex(
    346                 r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
    347             return self.url_result(self._proto_relative_url(embed_url))
    348         config = json.loads(config_json)['config']
    349         video_url = config['video']['url']
    350         thumbnail = config.get('image', {}).get('url')
    351 
    352         title = self._html_search_regex(
    353             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
    354         description = self._html_search_regex(
    355             [
    356                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
    357                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
    358             ],
    359             webpage, 'description', fatal=False)
    360 
    361         return {
    362             'id': name,
    363             'url': video_url,
    364             'title': title,
    365             'thumbnail': thumbnail,
    366             'description': description,
    367         }