ted.py (13942B)
1 from __future__ import unicode_literals 2 3 import json 4 import re 5 6 from .common import InfoExtractor 7 8 from ..compat import ( 9 compat_str, 10 compat_urlparse 11 ) 12 from ..utils import ( 13 extract_attributes, 14 float_or_none, 15 int_or_none, 16 try_get, 17 url_or_none, 18 ) 19 20 21 class TEDIE(InfoExtractor): 22 IE_NAME = 'ted' 23 _VALID_URL = r'''(?x) 24 (?P<proto>https?://) 25 (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ 26 ( 27 (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist 28 | 29 ((?P<type_talk>talks)) # We have a simple talk 30 | 31 (?P<type_watch>watch)/[^/]+/[^/]+ 32 ) 33 (/lang/(.*?))? # The url may contain the language 34 /(?P<name>[\w-]+) # Here goes the name and then ".html" 35 .*)$ 36 ''' 37 _TESTS = [{ 38 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 39 'md5': 'b0ce2b05ca215042124fbc9e3886493a', 40 'info_dict': { 41 'id': '102', 42 'ext': 'mp4', 43 'title': 'The illusion of consciousness', 44 'description': ('Philosopher Dan Dennett makes a compelling ' 45 'argument that not only don\'t we understand our own ' 46 'consciousness, but that half the time our brains are ' 47 'actively fooling us.'), 48 'uploader': 'Dan Dennett', 49 'width': 853, 50 'duration': 1308, 51 'view_count': int, 52 'comment_count': int, 53 'tags': list, 54 }, 55 'params': { 56 'skip_download': True, 57 }, 58 }, { 59 # missing HTTP bitrates 60 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms', 61 'info_dict': { 62 'id': '6069', 63 'ext': 'mp4', 64 'title': 'The beauty and power of algorithms', 65 'thumbnail': r're:^https?://.+\.jpg', 66 'description': 'md5:734e352710fb00d840ab87ae31aaf688', 67 'uploader': 'Vishal Sikka', 68 }, 69 'params': { 70 'skip_download': True, 71 }, 72 }, { 73 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', 74 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0', 75 'info_dict': { 76 'id': '1972', 77 'ext': 'mp4', 78 'title': 'Be passionate. Be courageous. Be your best.', 79 'uploader': 'Gabby Giffords and Mark Kelly', 80 'description': 'md5:5174aed4d0f16021b704120360f72b92', 81 'duration': 1128, 82 }, 83 'params': { 84 'skip_download': True, 85 }, 86 }, { 87 'url': 'http://www.ted.com/playlists/who_are_the_hackers', 88 'info_dict': { 89 'id': '10', 90 'title': 'Who are the hackers?', 91 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a' 92 }, 93 'playlist_mincount': 6, 94 }, { 95 # contains a youtube video 96 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything', 97 'add_ie': ['Youtube'], 98 'info_dict': { 99 'id': '_ZG8HBuDjgc', 100 'ext': 'webm', 101 'title': 'Douglas Adams: Parrots the Universe and Everything', 102 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af', 103 'uploader': 'University of California Television (UCTV)', 104 'uploader_id': 'UCtelevision', 105 'upload_date': '20080522', 106 }, 107 'params': { 108 'skip_download': True, 109 }, 110 }, { 111 # no nativeDownloads 112 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth', 113 'info_dict': { 114 'id': '1792', 115 'ext': 'mp4', 116 'title': 'The orchestra in my mouth', 117 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', 118 'uploader': 'Tom Thum', 119 'view_count': int, 120 'comment_count': int, 121 'tags': list, 122 }, 123 'params': { 124 'skip_download': True, 125 }, 126 }, { 127 # with own formats and private Youtube external 128 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity', 129 'only_matching': True, 130 }] 131 132 _NATIVE_FORMATS = { 133 'low': {'width': 320, 'height': 180}, 134 'medium': {'width': 512, 'height': 288}, 135 'high': {'width': 854, 'height': 480}, 136 } 137 138 def _extract_info(self, webpage): 139 info_json = self._search_regex( 140 r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>', 141 webpage, 'info json') 142 return json.loads(info_json) 143 144 def _real_extract(self, url): 145 m = re.match(self._VALID_URL, url, re.VERBOSE) 146 if m.group('type').startswith('embed'): 147 desktop_url = m.group('proto') + 'www' + m.group('urlmain') 148 return self.url_result(desktop_url, 'TED') 149 name = m.group('name') 150 if m.group('type_talk'): 151 return self._talk_info(url, name) 152 elif m.group('type_watch'): 153 return self._watch_info(url, name) 154 else: 155 return self._playlist_videos_info(url, name) 156 157 def _playlist_videos_info(self, url, name): 158 '''Returns the videos of the playlist''' 159 160 webpage = self._download_webpage(url, name, 161 'Downloading playlist webpage') 162 163 playlist_entries = [] 164 for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage): 165 attrs = extract_attributes(entry) 166 entry_url = compat_urlparse.urljoin(url, attrs['href']) 167 playlist_entries.append(self.url_result(entry_url, self.ie_key())) 168 169 final_url = self._og_search_url(webpage, fatal=False) 170 playlist_id = ( 171 re.match(self._VALID_URL, final_url).group('playlist_id') 172 if final_url else None) 173 174 return self.playlist_result( 175 playlist_entries, playlist_id=playlist_id, 176 playlist_title=self._og_search_title(webpage, fatal=False), 177 playlist_description=self._og_search_description(webpage)) 178 179 def _talk_info(self, url, video_name): 180 webpage = self._download_webpage(url, video_name) 181 182 info = self._extract_info(webpage) 183 184 data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info 185 talk_info = data['talks'][0] 186 187 title = talk_info['title'].strip() 188 189 downloads = talk_info.get('downloads') or {} 190 native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {} 191 192 formats = [{ 193 'url': format_url, 194 'format_id': format_id, 195 } for (format_id, format_url) in native_downloads.items() if format_url is not None] 196 197 subtitled_downloads = downloads.get('subtitledDownloads') or {} 198 for lang, subtitled_download in subtitled_downloads.items(): 199 for q in self._NATIVE_FORMATS: 200 q_url = subtitled_download.get(q) 201 if not q_url: 202 continue 203 formats.append({ 204 'url': q_url, 205 'format_id': '%s-%s' % (q, lang), 206 'language': lang, 207 }) 208 209 if formats: 210 for f in formats: 211 finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0]) 212 if finfo: 213 f.update(finfo) 214 215 player_talk = talk_info['player_talks'][0] 216 217 resources_ = player_talk.get('resources') or talk_info.get('resources') 218 219 http_url = None 220 for format_id, resources in resources_.items(): 221 if format_id == 'hls': 222 if not isinstance(resources, dict): 223 continue 224 stream_url = url_or_none(resources.get('stream')) 225 if not stream_url: 226 continue 227 formats.extend(self._extract_m3u8_formats( 228 stream_url, video_name, 'mp4', m3u8_id=format_id, 229 fatal=False)) 230 else: 231 if not isinstance(resources, list): 232 continue 233 if format_id == 'h264': 234 for resource in resources: 235 h264_url = resource.get('file') 236 if not h264_url: 237 continue 238 bitrate = int_or_none(resource.get('bitrate')) 239 formats.append({ 240 'url': h264_url, 241 'format_id': '%s-%sk' % (format_id, bitrate), 242 'tbr': bitrate, 243 }) 244 if re.search(r'\d+k', h264_url): 245 http_url = h264_url 246 elif format_id == 'rtmp': 247 streamer = talk_info.get('streamer') 248 if not streamer: 249 continue 250 for resource in resources: 251 formats.append({ 252 'format_id': '%s-%s' % (format_id, resource.get('name')), 253 'url': streamer, 254 'play_path': resource['file'], 255 'ext': 'flv', 256 'width': int_or_none(resource.get('width')), 257 'height': int_or_none(resource.get('height')), 258 'tbr': int_or_none(resource.get('bitrate')), 259 }) 260 261 m3u8_formats = list(filter( 262 lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', 263 formats)) 264 if http_url: 265 for m3u8_format in m3u8_formats: 266 bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) 267 if not bitrate: 268 continue 269 bitrate_url = re.sub(r'\d+k', bitrate, http_url) 270 if not self._is_valid_url( 271 bitrate_url, video_name, '%s bitrate' % bitrate): 272 continue 273 f = m3u8_format.copy() 274 f.update({ 275 'url': bitrate_url, 276 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 277 'protocol': 'http', 278 }) 279 if f.get('acodec') == 'none': 280 del f['acodec'] 281 formats.append(f) 282 283 audio_download = talk_info.get('audioDownload') 284 if audio_download: 285 formats.append({ 286 'url': audio_download, 287 'format_id': 'audio', 288 'vcodec': 'none', 289 }) 290 291 if not formats: 292 external = player_talk.get('external') 293 if isinstance(external, dict): 294 service = external.get('service') 295 if isinstance(service, compat_str): 296 ext_url = None 297 if service.lower() == 'youtube': 298 ext_url = external.get('code') 299 return self.url_result(ext_url or external['uri']) 300 301 self._sort_formats(formats) 302 303 video_id = compat_str(talk_info['id']) 304 305 return { 306 'id': video_id, 307 'title': title, 308 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), 309 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), 310 'description': self._og_search_description(webpage), 311 'subtitles': self._get_subtitles(video_id, talk_info), 312 'formats': formats, 313 'duration': float_or_none(talk_info.get('duration')), 314 'view_count': int_or_none(data.get('viewed_count')), 315 'comment_count': int_or_none( 316 try_get(data, lambda x: x['comments']['count'])), 317 'tags': try_get(talk_info, lambda x: x['tags'], list), 318 } 319 320 def _get_subtitles(self, video_id, talk_info): 321 sub_lang_list = {} 322 for language in try_get( 323 talk_info, 324 (lambda x: x['downloads']['languages'], 325 lambda x: x['languages']), list): 326 lang_code = language.get('languageCode') or language.get('ianaCode') 327 if not lang_code: 328 continue 329 sub_lang_list[lang_code] = [ 330 { 331 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), 332 'ext': ext, 333 } 334 for ext in ['ted', 'srt'] 335 ] 336 return sub_lang_list 337 338 def _watch_info(self, url, name): 339 webpage = self._download_webpage(url, name) 340 341 config_json = self._html_search_regex( 342 r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', 343 webpage, 'config', default=None) 344 if not config_json: 345 embed_url = self._search_regex( 346 r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url') 347 return self.url_result(self._proto_relative_url(embed_url)) 348 config = json.loads(config_json)['config'] 349 video_url = config['video']['url'] 350 thumbnail = config.get('image', {}).get('url') 351 352 title = self._html_search_regex( 353 r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title') 354 description = self._html_search_regex( 355 [ 356 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>', 357 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>', 358 ], 359 webpage, 'description', fatal=False) 360 361 return { 362 'id': name, 363 'url': video_url, 364 'title': title, 365 'thumbnail': thumbnail, 366 'description': description, 367 }