prosiebensat1.py (21578B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from hashlib import sha1 7 from .common import InfoExtractor 8 from ..compat import compat_str 9 from ..utils import ( 10 ExtractorError, 11 determine_ext, 12 float_or_none, 13 int_or_none, 14 merge_dicts, 15 unified_strdate, 16 ) 17 18 19 class ProSiebenSat1BaseIE(InfoExtractor): 20 _GEO_BYPASS = False 21 _ACCESS_ID = None 22 _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' 23 _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' 24 25 def _extract_video_info(self, url, clip_id): 26 client_location = url 27 28 video = self._download_json( 29 'http://vas.sim-technik.de/vas/live/v2/videos', 30 clip_id, 'Downloading videos JSON', query={ 31 'access_token': self._TOKEN, 32 'client_location': client_location, 33 'client_name': self._CLIENT_NAME, 34 'ids': clip_id, 35 })[0] 36 37 if video.get('is_protected') is True: 38 raise ExtractorError('This video is DRM protected.', expected=True) 39 40 formats = [] 41 if self._ACCESS_ID: 42 raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID 43 protocols = self._download_json( 44 self._V4_BASE_URL + 'protocols', clip_id, 45 'Downloading protocols JSON', 46 headers=self.geo_verification_headers(), query={ 47 'access_id': self._ACCESS_ID, 48 'client_token': sha1((raw_ct).encode()).hexdigest(), 49 'video_id': clip_id, 50 }, fatal=False, expected_status=(403,)) or {} 51 error = protocols.get('error') or {} 52 if error.get('title') == 'Geo check failed': 53 self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) 54 server_token = protocols.get('server_token') 55 if server_token: 56 urls = (self._download_json( 57 self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ 58 'access_id': self._ACCESS_ID, 59 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), 60 'protocols': self._SUPPORTED_PROTOCOLS, 61 'server_token': server_token, 62 'video_id': clip_id, 63 }, fatal=False) or {}).get('urls') or {} 64 for protocol, variant in urls.items(): 65 source_url = variant.get('clear', {}).get('url') 66 if not source_url: 67 continue 68 if protocol == 'dash': 69 formats.extend(self._extract_mpd_formats( 70 source_url, clip_id, mpd_id=protocol, fatal=False)) 71 elif protocol == 'hls': 72 formats.extend(self._extract_m3u8_formats( 73 source_url, clip_id, 'mp4', 'm3u8_native', 74 m3u8_id=protocol, fatal=False)) 75 else: 76 formats.append({ 77 'url': source_url, 78 'format_id': protocol, 79 }) 80 if not formats: 81 source_ids = [compat_str(source['id']) for source in video['sources']] 82 83 client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() 84 85 sources = self._download_json( 86 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, 87 clip_id, 'Downloading sources JSON', query={ 88 'access_token': self._TOKEN, 89 'client_id': client_id, 90 'client_location': client_location, 91 'client_name': self._CLIENT_NAME, 92 }) 93 server_id = sources['server_id'] 94 95 def fix_bitrate(bitrate): 96 bitrate = int_or_none(bitrate) 97 if not bitrate: 98 return None 99 return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate 100 101 for source_id in source_ids: 102 client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() 103 urls = self._download_json( 104 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, 105 clip_id, 'Downloading urls JSON', fatal=False, query={ 106 'access_token': self._TOKEN, 107 'client_id': client_id, 108 'client_location': client_location, 109 'client_name': self._CLIENT_NAME, 110 'server_id': server_id, 111 'source_ids': source_id, 112 }) 113 if not urls: 114 continue 115 if urls.get('status_code') != 0: 116 raise ExtractorError('This video is unavailable', expected=True) 117 urls_sources = urls['sources'] 118 if isinstance(urls_sources, dict): 119 urls_sources = urls_sources.values() 120 for source in urls_sources: 121 source_url = source.get('url') 122 if not source_url: 123 continue 124 protocol = source.get('protocol') 125 mimetype = source.get('mimetype') 126 if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': 127 formats.extend(self._extract_f4m_formats( 128 source_url, clip_id, f4m_id='hds', fatal=False)) 129 elif mimetype == 'application/x-mpegURL': 130 formats.extend(self._extract_m3u8_formats( 131 source_url, clip_id, 'mp4', 'm3u8_native', 132 m3u8_id='hls', fatal=False)) 133 elif mimetype == 'application/dash+xml': 134 formats.extend(self._extract_mpd_formats( 135 source_url, clip_id, mpd_id='dash', fatal=False)) 136 else: 137 tbr = fix_bitrate(source['bitrate']) 138 if protocol in ('rtmp', 'rtmpe'): 139 mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) 140 if not mobj: 141 continue 142 path = mobj.group('path') 143 mp4colon_index = path.rfind('mp4:') 144 app = path[:mp4colon_index] 145 play_path = path[mp4colon_index:] 146 formats.append({ 147 'url': '%s/%s' % (mobj.group('url'), app), 148 'app': app, 149 'play_path': play_path, 150 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', 151 'page_url': 'http://www.prosieben.de', 152 'tbr': tbr, 153 'ext': 'flv', 154 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), 155 }) 156 else: 157 formats.append({ 158 'url': source_url, 159 'tbr': tbr, 160 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), 161 }) 162 self._sort_formats(formats) 163 164 return { 165 'duration': float_or_none(video.get('duration')), 166 'formats': formats, 167 } 168 169 170 class ProSiebenSat1IE(ProSiebenSat1BaseIE): 171 IE_NAME = 'prosiebensat1' 172 IE_DESC = 'ProSiebenSat.1 Digital' 173 _VALID_URL = r'''(?x) 174 https?:// 175 (?:www\.)? 176 (?: 177 (?:beta\.)? 178 (?: 179 prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia 180 )\.(?:de|at|ch)| 181 ran\.de|fem\.com|advopedia\.de|galileo\.tv/video 182 ) 183 /(?P<id>.+) 184 ''' 185 186 _TESTS = [ 187 { 188 # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 189 # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: 190 # - malformed f4m manifest support 191 # - proper handling of URLs starting with `https?://` in 2.0 manifests 192 # - recursive child f4m manifests extraction 193 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 194 'info_dict': { 195 'id': '2104602', 196 'ext': 'mp4', 197 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', 198 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 199 'upload_date': '20131231', 200 'duration': 5845.04, 201 'series': 'CIRCUS HALLIGALLI', 202 'season_number': 2, 203 'episode': 'Episode 18 - Staffel 2', 204 'episode_number': 18, 205 }, 206 }, 207 { 208 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', 209 'info_dict': { 210 'id': '2570327', 211 'ext': 'mp4', 212 'title': 'Lady-Umstyling für Audrina', 213 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', 214 'upload_date': '20131014', 215 'duration': 606.76, 216 }, 217 'params': { 218 # rtmp download 219 'skip_download': True, 220 }, 221 'skip': 'Seems to be broken', 222 }, 223 { 224 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', 225 'info_dict': { 226 'id': '2429369', 227 'ext': 'mp4', 228 'title': 'Countdown für die Autowerkstatt', 229 'description': 'md5:809fc051a457b5d8666013bc40698817', 230 'upload_date': '20140223', 231 'duration': 2595.04, 232 }, 233 'params': { 234 # rtmp download 235 'skip_download': True, 236 }, 237 'skip': 'This video is unavailable', 238 }, 239 { 240 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', 241 'info_dict': { 242 'id': '2904997', 243 'ext': 'mp4', 244 'title': 'Sexy laufen in Ugg Boots', 245 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', 246 'upload_date': '20140122', 247 'duration': 245.32, 248 }, 249 'params': { 250 # rtmp download 251 'skip_download': True, 252 }, 253 'skip': 'This video is unavailable', 254 }, 255 { 256 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', 257 'info_dict': { 258 'id': '2906572', 259 'ext': 'mp4', 260 'title': 'Im Interview: Kai Wiesinger', 261 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', 262 'upload_date': '20140203', 263 'duration': 522.56, 264 }, 265 'params': { 266 # rtmp download 267 'skip_download': True, 268 }, 269 'skip': 'This video is unavailable', 270 }, 271 { 272 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', 273 'info_dict': { 274 'id': '2992323', 275 'ext': 'mp4', 276 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', 277 'description': 'md5:2669cde3febe9bce13904f701e774eb6', 278 'upload_date': '20141014', 279 'duration': 2410.44, 280 }, 281 'params': { 282 # rtmp download 283 'skip_download': True, 284 }, 285 'skip': 'This video is unavailable', 286 }, 287 { 288 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', 289 'info_dict': { 290 'id': '3004256', 291 'ext': 'mp4', 292 'title': 'Schalke: Tönnies möchte Raul zurück', 293 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', 294 'upload_date': '20140226', 295 'duration': 228.96, 296 }, 297 'params': { 298 # rtmp download 299 'skip_download': True, 300 }, 301 'skip': 'This video is unavailable', 302 }, 303 { 304 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', 305 'info_dict': { 306 'id': '2572814', 307 'ext': 'mp4', 308 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', 309 'description': 'md5:6ddb02b0781c6adf778afea606652e38', 310 'timestamp': 1382041620, 311 'upload_date': '20131017', 312 'duration': 469.88, 313 }, 314 'params': { 315 'skip_download': True, 316 }, 317 }, 318 { 319 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', 320 'info_dict': { 321 'id': '2156342', 322 'ext': 'mp4', 323 'title': 'Kurztrips zum Valentinstag', 324 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', 325 'duration': 307.24, 326 }, 327 'params': { 328 'skip_download': True, 329 }, 330 }, 331 { 332 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', 333 'info_dict': { 334 'id': '439664', 335 'title': 'Episode 8 - Ganze Folge - Playlist', 336 'description': 'md5:63b8963e71f481782aeea877658dec84', 337 }, 338 'playlist_count': 2, 339 'skip': 'This video is unavailable', 340 }, 341 { 342 # title in <h2 class="subtitle"> 343 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', 344 'info_dict': { 345 'id': '4895826', 346 'ext': 'mp4', 347 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', 348 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', 349 'upload_date': '20170302', 350 }, 351 'params': { 352 'skip_download': True, 353 }, 354 'skip': 'geo restricted to Germany', 355 }, 356 { 357 # geo restricted to Germany 358 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', 359 'only_matching': True, 360 }, 361 { 362 # geo restricted to Germany 363 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', 364 'only_matching': True, 365 }, 366 { 367 # geo restricted to Germany 368 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', 369 'only_matching': True, 370 }, 371 { 372 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', 373 'only_matching': True, 374 }, 375 { 376 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', 377 'only_matching': True, 378 }, 379 ] 380 381 _TOKEN = 'prosieben' 382 _SALT = '01!8d8F_)r9]4s[qeuXfP%' 383 _CLIENT_NAME = 'kolibri-2.0.19-splec4' 384 385 _ACCESS_ID = 'x_prosiebenmaxx-de' 386 _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' 387 _IV = 'Aeluchoc6aevechuipiexeeboowedaok' 388 389 _CLIPID_REGEXES = [ 390 r'"clip_id"\s*:\s+"(\d+)"', 391 r'clipid: "(\d+)"', 392 r'clip[iI]d=(\d+)', 393 r'clip[iI][dD]\s*=\s*["\'](\d+)', 394 r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", 395 r'proMamsId"\s*:\s*"(\d+)', 396 r'proMamsId"\s*:\s*"(\d+)', 397 ] 398 _TITLE_REGEXES = [ 399 r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', 400 r'<header class="clearfix">\s*<h3>(.+?)</h3>', 401 r'<!-- start video -->\s*<h1>(.+?)</h1>', 402 r'<h1 class="att-name">\s*(.+?)</h1>', 403 r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', 404 r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', 405 r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', 406 r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', 407 ] 408 _DESCRIPTION_REGEXES = [ 409 r'<p itemprop="description">\s*(.+?)</p>', 410 r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', 411 r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', 412 r'<p class="att-description">\s*(.+?)\s*</p>', 413 r'<p class="video-description" itemprop="description">\s*(.+?)</p>', 414 r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', 415 ] 416 _UPLOAD_DATE_REGEXES = [ 417 r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', 418 r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', 419 r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', 420 r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', 421 ] 422 _PAGE_TYPE_REGEXES = [ 423 r'<meta name="page_type" content="([^"]+)">', 424 r"'itemType'\s*:\s*'([^']*)'", 425 ] 426 _PLAYLIST_ID_REGEXES = [ 427 r'content[iI]d=(\d+)', 428 r"'itemId'\s*:\s*'([^']*)'", 429 ] 430 _PLAYLIST_CLIP_REGEXES = [ 431 r'(?s)data-qvt=.+?<a href="([^"]+)"', 432 ] 433 434 def _extract_clip(self, url, webpage): 435 clip_id = self._html_search_regex( 436 self._CLIPID_REGEXES, webpage, 'clip id') 437 title = self._html_search_regex( 438 self._TITLE_REGEXES, webpage, 'title', 439 default=None) or self._og_search_title(webpage) 440 info = self._extract_video_info(url, clip_id) 441 description = self._html_search_regex( 442 self._DESCRIPTION_REGEXES, webpage, 'description', default=None) 443 if description is None: 444 description = self._og_search_description(webpage) 445 thumbnail = self._og_search_thumbnail(webpage) 446 upload_date = unified_strdate( 447 self._html_search_meta('og:published_time', webpage, 448 'upload date', default=None) 449 or self._html_search_regex(self._UPLOAD_DATE_REGEXES, 450 webpage, 'upload date', default=None)) 451 452 json_ld = self._search_json_ld(webpage, clip_id, default={}) 453 454 return merge_dicts(info, { 455 'id': clip_id, 456 'title': title, 457 'description': description, 458 'thumbnail': thumbnail, 459 'upload_date': upload_date, 460 }, json_ld) 461 462 def _extract_playlist(self, url, webpage): 463 playlist_id = self._html_search_regex( 464 self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') 465 playlist = self._parse_json( 466 self._search_regex( 467 r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', 468 webpage, 'playlist'), 469 playlist_id) 470 entries = [] 471 for item in playlist: 472 clip_id = item.get('id') or item.get('upc') 473 if not clip_id: 474 continue 475 info = self._extract_video_info(url, clip_id) 476 info.update({ 477 'id': clip_id, 478 'title': item.get('title') or item.get('teaser', {}).get('headline'), 479 'description': item.get('teaser', {}).get('description'), 480 'thumbnail': item.get('poster'), 481 'duration': float_or_none(item.get('duration')), 482 'series': item.get('tvShowTitle'), 483 'uploader': item.get('broadcastPublisher'), 484 }) 485 entries.append(info) 486 return self.playlist_result(entries, playlist_id) 487 488 def _real_extract(self, url): 489 video_id = self._match_id(url) 490 webpage = self._download_webpage(url, video_id) 491 page_type = self._search_regex( 492 self._PAGE_TYPE_REGEXES, webpage, 493 'page type', default='clip').lower() 494 if page_type == 'clip': 495 return self._extract_clip(url, webpage) 496 elif page_type == 'playlist': 497 return self._extract_playlist(url, webpage) 498 else: 499 raise ExtractorError( 500 'Unsupported page type %s' % page_type, expected=True)