orf.py (20485B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..compat import compat_str 8 from ..utils import ( 9 clean_html, 10 determine_ext, 11 float_or_none, 12 HEADRequest, 13 int_or_none, 14 orderedSet, 15 remove_end, 16 str_or_none, 17 strip_jsonp, 18 unescapeHTML, 19 unified_strdate, 20 url_or_none, 21 ) 22 23 24 class ORFTVthekIE(InfoExtractor): 25 IE_NAME = 'orf:tvthek' 26 IE_DESC = 'ORF TVthek' 27 _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)' 28 29 _TESTS = [{ 30 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', 31 'playlist': [{ 32 'md5': '2942210346ed779588f428a92db88712', 33 'info_dict': { 34 'id': '8896777', 35 'ext': 'mp4', 36 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde', 37 'description': 'md5:c1272f0245537812d4e36419c207b67d', 38 'duration': 2668, 39 'upload_date': '20141208', 40 }, 41 }], 42 'skip': 'Blocked outside of Austria / Germany', 43 }, { 44 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256', 45 'info_dict': { 46 'id': '7982259', 47 'ext': 'mp4', 48 'title': 'Best of Ingrid Thurnher', 49 'upload_date': '20140527', 50 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', 51 }, 52 'params': { 53 'skip_download': True, # rtsp downloads 54 }, 55 'skip': 'Blocked outside of Austria / Germany', 56 }, { 57 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', 58 'only_matching': True, 59 }, { 60 'url': 'http://tvthek.orf.at/profile/Universum/35429', 61 'only_matching': True, 62 }] 63 64 def _real_extract(self, url): 65 playlist_id = self._match_id(url) 66 webpage = self._download_webpage(url, playlist_id) 67 68 data_jsb = self._parse_json( 69 self._search_regex( 70 r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2', 71 webpage, 'playlist', group='json'), 72 playlist_id, transform_source=unescapeHTML)['playlist']['videos'] 73 74 entries = [] 75 for sd in data_jsb: 76 video_id, title = sd.get('id'), sd.get('title') 77 if not video_id or not title: 78 continue 79 video_id = compat_str(video_id) 80 formats = [] 81 for fd in sd['sources']: 82 src = url_or_none(fd.get('src')) 83 if not src: 84 continue 85 format_id_list = [] 86 for key in ('delivery', 'quality', 'quality_string'): 87 value = fd.get(key) 88 if value: 89 format_id_list.append(value) 90 format_id = '-'.join(format_id_list) 91 ext = determine_ext(src) 92 if ext == 'm3u8': 93 m3u8_formats = self._extract_m3u8_formats( 94 src, video_id, 'mp4', m3u8_id=format_id, fatal=False) 95 if any('/geoprotection' in f['url'] for f in m3u8_formats): 96 self.raise_geo_restricted() 97 formats.extend(m3u8_formats) 98 elif ext == 'f4m': 99 formats.extend(self._extract_f4m_formats( 100 src, video_id, f4m_id=format_id, fatal=False)) 101 elif ext == 'mpd': 102 formats.extend(self._extract_mpd_formats( 103 src, video_id, mpd_id=format_id, fatal=False)) 104 else: 105 formats.append({ 106 'format_id': format_id, 107 'url': src, 108 'protocol': fd.get('protocol'), 109 }) 110 111 # Check for geoblocking. 112 # There is a property is_geoprotection, but that's always false 113 geo_str = sd.get('geoprotection_string') 114 if geo_str: 115 try: 116 http_url = next( 117 f['url'] 118 for f in formats 119 if re.match(r'^https?://.*\.mp4$', f['url'])) 120 except StopIteration: 121 pass 122 else: 123 req = HEADRequest(http_url) 124 self._request_webpage( 125 req, video_id, 126 note='Testing for geoblocking', 127 errnote=(( 128 'This video seems to be blocked outside of %s. ' 129 'You may want to try the streaming-* formats.') 130 % geo_str), 131 fatal=False) 132 133 self._check_formats(formats, video_id) 134 self._sort_formats(formats) 135 136 subtitles = {} 137 for sub in sd.get('subtitles', []): 138 sub_src = sub.get('src') 139 if not sub_src: 140 continue 141 subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ 142 'url': sub_src, 143 }) 144 145 upload_date = unified_strdate(sd.get('created_date')) 146 147 thumbnails = [] 148 preview = sd.get('preview_image_url') 149 if preview: 150 thumbnails.append({ 151 'id': 'preview', 152 'url': preview, 153 'preference': 0, 154 }) 155 image = sd.get('image_full_url') 156 if not image and len(data_jsb) == 1: 157 image = self._og_search_thumbnail(webpage) 158 if image: 159 thumbnails.append({ 160 'id': 'full', 161 'url': image, 162 'preference': 1, 163 }) 164 165 entries.append({ 166 '_type': 'video', 167 'id': video_id, 168 'title': title, 169 'formats': formats, 170 'subtitles': subtitles, 171 'description': sd.get('description'), 172 'duration': int_or_none(sd.get('duration_in_seconds')), 173 'upload_date': upload_date, 174 'thumbnails': thumbnails, 175 }) 176 177 return { 178 '_type': 'playlist', 179 'entries': entries, 180 'id': playlist_id, 181 } 182 183 184 class ORFRadioIE(InfoExtractor): 185 def _real_extract(self, url): 186 mobj = re.match(self._VALID_URL, url) 187 show_date = mobj.group('date') 188 show_id = mobj.group('show') 189 190 data = self._download_json( 191 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' 192 % (self._API_STATION, show_id, show_date), show_id) 193 194 entries = [] 195 for info in data['streams']: 196 loop_stream_id = str_or_none(info.get('loopStreamId')) 197 if not loop_stream_id: 198 continue 199 title = str_or_none(data.get('title')) 200 if not title: 201 continue 202 start = int_or_none(info.get('start'), scale=1000) 203 end = int_or_none(info.get('end'), scale=1000) 204 duration = end - start if end and start else None 205 entries.append({ 206 'id': loop_stream_id.replace('.mp3', ''), 207 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), 208 'title': title, 209 'description': clean_html(data.get('subtitle')), 210 'duration': duration, 211 'timestamp': start, 212 'ext': 'mp3', 213 'series': data.get('programTitle'), 214 }) 215 216 return { 217 '_type': 'playlist', 218 'id': show_id, 219 'title': data.get('title'), 220 'description': clean_html(data.get('subtitle')), 221 'entries': entries, 222 } 223 224 225 class ORFFM4IE(ORFRadioIE): 226 IE_NAME = 'orf:fm4' 227 IE_DESC = 'radio FM4' 228 _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)' 229 _API_STATION = 'fm4' 230 _LOOP_STATION = 'fm4' 231 232 _TEST = { 233 'url': 'http://fm4.orf.at/player/20170107/4CC', 234 'md5': '2b0be47375432a7ef104453432a19212', 235 'info_dict': { 236 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', 237 'ext': 'mp3', 238 'title': 'Solid Steel Radioshow', 239 'description': 'Die Mixshow von Coldcut und Ninja Tune.', 240 'duration': 3599, 241 'timestamp': 1483819257, 242 'upload_date': '20170107', 243 }, 244 'skip': 'Shows from ORF radios are only available for 7 days.', 245 'only_matching': True, 246 } 247 248 249 class ORFNOEIE(ORFRadioIE): 250 IE_NAME = 'orf:noe' 251 IE_DESC = 'Radio Niederösterreich' 252 _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 253 _API_STATION = 'noe' 254 _LOOP_STATION = 'oe2n' 255 256 _TEST = { 257 'url': 'https://noe.orf.at/player/20200423/NGM', 258 'only_matching': True, 259 } 260 261 262 class ORFWIEIE(ORFRadioIE): 263 IE_NAME = 'orf:wien' 264 IE_DESC = 'Radio Wien' 265 _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 266 _API_STATION = 'wie' 267 _LOOP_STATION = 'oe2w' 268 269 _TEST = { 270 'url': 'https://wien.orf.at/player/20200423/WGUM', 271 'only_matching': True, 272 } 273 274 275 class ORFBGLIE(ORFRadioIE): 276 IE_NAME = 'orf:burgenland' 277 IE_DESC = 'Radio Burgenland' 278 _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 279 _API_STATION = 'bgl' 280 _LOOP_STATION = 'oe2b' 281 282 _TEST = { 283 'url': 'https://burgenland.orf.at/player/20200423/BGM', 284 'only_matching': True, 285 } 286 287 288 class ORFOOEIE(ORFRadioIE): 289 IE_NAME = 'orf:oberoesterreich' 290 IE_DESC = 'Radio Oberösterreich' 291 _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 292 _API_STATION = 'ooe' 293 _LOOP_STATION = 'oe2o' 294 295 _TEST = { 296 'url': 'https://ooe.orf.at/player/20200423/OGMO', 297 'only_matching': True, 298 } 299 300 301 class ORFSTMIE(ORFRadioIE): 302 IE_NAME = 'orf:steiermark' 303 IE_DESC = 'Radio Steiermark' 304 _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 305 _API_STATION = 'stm' 306 _LOOP_STATION = 'oe2st' 307 308 _TEST = { 309 'url': 'https://steiermark.orf.at/player/20200423/STGMS', 310 'only_matching': True, 311 } 312 313 314 class ORFKTNIE(ORFRadioIE): 315 IE_NAME = 'orf:kaernten' 316 IE_DESC = 'Radio Kärnten' 317 _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 318 _API_STATION = 'ktn' 319 _LOOP_STATION = 'oe2k' 320 321 _TEST = { 322 'url': 'https://kaernten.orf.at/player/20200423/KGUMO', 323 'only_matching': True, 324 } 325 326 327 class ORFSBGIE(ORFRadioIE): 328 IE_NAME = 'orf:salzburg' 329 IE_DESC = 'Radio Salzburg' 330 _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 331 _API_STATION = 'sbg' 332 _LOOP_STATION = 'oe2s' 333 334 _TEST = { 335 'url': 'https://salzburg.orf.at/player/20200423/SGUM', 336 'only_matching': True, 337 } 338 339 340 class ORFTIRIE(ORFRadioIE): 341 IE_NAME = 'orf:tirol' 342 IE_DESC = 'Radio Tirol' 343 _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 344 _API_STATION = 'tir' 345 _LOOP_STATION = 'oe2t' 346 347 _TEST = { 348 'url': 'https://tirol.orf.at/player/20200423/TGUMO', 349 'only_matching': True, 350 } 351 352 353 class ORFVBGIE(ORFRadioIE): 354 IE_NAME = 'orf:vorarlberg' 355 IE_DESC = 'Radio Vorarlberg' 356 _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 357 _API_STATION = 'vbg' 358 _LOOP_STATION = 'oe2v' 359 360 _TEST = { 361 'url': 'https://vorarlberg.orf.at/player/20200423/VGUM', 362 'only_matching': True, 363 } 364 365 366 class ORFOE3IE(ORFRadioIE): 367 IE_NAME = 'orf:oe3' 368 IE_DESC = 'Radio Österreich 3' 369 _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 370 _API_STATION = 'oe3' 371 _LOOP_STATION = 'oe3' 372 373 _TEST = { 374 'url': 'https://oe3.orf.at/player/20200424/3WEK', 375 'only_matching': True, 376 } 377 378 379 class ORFOE1IE(ORFRadioIE): 380 IE_NAME = 'orf:oe1' 381 IE_DESC = 'Radio Österreich 1' 382 _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' 383 _API_STATION = 'oe1' 384 _LOOP_STATION = 'oe1' 385 386 _TEST = { 387 'url': 'http://oe1.orf.at/player/20170108/456544', 388 'md5': '34d8a6e67ea888293741c86a099b745b', 389 'info_dict': { 390 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', 391 'ext': 'mp3', 392 'title': 'Morgenjournal', 393 'duration': 609, 394 'timestamp': 1483858796, 395 'upload_date': '20170108', 396 }, 397 'skip': 'Shows from ORF radios are only available for 7 days.' 398 } 399 400 401 class ORFIPTVIE(InfoExtractor): 402 IE_NAME = 'orf:iptv' 403 IE_DESC = 'iptv.ORF.at' 404 _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' 405 406 _TEST = { 407 'url': 'http://iptv.orf.at/stories/2275236/', 408 'md5': 'c8b22af4718a4b4af58342529453e3e5', 409 'info_dict': { 410 'id': '350612', 411 'ext': 'flv', 412 'title': 'Weitere Evakuierungen um Vulkan Calbuco', 413 'description': 'md5:d689c959bdbcf04efeddedbf2299d633', 414 'duration': 68.197, 415 'thumbnail': r're:^https?://.*\.jpg$', 416 'upload_date': '20150425', 417 }, 418 } 419 420 def _real_extract(self, url): 421 story_id = self._match_id(url) 422 423 webpage = self._download_webpage( 424 'http://iptv.orf.at/stories/%s' % story_id, story_id) 425 426 video_id = self._search_regex( 427 r'data-video(?:id)?="(\d+)"', webpage, 'video id') 428 429 data = self._download_json( 430 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, 431 video_id)[0] 432 433 duration = float_or_none(data['duration'], 1000) 434 435 video = data['sources']['default'] 436 load_balancer_url = video['loadBalancerUrl'] 437 abr = int_or_none(video.get('audioBitrate')) 438 vbr = int_or_none(video.get('bitrate')) 439 fps = int_or_none(video.get('videoFps')) 440 width = int_or_none(video.get('videoWidth')) 441 height = int_or_none(video.get('videoHeight')) 442 thumbnail = video.get('preview') 443 444 rendition = self._download_json( 445 load_balancer_url, video_id, transform_source=strip_jsonp) 446 447 f = { 448 'abr': abr, 449 'vbr': vbr, 450 'fps': fps, 451 'width': width, 452 'height': height, 453 } 454 455 formats = [] 456 for format_id, format_url in rendition['redirect'].items(): 457 if format_id == 'rtmp': 458 ff = f.copy() 459 ff.update({ 460 'url': format_url, 461 'format_id': format_id, 462 }) 463 formats.append(ff) 464 elif determine_ext(format_url) == 'f4m': 465 formats.extend(self._extract_f4m_formats( 466 format_url, video_id, f4m_id=format_id)) 467 elif determine_ext(format_url) == 'm3u8': 468 formats.extend(self._extract_m3u8_formats( 469 format_url, video_id, 'mp4', m3u8_id=format_id)) 470 else: 471 continue 472 self._sort_formats(formats) 473 474 title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') 475 description = self._og_search_description(webpage) 476 upload_date = unified_strdate(self._html_search_meta( 477 'dc.date', webpage, 'upload date')) 478 479 return { 480 'id': video_id, 481 'title': title, 482 'description': description, 483 'duration': duration, 484 'thumbnail': thumbnail, 485 'upload_date': upload_date, 486 'formats': formats, 487 } 488 489 490 class ORFFM4StoryIE(InfoExtractor): 491 IE_NAME = 'orf:fm4:story' 492 IE_DESC = 'fm4.orf.at stories' 493 _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' 494 495 _TEST = { 496 'url': 'http://fm4.orf.at/stories/2865738/', 497 'playlist': [{ 498 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', 499 'info_dict': { 500 'id': '547792', 501 'ext': 'flv', 502 'title': 'Manu Delago und Inner Tongue live', 503 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', 504 'duration': 1748.52, 505 'thumbnail': r're:^https?://.*\.jpg$', 506 'upload_date': '20170913', 507 }, 508 }, { 509 'md5': 'c6dd2179731f86f4f55a7b49899d515f', 510 'info_dict': { 511 'id': '547798', 512 'ext': 'flv', 513 'title': 'Manu Delago und Inner Tongue live (2)', 514 'duration': 1504.08, 515 'thumbnail': r're:^https?://.*\.jpg$', 516 'upload_date': '20170913', 517 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', 518 }, 519 }], 520 } 521 522 def _real_extract(self, url): 523 story_id = self._match_id(url) 524 webpage = self._download_webpage(url, story_id) 525 526 entries = [] 527 all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) 528 for idx, video_id in enumerate(all_ids): 529 data = self._download_json( 530 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, 531 video_id)[0] 532 533 duration = float_or_none(data['duration'], 1000) 534 535 video = data['sources']['q8c'] 536 load_balancer_url = video['loadBalancerUrl'] 537 abr = int_or_none(video.get('audioBitrate')) 538 vbr = int_or_none(video.get('bitrate')) 539 fps = int_or_none(video.get('videoFps')) 540 width = int_or_none(video.get('videoWidth')) 541 height = int_or_none(video.get('videoHeight')) 542 thumbnail = video.get('preview') 543 544 rendition = self._download_json( 545 load_balancer_url, video_id, transform_source=strip_jsonp) 546 547 f = { 548 'abr': abr, 549 'vbr': vbr, 550 'fps': fps, 551 'width': width, 552 'height': height, 553 } 554 555 formats = [] 556 for format_id, format_url in rendition['redirect'].items(): 557 if format_id == 'rtmp': 558 ff = f.copy() 559 ff.update({ 560 'url': format_url, 561 'format_id': format_id, 562 }) 563 formats.append(ff) 564 elif determine_ext(format_url) == 'f4m': 565 formats.extend(self._extract_f4m_formats( 566 format_url, video_id, f4m_id=format_id)) 567 elif determine_ext(format_url) == 'm3u8': 568 formats.extend(self._extract_m3u8_formats( 569 format_url, video_id, 'mp4', m3u8_id=format_id)) 570 else: 571 continue 572 self._sort_formats(formats) 573 574 title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') 575 if idx >= 1: 576 # Titles are duplicates, make them unique 577 title += ' (' + str(idx + 1) + ')' 578 description = self._og_search_description(webpage) 579 upload_date = unified_strdate(self._html_search_meta( 580 'dc.date', webpage, 'upload date')) 581 582 entries.append({ 583 'id': video_id, 584 'title': title, 585 'description': description, 586 'duration': duration, 587 'thumbnail': thumbnail, 588 'upload_date': upload_date, 589 'formats': formats, 590 }) 591 592 return self.playlist_result(entries)