youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

orf.py (20485B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..compat import compat_str
      8 from ..utils import (
      9     clean_html,
     10     determine_ext,
     11     float_or_none,
     12     HEADRequest,
     13     int_or_none,
     14     orderedSet,
     15     remove_end,
     16     str_or_none,
     17     strip_jsonp,
     18     unescapeHTML,
     19     unified_strdate,
     20     url_or_none,
     21 )
     22 
     23 
     24 class ORFTVthekIE(InfoExtractor):
     25     IE_NAME = 'orf:tvthek'
     26     IE_DESC = 'ORF TVthek'
     27     _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
     28 
     29     _TESTS = [{
     30         'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
     31         'playlist': [{
     32             'md5': '2942210346ed779588f428a92db88712',
     33             'info_dict': {
     34                 'id': '8896777',
     35                 'ext': 'mp4',
     36                 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
     37                 'description': 'md5:c1272f0245537812d4e36419c207b67d',
     38                 'duration': 2668,
     39                 'upload_date': '20141208',
     40             },
     41         }],
     42         'skip': 'Blocked outside of Austria / Germany',
     43     }, {
     44         'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
     45         'info_dict': {
     46             'id': '7982259',
     47             'ext': 'mp4',
     48             'title': 'Best of Ingrid Thurnher',
     49             'upload_date': '20140527',
     50             'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
     51         },
     52         'params': {
     53             'skip_download': True,  # rtsp downloads
     54         },
     55         'skip': 'Blocked outside of Austria / Germany',
     56     }, {
     57         'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
     58         'only_matching': True,
     59     }, {
     60         'url': 'http://tvthek.orf.at/profile/Universum/35429',
     61         'only_matching': True,
     62     }]
     63 
     64     def _real_extract(self, url):
     65         playlist_id = self._match_id(url)
     66         webpage = self._download_webpage(url, playlist_id)
     67 
     68         data_jsb = self._parse_json(
     69             self._search_regex(
     70                 r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
     71                 webpage, 'playlist', group='json'),
     72             playlist_id, transform_source=unescapeHTML)['playlist']['videos']
     73 
     74         entries = []
     75         for sd in data_jsb:
     76             video_id, title = sd.get('id'), sd.get('title')
     77             if not video_id or not title:
     78                 continue
     79             video_id = compat_str(video_id)
     80             formats = []
     81             for fd in sd['sources']:
     82                 src = url_or_none(fd.get('src'))
     83                 if not src:
     84                     continue
     85                 format_id_list = []
     86                 for key in ('delivery', 'quality', 'quality_string'):
     87                     value = fd.get(key)
     88                     if value:
     89                         format_id_list.append(value)
     90                 format_id = '-'.join(format_id_list)
     91                 ext = determine_ext(src)
     92                 if ext == 'm3u8':
     93                     m3u8_formats = self._extract_m3u8_formats(
     94                         src, video_id, 'mp4', m3u8_id=format_id, fatal=False)
     95                     if any('/geoprotection' in f['url'] for f in m3u8_formats):
     96                         self.raise_geo_restricted()
     97                     formats.extend(m3u8_formats)
     98                 elif ext == 'f4m':
     99                     formats.extend(self._extract_f4m_formats(
    100                         src, video_id, f4m_id=format_id, fatal=False))
    101                 elif ext == 'mpd':
    102                     formats.extend(self._extract_mpd_formats(
    103                         src, video_id, mpd_id=format_id, fatal=False))
    104                 else:
    105                     formats.append({
    106                         'format_id': format_id,
    107                         'url': src,
    108                         'protocol': fd.get('protocol'),
    109                     })
    110 
    111             # Check for geoblocking.
    112             # There is a property is_geoprotection, but that's always false
    113             geo_str = sd.get('geoprotection_string')
    114             if geo_str:
    115                 try:
    116                     http_url = next(
    117                         f['url']
    118                         for f in formats
    119                         if re.match(r'^https?://.*\.mp4$', f['url']))
    120                 except StopIteration:
    121                     pass
    122                 else:
    123                     req = HEADRequest(http_url)
    124                     self._request_webpage(
    125                         req, video_id,
    126                         note='Testing for geoblocking',
    127                         errnote=((
    128                             'This video seems to be blocked outside of %s. '
    129                             'You may want to try the streaming-* formats.')
    130                             % geo_str),
    131                         fatal=False)
    132 
    133             self._check_formats(formats, video_id)
    134             self._sort_formats(formats)
    135 
    136             subtitles = {}
    137             for sub in sd.get('subtitles', []):
    138                 sub_src = sub.get('src')
    139                 if not sub_src:
    140                     continue
    141                 subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
    142                     'url': sub_src,
    143                 })
    144 
    145             upload_date = unified_strdate(sd.get('created_date'))
    146 
    147             thumbnails = []
    148             preview = sd.get('preview_image_url')
    149             if preview:
    150                 thumbnails.append({
    151                     'id': 'preview',
    152                     'url': preview,
    153                     'preference': 0,
    154                 })
    155             image = sd.get('image_full_url')
    156             if not image and len(data_jsb) == 1:
    157                 image = self._og_search_thumbnail(webpage)
    158             if image:
    159                 thumbnails.append({
    160                     'id': 'full',
    161                     'url': image,
    162                     'preference': 1,
    163                 })
    164 
    165             entries.append({
    166                 '_type': 'video',
    167                 'id': video_id,
    168                 'title': title,
    169                 'formats': formats,
    170                 'subtitles': subtitles,
    171                 'description': sd.get('description'),
    172                 'duration': int_or_none(sd.get('duration_in_seconds')),
    173                 'upload_date': upload_date,
    174                 'thumbnails': thumbnails,
    175             })
    176 
    177         return {
    178             '_type': 'playlist',
    179             'entries': entries,
    180             'id': playlist_id,
    181         }
    182 
    183 
    184 class ORFRadioIE(InfoExtractor):
    185     def _real_extract(self, url):
    186         mobj = re.match(self._VALID_URL, url)
    187         show_date = mobj.group('date')
    188         show_id = mobj.group('show')
    189 
    190         data = self._download_json(
    191             'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s'
    192             % (self._API_STATION, show_id, show_date), show_id)
    193 
    194         entries = []
    195         for info in data['streams']:
    196             loop_stream_id = str_or_none(info.get('loopStreamId'))
    197             if not loop_stream_id:
    198                 continue
    199             title = str_or_none(data.get('title'))
    200             if not title:
    201                 continue
    202             start = int_or_none(info.get('start'), scale=1000)
    203             end = int_or_none(info.get('end'), scale=1000)
    204             duration = end - start if end and start else None
    205             entries.append({
    206                 'id': loop_stream_id.replace('.mp3', ''),
    207                 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
    208                 'title': title,
    209                 'description': clean_html(data.get('subtitle')),
    210                 'duration': duration,
    211                 'timestamp': start,
    212                 'ext': 'mp3',
    213                 'series': data.get('programTitle'),
    214             })
    215 
    216         return {
    217             '_type': 'playlist',
    218             'id': show_id,
    219             'title': data.get('title'),
    220             'description': clean_html(data.get('subtitle')),
    221             'entries': entries,
    222         }
    223 
    224 
    225 class ORFFM4IE(ORFRadioIE):
    226     IE_NAME = 'orf:fm4'
    227     IE_DESC = 'radio FM4'
    228     _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)'
    229     _API_STATION = 'fm4'
    230     _LOOP_STATION = 'fm4'
    231 
    232     _TEST = {
    233         'url': 'http://fm4.orf.at/player/20170107/4CC',
    234         'md5': '2b0be47375432a7ef104453432a19212',
    235         'info_dict': {
    236             'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
    237             'ext': 'mp3',
    238             'title': 'Solid Steel Radioshow',
    239             'description': 'Die Mixshow von Coldcut und Ninja Tune.',
    240             'duration': 3599,
    241             'timestamp': 1483819257,
    242             'upload_date': '20170107',
    243         },
    244         'skip': 'Shows from ORF radios are only available for 7 days.',
    245         'only_matching': True,
    246     }
    247 
    248 
    249 class ORFNOEIE(ORFRadioIE):
    250     IE_NAME = 'orf:noe'
    251     IE_DESC = 'Radio Niederösterreich'
    252     _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    253     _API_STATION = 'noe'
    254     _LOOP_STATION = 'oe2n'
    255 
    256     _TEST = {
    257         'url': 'https://noe.orf.at/player/20200423/NGM',
    258         'only_matching': True,
    259     }
    260 
    261 
    262 class ORFWIEIE(ORFRadioIE):
    263     IE_NAME = 'orf:wien'
    264     IE_DESC = 'Radio Wien'
    265     _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    266     _API_STATION = 'wie'
    267     _LOOP_STATION = 'oe2w'
    268 
    269     _TEST = {
    270         'url': 'https://wien.orf.at/player/20200423/WGUM',
    271         'only_matching': True,
    272     }
    273 
    274 
    275 class ORFBGLIE(ORFRadioIE):
    276     IE_NAME = 'orf:burgenland'
    277     IE_DESC = 'Radio Burgenland'
    278     _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    279     _API_STATION = 'bgl'
    280     _LOOP_STATION = 'oe2b'
    281 
    282     _TEST = {
    283         'url': 'https://burgenland.orf.at/player/20200423/BGM',
    284         'only_matching': True,
    285     }
    286 
    287 
    288 class ORFOOEIE(ORFRadioIE):
    289     IE_NAME = 'orf:oberoesterreich'
    290     IE_DESC = 'Radio Oberösterreich'
    291     _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    292     _API_STATION = 'ooe'
    293     _LOOP_STATION = 'oe2o'
    294 
    295     _TEST = {
    296         'url': 'https://ooe.orf.at/player/20200423/OGMO',
    297         'only_matching': True,
    298     }
    299 
    300 
    301 class ORFSTMIE(ORFRadioIE):
    302     IE_NAME = 'orf:steiermark'
    303     IE_DESC = 'Radio Steiermark'
    304     _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    305     _API_STATION = 'stm'
    306     _LOOP_STATION = 'oe2st'
    307 
    308     _TEST = {
    309         'url': 'https://steiermark.orf.at/player/20200423/STGMS',
    310         'only_matching': True,
    311     }
    312 
    313 
    314 class ORFKTNIE(ORFRadioIE):
    315     IE_NAME = 'orf:kaernten'
    316     IE_DESC = 'Radio Kärnten'
    317     _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    318     _API_STATION = 'ktn'
    319     _LOOP_STATION = 'oe2k'
    320 
    321     _TEST = {
    322         'url': 'https://kaernten.orf.at/player/20200423/KGUMO',
    323         'only_matching': True,
    324     }
    325 
    326 
    327 class ORFSBGIE(ORFRadioIE):
    328     IE_NAME = 'orf:salzburg'
    329     IE_DESC = 'Radio Salzburg'
    330     _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    331     _API_STATION = 'sbg'
    332     _LOOP_STATION = 'oe2s'
    333 
    334     _TEST = {
    335         'url': 'https://salzburg.orf.at/player/20200423/SGUM',
    336         'only_matching': True,
    337     }
    338 
    339 
    340 class ORFTIRIE(ORFRadioIE):
    341     IE_NAME = 'orf:tirol'
    342     IE_DESC = 'Radio Tirol'
    343     _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    344     _API_STATION = 'tir'
    345     _LOOP_STATION = 'oe2t'
    346 
    347     _TEST = {
    348         'url': 'https://tirol.orf.at/player/20200423/TGUMO',
    349         'only_matching': True,
    350     }
    351 
    352 
    353 class ORFVBGIE(ORFRadioIE):
    354     IE_NAME = 'orf:vorarlberg'
    355     IE_DESC = 'Radio Vorarlberg'
    356     _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    357     _API_STATION = 'vbg'
    358     _LOOP_STATION = 'oe2v'
    359 
    360     _TEST = {
    361         'url': 'https://vorarlberg.orf.at/player/20200423/VGUM',
    362         'only_matching': True,
    363     }
    364 
    365 
    366 class ORFOE3IE(ORFRadioIE):
    367     IE_NAME = 'orf:oe3'
    368     IE_DESC = 'Radio Österreich 3'
    369     _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    370     _API_STATION = 'oe3'
    371     _LOOP_STATION = 'oe3'
    372 
    373     _TEST = {
    374         'url': 'https://oe3.orf.at/player/20200424/3WEK',
    375         'only_matching': True,
    376     }
    377 
    378 
    379 class ORFOE1IE(ORFRadioIE):
    380     IE_NAME = 'orf:oe1'
    381     IE_DESC = 'Radio Österreich 1'
    382     _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
    383     _API_STATION = 'oe1'
    384     _LOOP_STATION = 'oe1'
    385 
    386     _TEST = {
    387         'url': 'http://oe1.orf.at/player/20170108/456544',
    388         'md5': '34d8a6e67ea888293741c86a099b745b',
    389         'info_dict': {
    390             'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
    391             'ext': 'mp3',
    392             'title': 'Morgenjournal',
    393             'duration': 609,
    394             'timestamp': 1483858796,
    395             'upload_date': '20170108',
    396         },
    397         'skip': 'Shows from ORF radios are only available for 7 days.'
    398     }
    399 
    400 
    401 class ORFIPTVIE(InfoExtractor):
    402     IE_NAME = 'orf:iptv'
    403     IE_DESC = 'iptv.ORF.at'
    404     _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
    405 
    406     _TEST = {
    407         'url': 'http://iptv.orf.at/stories/2275236/',
    408         'md5': 'c8b22af4718a4b4af58342529453e3e5',
    409         'info_dict': {
    410             'id': '350612',
    411             'ext': 'flv',
    412             'title': 'Weitere Evakuierungen um Vulkan Calbuco',
    413             'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
    414             'duration': 68.197,
    415             'thumbnail': r're:^https?://.*\.jpg$',
    416             'upload_date': '20150425',
    417         },
    418     }
    419 
    420     def _real_extract(self, url):
    421         story_id = self._match_id(url)
    422 
    423         webpage = self._download_webpage(
    424             'http://iptv.orf.at/stories/%s' % story_id, story_id)
    425 
    426         video_id = self._search_regex(
    427             r'data-video(?:id)?="(\d+)"', webpage, 'video id')
    428 
    429         data = self._download_json(
    430             'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
    431             video_id)[0]
    432 
    433         duration = float_or_none(data['duration'], 1000)
    434 
    435         video = data['sources']['default']
    436         load_balancer_url = video['loadBalancerUrl']
    437         abr = int_or_none(video.get('audioBitrate'))
    438         vbr = int_or_none(video.get('bitrate'))
    439         fps = int_or_none(video.get('videoFps'))
    440         width = int_or_none(video.get('videoWidth'))
    441         height = int_or_none(video.get('videoHeight'))
    442         thumbnail = video.get('preview')
    443 
    444         rendition = self._download_json(
    445             load_balancer_url, video_id, transform_source=strip_jsonp)
    446 
    447         f = {
    448             'abr': abr,
    449             'vbr': vbr,
    450             'fps': fps,
    451             'width': width,
    452             'height': height,
    453         }
    454 
    455         formats = []
    456         for format_id, format_url in rendition['redirect'].items():
    457             if format_id == 'rtmp':
    458                 ff = f.copy()
    459                 ff.update({
    460                     'url': format_url,
    461                     'format_id': format_id,
    462                 })
    463                 formats.append(ff)
    464             elif determine_ext(format_url) == 'f4m':
    465                 formats.extend(self._extract_f4m_formats(
    466                     format_url, video_id, f4m_id=format_id))
    467             elif determine_ext(format_url) == 'm3u8':
    468                 formats.extend(self._extract_m3u8_formats(
    469                     format_url, video_id, 'mp4', m3u8_id=format_id))
    470             else:
    471                 continue
    472         self._sort_formats(formats)
    473 
    474         title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
    475         description = self._og_search_description(webpage)
    476         upload_date = unified_strdate(self._html_search_meta(
    477             'dc.date', webpage, 'upload date'))
    478 
    479         return {
    480             'id': video_id,
    481             'title': title,
    482             'description': description,
    483             'duration': duration,
    484             'thumbnail': thumbnail,
    485             'upload_date': upload_date,
    486             'formats': formats,
    487         }
    488 
    489 
    490 class ORFFM4StoryIE(InfoExtractor):
    491     IE_NAME = 'orf:fm4:story'
    492     IE_DESC = 'fm4.orf.at stories'
    493     _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)'
    494 
    495     _TEST = {
    496         'url': 'http://fm4.orf.at/stories/2865738/',
    497         'playlist': [{
    498             'md5': 'e1c2c706c45c7b34cf478bbf409907ca',
    499             'info_dict': {
    500                 'id': '547792',
    501                 'ext': 'flv',
    502                 'title': 'Manu Delago und Inner Tongue live',
    503                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
    504                 'duration': 1748.52,
    505                 'thumbnail': r're:^https?://.*\.jpg$',
    506                 'upload_date': '20170913',
    507             },
    508         }, {
    509             'md5': 'c6dd2179731f86f4f55a7b49899d515f',
    510             'info_dict': {
    511                 'id': '547798',
    512                 'ext': 'flv',
    513                 'title': 'Manu Delago und Inner Tongue live (2)',
    514                 'duration': 1504.08,
    515                 'thumbnail': r're:^https?://.*\.jpg$',
    516                 'upload_date': '20170913',
    517                 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
    518             },
    519         }],
    520     }
    521 
    522     def _real_extract(self, url):
    523         story_id = self._match_id(url)
    524         webpage = self._download_webpage(url, story_id)
    525 
    526         entries = []
    527         all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage))
    528         for idx, video_id in enumerate(all_ids):
    529             data = self._download_json(
    530                 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
    531                 video_id)[0]
    532 
    533             duration = float_or_none(data['duration'], 1000)
    534 
    535             video = data['sources']['q8c']
    536             load_balancer_url = video['loadBalancerUrl']
    537             abr = int_or_none(video.get('audioBitrate'))
    538             vbr = int_or_none(video.get('bitrate'))
    539             fps = int_or_none(video.get('videoFps'))
    540             width = int_or_none(video.get('videoWidth'))
    541             height = int_or_none(video.get('videoHeight'))
    542             thumbnail = video.get('preview')
    543 
    544             rendition = self._download_json(
    545                 load_balancer_url, video_id, transform_source=strip_jsonp)
    546 
    547             f = {
    548                 'abr': abr,
    549                 'vbr': vbr,
    550                 'fps': fps,
    551                 'width': width,
    552                 'height': height,
    553             }
    554 
    555             formats = []
    556             for format_id, format_url in rendition['redirect'].items():
    557                 if format_id == 'rtmp':
    558                     ff = f.copy()
    559                     ff.update({
    560                         'url': format_url,
    561                         'format_id': format_id,
    562                     })
    563                     formats.append(ff)
    564                 elif determine_ext(format_url) == 'f4m':
    565                     formats.extend(self._extract_f4m_formats(
    566                         format_url, video_id, f4m_id=format_id))
    567                 elif determine_ext(format_url) == 'm3u8':
    568                     formats.extend(self._extract_m3u8_formats(
    569                         format_url, video_id, 'mp4', m3u8_id=format_id))
    570                 else:
    571                     continue
    572             self._sort_formats(formats)
    573 
    574             title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at')
    575             if idx >= 1:
    576                 # Titles are duplicates, make them unique
    577                 title += ' (' + str(idx + 1) + ')'
    578             description = self._og_search_description(webpage)
    579             upload_date = unified_strdate(self._html_search_meta(
    580                 'dc.date', webpage, 'upload date'))
    581 
    582             entries.append({
    583                 'id': video_id,
    584                 'title': title,
    585                 'description': description,
    586                 'duration': duration,
    587                 'thumbnail': thumbnail,
    588                 'upload_date': upload_date,
    589                 'formats': formats,
    590             })
    591 
    592         return self.playlist_result(entries)