youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

canvas.py (15030B)


      1 from __future__ import unicode_literals
      2 
      3 import re
      4 import json
      5 
      6 from .common import InfoExtractor
      7 from .gigya import GigyaBaseIE
      8 from ..compat import compat_HTTPError
      9 from ..utils import (
     10     ExtractorError,
     11     clean_html,
     12     extract_attributes,
     13     float_or_none,
     14     get_element_by_class,
     15     int_or_none,
     16     merge_dicts,
     17     str_or_none,
     18     strip_or_none,
     19     url_or_none,
     20 )
     21 
     22 
     23 class CanvasIE(InfoExtractor):
     24     _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
     25     _TESTS = [{
     26         'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
     27         'md5': '68993eda72ef62386a15ea2cf3c93107',
     28         'info_dict': {
     29             'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
     30             'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
     31             'ext': 'mp4',
     32             'title': 'Nachtwacht: De Greystook',
     33             'description': 'Nachtwacht: De Greystook',
     34             'thumbnail': r're:^https?://.*\.jpg$',
     35             'duration': 1468.04,
     36         },
     37         'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
     38     }, {
     39         'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
     40         'only_matching': True,
     41     }]
     42     _GEO_BYPASS = False
     43     _HLS_ENTRY_PROTOCOLS_MAP = {
     44         'HLS': 'm3u8_native',
     45         'HLS_AES': 'm3u8',
     46     }
     47     _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
     48 
     49     def _real_extract(self, url):
     50         mobj = re.match(self._VALID_URL, url)
     51         site_id, video_id = mobj.group('site_id'), mobj.group('id')
     52 
     53         data = None
     54         if site_id != 'vrtvideo':
     55             # Old API endpoint, serves more formats but may fail for some videos
     56             data = self._download_json(
     57                 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
     58                 % (site_id, video_id), video_id, 'Downloading asset JSON',
     59                 'Unable to download asset JSON', fatal=False)
     60 
     61         # New API endpoint
     62         if not data:
     63             headers = self.geo_verification_headers()
     64             headers.update({'Content-Type': 'application/json'})
     65             token = self._download_json(
     66                 '%s/tokens' % self._REST_API_BASE, video_id,
     67                 'Downloading token', data=b'', headers=headers)['vrtPlayerToken']
     68             data = self._download_json(
     69                 '%s/videos/%s' % (self._REST_API_BASE, video_id),
     70                 video_id, 'Downloading video JSON', query={
     71                     'vrtPlayerToken': token,
     72                     'client': '%s@PROD' % site_id,
     73                 }, expected_status=400)
     74             if not data.get('title'):
     75                 code = data.get('code')
     76                 if code == 'AUTHENTICATION_REQUIRED':
     77                     self.raise_login_required()
     78                 elif code == 'INVALID_LOCATION':
     79                     self.raise_geo_restricted(countries=['BE'])
     80                 raise ExtractorError(data.get('message') or code, expected=True)
     81 
     82         title = data['title']
     83         description = data.get('description')
     84 
     85         formats = []
     86         for target in data['targetUrls']:
     87             format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
     88             if not format_url or not format_type:
     89                 continue
     90             format_type = format_type.upper()
     91             if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
     92                 formats.extend(self._extract_m3u8_formats(
     93                     format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
     94                     m3u8_id=format_type, fatal=False))
     95             elif format_type == 'HDS':
     96                 formats.extend(self._extract_f4m_formats(
     97                     format_url, video_id, f4m_id=format_type, fatal=False))
     98             elif format_type == 'MPEG_DASH':
     99                 formats.extend(self._extract_mpd_formats(
    100                     format_url, video_id, mpd_id=format_type, fatal=False))
    101             elif format_type == 'HSS':
    102                 formats.extend(self._extract_ism_formats(
    103                     format_url, video_id, ism_id='mss', fatal=False))
    104             else:
    105                 formats.append({
    106                     'format_id': format_type,
    107                     'url': format_url,
    108                 })
    109         self._sort_formats(formats)
    110 
    111         subtitles = {}
    112         subtitle_urls = data.get('subtitleUrls')
    113         if isinstance(subtitle_urls, list):
    114             for subtitle in subtitle_urls:
    115                 subtitle_url = subtitle.get('url')
    116                 if subtitle_url and subtitle.get('type') == 'CLOSED':
    117                     subtitles.setdefault('nl', []).append({'url': subtitle_url})
    118 
    119         return {
    120             'id': video_id,
    121             'display_id': video_id,
    122             'title': title,
    123             'description': description,
    124             'formats': formats,
    125             'duration': float_or_none(data.get('duration'), 1000),
    126             'thumbnail': data.get('posterImageUrl'),
    127             'subtitles': subtitles,
    128         }
    129 
    130 
    131 class CanvasEenIE(InfoExtractor):
    132     IE_DESC = 'canvas.be and een.be'
    133     _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
    134     _TESTS = [{
    135         'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
    136         'md5': 'ed66976748d12350b118455979cca293',
    137         'info_dict': {
    138             'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
    139             'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
    140             'ext': 'flv',
    141             'title': 'De afspraak veilt voor de Warmste Week',
    142             'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
    143             'thumbnail': r're:^https?://.*\.jpg$',
    144             'duration': 49.02,
    145         },
    146         'expected_warnings': ['is not a supported codec'],
    147     }, {
    148         # with subtitles
    149         'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
    150         'info_dict': {
    151             'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
    152             'display_id': 'pieter-0167',
    153             'ext': 'mp4',
    154             'title': 'Pieter 0167',
    155             'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
    156             'thumbnail': r're:^https?://.*\.jpg$',
    157             'duration': 2553.08,
    158             'subtitles': {
    159                 'nl': [{
    160                     'ext': 'vtt',
    161                 }],
    162             },
    163         },
    164         'params': {
    165             'skip_download': True,
    166         },
    167         'skip': 'Pagina niet gevonden',
    168     }, {
    169         'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
    170         'info_dict': {
    171             'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
    172             'display_id': 'emma-pakt-thilly-aan',
    173             'ext': 'mp4',
    174             'title': 'Emma pakt Thilly aan',
    175             'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
    176             'thumbnail': r're:^https?://.*\.jpg$',
    177             'duration': 118.24,
    178         },
    179         'params': {
    180             'skip_download': True,
    181         },
    182         'expected_warnings': ['is not a supported codec'],
    183     }, {
    184         'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
    185         'only_matching': True,
    186     }]
    187 
    188     def _real_extract(self, url):
    189         mobj = re.match(self._VALID_URL, url)
    190         site_id, display_id = mobj.group('site_id'), mobj.group('id')
    191 
    192         webpage = self._download_webpage(url, display_id)
    193 
    194         title = strip_or_none(self._search_regex(
    195             r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
    196             webpage, 'title', default=None) or self._og_search_title(
    197             webpage, default=None))
    198 
    199         video_id = self._html_search_regex(
    200             r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
    201             group='id')
    202 
    203         return {
    204             '_type': 'url_transparent',
    205             'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
    206             'ie_key': CanvasIE.ie_key(),
    207             'id': video_id,
    208             'display_id': display_id,
    209             'title': title,
    210             'description': self._og_search_description(webpage),
    211         }
    212 
    213 
    214 class VrtNUIE(GigyaBaseIE):
    215     IE_DESC = 'VrtNU.be'
    216     _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
    217     _TESTS = [{
    218         # Available via old API endpoint
    219         'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',
    220         'info_dict': {
    221             'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
    222             'ext': 'mp4',
    223             'title': 'Postbus X - Aflevering 1 (Seizoen 1989)',
    224             'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7',
    225             'duration': 1457.04,
    226             'thumbnail': r're:^https?://.*\.jpg$',
    227             'series': 'Postbus X',
    228             'season': 'Seizoen 1989',
    229             'season_number': 1989,
    230             'episode': 'De zwarte weduwe',
    231             'episode_number': 1,
    232             'timestamp': 1595822400,
    233             'upload_date': '20200727',
    234         },
    235         'skip': 'This video is only available for registered users',
    236         'params': {
    237             'username': '<snip>',
    238             'password': '<snip>',
    239         },
    240         'expected_warnings': ['is not a supported codec'],
    241     }, {
    242         # Only available via new API endpoint
    243         'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
    244         'info_dict': {
    245             'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
    246             'ext': 'mp4',
    247             'title': 'Aflevering 5',
    248             'description': 'Wie valt door de mand tijdens een missie?',
    249             'duration': 2967.06,
    250             'season': 'Season 1',
    251             'season_number': 1,
    252             'episode_number': 5,
    253         },
    254         'skip': 'This video is only available for registered users',
    255         'params': {
    256             'username': '<snip>',
    257             'password': '<snip>',
    258         },
    259         'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
    260     }]
    261     _NETRC_MACHINE = 'vrtnu'
    262     _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
    263     _CONTEXT_ID = 'R3595707040'
    264 
    265     def _real_initialize(self):
    266         self._login()
    267 
    268     def _login(self):
    269         username, password = self._get_login_info()
    270         if username is None:
    271             return
    272 
    273         auth_data = {
    274             'APIKey': self._APIKEY,
    275             'targetEnv': 'jssdk',
    276             'loginID': username,
    277             'password': password,
    278             'authMode': 'cookie',
    279         }
    280 
    281         auth_info = self._gigya_login(auth_data)
    282 
    283         # Sometimes authentication fails for no good reason, retry
    284         login_attempt = 1
    285         while login_attempt <= 3:
    286             try:
    287                 # When requesting a token, no actual token is returned, but the
    288                 # necessary cookies are set.
    289                 self._request_webpage(
    290                     'https://token.vrt.be',
    291                     None, note='Requesting a token', errnote='Could not get a token',
    292                     headers={
    293                         'Content-Type': 'application/json',
    294                         'Referer': 'https://www.vrt.be/vrtnu/',
    295                     },
    296                     data=json.dumps({
    297                         'uid': auth_info['UID'],
    298                         'uidsig': auth_info['UIDSignature'],
    299                         'ts': auth_info['signatureTimestamp'],
    300                         'email': auth_info['profile']['email'],
    301                     }).encode('utf-8'))
    302             except ExtractorError as e:
    303                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
    304                     login_attempt += 1
    305                     self.report_warning('Authentication failed')
    306                     self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
    307                 else:
    308                     raise e
    309             else:
    310                 break
    311 
    312     def _real_extract(self, url):
    313         display_id = self._match_id(url)
    314 
    315         webpage = self._download_webpage(url, display_id)
    316 
    317         attrs = extract_attributes(self._search_regex(
    318             r'(<nui-media[^>]+>)', webpage, 'media element'))
    319         video_id = attrs['videoid']
    320         publication_id = attrs.get('publicationid')
    321         if publication_id:
    322             video_id = publication_id + '$' + video_id
    323 
    324         page = (self._parse_json(self._search_regex(
    325             r'digitalData\s*=\s*({.+?});', webpage, 'digial data',
    326             default='{}'), video_id, fatal=False) or {}).get('page') or {}
    327 
    328         info = self._search_json_ld(webpage, display_id, default={})
    329         return merge_dicts(info, {
    330             '_type': 'url_transparent',
    331             'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
    332             'ie_key': CanvasIE.ie_key(),
    333             'id': video_id,
    334             'display_id': display_id,
    335             'season_number': int_or_none(page.get('episode_season')),
    336         })
    337 
    338 
    339 class DagelijkseKostIE(InfoExtractor):
    340     IE_DESC = 'dagelijksekost.een.be'
    341     _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
    342     _TEST = {
    343         'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
    344         'md5': '30bfffc323009a3e5f689bef6efa2365',
    345         'info_dict': {
    346             'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
    347             'display_id': 'hachis-parmentier-met-witloof',
    348             'ext': 'mp4',
    349             'title': 'Hachis parmentier met witloof',
    350             'description': 'md5:9960478392d87f63567b5b117688cdc5',
    351             'thumbnail': r're:^https?://.*\.jpg$',
    352             'duration': 283.02,
    353         },
    354         'expected_warnings': ['is not a supported codec'],
    355     }
    356 
    357     def _real_extract(self, url):
    358         display_id = self._match_id(url)
    359         webpage = self._download_webpage(url, display_id)
    360 
    361         title = strip_or_none(get_element_by_class(
    362             'dish-metadata__title', webpage
    363         ) or self._html_search_meta(
    364             'twitter:title', webpage))
    365 
    366         description = clean_html(get_element_by_class(
    367             'dish-description', webpage)
    368         ) or self._html_search_meta(
    369             ('description', 'twitter:description', 'og:description'),
    370             webpage)
    371 
    372         video_id = self._html_search_regex(
    373             r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
    374             group='id')
    375 
    376         return {
    377             '_type': 'url_transparent',
    378             'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
    379             'ie_key': CanvasIE.ie_key(),
    380             'id': video_id,
    381             'display_id': display_id,
    382             'title': title,
    383             'description': description,
    384         }