canvas.py (15030B)
1 from __future__ import unicode_literals 2 3 import re 4 import json 5 6 from .common import InfoExtractor 7 from .gigya import GigyaBaseIE 8 from ..compat import compat_HTTPError 9 from ..utils import ( 10 ExtractorError, 11 clean_html, 12 extract_attributes, 13 float_or_none, 14 get_element_by_class, 15 int_or_none, 16 merge_dicts, 17 str_or_none, 18 strip_or_none, 19 url_or_none, 20 ) 21 22 23 class CanvasIE(InfoExtractor): 24 _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' 25 _TESTS = [{ 26 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 27 'md5': '68993eda72ef62386a15ea2cf3c93107', 28 'info_dict': { 29 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 30 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 31 'ext': 'mp4', 32 'title': 'Nachtwacht: De Greystook', 33 'description': 'Nachtwacht: De Greystook', 34 'thumbnail': r're:^https?://.*\.jpg$', 35 'duration': 1468.04, 36 }, 37 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], 38 }, { 39 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 40 'only_matching': True, 41 }] 42 _GEO_BYPASS = False 43 _HLS_ENTRY_PROTOCOLS_MAP = { 44 'HLS': 'm3u8_native', 45 'HLS_AES': 'm3u8', 46 } 47 _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' 48 49 def _real_extract(self, url): 50 mobj = re.match(self._VALID_URL, url) 51 site_id, video_id = mobj.group('site_id'), mobj.group('id') 52 53 data = None 54 if site_id != 'vrtvideo': 55 # Old API endpoint, serves more formats but may fail for some videos 56 data = self._download_json( 57 'https://mediazone.vrt.be/api/v1/%s/assets/%s' 58 % (site_id, video_id), video_id, 'Downloading asset JSON', 59 'Unable to download asset JSON', fatal=False) 60 61 # New API endpoint 62 if not data: 63 headers = self.geo_verification_headers() 64 headers.update({'Content-Type': 'application/json'}) 65 token = self._download_json( 66 '%s/tokens' % self._REST_API_BASE, video_id, 67 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] 68 data = self._download_json( 69 '%s/videos/%s' % (self._REST_API_BASE, video_id), 70 video_id, 'Downloading video JSON', query={ 71 'vrtPlayerToken': token, 72 'client': '%s@PROD' % site_id, 73 }, expected_status=400) 74 if not data.get('title'): 75 code = data.get('code') 76 if code == 'AUTHENTICATION_REQUIRED': 77 self.raise_login_required() 78 elif code == 'INVALID_LOCATION': 79 self.raise_geo_restricted(countries=['BE']) 80 raise ExtractorError(data.get('message') or code, expected=True) 81 82 title = data['title'] 83 description = data.get('description') 84 85 formats = [] 86 for target in data['targetUrls']: 87 format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) 88 if not format_url or not format_type: 89 continue 90 format_type = format_type.upper() 91 if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: 92 formats.extend(self._extract_m3u8_formats( 93 format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], 94 m3u8_id=format_type, fatal=False)) 95 elif format_type == 'HDS': 96 formats.extend(self._extract_f4m_formats( 97 format_url, video_id, f4m_id=format_type, fatal=False)) 98 elif format_type == 'MPEG_DASH': 99 formats.extend(self._extract_mpd_formats( 100 format_url, video_id, mpd_id=format_type, fatal=False)) 101 elif format_type == 'HSS': 102 formats.extend(self._extract_ism_formats( 103 format_url, video_id, ism_id='mss', fatal=False)) 104 else: 105 formats.append({ 106 'format_id': format_type, 107 'url': format_url, 108 }) 109 self._sort_formats(formats) 110 111 subtitles = {} 112 subtitle_urls = data.get('subtitleUrls') 113 if isinstance(subtitle_urls, list): 114 for subtitle in subtitle_urls: 115 subtitle_url = subtitle.get('url') 116 if subtitle_url and subtitle.get('type') == 'CLOSED': 117 subtitles.setdefault('nl', []).append({'url': subtitle_url}) 118 119 return { 120 'id': video_id, 121 'display_id': video_id, 122 'title': title, 123 'description': description, 124 'formats': formats, 125 'duration': float_or_none(data.get('duration'), 1000), 126 'thumbnail': data.get('posterImageUrl'), 127 'subtitles': subtitles, 128 } 129 130 131 class CanvasEenIE(InfoExtractor): 132 IE_DESC = 'canvas.be and een.be' 133 _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' 134 _TESTS = [{ 135 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', 136 'md5': 'ed66976748d12350b118455979cca293', 137 'info_dict': { 138 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 139 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', 140 'ext': 'flv', 141 'title': 'De afspraak veilt voor de Warmste Week', 142 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', 143 'thumbnail': r're:^https?://.*\.jpg$', 144 'duration': 49.02, 145 }, 146 'expected_warnings': ['is not a supported codec'], 147 }, { 148 # with subtitles 149 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', 150 'info_dict': { 151 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', 152 'display_id': 'pieter-0167', 153 'ext': 'mp4', 154 'title': 'Pieter 0167', 155 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', 156 'thumbnail': r're:^https?://.*\.jpg$', 157 'duration': 2553.08, 158 'subtitles': { 159 'nl': [{ 160 'ext': 'vtt', 161 }], 162 }, 163 }, 164 'params': { 165 'skip_download': True, 166 }, 167 'skip': 'Pagina niet gevonden', 168 }, { 169 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', 170 'info_dict': { 171 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', 172 'display_id': 'emma-pakt-thilly-aan', 173 'ext': 'mp4', 174 'title': 'Emma pakt Thilly aan', 175 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', 176 'thumbnail': r're:^https?://.*\.jpg$', 177 'duration': 118.24, 178 }, 179 'params': { 180 'skip_download': True, 181 }, 182 'expected_warnings': ['is not a supported codec'], 183 }, { 184 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', 185 'only_matching': True, 186 }] 187 188 def _real_extract(self, url): 189 mobj = re.match(self._VALID_URL, url) 190 site_id, display_id = mobj.group('site_id'), mobj.group('id') 191 192 webpage = self._download_webpage(url, display_id) 193 194 title = strip_or_none(self._search_regex( 195 r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', 196 webpage, 'title', default=None) or self._og_search_title( 197 webpage, default=None)) 198 199 video_id = self._html_search_regex( 200 r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', 201 group='id') 202 203 return { 204 '_type': 'url_transparent', 205 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), 206 'ie_key': CanvasIE.ie_key(), 207 'id': video_id, 208 'display_id': display_id, 209 'title': title, 210 'description': self._og_search_description(webpage), 211 } 212 213 214 class VrtNUIE(GigyaBaseIE): 215 IE_DESC = 'VrtNU.be' 216 _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' 217 _TESTS = [{ 218 # Available via old API endpoint 219 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 220 'info_dict': { 221 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 222 'ext': 'mp4', 223 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', 224 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 225 'duration': 1457.04, 226 'thumbnail': r're:^https?://.*\.jpg$', 227 'series': 'Postbus X', 228 'season': 'Seizoen 1989', 229 'season_number': 1989, 230 'episode': 'De zwarte weduwe', 231 'episode_number': 1, 232 'timestamp': 1595822400, 233 'upload_date': '20200727', 234 }, 235 'skip': 'This video is only available for registered users', 236 'params': { 237 'username': '<snip>', 238 'password': '<snip>', 239 }, 240 'expected_warnings': ['is not a supported codec'], 241 }, { 242 # Only available via new API endpoint 243 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', 244 'info_dict': { 245 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', 246 'ext': 'mp4', 247 'title': 'Aflevering 5', 248 'description': 'Wie valt door de mand tijdens een missie?', 249 'duration': 2967.06, 250 'season': 'Season 1', 251 'season_number': 1, 252 'episode_number': 5, 253 }, 254 'skip': 'This video is only available for registered users', 255 'params': { 256 'username': '<snip>', 257 'password': '<snip>', 258 }, 259 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], 260 }] 261 _NETRC_MACHINE = 'vrtnu' 262 _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' 263 _CONTEXT_ID = 'R3595707040' 264 265 def _real_initialize(self): 266 self._login() 267 268 def _login(self): 269 username, password = self._get_login_info() 270 if username is None: 271 return 272 273 auth_data = { 274 'APIKey': self._APIKEY, 275 'targetEnv': 'jssdk', 276 'loginID': username, 277 'password': password, 278 'authMode': 'cookie', 279 } 280 281 auth_info = self._gigya_login(auth_data) 282 283 # Sometimes authentication fails for no good reason, retry 284 login_attempt = 1 285 while login_attempt <= 3: 286 try: 287 # When requesting a token, no actual token is returned, but the 288 # necessary cookies are set. 289 self._request_webpage( 290 'https://token.vrt.be', 291 None, note='Requesting a token', errnote='Could not get a token', 292 headers={ 293 'Content-Type': 'application/json', 294 'Referer': 'https://www.vrt.be/vrtnu/', 295 }, 296 data=json.dumps({ 297 'uid': auth_info['UID'], 298 'uidsig': auth_info['UIDSignature'], 299 'ts': auth_info['signatureTimestamp'], 300 'email': auth_info['profile']['email'], 301 }).encode('utf-8')) 302 except ExtractorError as e: 303 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: 304 login_attempt += 1 305 self.report_warning('Authentication failed') 306 self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') 307 else: 308 raise e 309 else: 310 break 311 312 def _real_extract(self, url): 313 display_id = self._match_id(url) 314 315 webpage = self._download_webpage(url, display_id) 316 317 attrs = extract_attributes(self._search_regex( 318 r'(<nui-media[^>]+>)', webpage, 'media element')) 319 video_id = attrs['videoid'] 320 publication_id = attrs.get('publicationid') 321 if publication_id: 322 video_id = publication_id + '$' + video_id 323 324 page = (self._parse_json(self._search_regex( 325 r'digitalData\s*=\s*({.+?});', webpage, 'digial data', 326 default='{}'), video_id, fatal=False) or {}).get('page') or {} 327 328 info = self._search_json_ld(webpage, display_id, default={}) 329 return merge_dicts(info, { 330 '_type': 'url_transparent', 331 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 332 'ie_key': CanvasIE.ie_key(), 333 'id': video_id, 334 'display_id': display_id, 335 'season_number': int_or_none(page.get('episode_season')), 336 }) 337 338 339 class DagelijkseKostIE(InfoExtractor): 340 IE_DESC = 'dagelijksekost.een.be' 341 _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' 342 _TEST = { 343 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', 344 'md5': '30bfffc323009a3e5f689bef6efa2365', 345 'info_dict': { 346 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', 347 'display_id': 'hachis-parmentier-met-witloof', 348 'ext': 'mp4', 349 'title': 'Hachis parmentier met witloof', 350 'description': 'md5:9960478392d87f63567b5b117688cdc5', 351 'thumbnail': r're:^https?://.*\.jpg$', 352 'duration': 283.02, 353 }, 354 'expected_warnings': ['is not a supported codec'], 355 } 356 357 def _real_extract(self, url): 358 display_id = self._match_id(url) 359 webpage = self._download_webpage(url, display_id) 360 361 title = strip_or_none(get_element_by_class( 362 'dish-metadata__title', webpage 363 ) or self._html_search_meta( 364 'twitter:title', webpage)) 365 366 description = clean_html(get_element_by_class( 367 'dish-description', webpage) 368 ) or self._html_search_meta( 369 ('description', 'twitter:description', 'og:description'), 370 webpage) 371 372 video_id = self._html_search_regex( 373 r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', 374 group='id') 375 376 return { 377 '_type': 'url_transparent', 378 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, 379 'ie_key': CanvasIE.ie_key(), 380 'id': video_id, 381 'display_id': display_id, 382 'title': title, 383 'description': description, 384 }