mediasite.py (14672B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 import json 6 7 from .common import InfoExtractor 8 from ..compat import ( 9 compat_str, 10 compat_urlparse, 11 ) 12 from ..utils import ( 13 ExtractorError, 14 float_or_none, 15 mimetype2ext, 16 str_or_none, 17 try_get, 18 unescapeHTML, 19 unsmuggle_url, 20 url_or_none, 21 urljoin, 22 ) 23 24 25 _ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})' 26 27 28 class MediasiteIE(InfoExtractor): 29 _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE 30 _TESTS = [ 31 { 32 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', 33 'info_dict': { 34 'id': '2db6c271681e4f199af3c60d1f82869b1d', 35 'ext': 'mp4', 36 'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles', 37 'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.', 38 'timestamp': 1474268400.0, 39 'upload_date': '20160919', 40 }, 41 }, 42 { 43 'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb', 44 'info_dict': { 45 'id': '90bb363295d945d6b548c867d01181361d', 46 'ext': 'mp4', 47 'upload_date': '20150429', 48 'title': '5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity', 49 'timestamp': 1430311380.0, 50 }, 51 }, 52 { 53 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', 54 'md5': '481fda1c11f67588c0d9d8fbdced4e39', 55 'info_dict': { 56 'id': '585a43626e544bdd97aeb71a0ec907a01d', 57 'ext': 'mp4', 58 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', 59 'description': '', 60 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', 61 'duration': 7713.088, 62 'timestamp': 1413309600, 63 'upload_date': '20141014', 64 }, 65 }, 66 { 67 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', 68 'md5': 'ef1fdded95bdf19b12c5999949419c92', 69 'info_dict': { 70 'id': '86a9ea9f53e149079fbdb4202b521ed21d', 71 'ext': 'wmv', 72 'title': '64ste Vakantiecursus: Afvalwater', 73 'description': 'md5:7fd774865cc69d972f542b157c328305', 74 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 75 'duration': 10853, 76 'timestamp': 1326446400, 77 'upload_date': '20120113', 78 }, 79 }, 80 { 81 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', 82 'md5': '9422edc9b9a60151727e4b6d8bef393d', 83 'info_dict': { 84 'id': '24aace4429fc450fb5b38cdbf424a66e1d', 85 'ext': 'mp4', 86 'title': 'Xyce Software Training - Section 1', 87 'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}', 88 'upload_date': '20120409', 89 'timestamp': 1333983600, 90 'duration': 7794, 91 } 92 }, 93 { 94 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', 95 'only_matching': True, 96 }, 97 { 98 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', 99 'only_matching': True, 100 }, 101 { 102 # dashed id 103 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d', 104 'only_matching': True, 105 } 106 ] 107 108 # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) 109 _STREAM_TYPES = { 110 0: 'video1', # the main video 111 2: 'slide', 112 3: 'presentation', 113 4: 'video2', # screencast? 114 5: 'video3', 115 } 116 117 @staticmethod 118 def _extract_urls(webpage): 119 return [ 120 unescapeHTML(mobj.group('url')) 121 for mobj in re.finditer( 122 r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, 123 webpage)] 124 125 def _real_extract(self, url): 126 url, data = unsmuggle_url(url, {}) 127 mobj = re.match(self._VALID_URL, url) 128 resource_id = mobj.group('id') 129 query = mobj.group('query') 130 131 webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? 132 redirect_url = urlh.geturl() 133 134 # XXX: might have also extracted UrlReferrer and QueryString from the html 135 service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( 136 r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id, 137 default='/Mediasite/PlayerService/PlayerService.svc/json')) 138 139 player_options = self._download_json( 140 '%s/GetPlayerOptions' % service_path, resource_id, 141 headers={ 142 'Content-type': 'application/json; charset=utf-8', 143 'X-Requested-With': 'XMLHttpRequest', 144 }, 145 data=json.dumps({ 146 'getPlayerOptionsRequest': { 147 'ResourceId': resource_id, 148 'QueryString': query, 149 'UrlReferrer': data.get('UrlReferrer', ''), 150 'UseScreenReader': False, 151 } 152 }).encode('utf-8'))['d'] 153 154 presentation = player_options['Presentation'] 155 title = presentation['Title'] 156 157 if presentation is None: 158 raise ExtractorError( 159 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], 160 expected=True) 161 162 thumbnails = [] 163 formats = [] 164 for snum, Stream in enumerate(presentation['Streams']): 165 stream_type = Stream.get('StreamType') 166 if stream_type is None: 167 continue 168 169 video_urls = Stream.get('VideoUrls') 170 if not isinstance(video_urls, list): 171 video_urls = [] 172 173 stream_id = self._STREAM_TYPES.get( 174 stream_type, 'type%u' % stream_type) 175 176 stream_formats = [] 177 for unum, VideoUrl in enumerate(video_urls): 178 video_url = url_or_none(VideoUrl.get('Location')) 179 if not video_url: 180 continue 181 # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS 182 183 media_type = VideoUrl.get('MediaType') 184 if media_type == 'SS': 185 stream_formats.extend(self._extract_ism_formats( 186 video_url, resource_id, 187 ism_id='%s-%u.%u' % (stream_id, snum, unum), 188 fatal=False)) 189 elif media_type == 'Dash': 190 stream_formats.extend(self._extract_mpd_formats( 191 video_url, resource_id, 192 mpd_id='%s-%u.%u' % (stream_id, snum, unum), 193 fatal=False)) 194 else: 195 stream_formats.append({ 196 'format_id': '%s-%u.%u' % (stream_id, snum, unum), 197 'url': video_url, 198 'ext': mimetype2ext(VideoUrl.get('MimeType')), 199 }) 200 201 # TODO: if Stream['HasSlideContent']: 202 # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) 203 # from Stream['Slides'] 204 # this will require writing a custom downloader... 205 206 # disprefer 'secondary' streams 207 if stream_type != 0: 208 for fmt in stream_formats: 209 fmt['preference'] = -1 210 211 thumbnail_url = Stream.get('ThumbnailUrl') 212 if thumbnail_url: 213 thumbnails.append({ 214 'id': '%s-%u' % (stream_id, snum), 215 'url': urljoin(redirect_url, thumbnail_url), 216 'preference': -1 if stream_type != 0 else 0, 217 }) 218 formats.extend(stream_formats) 219 220 self._sort_formats(formats) 221 222 # XXX: Presentation['Presenters'] 223 # XXX: Presentation['Transcript'] 224 225 return { 226 'id': resource_id, 227 'title': title, 228 'description': presentation.get('Description'), 229 'duration': float_or_none(presentation.get('Duration'), 1000), 230 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), 231 'formats': formats, 232 'thumbnails': thumbnails, 233 } 234 235 236 class MediasiteCatalogIE(InfoExtractor): 237 _VALID_URL = r'''(?xi) 238 (?P<url>https?://[^/]+/Mediasite) 239 /Catalog/Full/ 240 (?P<catalog_id>{0}) 241 (?: 242 /(?P<current_folder_id>{0}) 243 /(?P<root_dynamic_folder_id>{0}) 244 )? 245 '''.format(_ID_RE) 246 _TESTS = [{ 247 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21', 248 'info_dict': { 249 'id': '631f9e48530d454381549f955d08c75e21', 250 'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically', 251 }, 252 'playlist_count': 6, 253 'expected_warnings': ['is not a supported codec'], 254 }, { 255 # with CurrentFolderId and RootDynamicFolderId 256 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', 257 'info_dict': { 258 'id': '9518c4a6c5cf4993b21cbd53e828a92521', 259 'title': 'IUSM Family and Friends Sessions', 260 }, 261 'playlist_count': 2, 262 }, { 263 'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21', 264 'only_matching': True, 265 }, { 266 # no AntiForgeryToken 267 'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21', 268 'only_matching': True, 269 }, { 270 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', 271 'only_matching': True, 272 }, { 273 # dashed id 274 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e', 275 'only_matching': True, 276 }] 277 278 def _real_extract(self, url): 279 mobj = re.match(self._VALID_URL, url) 280 mediasite_url = mobj.group('url') 281 catalog_id = mobj.group('catalog_id') 282 current_folder_id = mobj.group('current_folder_id') or catalog_id 283 root_dynamic_folder_id = mobj.group('root_dynamic_folder_id') 284 285 webpage = self._download_webpage(url, catalog_id) 286 287 # AntiForgeryToken is optional (e.g. [1]) 288 # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21 289 anti_forgery_token = self._search_regex( 290 r'AntiForgeryToken\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', 291 webpage, 'anti forgery token', default=None, group='value') 292 if anti_forgery_token: 293 anti_forgery_header = self._search_regex( 294 r'AntiForgeryHeaderName\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', 295 webpage, 'anti forgery header name', 296 default='X-SOFO-AntiForgeryHeader', group='value') 297 298 data = { 299 'IsViewPage': True, 300 'IsNewFolder': True, 301 'AuthTicket': None, 302 'CatalogId': catalog_id, 303 'CurrentFolderId': current_folder_id, 304 'RootDynamicFolderId': root_dynamic_folder_id, 305 'ItemsPerPage': 1000, 306 'PageIndex': 0, 307 'PermissionMask': 'Execute', 308 'CatalogSearchType': 'SearchInFolder', 309 'SortBy': 'Date', 310 'SortDirection': 'Descending', 311 'StartDate': None, 312 'EndDate': None, 313 'StatusFilterList': None, 314 'PreviewKey': None, 315 'Tags': [], 316 } 317 318 headers = { 319 'Content-Type': 'application/json; charset=UTF-8', 320 'Referer': url, 321 'X-Requested-With': 'XMLHttpRequest', 322 } 323 if anti_forgery_token: 324 headers[anti_forgery_header] = anti_forgery_token 325 326 catalog = self._download_json( 327 '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url, 328 catalog_id, data=json.dumps(data).encode(), headers=headers) 329 330 entries = [] 331 for video in catalog['PresentationDetailsList']: 332 if not isinstance(video, dict): 333 continue 334 video_id = str_or_none(video.get('Id')) 335 if not video_id: 336 continue 337 entries.append(self.url_result( 338 '%s/Play/%s' % (mediasite_url, video_id), 339 ie=MediasiteIE.ie_key(), video_id=video_id)) 340 341 title = try_get( 342 catalog, lambda x: x['CurrentFolder']['Name'], compat_str) 343 344 return self.playlist_result(entries, catalog_id, title,) 345 346 347 class MediasiteNamedCatalogIE(InfoExtractor): 348 _VALID_URL = r'(?xi)(?P<url>https?://[^/]+/Mediasite)/Catalog/catalogs/(?P<catalog_name>[^/?#&]+)' 349 _TESTS = [{ 350 'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o', 351 'only_matching': True, 352 }] 353 354 def _real_extract(self, url): 355 mobj = re.match(self._VALID_URL, url) 356 mediasite_url = mobj.group('url') 357 catalog_name = mobj.group('catalog_name') 358 359 webpage = self._download_webpage(url, catalog_name) 360 361 catalog_id = self._search_regex( 362 r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id') 363 364 return self.url_result( 365 '%s/Catalog/Full/%s' % (mediasite_url, catalog_id), 366 ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id)