pluralsight.py (18654B)
1 from __future__ import unicode_literals 2 3 import collections 4 import json 5 import os 6 import random 7 import re 8 9 from .common import InfoExtractor 10 from ..compat import ( 11 compat_str, 12 compat_urlparse, 13 ) 14 from ..utils import ( 15 dict_get, 16 ExtractorError, 17 float_or_none, 18 int_or_none, 19 parse_duration, 20 qualities, 21 srt_subtitles_timecode, 22 try_get, 23 update_url_query, 24 urlencode_postdata, 25 ) 26 27 28 class PluralsightBaseIE(InfoExtractor): 29 _API_BASE = 'https://app.pluralsight.com' 30 31 _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE 32 _GRAPHQL_HEADERS = { 33 'Content-Type': 'application/json;charset=UTF-8', 34 } 35 _GRAPHQL_COURSE_TMPL = ''' 36 query BootstrapPlayer { 37 rpc { 38 bootstrapPlayer { 39 profile { 40 firstName 41 lastName 42 email 43 username 44 userHandle 45 authed 46 isAuthed 47 plan 48 } 49 course(courseId: "%s") { 50 name 51 title 52 courseHasCaptions 53 translationLanguages { 54 code 55 name 56 } 57 supportsWideScreenVideoFormats 58 timestamp 59 modules { 60 name 61 title 62 duration 63 formattedDuration 64 author 65 authorized 66 clips { 67 authorized 68 clipId 69 duration 70 formattedDuration 71 id 72 index 73 moduleIndex 74 moduleTitle 75 name 76 title 77 watched 78 } 79 } 80 } 81 } 82 } 83 }''' 84 85 def _download_course(self, course_id, url, display_id): 86 try: 87 return self._download_course_rpc(course_id, url, display_id) 88 except ExtractorError: 89 # Old API fallback 90 return self._download_json( 91 'https://app.pluralsight.com/player/user/api/v1/player/payload', 92 display_id, data=urlencode_postdata({'courseId': course_id}), 93 headers={'Referer': url}) 94 95 def _download_course_rpc(self, course_id, url, display_id): 96 response = self._download_json( 97 self._GRAPHQL_EP, display_id, data=json.dumps({ 98 'query': self._GRAPHQL_COURSE_TMPL % course_id, 99 'variables': {} 100 }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) 101 102 course = try_get( 103 response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], 104 dict) 105 if course: 106 return course 107 108 raise ExtractorError( 109 '%s said: %s' % (self.IE_NAME, response['error']['message']), 110 expected=True) 111 112 113 class PluralsightIE(PluralsightBaseIE): 114 IE_NAME = 'pluralsight' 115 _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' 116 _LOGIN_URL = 'https://app.pluralsight.com/id/' 117 118 _NETRC_MACHINE = 'pluralsight' 119 120 _TESTS = [{ 121 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', 122 'md5': '4d458cf5cf4c593788672419a8dd4cf8', 123 'info_dict': { 124 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', 125 'ext': 'mp4', 126 'title': 'Demo Monitoring', 127 'duration': 338, 128 }, 129 'skip': 'Requires pluralsight account credentials', 130 }, { 131 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', 132 'only_matching': True, 133 }, { 134 # available without pluralsight account 135 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', 136 'only_matching': True, 137 }, { 138 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', 139 'only_matching': True, 140 }] 141 142 GRAPHQL_VIEWCLIP_TMPL = ''' 143 query viewClip { 144 viewClip(input: { 145 author: "%(author)s", 146 clipIndex: %(clipIndex)d, 147 courseName: "%(courseName)s", 148 includeCaptions: %(includeCaptions)s, 149 locale: "%(locale)s", 150 mediaType: "%(mediaType)s", 151 moduleName: "%(moduleName)s", 152 quality: "%(quality)s" 153 }) { 154 urls { 155 url 156 cdn 157 rank 158 source 159 }, 160 status 161 } 162 }''' 163 164 def _real_initialize(self): 165 self._login() 166 167 def _login(self): 168 username, password = self._get_login_info() 169 if username is None: 170 return 171 172 login_page = self._download_webpage( 173 self._LOGIN_URL, None, 'Downloading login page') 174 175 login_form = self._hidden_inputs(login_page) 176 177 login_form.update({ 178 'Username': username, 179 'Password': password, 180 }) 181 182 post_url = self._search_regex( 183 r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 184 'post url', default=self._LOGIN_URL, group='url') 185 186 if not post_url.startswith('http'): 187 post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) 188 189 response = self._download_webpage( 190 post_url, None, 'Logging in', 191 data=urlencode_postdata(login_form), 192 headers={'Content-Type': 'application/x-www-form-urlencoded'}) 193 194 error = self._search_regex( 195 r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', 196 response, 'error message', default=None) 197 if error: 198 raise ExtractorError('Unable to login: %s' % error, expected=True) 199 200 if all(not re.search(p, response) for p in ( 201 r'__INITIAL_STATE__', r'["\']currentUser["\']', 202 # new layout? 203 r'>\s*Sign out\s*<')): 204 BLOCKED = 'Your account has been blocked due to suspicious activity' 205 if BLOCKED in response: 206 raise ExtractorError( 207 'Unable to login: %s' % BLOCKED, expected=True) 208 MUST_AGREE = 'To continue using Pluralsight, you must agree to' 209 if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): 210 raise ExtractorError( 211 'Unable to login: %s some documents. Go to pluralsight.com, ' 212 'log in and agree with what Pluralsight requires.' 213 % MUST_AGREE, expected=True) 214 215 raise ExtractorError('Unable to log in') 216 217 def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): 218 captions = None 219 if clip_id: 220 captions = self._download_json( 221 '%s/transcript/api/v1/caption/json/%s/%s' 222 % (self._API_BASE, clip_id, lang), video_id, 223 'Downloading captions JSON', 'Unable to download captions JSON', 224 fatal=False) 225 if not captions: 226 captions_post = { 227 'a': author, 228 'cn': int(clip_idx), 229 'lc': lang, 230 'm': name, 231 } 232 captions = self._download_json( 233 '%s/player/retrieve-captions' % self._API_BASE, video_id, 234 'Downloading captions JSON', 'Unable to download captions JSON', 235 fatal=False, data=json.dumps(captions_post).encode('utf-8'), 236 headers={'Content-Type': 'application/json;charset=utf-8'}) 237 if captions: 238 return { 239 lang: [{ 240 'ext': 'json', 241 'data': json.dumps(captions), 242 }, { 243 'ext': 'srt', 244 'data': self._convert_subtitles(duration, captions), 245 }] 246 } 247 248 @staticmethod 249 def _convert_subtitles(duration, subs): 250 srt = '' 251 TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') 252 TEXT_KEYS = ('text', 'Text') 253 for num, current in enumerate(subs): 254 current = subs[num] 255 start, text = ( 256 float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), 257 dict_get(current, TEXT_KEYS)) 258 if start is None or text is None: 259 continue 260 end = duration if num == len(subs) - 1 else float_or_none( 261 dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) 262 if end is None: 263 continue 264 srt += os.linesep.join( 265 ( 266 '%d' % num, 267 '%s --> %s' % ( 268 srt_subtitles_timecode(start), 269 srt_subtitles_timecode(end)), 270 text, 271 os.linesep, 272 )) 273 return srt 274 275 def _real_extract(self, url): 276 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) 277 278 author = qs.get('author', [None])[0] 279 name = qs.get('name', [None])[0] 280 clip_idx = qs.get('clip', [None])[0] 281 course_name = qs.get('course', [None])[0] 282 283 if any(not f for f in (author, name, clip_idx, course_name,)): 284 raise ExtractorError('Invalid URL', expected=True) 285 286 display_id = '%s-%s' % (name, clip_idx) 287 288 course = self._download_course(course_name, url, display_id) 289 290 collection = course['modules'] 291 292 clip = None 293 294 for module_ in collection: 295 if name in (module_.get('moduleName'), module_.get('name')): 296 for clip_ in module_.get('clips', []): 297 clip_index = clip_.get('clipIndex') 298 if clip_index is None: 299 clip_index = clip_.get('index') 300 if clip_index is None: 301 continue 302 if compat_str(clip_index) == clip_idx: 303 clip = clip_ 304 break 305 306 if not clip: 307 raise ExtractorError('Unable to resolve clip') 308 309 title = clip['title'] 310 clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] 311 312 QUALITIES = { 313 'low': {'width': 640, 'height': 480}, 314 'medium': {'width': 848, 'height': 640}, 315 'high': {'width': 1024, 'height': 768}, 316 'high-widescreen': {'width': 1280, 'height': 720}, 317 } 318 319 QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) 320 quality_key = qualities(QUALITIES_PREFERENCE) 321 322 AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) 323 324 ALLOWED_QUALITIES = ( 325 AllowedQuality('webm', ['high', ]), 326 AllowedQuality('mp4', ['low', 'medium', 'high', ]), 327 ) 328 329 # Some courses also offer widescreen resolution for high quality (see 330 # https://github.com/ytdl-org/youtube-dl/issues/7766) 331 widescreen = course.get('supportsWideScreenVideoFormats') is True 332 best_quality = 'high-widescreen' if widescreen else 'high' 333 if widescreen: 334 for allowed_quality in ALLOWED_QUALITIES: 335 allowed_quality.qualities.append(best_quality) 336 337 # In order to minimize the number of calls to ViewClip API and reduce 338 # the probability of being throttled or banned by Pluralsight we will request 339 # only single format until formats listing was explicitly requested. 340 if self._downloader.params.get('listformats', False): 341 allowed_qualities = ALLOWED_QUALITIES 342 else: 343 def guess_allowed_qualities(): 344 req_format = self._downloader.params.get('format') or 'best' 345 req_format_split = req_format.split('-', 1) 346 if len(req_format_split) > 1: 347 req_ext, req_quality = req_format_split 348 req_quality = '-'.join(req_quality.split('-')[:2]) 349 for allowed_quality in ALLOWED_QUALITIES: 350 if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: 351 return (AllowedQuality(req_ext, (req_quality, )), ) 352 req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' 353 return (AllowedQuality(req_ext, (best_quality, )), ) 354 allowed_qualities = guess_allowed_qualities() 355 356 formats = [] 357 for ext, qualities_ in allowed_qualities: 358 for quality in qualities_: 359 f = QUALITIES[quality].copy() 360 clip_post = { 361 'author': author, 362 'includeCaptions': 'false', 363 'clipIndex': int(clip_idx), 364 'courseName': course_name, 365 'locale': 'en', 366 'moduleName': name, 367 'mediaType': ext, 368 'quality': '%dx%d' % (f['width'], f['height']), 369 } 370 format_id = '%s-%s' % (ext, quality) 371 372 try: 373 viewclip = self._download_json( 374 self._GRAPHQL_EP, display_id, 375 'Downloading %s viewclip graphql' % format_id, 376 data=json.dumps({ 377 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, 378 'variables': {} 379 }).encode('utf-8'), 380 headers=self._GRAPHQL_HEADERS)['data']['viewClip'] 381 except ExtractorError: 382 # Still works but most likely will go soon 383 viewclip = self._download_json( 384 '%s/video/clips/viewclip' % self._API_BASE, display_id, 385 'Downloading %s viewclip JSON' % format_id, fatal=False, 386 data=json.dumps(clip_post).encode('utf-8'), 387 headers={'Content-Type': 'application/json;charset=utf-8'}) 388 389 # Pluralsight tracks multiple sequential calls to ViewClip API and start 390 # to return 429 HTTP errors after some time (see 391 # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead 392 # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). 393 # To somewhat reduce the probability of these consequences 394 # we will sleep random amount of time before each call to ViewClip. 395 self._sleep( 396 random.randint(5, 10), display_id, 397 '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') 398 399 if not viewclip: 400 continue 401 402 clip_urls = viewclip.get('urls') 403 if not isinstance(clip_urls, list): 404 continue 405 406 for clip_url_data in clip_urls: 407 clip_url = clip_url_data.get('url') 408 if not clip_url: 409 continue 410 cdn = clip_url_data.get('cdn') 411 clip_f = f.copy() 412 clip_f.update({ 413 'url': clip_url, 414 'ext': ext, 415 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, 416 'quality': quality_key(quality), 417 'source_preference': int_or_none(clip_url_data.get('rank')), 418 }) 419 formats.append(clip_f) 420 421 self._sort_formats(formats) 422 423 duration = int_or_none( 424 clip.get('duration')) or parse_duration(clip.get('formattedDuration')) 425 426 # TODO: other languages? 427 subtitles = self.extract_subtitles( 428 author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) 429 430 return { 431 'id': clip_id, 432 'title': title, 433 'duration': duration, 434 'creator': author, 435 'formats': formats, 436 'subtitles': subtitles, 437 } 438 439 440 class PluralsightCourseIE(PluralsightBaseIE): 441 IE_NAME = 'pluralsight:course' 442 _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' 443 _TESTS = [{ 444 # Free course from Pluralsight Starter Subscription for Microsoft TechNet 445 # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz 446 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', 447 'info_dict': { 448 'id': 'hosting-sql-server-windows-azure-iaas', 449 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', 450 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', 451 }, 452 'playlist_count': 31, 453 }, { 454 # available without pluralsight account 455 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', 456 'only_matching': True, 457 }, { 458 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', 459 'only_matching': True, 460 }] 461 462 def _real_extract(self, url): 463 course_id = self._match_id(url) 464 465 # TODO: PSM cookie 466 467 course = self._download_course(course_id, url, course_id) 468 469 title = course['title'] 470 course_name = course['name'] 471 course_data = course['modules'] 472 description = course.get('description') or course.get('shortDescription') 473 474 entries = [] 475 for num, module in enumerate(course_data, 1): 476 author = module.get('author') 477 module_name = module.get('name') 478 if not author or not module_name: 479 continue 480 for clip in module.get('clips', []): 481 clip_index = int_or_none(clip.get('index')) 482 if clip_index is None: 483 continue 484 clip_url = update_url_query( 485 '%s/player' % self._API_BASE, query={ 486 'mode': 'live', 487 'course': course_name, 488 'author': author, 489 'name': module_name, 490 'clip': clip_index, 491 }) 492 entries.append({ 493 '_type': 'url_transparent', 494 'url': clip_url, 495 'ie_key': PluralsightIE.ie_key(), 496 'chapter': module.get('title'), 497 'chapter_number': num, 498 'chapter_id': module.get('moduleRef'), 499 }) 500 501 return self.playlist_result(entries, course_id, title, description)