linuxacademy.py (9511B)
1 from __future__ import unicode_literals 2 3 import json 4 import random 5 import re 6 7 from .common import InfoExtractor 8 from ..compat import ( 9 compat_b64decode, 10 compat_HTTPError, 11 compat_str, 12 ) 13 from ..utils import ( 14 clean_html, 15 ExtractorError, 16 js_to_json, 17 parse_duration, 18 try_get, 19 unified_timestamp, 20 urlencode_postdata, 21 urljoin, 22 ) 23 24 25 class LinuxAcademyIE(InfoExtractor): 26 _VALID_URL = r'''(?x) 27 https?:// 28 (?:www\.)?linuxacademy\.com/cp/ 29 (?: 30 courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| 31 modules/view/id/(?P<course_id>\d+) 32 ) 33 ''' 34 _TESTS = [{ 35 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', 36 'info_dict': { 37 'id': '7971-2', 38 'ext': 'mp4', 39 'title': 'What Is Data Science', 40 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', 41 'timestamp': 1607387907, 42 'upload_date': '20201208', 43 'duration': 304, 44 }, 45 'params': { 46 'skip_download': True, 47 }, 48 'skip': 'Requires Linux Academy account credentials', 49 }, { 50 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', 51 'only_matching': True, 52 }, { 53 'url': 'https://linuxacademy.com/cp/modules/view/id/154', 54 'info_dict': { 55 'id': '154', 56 'title': 'AWS Certified Cloud Practitioner', 57 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', 58 'duration': 28835, 59 }, 60 'playlist_count': 41, 61 'skip': 'Requires Linux Academy account credentials', 62 }] 63 64 _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' 65 _ORIGIN_URL = 'https://linuxacademy.com' 66 _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' 67 _NETRC_MACHINE = 'linuxacademy' 68 69 def _real_initialize(self): 70 self._login() 71 72 def _login(self): 73 username, password = self._get_login_info() 74 if username is None: 75 return 76 77 def random_string(): 78 return ''.join([ 79 random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') 80 for _ in range(32)]) 81 82 webpage, urlh = self._download_webpage_handle( 83 self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ 84 'client_id': self._CLIENT_ID, 85 'response_type': 'token id_token', 86 'response_mode': 'web_message', 87 'redirect_uri': self._ORIGIN_URL, 88 'scope': 'openid email user_impersonation profile', 89 'audience': self._ORIGIN_URL, 90 'state': random_string(), 91 'nonce': random_string(), 92 }) 93 94 login_data = self._parse_json( 95 self._search_regex( 96 r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 97 'login info', group='value'), None, 98 transform_source=lambda x: compat_b64decode(x).decode('utf-8') 99 )['extraParams'] 100 101 login_data.update({ 102 'client_id': self._CLIENT_ID, 103 'redirect_uri': self._ORIGIN_URL, 104 'tenant': 'lacausers', 105 'connection': 'Username-Password-Authentication', 106 'username': username, 107 'password': password, 108 'sso': 'true', 109 }) 110 111 login_state_url = urlh.geturl() 112 113 try: 114 login_page = self._download_webpage( 115 'https://login.linuxacademy.com/usernamepassword/login', None, 116 'Downloading login page', data=json.dumps(login_data).encode(), 117 headers={ 118 'Content-Type': 'application/json', 119 'Origin': 'https://login.linuxacademy.com', 120 'Referer': login_state_url, 121 }) 122 except ExtractorError as e: 123 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: 124 error = self._parse_json(e.cause.read(), None) 125 message = error.get('description') or error['code'] 126 raise ExtractorError( 127 '%s said: %s' % (self.IE_NAME, message), expected=True) 128 raise 129 130 callback_page, urlh = self._download_webpage_handle( 131 'https://login.linuxacademy.com/login/callback', None, 132 'Downloading callback page', 133 data=urlencode_postdata(self._hidden_inputs(login_page)), 134 headers={ 135 'Content-Type': 'application/x-www-form-urlencoded', 136 'Origin': 'https://login.linuxacademy.com', 137 'Referer': login_state_url, 138 }) 139 140 access_token = self._search_regex( 141 r'access_token=([^=&]+)', urlh.geturl(), 142 'access token', default=None) 143 if not access_token: 144 access_token = self._parse_json( 145 self._search_regex( 146 r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, 147 'authorization response'), None, 148 transform_source=js_to_json)['response']['access_token'] 149 150 self._download_webpage( 151 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' 152 % access_token, None, 'Downloading token validation page') 153 154 def _real_extract(self, url): 155 mobj = re.match(self._VALID_URL, url) 156 chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') 157 item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) 158 159 webpage = self._download_webpage(url, item_id) 160 161 # course path 162 if course_id: 163 module = self._parse_json( 164 self._search_regex( 165 r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'), 166 item_id) 167 entries = [] 168 chapter_number = None 169 chapter = None 170 chapter_id = None 171 for item in module['items']: 172 if not isinstance(item, dict): 173 continue 174 175 def type_field(key): 176 return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() 177 type_fields = (type_field('name'), type_field('slug')) 178 # Move to next module section 179 if 'section' in type_fields: 180 chapter = item.get('course_name') 181 chapter_id = item.get('course_module') 182 chapter_number = 1 if not chapter_number else chapter_number + 1 183 continue 184 # Skip non-lessons 185 if 'lesson' not in type_fields: 186 continue 187 lesson_url = urljoin(url, item.get('url')) 188 if not lesson_url: 189 continue 190 title = item.get('title') or item.get('lesson_name') 191 description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) 192 entries.append({ 193 '_type': 'url_transparent', 194 'url': lesson_url, 195 'ie_key': LinuxAcademyIE.ie_key(), 196 'title': title, 197 'description': description, 198 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), 199 'duration': parse_duration(item.get('duration')), 200 'chapter': chapter, 201 'chapter_id': chapter_id, 202 'chapter_number': chapter_number, 203 }) 204 return { 205 '_type': 'playlist', 206 'entries': entries, 207 'id': course_id, 208 'title': module.get('title'), 209 'description': module.get('md_desc') or clean_html(module.get('desc')), 210 'duration': parse_duration(module.get('duration')), 211 } 212 213 # single video path 214 m3u8_url = self._parse_json( 215 self._search_regex( 216 r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), 217 item_id)[0]['file'] 218 formats = self._extract_m3u8_formats( 219 m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', 220 m3u8_id='hls') 221 self._sort_formats(formats) 222 info = { 223 'id': item_id, 224 'formats': formats, 225 } 226 lesson = self._parse_json( 227 self._search_regex( 228 (r'window\.lesson\s*=\s*({.+?})\s*;', 229 r'player\.lesson\s*=\s*({.+?})\s*;'), 230 webpage, 'lesson', default='{}'), item_id, fatal=False) 231 if lesson: 232 info.update({ 233 'title': lesson.get('lesson_name'), 234 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), 235 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), 236 'duration': parse_duration(lesson.get('duration')), 237 }) 238 if not info.get('title'): 239 info['title'] = self._search_regex( 240 (r'>Lecture\s*:\s*(?P<value>[^<]+)', 241 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, 242 'title', group='value') 243 return info