lecturio.py (8437B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 clean_html, 9 determine_ext, 10 ExtractorError, 11 float_or_none, 12 int_or_none, 13 str_or_none, 14 url_or_none, 15 urlencode_postdata, 16 urljoin, 17 ) 18 19 20 class LecturioBaseIE(InfoExtractor): 21 _API_BASE_URL = 'https://app.lecturio.com/api/en/latest/html5/' 22 _LOGIN_URL = 'https://app.lecturio.com/en/login' 23 _NETRC_MACHINE = 'lecturio' 24 25 def _real_initialize(self): 26 self._login() 27 28 def _login(self): 29 username, password = self._get_login_info() 30 if username is None: 31 return 32 33 # Sets some cookies 34 _, urlh = self._download_webpage_handle( 35 self._LOGIN_URL, None, 'Downloading login popup') 36 37 def is_logged(url_handle): 38 return self._LOGIN_URL not in url_handle.geturl() 39 40 # Already logged in 41 if is_logged(urlh): 42 return 43 44 login_form = { 45 'signin[email]': username, 46 'signin[password]': password, 47 'signin[remember]': 'on', 48 } 49 50 response, urlh = self._download_webpage_handle( 51 self._LOGIN_URL, None, 'Logging in', 52 data=urlencode_postdata(login_form)) 53 54 # Logged in successfully 55 if is_logged(urlh): 56 return 57 58 errors = self._html_search_regex( 59 r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, 60 'errors', default=None) 61 if errors: 62 raise ExtractorError('Unable to login: %s' % errors, expected=True) 63 raise ExtractorError('Unable to log in') 64 65 66 class LecturioIE(LecturioBaseIE): 67 _VALID_URL = r'''(?x) 68 https:// 69 (?: 70 app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| 71 (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag 72 ) 73 ''' 74 _TESTS = [{ 75 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', 76 'md5': '9a42cf1d8282a6311bf7211bbde26fde', 77 'info_dict': { 78 'id': '39634', 79 'ext': 'mp4', 80 'title': 'Important Concepts and Terms — Introduction to Microbiology', 81 }, 82 'skip': 'Requires lecturio account credentials', 83 }, { 84 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', 85 'only_matching': True, 86 }, { 87 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', 88 'only_matching': True, 89 }] 90 91 _CC_LANGS = { 92 'Arabic': 'ar', 93 'Bulgarian': 'bg', 94 'German': 'de', 95 'English': 'en', 96 'Spanish': 'es', 97 'Persian': 'fa', 98 'French': 'fr', 99 'Japanese': 'ja', 100 'Polish': 'pl', 101 'Pashto': 'ps', 102 'Russian': 'ru', 103 } 104 105 def _real_extract(self, url): 106 mobj = re.match(self._VALID_URL, url) 107 nt = mobj.group('nt') or mobj.group('nt_de') 108 lecture_id = mobj.group('id') 109 display_id = nt or lecture_id 110 api_path = 'lectures/' + lecture_id if lecture_id else 'lecture/' + nt + '.json' 111 video = self._download_json( 112 self._API_BASE_URL + api_path, display_id) 113 title = video['title'].strip() 114 if not lecture_id: 115 pid = video.get('productId') or video.get('uid') 116 if pid: 117 spid = pid.split('_') 118 if spid and len(spid) == 2: 119 lecture_id = spid[1] 120 121 formats = [] 122 for format_ in video['content']['media']: 123 if not isinstance(format_, dict): 124 continue 125 file_ = format_.get('file') 126 if not file_: 127 continue 128 ext = determine_ext(file_) 129 if ext == 'smil': 130 # smil contains only broken RTMP formats anyway 131 continue 132 file_url = url_or_none(file_) 133 if not file_url: 134 continue 135 label = str_or_none(format_.get('label')) 136 filesize = int_or_none(format_.get('fileSize')) 137 f = { 138 'url': file_url, 139 'format_id': label, 140 'filesize': float_or_none(filesize, invscale=1000) 141 } 142 if label: 143 mobj = re.match(r'(\d+)p\s*\(([^)]+)\)', label) 144 if mobj: 145 f.update({ 146 'format_id': mobj.group(2), 147 'height': int(mobj.group(1)), 148 }) 149 formats.append(f) 150 self._sort_formats(formats) 151 152 subtitles = {} 153 automatic_captions = {} 154 captions = video.get('captions') or [] 155 for cc in captions: 156 cc_url = cc.get('url') 157 if not cc_url: 158 continue 159 cc_label = cc.get('translatedCode') 160 lang = cc.get('languageCode') or self._search_regex( 161 r'/([a-z]{2})_', cc_url, 'lang', 162 default=cc_label.split()[0] if cc_label else 'en') 163 original_lang = self._search_regex( 164 r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', 165 default=None) 166 sub_dict = (automatic_captions 167 if 'auto-translated' in cc_label or original_lang 168 else subtitles) 169 sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ 170 'url': cc_url, 171 }) 172 173 return { 174 'id': lecture_id or nt, 175 'title': title, 176 'formats': formats, 177 'subtitles': subtitles, 178 'automatic_captions': automatic_captions, 179 } 180 181 182 class LecturioCourseIE(LecturioBaseIE): 183 _VALID_URL = r'https://app\.lecturio\.com/(?:[^/]+/(?P<nt>[^/?#&]+)\.course|(?:#/)?course/c/(?P<id>\d+))' 184 _TESTS = [{ 185 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', 186 'info_dict': { 187 'id': 'microbiology-introduction', 188 'title': 'Microbiology: Introduction', 189 'description': 'md5:13da8500c25880c6016ae1e6d78c386a', 190 }, 191 'playlist_count': 45, 192 'skip': 'Requires lecturio account credentials', 193 }, { 194 'url': 'https://app.lecturio.com/#/course/c/6434', 195 'only_matching': True, 196 }] 197 198 def _real_extract(self, url): 199 nt, course_id = re.match(self._VALID_URL, url).groups() 200 display_id = nt or course_id 201 api_path = 'courses/' + course_id if course_id else 'course/content/' + nt + '.json' 202 course = self._download_json( 203 self._API_BASE_URL + api_path, display_id) 204 entries = [] 205 for lecture in course.get('lectures', []): 206 lecture_id = str_or_none(lecture.get('id')) 207 lecture_url = lecture.get('url') 208 if lecture_url: 209 lecture_url = urljoin(url, lecture_url) 210 else: 211 lecture_url = 'https://app.lecturio.com/#/lecture/c/%s/%s' % (course_id, lecture_id) 212 entries.append(self.url_result( 213 lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) 214 return self.playlist_result( 215 entries, display_id, course.get('title'), 216 clean_html(course.get('description'))) 217 218 219 class LecturioDeCourseIE(LecturioBaseIE): 220 _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs' 221 _TEST = { 222 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', 223 'only_matching': True, 224 } 225 226 def _real_extract(self, url): 227 display_id = self._match_id(url) 228 229 webpage = self._download_webpage(url, display_id) 230 231 entries = [] 232 for mobj in re.finditer( 233 r'(?s)<td[^>]+\bdata-lecture-id=["\'](?P<id>\d+).+?\bhref=(["\'])(?P<url>(?:(?!\2).)+\.vortrag)\b[^>]+>', 234 webpage): 235 lecture_url = urljoin(url, mobj.group('url')) 236 lecture_id = mobj.group('id') 237 entries.append(self.url_result( 238 lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) 239 240 title = self._search_regex( 241 r'<h1[^>]*>([^<]+)', webpage, 'title', default=None) 242 243 return self.playlist_result(entries, display_id, title)