teachable.py (10499B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from .wistia import WistiaIE 7 from ..utils import ( 8 clean_html, 9 ExtractorError, 10 int_or_none, 11 get_element_by_class, 12 strip_or_none, 13 urlencode_postdata, 14 urljoin, 15 ) 16 17 18 class TeachableBaseIE(InfoExtractor): 19 _NETRC_MACHINE = 'teachable' 20 _URL_PREFIX = 'teachable:' 21 22 _SITES = { 23 # Only notable ones here 24 'v1.upskillcourses.com': 'upskill', 25 'gns3.teachable.com': 'gns3', 26 'academyhacker.com': 'academyhacker', 27 'stackskills.com': 'stackskills', 28 'market.saleshacker.com': 'saleshacker', 29 'learnability.org': 'learnability', 30 'edurila.com': 'edurila', 31 'courses.workitdaily.com': 'workitdaily', 32 } 33 34 _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) 35 36 def _real_initialize(self): 37 self._logged_in = False 38 39 def _login(self, site): 40 if self._logged_in: 41 return 42 43 username, password = self._get_login_info( 44 netrc_machine=self._SITES.get(site, site)) 45 if username is None: 46 return 47 48 login_page, urlh = self._download_webpage_handle( 49 'https://%s/sign_in' % site, None, 50 'Downloading %s login page' % site) 51 52 def is_logged(webpage): 53 return any(re.search(p, webpage) for p in ( 54 r'class=["\']user-signout', 55 r'<a[^>]+\bhref=["\']/sign_out', 56 r'Log\s+[Oo]ut\s*<')) 57 58 if is_logged(login_page): 59 self._logged_in = True 60 return 61 62 login_url = urlh.geturl() 63 64 login_form = self._hidden_inputs(login_page) 65 66 login_form.update({ 67 'user[email]': username, 68 'user[password]': password, 69 }) 70 71 post_url = self._search_regex( 72 r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, 73 'post url', default=login_url, group='url') 74 75 if not post_url.startswith('http'): 76 post_url = urljoin(login_url, post_url) 77 78 response = self._download_webpage( 79 post_url, None, 'Logging in to %s' % site, 80 data=urlencode_postdata(login_form), 81 headers={ 82 'Content-Type': 'application/x-www-form-urlencoded', 83 'Referer': login_url, 84 }) 85 86 if '>I accept the new Privacy Policy<' in response: 87 raise ExtractorError( 88 'Unable to login: %s asks you to accept new Privacy Policy. ' 89 'Go to https://%s/ and accept.' % (site, site), expected=True) 90 91 # Successful login 92 if is_logged(response): 93 self._logged_in = True 94 return 95 96 message = get_element_by_class('alert', response) 97 if message is not None: 98 raise ExtractorError( 99 'Unable to login: %s' % clean_html(message), expected=True) 100 101 raise ExtractorError('Unable to log in') 102 103 104 class TeachableIE(TeachableBaseIE): 105 _VALID_URL = r'''(?x) 106 (?: 107 %shttps?://(?P<site_t>[^/]+)| 108 https?://(?:www\.)?(?P<site>%s) 109 ) 110 /courses/[^/]+/lectures/(?P<id>\d+) 111 ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE 112 113 _TESTS = [{ 114 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', 115 'info_dict': { 116 'id': 'untlgzk1v7', 117 'ext': 'bin', 118 'title': 'Overview', 119 'description': 'md5:071463ff08b86c208811130ea1c2464c', 120 'duration': 736.4, 121 'timestamp': 1542315762, 122 'upload_date': '20181115', 123 'chapter': 'Welcome', 124 'chapter_number': 1, 125 }, 126 'params': { 127 'skip_download': True, 128 }, 129 }, { 130 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', 131 'only_matching': True, 132 }, { 133 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', 134 'only_matching': True, 135 }, { 136 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', 137 'only_matching': True, 138 }] 139 140 @staticmethod 141 def _is_teachable(webpage): 142 return 'teachableTracker.linker:autoLink' in webpage and re.search( 143 r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', 144 webpage) 145 146 @staticmethod 147 def _extract_url(webpage, source_url): 148 if not TeachableIE._is_teachable(webpage): 149 return 150 if re.match(r'https?://[^/]+/(?:courses|p)', source_url): 151 return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) 152 153 def _real_extract(self, url): 154 mobj = re.match(self._VALID_URL, url) 155 site = mobj.group('site') or mobj.group('site_t') 156 video_id = mobj.group('id') 157 158 self._login(site) 159 160 prefixed = url.startswith(self._URL_PREFIX) 161 if prefixed: 162 url = url[len(self._URL_PREFIX):] 163 164 webpage = self._download_webpage(url, video_id) 165 166 wistia_urls = WistiaIE._extract_urls(webpage) 167 if not wistia_urls: 168 if any(re.search(p, webpage) for p in ( 169 r'class=["\']lecture-contents-locked', 170 r'>\s*Lecture contents locked', 171 r'id=["\']lecture-locked', 172 # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313 173 r'class=["\'](?:inner-)?lesson-locked', 174 r'>LESSON LOCKED<')): 175 self.raise_login_required('Lecture contents locked') 176 raise ExtractorError('Unable to find video URL') 177 178 title = self._og_search_title(webpage, default=None) 179 180 chapter = None 181 chapter_number = None 182 section_item = self._search_regex( 183 r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id, 184 webpage, 'section item', default=None, group='li') 185 if section_item: 186 chapter_number = int_or_none(self._search_regex( 187 r'data-ss-position=["\'](\d+)', section_item, 'section id', 188 default=None)) 189 if chapter_number is not None: 190 sections = [] 191 for s in re.findall( 192 r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage): 193 section = strip_or_none(clean_html(s)) 194 if not section: 195 sections = [] 196 break 197 sections.append(section) 198 if chapter_number <= len(sections): 199 chapter = sections[chapter_number - 1] 200 201 entries = [{ 202 '_type': 'url_transparent', 203 'url': wistia_url, 204 'ie_key': WistiaIE.ie_key(), 205 'title': title, 206 'chapter': chapter, 207 'chapter_number': chapter_number, 208 } for wistia_url in wistia_urls] 209 210 return self.playlist_result(entries, video_id, title) 211 212 213 class TeachableCourseIE(TeachableBaseIE): 214 _VALID_URL = r'''(?x) 215 (?: 216 %shttps?://(?P<site_t>[^/]+)| 217 https?://(?:www\.)?(?P<site>%s) 218 ) 219 /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) 220 ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE 221 _TESTS = [{ 222 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', 223 'info_dict': { 224 'id': 'essential-web-developer-course', 225 'title': 'The Essential Web Developer Course (Free)', 226 }, 227 'playlist_count': 192, 228 }, { 229 'url': 'http://v1.upskillcourses.com/courses/119763/', 230 'only_matching': True, 231 }, { 232 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', 233 'only_matching': True, 234 }, { 235 'url': 'https://gns3.teachable.com/courses/enrolled/423415', 236 'only_matching': True, 237 }, { 238 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', 239 'only_matching': True, 240 }, { 241 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', 242 'only_matching': True, 243 }] 244 245 @classmethod 246 def suitable(cls, url): 247 return False if TeachableIE.suitable(url) else super( 248 TeachableCourseIE, cls).suitable(url) 249 250 def _real_extract(self, url): 251 mobj = re.match(self._VALID_URL, url) 252 site = mobj.group('site') or mobj.group('site_t') 253 course_id = mobj.group('id') 254 255 self._login(site) 256 257 prefixed = url.startswith(self._URL_PREFIX) 258 if prefixed: 259 prefix = self._URL_PREFIX 260 url = url[len(prefix):] 261 262 webpage = self._download_webpage(url, course_id) 263 264 url_base = 'https://%s/' % site 265 266 entries = [] 267 268 for mobj in re.finditer( 269 r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', 270 webpage): 271 li = mobj.group('li') 272 if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): 273 continue 274 lecture_url = self._search_regex( 275 r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, 276 'lecture url', default=None, group='url') 277 if not lecture_url: 278 continue 279 lecture_id = self._search_regex( 280 r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) 281 title = self._html_search_regex( 282 r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, 283 'title', default=None) 284 entry_url = urljoin(url_base, lecture_url) 285 if prefixed: 286 entry_url = self._URL_PREFIX + entry_url 287 entries.append( 288 self.url_result( 289 entry_url, 290 ie=TeachableIE.ie_key(), video_id=lecture_id, 291 video_title=clean_html(title))) 292 293 course_title = self._html_search_regex( 294 (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', 295 r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), 296 webpage, 'course title', fatal=False) 297 298 return self.playlist_result(entries, course_id, course_title)