teamtreehouse.py (5504B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 clean_html, 9 determine_ext, 10 ExtractorError, 11 float_or_none, 12 get_element_by_class, 13 get_element_by_id, 14 parse_duration, 15 remove_end, 16 urlencode_postdata, 17 urljoin, 18 ) 19 20 21 class TeamTreeHouseIE(InfoExtractor): 22 _VALID_URL = r'https?://(?:www\.)?teamtreehouse\.com/library/(?P<id>[^/]+)' 23 _TESTS = [{ 24 # Course 25 'url': 'https://teamtreehouse.com/library/introduction-to-user-authentication-in-php', 26 'info_dict': { 27 'id': 'introduction-to-user-authentication-in-php', 28 'title': 'Introduction to User Authentication in PHP', 29 'description': 'md5:405d7b4287a159b27ddf30ca72b5b053', 30 }, 31 'playlist_mincount': 24, 32 }, { 33 # WorkShop 34 'url': 'https://teamtreehouse.com/library/deploying-a-react-app', 35 'info_dict': { 36 'id': 'deploying-a-react-app', 37 'title': 'Deploying a React App', 38 'description': 'md5:10a82e3ddff18c14ac13581c9b8e5921', 39 }, 40 'playlist_mincount': 4, 41 }, { 42 # Video 43 'url': 'https://teamtreehouse.com/library/application-overview-2', 44 'info_dict': { 45 'id': 'application-overview-2', 46 'ext': 'mp4', 47 'title': 'Application Overview', 48 'description': 'md5:4b0a234385c27140a4378de5f1e15127', 49 }, 50 'expected_warnings': ['This is just a preview'], 51 }] 52 _NETRC_MACHINE = 'teamtreehouse' 53 54 def _real_initialize(self): 55 email, password = self._get_login_info() 56 if email is None: 57 return 58 59 signin_page = self._download_webpage( 60 'https://teamtreehouse.com/signin', 61 None, 'Downloading signin page') 62 data = self._form_hidden_inputs('new_user_session', signin_page) 63 data.update({ 64 'user_session[email]': email, 65 'user_session[password]': password, 66 }) 67 error_message = get_element_by_class('error-message', self._download_webpage( 68 'https://teamtreehouse.com/person_session', 69 None, 'Logging in', data=urlencode_postdata(data))) 70 if error_message: 71 raise ExtractorError(clean_html(error_message), expected=True) 72 73 def _real_extract(self, url): 74 display_id = self._match_id(url) 75 webpage = self._download_webpage(url, display_id) 76 title = self._html_search_meta(['og:title', 'twitter:title'], webpage) 77 description = self._html_search_meta( 78 ['description', 'og:description', 'twitter:description'], webpage) 79 entries = self._parse_html5_media_entries(url, webpage, display_id) 80 if entries: 81 info = entries[0] 82 83 for subtitles in info.get('subtitles', {}).values(): 84 for subtitle in subtitles: 85 subtitle['ext'] = determine_ext(subtitle['url'], 'srt') 86 87 is_preview = 'data-preview="true"' in webpage 88 if is_preview: 89 self.report_warning( 90 'This is just a preview. You need to be signed in with a Basic account to download the entire video.', display_id) 91 duration = 30 92 else: 93 duration = float_or_none(self._search_regex( 94 r'data-duration="(\d+)"', webpage, 'duration'), 1000) 95 if not duration: 96 duration = parse_duration(get_element_by_id( 97 'video-duration', webpage)) 98 99 info.update({ 100 'id': display_id, 101 'title': title, 102 'description': description, 103 'duration': duration, 104 }) 105 return info 106 else: 107 def extract_urls(html, extract_info=None): 108 for path in re.findall(r'<a[^>]+href="([^"]+)"', html): 109 page_url = urljoin(url, path) 110 entry = { 111 '_type': 'url_transparent', 112 'id': self._match_id(page_url), 113 'url': page_url, 114 'id_key': self.ie_key(), 115 } 116 if extract_info: 117 entry.update(extract_info) 118 entries.append(entry) 119 120 workshop_videos = self._search_regex( 121 r'(?s)<ul[^>]+id="workshop-videos"[^>]*>(.+?)</ul>', 122 webpage, 'workshop videos', default=None) 123 if workshop_videos: 124 extract_urls(workshop_videos) 125 else: 126 stages_path = self._search_regex( 127 r'(?s)<div[^>]+id="syllabus-stages"[^>]+data-url="([^"]+)"', 128 webpage, 'stages path') 129 if stages_path: 130 stages_page = self._download_webpage( 131 urljoin(url, stages_path), display_id, 'Downloading stages page') 132 for chapter_number, (chapter, steps_list) in enumerate(re.findall(r'(?s)<h2[^>]*>\s*(.+?)\s*</h2>.+?<ul[^>]*>(.+?)</ul>', stages_page), 1): 133 extract_urls(steps_list, { 134 'chapter': chapter, 135 'chapter_number': chapter_number, 136 }) 137 title = remove_end(title, ' Course') 138 139 return self.playlist_result( 140 entries, display_id, title, description)