stanfordoc.py (3530B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..utils import ( 7 ExtractorError, 8 orderedSet, 9 unescapeHTML, 10 ) 11 12 13 class StanfordOpenClassroomIE(InfoExtractor): 14 IE_NAME = 'stanfordoc' 15 IE_DESC = 'Stanford Open ClassRoom' 16 _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 17 _TEST = { 18 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', 19 'md5': '544a9468546059d4e80d76265b0443b8', 20 'info_dict': { 21 'id': 'PracticalUnix_intro-environment', 22 'ext': 'mp4', 23 'title': 'Intro Environment', 24 } 25 } 26 27 def _real_extract(self, url): 28 mobj = re.match(self._VALID_URL, url) 29 30 if mobj.group('course') and mobj.group('video'): # A specific video 31 course = mobj.group('course') 32 video = mobj.group('video') 33 info = { 34 'id': course + '_' + video, 35 'uploader': None, 36 'upload_date': None, 37 } 38 39 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' 40 xmlUrl = baseUrl + video + '.xml' 41 mdoc = self._download_xml(xmlUrl, info['id']) 42 try: 43 info['title'] = mdoc.findall('./title')[0].text 44 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text 45 except IndexError: 46 raise ExtractorError('Invalid metadata XML file') 47 return info 48 elif mobj.group('course'): # A course page 49 course = mobj.group('course') 50 info = { 51 'id': course, 52 '_type': 'playlist', 53 'uploader': None, 54 'upload_date': None, 55 } 56 57 coursepage = self._download_webpage( 58 url, info['id'], 59 note='Downloading course info page', 60 errnote='Unable to download course info page') 61 62 info['title'] = self._html_search_regex( 63 r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) 64 65 info['description'] = self._html_search_regex( 66 r'(?s)<description>([^<]+)</description>', 67 coursepage, 'description', fatal=False) 68 69 links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) 70 info['entries'] = [self.url_result( 71 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) 72 ) for l in links] 73 return info 74 else: # Root page 75 info = { 76 'id': 'Stanford OpenClassroom', 77 '_type': 'playlist', 78 'uploader': None, 79 'upload_date': None, 80 } 81 info['title'] = info['id'] 82 83 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 84 rootpage = self._download_webpage(rootURL, info['id'], 85 errnote='Unable to download course info page') 86 87 links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) 88 info['entries'] = [self.url_result( 89 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) 90 ) for l in links] 91 return info