academicearth.py (1399B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 7 8 class AcademicEarthCourseIE(InfoExtractor): 9 _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)' 10 IE_NAME = 'AcademicEarth:Course' 11 _TEST = { 12 'url': 'http://academicearth.org/playlists/laws-of-nature/', 13 'info_dict': { 14 'id': 'laws-of-nature', 15 'title': 'Laws of Nature', 16 'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.', 17 }, 18 'playlist_count': 3, 19 } 20 21 def _real_extract(self, url): 22 playlist_id = self._match_id(url) 23 24 webpage = self._download_webpage(url, playlist_id) 25 title = self._html_search_regex( 26 r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, 'title') 27 description = self._html_search_regex( 28 r'<p class="excerpt"[^>]*?>(.*?)</p>', 29 webpage, 'description', fatal=False) 30 urls = re.findall( 31 r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">', 32 webpage) 33 entries = [self.url_result(u) for u in urls] 34 35 return { 36 '_type': 'playlist', 37 'id': playlist_id, 38 'title': title, 39 'description': description, 40 'entries': entries, 41 }