techtalks.py (2529B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..utils import ( 7 get_element_by_attribute, 8 clean_html, 9 ) 10 11 12 class TechTalksIE(InfoExtractor): 13 _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)' 14 15 _TESTS = [{ 16 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', 17 'info_dict': { 18 'id': '57758', 19 'title': 'Learning Topic Models --- Going beyond SVD', 20 }, 21 'playlist': [ 22 { 23 'info_dict': { 24 'id': '57758', 25 'ext': 'flv', 26 'title': 'Learning Topic Models --- Going beyond SVD', 27 }, 28 }, 29 { 30 'info_dict': { 31 'id': '57758-slides', 32 'ext': 'flv', 33 'title': 'Learning Topic Models --- Going beyond SVD', 34 }, 35 }, 36 ], 37 'params': { 38 # rtmp download 39 'skip_download': True, 40 }, 41 }, { 42 'url': 'http://techtalks.tv/talks/57758', 43 'only_matching': True, 44 }] 45 46 def _real_extract(self, url): 47 mobj = re.match(self._VALID_URL, url) 48 talk_id = mobj.group('id') 49 webpage = self._download_webpage(url, talk_id) 50 rtmp_url = self._search_regex( 51 r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url') 52 play_path = self._search_regex( 53 r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', 54 webpage, 'presenter play path') 55 title = clean_html(get_element_by_attribute('class', 'title', webpage)) 56 video_info = { 57 'id': talk_id, 58 'title': title, 59 'url': rtmp_url, 60 'play_path': play_path, 61 'ext': 'flv', 62 } 63 m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage) 64 if m_slides is None: 65 return video_info 66 else: 67 return { 68 '_type': 'playlist', 69 'id': talk_id, 70 'title': title, 71 'entries': [ 72 video_info, 73 # The slides video 74 { 75 'id': talk_id + '-slides', 76 'title': title, 77 'url': rtmp_url, 78 'play_path': m_slides.group(1), 79 'ext': 'flv', 80 }, 81 ], 82 }