youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

techtalks.py (2529B)


      1 from __future__ import unicode_literals
      2 
      3 import re
      4 
      5 from .common import InfoExtractor
      6 from ..utils import (
      7     get_element_by_attribute,
      8     clean_html,
      9 )
     10 
     11 
     12 class TechTalksIE(InfoExtractor):
     13     _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)'
     14 
     15     _TESTS = [{
     16         'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
     17         'info_dict': {
     18             'id': '57758',
     19             'title': 'Learning Topic Models --- Going beyond SVD',
     20         },
     21         'playlist': [
     22             {
     23                 'info_dict': {
     24                     'id': '57758',
     25                     'ext': 'flv',
     26                     'title': 'Learning Topic Models --- Going beyond SVD',
     27                 },
     28             },
     29             {
     30                 'info_dict': {
     31                     'id': '57758-slides',
     32                     'ext': 'flv',
     33                     'title': 'Learning Topic Models --- Going beyond SVD',
     34                 },
     35             },
     36         ],
     37         'params': {
     38             # rtmp download
     39             'skip_download': True,
     40         },
     41     }, {
     42         'url': 'http://techtalks.tv/talks/57758',
     43         'only_matching': True,
     44     }]
     45 
     46     def _real_extract(self, url):
     47         mobj = re.match(self._VALID_URL, url)
     48         talk_id = mobj.group('id')
     49         webpage = self._download_webpage(url, talk_id)
     50         rtmp_url = self._search_regex(
     51             r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
     52         play_path = self._search_regex(
     53             r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
     54             webpage, 'presenter play path')
     55         title = clean_html(get_element_by_attribute('class', 'title', webpage))
     56         video_info = {
     57             'id': talk_id,
     58             'title': title,
     59             'url': rtmp_url,
     60             'play_path': play_path,
     61             'ext': 'flv',
     62         }
     63         m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
     64         if m_slides is None:
     65             return video_info
     66         else:
     67             return {
     68                 '_type': 'playlist',
     69                 'id': talk_id,
     70                 'title': title,
     71                 'entries': [
     72                     video_info,
     73                     # The slides video
     74                     {
     75                         'id': talk_id + '-slides',
     76                         'title': title,
     77                         'url': rtmp_url,
     78                         'play_path': m_slides.group(1),
     79                         'ext': 'flv',
     80                     },
     81                 ],
     82             }