youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

viidea.py (7446B)


      1 from __future__ import unicode_literals
      2 
      3 import re
      4 
      5 from .common import InfoExtractor
      6 from ..compat import (
      7     compat_HTTPError,
      8     compat_str,
      9     compat_urlparse,
     10 )
     11 from ..utils import (
     12     ExtractorError,
     13     js_to_json,
     14     parse_duration,
     15     parse_iso8601,
     16 )
     17 
     18 
     19 class ViideaIE(InfoExtractor):
     20     _VALID_URL = r'''(?x)https?://(?:www\.)?(?:
     21             videolectures\.net|
     22             flexilearn\.viidea\.net|
     23             presentations\.ocwconsortium\.org|
     24             video\.travel-zoom\.si|
     25             video\.pomp-forum\.si|
     26             tv\.nil\.si|
     27             video\.hekovnik.com|
     28             video\.szko\.si|
     29             kpk\.viidea\.com|
     30             inside\.viidea\.net|
     31             video\.kiberpipa\.org|
     32             bvvideo\.si|
     33             kongres\.viidea\.net|
     34             edemokracija\.viidea\.com
     35         )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$'''
     36 
     37     _TESTS = [{
     38         'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
     39         'info_dict': {
     40             'id': '20171',
     41             'display_id': 'promogram_igor_mekjavic_eng',
     42             'ext': 'mp4',
     43             'title': 'Automatics, robotics and biocybernetics',
     44             'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
     45             'thumbnail': r're:http://.*\.jpg',
     46             'timestamp': 1372349289,
     47             'upload_date': '20130627',
     48             'duration': 565,
     49         },
     50         'params': {
     51             # m3u8 download
     52             'skip_download': True,
     53         },
     54     }, {
     55         # video with invalid direct format links (HTTP 403)
     56         'url': 'http://videolectures.net/russir2010_filippova_nlp/',
     57         'info_dict': {
     58             'id': '14891',
     59             'display_id': 'russir2010_filippova_nlp',
     60             'ext': 'flv',
     61             'title': 'NLP at Google',
     62             'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
     63             'thumbnail': r're:http://.*\.jpg',
     64             'timestamp': 1284375600,
     65             'upload_date': '20100913',
     66             'duration': 5352,
     67         },
     68         'params': {
     69             # rtmp download
     70             'skip_download': True,
     71         },
     72     }, {
     73         # event playlist
     74         'url': 'http://videolectures.net/deeplearning2015_montreal/',
     75         'info_dict': {
     76             'id': '23181',
     77             'title': 'Deep Learning Summer School, Montreal 2015',
     78             'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
     79             'thumbnail': r're:http://.*\.jpg',
     80             'timestamp': 1438560000,
     81         },
     82         'playlist_count': 30,
     83     }, {
     84         # multi part lecture
     85         'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
     86         'info_dict': {
     87             'id': '9737',
     88             'display_id': 'mlss09uk_bishop_ibi',
     89             'title': 'Introduction To Bayesian Inference',
     90             'thumbnail': r're:http://.*\.jpg',
     91             'timestamp': 1251622800,
     92         },
     93         'playlist': [{
     94             'info_dict': {
     95                 'id': '9737_part1',
     96                 'display_id': 'mlss09uk_bishop_ibi_part1',
     97                 'ext': 'wmv',
     98                 'title': 'Introduction To Bayesian Inference (Part 1)',
     99                 'thumbnail': r're:http://.*\.jpg',
    100                 'duration': 4622,
    101                 'timestamp': 1251622800,
    102                 'upload_date': '20090830',
    103             },
    104         }, {
    105             'info_dict': {
    106                 'id': '9737_part2',
    107                 'display_id': 'mlss09uk_bishop_ibi_part2',
    108                 'ext': 'wmv',
    109                 'title': 'Introduction To Bayesian Inference (Part 2)',
    110                 'thumbnail': r're:http://.*\.jpg',
    111                 'duration': 5641,
    112                 'timestamp': 1251622800,
    113                 'upload_date': '20090830',
    114             },
    115         }],
    116         'playlist_count': 2,
    117     }]
    118 
    119     def _real_extract(self, url):
    120         lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups()
    121 
    122         webpage = self._download_webpage(url, lecture_slug)
    123 
    124         cfg = self._parse_json(self._search_regex(
    125             [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function',
    126              r'cfg\s*:\s*({[^}]+})'],
    127             webpage, 'cfg'), lecture_slug, js_to_json)
    128 
    129         lecture_id = compat_str(cfg['obj_id'])
    130 
    131         base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
    132 
    133         try:
    134             lecture_data = self._download_json(
    135                 '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
    136                 lecture_id)['lecture'][0]
    137         except ExtractorError as e:
    138             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
    139                 msg = self._parse_json(
    140                     e.cause.read().decode('utf-8'), lecture_id)
    141                 raise ExtractorError(msg['detail'], expected=True)
    142             raise
    143 
    144         lecture_info = {
    145             'id': lecture_id,
    146             'display_id': lecture_slug,
    147             'title': lecture_data['title'],
    148             'timestamp': parse_iso8601(lecture_data.get('time')),
    149             'description': lecture_data.get('description_wiki'),
    150             'thumbnail': lecture_data.get('thumb'),
    151         }
    152 
    153         playlist_entries = []
    154         lecture_type = lecture_data.get('type')
    155         parts = [compat_str(video) for video in cfg.get('videos', [])]
    156         if parts:
    157             multipart = len(parts) > 1
    158 
    159             def extract_part(part_id):
    160                 smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id)
    161                 smil = self._download_smil(smil_url, lecture_id)
    162                 info = self._parse_smil(smil, smil_url, lecture_id)
    163                 self._sort_formats(info['formats'])
    164                 info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id)
    165                 info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id)
    166                 if multipart:
    167                     info['title'] += ' (Part %s)' % part_id
    168                 switch = smil.find('.//switch')
    169                 if switch is not None:
    170                     info['duration'] = parse_duration(switch.attrib.get('dur'))
    171                 item_info = lecture_info.copy()
    172                 item_info.update(info)
    173                 return item_info
    174 
    175             if explicit_part_id or not multipart:
    176                 result = extract_part(explicit_part_id or parts[0])
    177             else:
    178                 result = {
    179                     '_type': 'multi_video',
    180                     'entries': [extract_part(part) for part in parts],
    181                 }
    182                 result.update(lecture_info)
    183 
    184             # Immediately return explicitly requested part or non event item
    185             if explicit_part_id or lecture_type != 'evt':
    186                 return result
    187 
    188             playlist_entries.append(result)
    189 
    190         # It's probably a playlist
    191         if not parts or lecture_type == 'evt':
    192             playlist_webpage = self._download_webpage(
    193                 '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id)
    194             entries = [
    195                 self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea')
    196                 for _, video_url in re.findall(
    197                     r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)]
    198             playlist_entries.extend(entries)
    199 
    200         playlist = self.playlist_result(playlist_entries, lecture_id)
    201         playlist.update(lecture_info)
    202         return playlist