viidea.py (7446B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..compat import ( 7 compat_HTTPError, 8 compat_str, 9 compat_urlparse, 10 ) 11 from ..utils import ( 12 ExtractorError, 13 js_to_json, 14 parse_duration, 15 parse_iso8601, 16 ) 17 18 19 class ViideaIE(InfoExtractor): 20 _VALID_URL = r'''(?x)https?://(?:www\.)?(?: 21 videolectures\.net| 22 flexilearn\.viidea\.net| 23 presentations\.ocwconsortium\.org| 24 video\.travel-zoom\.si| 25 video\.pomp-forum\.si| 26 tv\.nil\.si| 27 video\.hekovnik.com| 28 video\.szko\.si| 29 kpk\.viidea\.com| 30 inside\.viidea\.net| 31 video\.kiberpipa\.org| 32 bvvideo\.si| 33 kongres\.viidea\.net| 34 edemokracija\.viidea\.com 35 )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$''' 36 37 _TESTS = [{ 38 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 39 'info_dict': { 40 'id': '20171', 41 'display_id': 'promogram_igor_mekjavic_eng', 42 'ext': 'mp4', 43 'title': 'Automatics, robotics and biocybernetics', 44 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', 45 'thumbnail': r're:http://.*\.jpg', 46 'timestamp': 1372349289, 47 'upload_date': '20130627', 48 'duration': 565, 49 }, 50 'params': { 51 # m3u8 download 52 'skip_download': True, 53 }, 54 }, { 55 # video with invalid direct format links (HTTP 403) 56 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 57 'info_dict': { 58 'id': '14891', 59 'display_id': 'russir2010_filippova_nlp', 60 'ext': 'flv', 61 'title': 'NLP at Google', 62 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', 63 'thumbnail': r're:http://.*\.jpg', 64 'timestamp': 1284375600, 65 'upload_date': '20100913', 66 'duration': 5352, 67 }, 68 'params': { 69 # rtmp download 70 'skip_download': True, 71 }, 72 }, { 73 # event playlist 74 'url': 'http://videolectures.net/deeplearning2015_montreal/', 75 'info_dict': { 76 'id': '23181', 77 'title': 'Deep Learning Summer School, Montreal 2015', 78 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', 79 'thumbnail': r're:http://.*\.jpg', 80 'timestamp': 1438560000, 81 }, 82 'playlist_count': 30, 83 }, { 84 # multi part lecture 85 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', 86 'info_dict': { 87 'id': '9737', 88 'display_id': 'mlss09uk_bishop_ibi', 89 'title': 'Introduction To Bayesian Inference', 90 'thumbnail': r're:http://.*\.jpg', 91 'timestamp': 1251622800, 92 }, 93 'playlist': [{ 94 'info_dict': { 95 'id': '9737_part1', 96 'display_id': 'mlss09uk_bishop_ibi_part1', 97 'ext': 'wmv', 98 'title': 'Introduction To Bayesian Inference (Part 1)', 99 'thumbnail': r're:http://.*\.jpg', 100 'duration': 4622, 101 'timestamp': 1251622800, 102 'upload_date': '20090830', 103 }, 104 }, { 105 'info_dict': { 106 'id': '9737_part2', 107 'display_id': 'mlss09uk_bishop_ibi_part2', 108 'ext': 'wmv', 109 'title': 'Introduction To Bayesian Inference (Part 2)', 110 'thumbnail': r're:http://.*\.jpg', 111 'duration': 5641, 112 'timestamp': 1251622800, 113 'upload_date': '20090830', 114 }, 115 }], 116 'playlist_count': 2, 117 }] 118 119 def _real_extract(self, url): 120 lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups() 121 122 webpage = self._download_webpage(url, lecture_slug) 123 124 cfg = self._parse_json(self._search_regex( 125 [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function', 126 r'cfg\s*:\s*({[^}]+})'], 127 webpage, 'cfg'), lecture_slug, js_to_json) 128 129 lecture_id = compat_str(cfg['obj_id']) 130 131 base_url = self._proto_relative_url(cfg['livepipe'], 'http:') 132 133 try: 134 lecture_data = self._download_json( 135 '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), 136 lecture_id)['lecture'][0] 137 except ExtractorError as e: 138 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: 139 msg = self._parse_json( 140 e.cause.read().decode('utf-8'), lecture_id) 141 raise ExtractorError(msg['detail'], expected=True) 142 raise 143 144 lecture_info = { 145 'id': lecture_id, 146 'display_id': lecture_slug, 147 'title': lecture_data['title'], 148 'timestamp': parse_iso8601(lecture_data.get('time')), 149 'description': lecture_data.get('description_wiki'), 150 'thumbnail': lecture_data.get('thumb'), 151 } 152 153 playlist_entries = [] 154 lecture_type = lecture_data.get('type') 155 parts = [compat_str(video) for video in cfg.get('videos', [])] 156 if parts: 157 multipart = len(parts) > 1 158 159 def extract_part(part_id): 160 smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) 161 smil = self._download_smil(smil_url, lecture_id) 162 info = self._parse_smil(smil, smil_url, lecture_id) 163 self._sort_formats(info['formats']) 164 info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) 165 info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) 166 if multipart: 167 info['title'] += ' (Part %s)' % part_id 168 switch = smil.find('.//switch') 169 if switch is not None: 170 info['duration'] = parse_duration(switch.attrib.get('dur')) 171 item_info = lecture_info.copy() 172 item_info.update(info) 173 return item_info 174 175 if explicit_part_id or not multipart: 176 result = extract_part(explicit_part_id or parts[0]) 177 else: 178 result = { 179 '_type': 'multi_video', 180 'entries': [extract_part(part) for part in parts], 181 } 182 result.update(lecture_info) 183 184 # Immediately return explicitly requested part or non event item 185 if explicit_part_id or lecture_type != 'evt': 186 return result 187 188 playlist_entries.append(result) 189 190 # It's probably a playlist 191 if not parts or lecture_type == 'evt': 192 playlist_webpage = self._download_webpage( 193 '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) 194 entries = [ 195 self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') 196 for _, video_url in re.findall( 197 r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] 198 playlist_entries.extend(entries) 199 200 playlist = self.playlist_result(playlist_entries, lecture_id) 201 playlist.update(lecture_info) 202 return playlist