vuclip.py (2254B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..compat import ( 7 compat_urllib_parse_urlparse, 8 ) 9 from ..utils import ( 10 ExtractorError, 11 parse_duration, 12 remove_end, 13 ) 14 15 16 class VuClipIE(InfoExtractor): 17 _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' 18 19 _TEST = { 20 'url': 'http://m.vuclip.com/w?cid=1129900602&bu=8589892792&frm=w&z=34801&op=0&oc=843169247§ion=recommend', 21 'info_dict': { 22 'id': '1129900602', 23 'ext': '3gp', 24 'title': 'Top 10 TV Convicts', 25 'duration': 733, 26 } 27 } 28 29 def _real_extract(self, url): 30 video_id = self._match_id(url) 31 webpage = self._download_webpage(url, video_id) 32 33 ad_m = re.search( 34 r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage) 35 if ad_m: 36 urlr = compat_urllib_parse_urlparse(url) 37 adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1) 38 webpage = self._download_webpage( 39 adfree_url, video_id, note='Download post-ad page') 40 41 error_msg = self._html_search_regex( 42 r'<p class="message">(.*?)</p>', webpage, 'error message', 43 default=None) 44 if error_msg: 45 raise ExtractorError( 46 '%s said: %s' % (self.IE_NAME, error_msg), expected=True) 47 48 # These clowns alternate between two page types 49 video_url = self._search_regex( 50 r'<a[^>]+href="([^"]+)"[^>]*><img[^>]+src="[^"]*/play\.gif', 51 webpage, 'video URL', default=None) 52 if video_url: 53 formats = [{ 54 'url': video_url, 55 }] 56 else: 57 formats = self._parse_html5_media_entries(url, webpage, video_id)[0]['formats'] 58 59 title = remove_end(self._html_search_regex( 60 r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip(), ' - Video') 61 62 duration = parse_duration(self._html_search_regex( 63 r'[(>]([0-9]+:[0-9]+)(?:<span|\))', webpage, 'duration', fatal=False)) 64 65 return { 66 'id': video_id, 67 'formats': formats, 68 'title': title, 69 'duration': duration, 70 }