veehd.py (4134B)
1 from __future__ import unicode_literals 2 3 import re 4 import json 5 6 from .common import InfoExtractor 7 from ..compat import ( 8 compat_urllib_parse_unquote, 9 compat_urlparse, 10 ) 11 from ..utils import ( 12 ExtractorError, 13 clean_html, 14 get_element_by_id, 15 ) 16 17 18 class VeeHDIE(InfoExtractor): 19 _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)' 20 21 # Seems VeeHD videos have multiple copies on several servers, all of 22 # whom have different MD5 checksums, so omit md5 field in all tests 23 _TESTS = [{ 24 'url': 'http://veehd.com/video/4639434_Solar-Sinter', 25 'info_dict': { 26 'id': '4639434', 27 'ext': 'mp4', 28 'title': 'Solar Sinter', 29 'uploader_id': 'VideoEyes', 30 'description': 'md5:46a840e8692ddbaffb5f81d9885cb457', 31 }, 32 'skip': 'Video deleted', 33 }, { 34 'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling', 35 'info_dict': { 36 'id': '4905758', 37 'ext': 'mp4', 38 'title': 'Elysian Fields - Channeling', 39 'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b', 40 'uploader_id': 'spotted', 41 } 42 }, { 43 'url': 'http://veehd.com/video/2046729_2012-2009-DivX-Trailer', 44 'info_dict': { 45 'id': '2046729', 46 'ext': 'avi', 47 'title': '2012 (2009) DivX Trailer', 48 'description': 'md5:75435ee95255e6a9838ac6f6f3a2396b', 49 'uploader_id': 'Movie_Trailers', 50 } 51 }] 52 53 def _real_extract(self, url): 54 video_id = self._match_id(url) 55 56 # VeeHD seems to send garbage on the first request. 57 # See https://github.com/ytdl-org/youtube-dl/issues/2102 58 self._download_webpage(url, video_id, 'Requesting webpage') 59 webpage = self._download_webpage(url, video_id) 60 61 if 'This video has been removed<' in webpage: 62 raise ExtractorError('Video %s has been removed' % video_id, expected=True) 63 64 player_path = self._search_regex( 65 r'\$\("#playeriframe"\).attr\({src : "(.+?)"', 66 webpage, 'player path') 67 player_url = compat_urlparse.urljoin(url, player_path) 68 69 self._download_webpage(player_url, video_id, 'Requesting player page') 70 player_page = self._download_webpage( 71 player_url, video_id, 'Downloading player page') 72 73 video_url = None 74 75 config_json = self._search_regex( 76 r'value=\'config=({.+?})\'', player_page, 'config json', default=None) 77 78 if config_json: 79 config = json.loads(config_json) 80 video_url = compat_urllib_parse_unquote(config['clip']['url']) 81 82 if not video_url: 83 video_url = self._html_search_regex( 84 r'<embed[^>]+type="video/divx"[^>]+src="([^"]+)"', 85 player_page, 'video url', default=None) 86 87 if not video_url: 88 iframe_src = self._search_regex( 89 r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url') 90 iframe_url = 'http://veehd.com/%s' % iframe_src 91 92 self._download_webpage(iframe_url, video_id, 'Requesting iframe page') 93 iframe_page = self._download_webpage( 94 iframe_url, video_id, 'Downloading iframe page') 95 96 video_url = self._search_regex( 97 r"file\s*:\s*'([^']+)'", iframe_page, 'video url') 98 99 title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) 100 uploader_id = self._html_search_regex( 101 r'<a href="/profile/\d+">(.+?)</a>', 102 webpage, 'uploader') 103 thumbnail = self._search_regex( 104 r'<img id="veehdpreview" src="(.+?)"', 105 webpage, 'thumbnail') 106 description = self._html_search_regex( 107 r'<td class="infodropdown".*?<div>(.*?)<ul', 108 webpage, 'description', flags=re.DOTALL) 109 110 return { 111 '_type': 'video', 112 'id': video_id, 113 'title': title, 114 'url': video_url, 115 'uploader_id': uploader_id, 116 'thumbnail': thumbnail, 117 'description': description, 118 }