sohu.py (6911B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..compat import ( 8 compat_str, 9 compat_urllib_parse_urlencode, 10 ) 11 from ..utils import ( 12 ExtractorError, 13 int_or_none, 14 try_get, 15 ) 16 17 18 class SohuIE(InfoExtractor): 19 _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' 20 21 # Sohu videos give different MD5 sums on Travis CI and my machine 22 _TESTS = [{ 23 'note': 'This video is available only in Mainland China', 24 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', 25 'info_dict': { 26 'id': '382479172', 27 'ext': 'mp4', 28 'title': 'MV:Far East Movement《The Illest》', 29 }, 30 'skip': 'On available in China', 31 }, { 32 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', 33 'info_dict': { 34 'id': '409385080', 35 'ext': 'mp4', 36 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', 37 } 38 }, { 39 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 40 'info_dict': { 41 'id': '78693464', 42 'ext': 'mp4', 43 'title': '【爱范品】第31期:MWC见不到的奇葩手机', 44 } 45 }, { 46 'note': 'Multipart video', 47 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', 48 'info_dict': { 49 'id': '78910339', 50 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 51 }, 52 'playlist': [{ 53 'info_dict': { 54 'id': '78910339_part1', 55 'ext': 'mp4', 56 'duration': 294, 57 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 58 } 59 }, { 60 'info_dict': { 61 'id': '78910339_part2', 62 'ext': 'mp4', 63 'duration': 300, 64 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 65 } 66 }, { 67 'info_dict': { 68 'id': '78910339_part3', 69 'ext': 'mp4', 70 'duration': 150, 71 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 72 } 73 }] 74 }, { 75 'note': 'Video with title containing dash', 76 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', 77 'info_dict': { 78 'id': '78932792', 79 'ext': 'mp4', 80 'title': 'youtube-dl testing video', 81 }, 82 'params': { 83 'skip_download': True 84 } 85 }] 86 87 def _real_extract(self, url): 88 89 def _fetch_data(vid_id, mytv=False): 90 if mytv: 91 base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' 92 else: 93 base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' 94 95 return self._download_json( 96 base_data_url + vid_id, video_id, 97 'Downloading JSON data for %s' % vid_id, 98 headers=self.geo_verification_headers()) 99 100 mobj = re.match(self._VALID_URL, url) 101 video_id = mobj.group('id') 102 mytv = mobj.group('mytv') is not None 103 104 webpage = self._download_webpage(url, video_id) 105 106 title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) 107 108 vid = self._html_search_regex( 109 r'var vid ?= ?["\'](\d+)["\']', 110 webpage, 'video path') 111 vid_data = _fetch_data(vid, mytv) 112 if vid_data['play'] != 1: 113 if vid_data.get('status') == 12: 114 raise ExtractorError( 115 '%s said: There\'s something wrong in the video.' % self.IE_NAME, 116 expected=True) 117 else: 118 self.raise_geo_restricted( 119 '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME) 120 121 formats_json = {} 122 for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): 123 vid_id = vid_data['data'].get('%sVid' % format_id) 124 if not vid_id: 125 continue 126 vid_id = compat_str(vid_id) 127 formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) 128 129 part_count = vid_data['data']['totalBlocks'] 130 131 playlist = [] 132 for i in range(part_count): 133 formats = [] 134 for format_id, format_data in formats_json.items(): 135 allot = format_data['allot'] 136 137 data = format_data['data'] 138 clips_url = data['clipsURL'] 139 su = data['su'] 140 141 video_url = 'newflv.sohu.ccgslb.net' 142 cdnId = None 143 retries = 0 144 145 while 'newflv.sohu.ccgslb.net' in video_url: 146 params = { 147 'prot': 9, 148 'file': clips_url[i], 149 'new': su[i], 150 'prod': 'flash', 151 'rb': 1, 152 } 153 154 if cdnId is not None: 155 params['idc'] = cdnId 156 157 download_note = 'Downloading %s video URL part %d of %d' % ( 158 format_id, i + 1, part_count) 159 160 if retries > 0: 161 download_note += ' (retry #%d)' % retries 162 part_info = self._parse_json(self._download_webpage( 163 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), 164 video_id, download_note), video_id) 165 166 video_url = part_info['url'] 167 cdnId = part_info.get('nid') 168 169 retries += 1 170 if retries > 5: 171 raise ExtractorError('Failed to get video URL') 172 173 formats.append({ 174 'url': video_url, 175 'format_id': format_id, 176 'filesize': int_or_none( 177 try_get(data, lambda x: x['clipsBytes'][i])), 178 'width': int_or_none(data.get('width')), 179 'height': int_or_none(data.get('height')), 180 'fps': int_or_none(data.get('fps')), 181 }) 182 self._sort_formats(formats) 183 184 playlist.append({ 185 'id': '%s_part%d' % (video_id, i + 1), 186 'title': title, 187 'duration': vid_data['data']['clipsDuration'][i], 188 'formats': formats, 189 }) 190 191 if len(playlist) == 1: 192 info = playlist[0] 193 info['id'] = video_id 194 else: 195 info = { 196 '_type': 'multi_video', 197 'entries': playlist, 198 'id': video_id, 199 'title': title, 200 } 201 202 return info