piksel.py (7045B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..compat import compat_str 8 from ..utils import ( 9 dict_get, 10 ExtractorError, 11 int_or_none, 12 parse_iso8601, 13 try_get, 14 unescapeHTML, 15 ) 16 17 18 class PikselIE(InfoExtractor): 19 _VALID_URL = r'''(?x)https?:// 20 (?: 21 (?: 22 player\. 23 (?: 24 olympusattelecom| 25 vibebyvista 26 )| 27 (?:api|player)\.multicastmedia| 28 (?:api-ovp|player)\.piksel 29 )\.com| 30 (?: 31 mz-edge\.stream\.co| 32 movie-s\.nhk\.or 33 )\.jp| 34 vidego\.baltimorecity\.gov 35 )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' 36 _TESTS = [ 37 { 38 'url': 'http://player.piksel.com/v/ums2867l', 39 'md5': '34e34c8d89dc2559976a6079db531e85', 40 'info_dict': { 41 'id': 'ums2867l', 42 'ext': 'mp4', 43 'title': 'GX-005 with Caption', 44 'timestamp': 1481335659, 45 'upload_date': '20161210' 46 } 47 }, 48 { 49 # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al 50 'url': 'https://player.piksel.com/v/v80kqp41', 51 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', 52 'info_dict': { 53 'id': 'v80kqp41', 54 'ext': 'mp4', 55 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', 56 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', 57 'timestamp': 1486171129, 58 'upload_date': '20170204' 59 } 60 }, 61 { 62 # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ 63 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', 64 'only_matching': True, 65 } 66 ] 67 68 @staticmethod 69 def _extract_url(webpage): 70 mobj = re.search( 71 r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', 72 webpage) 73 if mobj: 74 return mobj.group('url') 75 76 def _call_api(self, app_token, resource, display_id, query, fatal=True): 77 response = (self._download_json( 78 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), 79 display_id, query=query, fatal=fatal) or {}).get('response') 80 failure = try_get(response, lambda x: x['failure']['reason']) 81 if failure: 82 if fatal: 83 raise ExtractorError(failure, expected=True) 84 self.report_warning(failure) 85 return response 86 87 def _real_extract(self, url): 88 ref_id, display_id = re.match(self._VALID_URL, url).groups() 89 webpage = self._download_webpage(url, display_id) 90 app_token = self._search_regex([ 91 r'clientAPI\s*:\s*"([^"]+)"', 92 r'data-de-api-key\s*=\s*"([^"]+)"' 93 ], webpage, 'app token') 94 query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} 95 program = self._call_api( 96 app_token, 'program', display_id, query)['WsProgramResponse']['program'] 97 video_id = program['uuid'] 98 video_data = program['asset'] 99 title = video_data['title'] 100 asset_type = dict_get(video_data, ['assetType', 'asset_type']) 101 102 formats = [] 103 104 def process_asset_file(asset_file): 105 if not asset_file: 106 return 107 # TODO: extract rtmp formats 108 http_url = asset_file.get('http_url') 109 if not http_url: 110 return 111 tbr = None 112 vbr = int_or_none(asset_file.get('videoBitrate'), 1024) 113 abr = int_or_none(asset_file.get('audioBitrate'), 1024) 114 if asset_type == 'video': 115 tbr = vbr + abr 116 elif asset_type == 'audio': 117 tbr = abr 118 119 format_id = ['http'] 120 if tbr: 121 format_id.append(compat_str(tbr)) 122 123 formats.append({ 124 'format_id': '-'.join(format_id), 125 'url': unescapeHTML(http_url), 126 'vbr': vbr, 127 'abr': abr, 128 'width': int_or_none(asset_file.get('videoWidth')), 129 'height': int_or_none(asset_file.get('videoHeight')), 130 'filesize': int_or_none(asset_file.get('filesize')), 131 'tbr': tbr, 132 }) 133 134 def process_asset_files(asset_files): 135 for asset_file in (asset_files or []): 136 process_asset_file(asset_file) 137 138 process_asset_files(video_data.get('assetFiles')) 139 process_asset_file(video_data.get('referenceFile')) 140 if not formats: 141 asset_id = video_data.get('assetid') or program.get('assetid') 142 if asset_id: 143 process_asset_files(try_get(self._call_api( 144 app_token, 'asset_file', display_id, { 145 'assetid': asset_id, 146 }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) 147 148 m3u8_url = dict_get(video_data, [ 149 'm3u8iPadURL', 150 'ipadM3u8Url', 151 'm3u8AndroidURL', 152 'm3u8iPhoneURL', 153 'iphoneM3u8Url']) 154 if m3u8_url: 155 formats.extend(self._extract_m3u8_formats( 156 m3u8_url, video_id, 'mp4', 'm3u8_native', 157 m3u8_id='hls', fatal=False)) 158 159 smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) 160 if smil_url: 161 transform_source = None 162 if ref_id == 'nhkworld': 163 # TODO: figure out if this is something to be fixed in urljoin, 164 # _parse_smil_formats or keep it here 165 transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') 166 formats.extend(self._extract_smil_formats( 167 re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, 168 transform_source=transform_source, fatal=False)) 169 170 self._sort_formats(formats) 171 172 subtitles = {} 173 for caption in video_data.get('captions', []): 174 caption_url = caption.get('url') 175 if caption_url: 176 subtitles.setdefault(caption.get('locale', 'en'), []).append({ 177 'url': caption_url}) 178 179 return { 180 'id': video_id, 181 'title': title, 182 'description': video_data.get('description'), 183 'thumbnail': video_data.get('thumbnailUrl'), 184 'timestamp': parse_iso8601(video_data.get('dateadd')), 185 'formats': formats, 186 'subtitles': subtitles, 187 }