openload.py (8184B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import json 5 import os 6 import subprocess 7 import tempfile 8 9 from ..compat import ( 10 compat_urlparse, 11 compat_kwargs, 12 ) 13 from ..utils import ( 14 check_executable, 15 encodeArgument, 16 ExtractorError, 17 get_exe_version, 18 is_outdated_version, 19 std_headers, 20 ) 21 22 23 def cookie_to_dict(cookie): 24 cookie_dict = { 25 'name': cookie.name, 26 'value': cookie.value, 27 } 28 if cookie.port_specified: 29 cookie_dict['port'] = cookie.port 30 if cookie.domain_specified: 31 cookie_dict['domain'] = cookie.domain 32 if cookie.path_specified: 33 cookie_dict['path'] = cookie.path 34 if cookie.expires is not None: 35 cookie_dict['expires'] = cookie.expires 36 if cookie.secure is not None: 37 cookie_dict['secure'] = cookie.secure 38 if cookie.discard is not None: 39 cookie_dict['discard'] = cookie.discard 40 try: 41 if (cookie.has_nonstandard_attr('httpOnly') 42 or cookie.has_nonstandard_attr('httponly') 43 or cookie.has_nonstandard_attr('HttpOnly')): 44 cookie_dict['httponly'] = True 45 except TypeError: 46 pass 47 return cookie_dict 48 49 50 def cookie_jar_to_list(cookie_jar): 51 return [cookie_to_dict(cookie) for cookie in cookie_jar] 52 53 54 class PhantomJSwrapper(object): 55 """PhantomJS wrapper class 56 57 This class is experimental. 58 """ 59 60 _TEMPLATE = r''' 61 phantom.onError = function(msg, trace) {{ 62 var msgStack = ['PHANTOM ERROR: ' + msg]; 63 if(trace && trace.length) {{ 64 msgStack.push('TRACE:'); 65 trace.forEach(function(t) {{ 66 msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line 67 + (t.function ? ' (in function ' + t.function +')' : '')); 68 }}); 69 }} 70 console.error(msgStack.join('\n')); 71 phantom.exit(1); 72 }}; 73 var page = require('webpage').create(); 74 var fs = require('fs'); 75 var read = {{ mode: 'r', charset: 'utf-8' }}; 76 var write = {{ mode: 'w', charset: 'utf-8' }}; 77 JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ 78 phantom.addCookie(x); 79 }}); 80 page.settings.resourceTimeout = {timeout}; 81 page.settings.userAgent = "{ua}"; 82 page.onLoadStarted = function() {{ 83 page.evaluate(function() {{ 84 delete window._phantom; 85 delete window.callPhantom; 86 }}); 87 }}; 88 var saveAndExit = function() {{ 89 fs.write("{html}", page.content, write); 90 fs.write("{cookies}", JSON.stringify(phantom.cookies), write); 91 phantom.exit(); 92 }}; 93 page.onLoadFinished = function(status) {{ 94 if(page.url === "") {{ 95 page.setContent(fs.read("{html}", read), "{url}"); 96 }} 97 else {{ 98 {jscode} 99 }} 100 }}; 101 page.open(""); 102 ''' 103 104 _TMP_FILE_NAMES = ['script', 'html', 'cookies'] 105 106 @staticmethod 107 def _version(): 108 return get_exe_version('phantomjs', version_re=r'([0-9.]+)') 109 110 def __init__(self, extractor, required_version=None, timeout=10000): 111 self._TMP_FILES = {} 112 113 self.exe = check_executable('phantomjs', ['-v']) 114 if not self.exe: 115 raise ExtractorError('PhantomJS executable not found in PATH, ' 116 'download it from http://phantomjs.org', 117 expected=True) 118 119 self.extractor = extractor 120 121 if required_version: 122 version = self._version() 123 if is_outdated_version(version, required_version): 124 self.extractor._downloader.report_warning( 125 'Your copy of PhantomJS is outdated, update it to version ' 126 '%s or newer if you encounter any errors.' % required_version) 127 128 self.options = { 129 'timeout': timeout, 130 } 131 for name in self._TMP_FILE_NAMES: 132 tmp = tempfile.NamedTemporaryFile(delete=False) 133 tmp.close() 134 self._TMP_FILES[name] = tmp 135 136 def __del__(self): 137 for name in self._TMP_FILE_NAMES: 138 try: 139 os.remove(self._TMP_FILES[name].name) 140 except (IOError, OSError, KeyError): 141 pass 142 143 def _save_cookies(self, url): 144 cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) 145 for cookie in cookies: 146 if 'path' not in cookie: 147 cookie['path'] = '/' 148 if 'domain' not in cookie: 149 cookie['domain'] = compat_urlparse.urlparse(url).netloc 150 with open(self._TMP_FILES['cookies'].name, 'wb') as f: 151 f.write(json.dumps(cookies).encode('utf-8')) 152 153 def _load_cookies(self): 154 with open(self._TMP_FILES['cookies'].name, 'rb') as f: 155 cookies = json.loads(f.read().decode('utf-8')) 156 for cookie in cookies: 157 if cookie['httponly'] is True: 158 cookie['rest'] = {'httpOnly': None} 159 if 'expiry' in cookie: 160 cookie['expire_time'] = cookie['expiry'] 161 self.extractor._set_cookie(**compat_kwargs(cookie)) 162 163 def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): 164 """ 165 Downloads webpage (if needed) and executes JS 166 167 Params: 168 url: website url 169 html: optional, html code of website 170 video_id: video id 171 note: optional, displayed when downloading webpage 172 note2: optional, displayed when executing JS 173 headers: custom http headers 174 jscode: code to be executed when page is loaded 175 176 Returns tuple with: 177 * downloaded website (after JS execution) 178 * anything you print with `console.log` (but not inside `page.execute`!) 179 180 In most cases you don't need to add any `jscode`. 181 It is executed in `page.onLoadFinished`. 182 `saveAndExit();` is mandatory, use it instead of `phantom.exit()` 183 It is possible to wait for some element on the webpage, for example: 184 var check = function() { 185 var elementFound = page.evaluate(function() { 186 return document.querySelector('#b.done') !== null; 187 }); 188 if(elementFound) 189 saveAndExit(); 190 else 191 window.setTimeout(check, 500); 192 } 193 194 page.evaluate(function(){ 195 document.querySelector('#a').click(); 196 }); 197 check(); 198 """ 199 if 'saveAndExit();' not in jscode: 200 raise ExtractorError('`saveAndExit();` not found in `jscode`') 201 if not html: 202 html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) 203 with open(self._TMP_FILES['html'].name, 'wb') as f: 204 f.write(html.encode('utf-8')) 205 206 self._save_cookies(url) 207 208 replaces = self.options 209 replaces['url'] = url 210 user_agent = headers.get('User-Agent') or std_headers['User-Agent'] 211 replaces['ua'] = user_agent.replace('"', '\\"') 212 replaces['jscode'] = jscode 213 214 for x in self._TMP_FILE_NAMES: 215 replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') 216 217 with open(self._TMP_FILES['script'].name, 'wb') as f: 218 f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) 219 220 if video_id is None: 221 self.extractor.to_screen('%s' % (note2,)) 222 else: 223 self.extractor.to_screen('%s: %s' % (video_id, note2)) 224 225 p = subprocess.Popen([ 226 self.exe, '--ssl-protocol=any', 227 self._TMP_FILES['script'].name 228 ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 229 out, err = p.communicate() 230 if p.returncode != 0: 231 raise ExtractorError( 232 'Executing JS failed\n:' + encodeArgument(err)) 233 with open(self._TMP_FILES['html'].name, 'rb') as f: 234 html = f.read().decode('utf-8') 235 236 self._load_cookies() 237 238 return (html, encodeArgument(out))