youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

jsinterp.py (9199B)


      1 from __future__ import unicode_literals
      2 
      3 import json
      4 import operator
      5 import re
      6 
      7 from .utils import (
      8     ExtractorError,
      9     remove_quotes,
     10 )
     11 
     12 _OPERATORS = [
     13     ('|', operator.or_),
     14     ('^', operator.xor),
     15     ('&', operator.and_),
     16     ('>>', operator.rshift),
     17     ('<<', operator.lshift),
     18     ('-', operator.sub),
     19     ('+', operator.add),
     20     ('%', operator.mod),
     21     ('/', operator.truediv),
     22     ('*', operator.mul),
     23 ]
     24 _ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS]
     25 _ASSIGN_OPERATORS.append(('=', lambda cur, right: right))
     26 
     27 _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
     28 
     29 
     30 class JSInterpreter(object):
     31     def __init__(self, code, objects=None):
     32         if objects is None:
     33             objects = {}
     34         self.code = code
     35         self._functions = {}
     36         self._objects = objects
     37 
     38     def interpret_statement(self, stmt, local_vars, allow_recursion=100):
     39         if allow_recursion < 0:
     40             raise ExtractorError('Recursion limit reached')
     41 
     42         should_abort = False
     43         stmt = stmt.lstrip()
     44         stmt_m = re.match(r'var\s', stmt)
     45         if stmt_m:
     46             expr = stmt[len(stmt_m.group(0)):]
     47         else:
     48             return_m = re.match(r'return(?:\s+|$)', stmt)
     49             if return_m:
     50                 expr = stmt[len(return_m.group(0)):]
     51                 should_abort = True
     52             else:
     53                 # Try interpreting it as an expression
     54                 expr = stmt
     55 
     56         v = self.interpret_expression(expr, local_vars, allow_recursion)
     57         return v, should_abort
     58 
     59     def interpret_expression(self, expr, local_vars, allow_recursion):
     60         expr = expr.strip()
     61         if expr == '':  # Empty expression
     62             return None
     63 
     64         if expr.startswith('('):
     65             parens_count = 0
     66             for m in re.finditer(r'[()]', expr):
     67                 if m.group(0) == '(':
     68                     parens_count += 1
     69                 else:
     70                     parens_count -= 1
     71                     if parens_count == 0:
     72                         sub_expr = expr[1:m.start()]
     73                         sub_result = self.interpret_expression(
     74                             sub_expr, local_vars, allow_recursion)
     75                         remaining_expr = expr[m.end():].strip()
     76                         if not remaining_expr:
     77                             return sub_result
     78                         else:
     79                             expr = json.dumps(sub_result) + remaining_expr
     80                         break
     81             else:
     82                 raise ExtractorError('Premature end of parens in %r' % expr)
     83 
     84         for op, opfunc in _ASSIGN_OPERATORS:
     85             m = re.match(r'''(?x)
     86                 (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
     87                 \s*%s
     88                 (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr)
     89             if not m:
     90                 continue
     91             right_val = self.interpret_expression(
     92                 m.group('expr'), local_vars, allow_recursion - 1)
     93 
     94             if m.groupdict().get('index'):
     95                 lvar = local_vars[m.group('out')]
     96                 idx = self.interpret_expression(
     97                     m.group('index'), local_vars, allow_recursion)
     98                 assert isinstance(idx, int)
     99                 cur = lvar[idx]
    100                 val = opfunc(cur, right_val)
    101                 lvar[idx] = val
    102                 return val
    103             else:
    104                 cur = local_vars.get(m.group('out'))
    105                 val = opfunc(cur, right_val)
    106                 local_vars[m.group('out')] = val
    107                 return val
    108 
    109         if expr.isdigit():
    110             return int(expr)
    111 
    112         var_m = re.match(
    113             r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE,
    114             expr)
    115         if var_m:
    116             return local_vars[var_m.group('name')]
    117 
    118         try:
    119             return json.loads(expr)
    120         except ValueError:
    121             pass
    122 
    123         m = re.match(
    124             r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
    125         if m:
    126             val = local_vars[m.group('in')]
    127             idx = self.interpret_expression(
    128                 m.group('idx'), local_vars, allow_recursion - 1)
    129             return val[idx]
    130 
    131         m = re.match(
    132             r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
    133             expr)
    134         if m:
    135             variable = m.group('var')
    136             member = remove_quotes(m.group('member') or m.group('member2'))
    137             arg_str = m.group('args')
    138 
    139             if variable in local_vars:
    140                 obj = local_vars[variable]
    141             else:
    142                 if variable not in self._objects:
    143                     self._objects[variable] = self.extract_object(variable)
    144                 obj = self._objects[variable]
    145 
    146             if arg_str is None:
    147                 # Member access
    148                 if member == 'length':
    149                     return len(obj)
    150                 return obj[member]
    151 
    152             assert expr.endswith(')')
    153             # Function call
    154             if arg_str == '':
    155                 argvals = tuple()
    156             else:
    157                 argvals = tuple([
    158                     self.interpret_expression(v, local_vars, allow_recursion)
    159                     for v in arg_str.split(',')])
    160 
    161             if member == 'split':
    162                 assert argvals == ('',)
    163                 return list(obj)
    164             if member == 'join':
    165                 assert len(argvals) == 1
    166                 return argvals[0].join(obj)
    167             if member == 'reverse':
    168                 assert len(argvals) == 0
    169                 obj.reverse()
    170                 return obj
    171             if member == 'slice':
    172                 assert len(argvals) == 1
    173                 return obj[argvals[0]:]
    174             if member == 'splice':
    175                 assert isinstance(obj, list)
    176                 index, howMany = argvals
    177                 res = []
    178                 for i in range(index, min(index + howMany, len(obj))):
    179                     res.append(obj.pop(index))
    180                 return res
    181 
    182             return obj[member](argvals)
    183 
    184         for op, opfunc in _OPERATORS:
    185             m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
    186             if not m:
    187                 continue
    188             x, abort = self.interpret_statement(
    189                 m.group('x'), local_vars, allow_recursion - 1)
    190             if abort:
    191                 raise ExtractorError(
    192                     'Premature left-side return of %s in %r' % (op, expr))
    193             y, abort = self.interpret_statement(
    194                 m.group('y'), local_vars, allow_recursion - 1)
    195             if abort:
    196                 raise ExtractorError(
    197                     'Premature right-side return of %s in %r' % (op, expr))
    198             return opfunc(x, y)
    199 
    200         m = re.match(
    201             r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
    202         if m:
    203             fname = m.group('func')
    204             argvals = tuple([
    205                 int(v) if v.isdigit() else local_vars[v]
    206                 for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple()
    207             if fname not in self._functions:
    208                 self._functions[fname] = self.extract_function(fname)
    209             return self._functions[fname](argvals)
    210 
    211         raise ExtractorError('Unsupported JS expression %r' % expr)
    212 
    213     def extract_object(self, objname):
    214         _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
    215         obj = {}
    216         obj_m = re.search(
    217             r'''(?x)
    218                 (?<!this\.)%s\s*=\s*{\s*
    219                     (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
    220                 }\s*;
    221             ''' % (re.escape(objname), _FUNC_NAME_RE),
    222             self.code)
    223         fields = obj_m.group('fields')
    224         # Currently, it only supports function definitions
    225         fields_m = re.finditer(
    226             r'''(?x)
    227                 (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}
    228             ''' % _FUNC_NAME_RE,
    229             fields)
    230         for f in fields_m:
    231             argnames = f.group('args').split(',')
    232             obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
    233 
    234         return obj
    235 
    236     def extract_function(self, funcname):
    237         func_m = re.search(
    238             r'''(?x)
    239                 (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
    240                 \((?P<args>[^)]*)\)\s*
    241                 \{(?P<code>[^}]+)\}''' % (
    242                 re.escape(funcname), re.escape(funcname), re.escape(funcname)),
    243             self.code)
    244         if func_m is None:
    245             raise ExtractorError('Could not find JS function %r' % funcname)
    246         argnames = func_m.group('args').split(',')
    247 
    248         return self.build_function(argnames, func_m.group('code'))
    249 
    250     def call_function(self, funcname, *args):
    251         f = self.extract_function(funcname)
    252         return f(args)
    253 
    254     def build_function(self, argnames, code):
    255         def resf(args):
    256             local_vars = dict(zip(argnames, args))
    257             for stmt in code.split(';'):
    258                 res, abort = self.interpret_statement(stmt, local_vars)
    259                 if abort:
    260                     break
    261             return res
    262         return resf