[jsinterp] Fix regexp parsing and .replace[All] method
authordirkf <fieldhouse@gmx.net>
Sun, 5 Mar 2023 23:07:07 +0000 (23:07 +0000)
committerdirkf <fieldhouse@gmx.net>
Tue, 7 Mar 2023 01:24:21 +0000 (01:24 +0000)
 * For performance, make regexp object instantiation lazy
 * Other small performance improvements

test/test_jsinterp.py
youtube_dl/jsinterp.py

index b5962356c773ab7c979b4f835ab5a08a652353d8..5d129433d42fc25e4e72ae93dfa4a6abaafc7596 100644 (file)
@@ -139,21 +139,16 @@ class TestJSInterpreter(unittest.TestCase):
         self.assertTrue(math.isnan(jsi.call_function('x')))
 
     def test_Date(self):
-        jsi = JSInterpreter('''
-        function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; }
-        ''')
-        self.assertEqual(jsi.call_function('x'), 86000)
-
         jsi = JSInterpreter('''
         function x(dt) { return new Date(dt) - 0; }
         ''')
         self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000)
 
         # date format m/d/y
-        jsi = JSInterpreter('''
-        function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; }
-        ''')
-        self.assertEqual(jsi.call_function('x'), 86000)
+        self.assertEqual(jsi.call_function('x', '12/31/1969 18:01:26 MDT'), 86000)
+
+        # epoch 0
+        self.assertEqual(jsi.call_function('x', '1 January 1970 00:00:00 UTC'), 0)
 
     def test_call(self):
         jsi = JSInterpreter('''
@@ -445,7 +440,7 @@ class TestJSInterpreter(unittest.TestCase):
         self.assertIs(jsi.call_function('x'), None)
 
         jsi = JSInterpreter('''
-        function x() { let a=/,,[/,913,/](,)}/; return a; }
+        function x() { let a=/,,[/,913,/](,)}/; "".replace(a, ""); return a; }
         ''')
         attrs = set(('findall', 'finditer', 'flags', 'groupindex',
                      'groups', 'match', 'pattern', 'scanner',
@@ -457,6 +452,31 @@ class TestJSInterpreter(unittest.TestCase):
         ''')
         self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I)
 
+        jsi = JSInterpreter(r'''
+        function x() { let a="data-name".replace("data-", ""); return a }
+        ''')
+        self.assertEqual(jsi.call_function('x'), 'name')
+
+        jsi = JSInterpreter(r'''
+        function x() { let a="data-name".replace(new RegExp("^.+-"), ""); return a; }
+        ''')
+        self.assertEqual(jsi.call_function('x'), 'name')
+
+        jsi = JSInterpreter(r'''
+        function x() { let a="data-name".replace(/^.+-/, ""); return a; }
+        ''')
+        self.assertEqual(jsi.call_function('x'), 'name')
+
+        jsi = JSInterpreter(r'''
+        function x() { let a="data-name".replace(/a/g, "o"); return a; }
+        ''')
+        self.assertEqual(jsi.call_function('x'), 'doto-nome')
+
+        jsi = JSInterpreter(r'''
+        function x() { let a="data-name".replaceAll("a", "o"); return a; }
+        ''')
+        self.assertEqual(jsi.call_function('x'), 'doto-nome')
+
         jsi = JSInterpreter(r'''
         function x() { let a=[/[)\\]/]; return a[0]; }
         ''')
@@ -485,6 +505,12 @@ class TestJSInterpreter(unittest.TestCase):
         jsi = JSInterpreter('function x(){return 1236566549 << 5}')
         self.assertEqual(jsi.call_function('x'), 915423904)
 
+    """ # fails so far
+    def test_packed(self):
+        jsi = JSInterpreter('''function x(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''')
+        self.assertEqual(jsi.call_function('x', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("<q />").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|')))
+    """
+
 
 if __name__ == '__main__':
     unittest.main()
index e28670a3f3de2bcc4c093ec8164de4909b5653fa..ab7d6f9263e87157049a33a8cfc6bc08909cef8b 100644 (file)
@@ -12,9 +12,11 @@ from .utils import (
     js_to_json,
     remove_quotes,
     unified_timestamp,
+    variadic,
 )
 from .compat import (
     compat_basestring,
+    compat_chr,
     compat_collections_chain_map as ChainMap,
     compat_itertools_zip_longest as zip_longest,
     compat_str,
@@ -205,10 +207,10 @@ class JSInterpreter(object):
             super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs)
 
     class JS_RegExp(object):
-        _RE_FLAGS = {
+        RE_FLAGS = {
             # special knowledge: Python's re flags are bitmask values, current max 128
             # invent new bitmask values well above that for literal parsing
-            # TODO: new pattern class to execute matches with these flags
+            # TODO: execute matches with these flags (remaining: d, y)
             'd': 1024,  # Generate indices for substring matches
             'g': 2048,  # Global search
             'i': re.I,  # Case-insensitive search
@@ -218,12 +220,19 @@ class JSInterpreter(object):
             'y': 4096,  # Perform a "sticky" search that matches starting at the current position in the target string
         }
 
-        def __init__(self, pattern_txt, flags=''):
+        def __init__(self, pattern_txt, flags=0):
             if isinstance(flags, compat_str):
                 flags, _ = self.regex_flags(flags)
-            # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern
             # First, avoid https://github.com/python/cpython/issues/74534
-            self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags)
+            self.__self = None
+            self.__pattern_txt = pattern_txt.replace('[[', r'[\[')
+            self.__flags = flags
+
+        def __instantiate(self):
+            if self.__self:
+                return
+            self.__self = re.compile(self.__pattern_txt, self.__flags)
+            # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern
             for name in dir(self.__self):
                 # Only these? Obviously __class__, __init__.
                 # PyPy creates a __weakref__ attribute with value None
@@ -232,15 +241,21 @@ class JSInterpreter(object):
                     continue
                 setattr(self, name, getattr(self.__self, name))
 
+        def __getattr__(self, name):
+            self.__instantiate()
+            if hasattr(self, name):
+                return getattr(self, name)
+            return super(JSInterpreter.JS_RegExp, self).__getattr__(name)
+
         @classmethod
         def regex_flags(cls, expr):
             flags = 0
             if not expr:
                 return flags, expr
             for idx, ch in enumerate(expr):
-                if ch not in cls._RE_FLAGS:
+                if ch not in cls.RE_FLAGS:
                     break
-                flags |= cls._RE_FLAGS[ch]
+                flags |= cls.RE_FLAGS[ch]
             return flags, expr[idx + 1:]
 
     @classmethod
@@ -265,17 +280,17 @@ class JSInterpreter(object):
         counters = dict((k, 0) for k in _MATCHING_PARENS.values())
         start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
         in_quote, escaping, skipping = None, False, 0
-        after_op, in_regex_char_group, skip_re = True, False, 0
+        after_op, in_regex_char_group = True, False
 
         for idx, char in enumerate(expr):
-            if skip_re > 0:
-                skip_re -= 1
-                continue
+            paren_delta = 0
             if not in_quote:
                 if char in _MATCHING_PARENS:
                     counters[_MATCHING_PARENS[char]] += 1
+                    paren_delta = 1
                 elif char in counters:
                     counters[char] -= 1
+                    paren_delta = -1
             if not escaping:
                 if char in _QUOTES and in_quote in (char, None):
                     if in_quote or after_op or char != '/':
@@ -283,7 +298,7 @@ class JSInterpreter(object):
                 elif in_quote == '/' and char in '[]':
                     in_regex_char_group = char == '['
             escaping = not escaping and in_quote and char == '\\'
-            after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op))
+            after_op = not in_quote and (char in cls.OP_CHARS or paren_delta > 0 or (after_op and char.isspace()))
 
             if char != delim[pos] or any(counters.values()) or in_quote:
                 pos = skipping = 0
@@ -293,7 +308,7 @@ class JSInterpreter(object):
                 continue
             elif pos == 0 and skip_delims:
                 here = expr[idx:]
-                for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]:
+                for s in variadic(skip_delims):
                     if here.startswith(s) and s:
                         skipping = len(s) - 1
                         break
@@ -316,7 +331,7 @@ class JSInterpreter(object):
         separated = list(cls._separate(expr, delim, 1))
 
         if len(separated) < 2:
-            raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals()))
+            raise cls.Exception('No terminating paren {delim} in {expr!r:.5500}'.format(**locals()))
         return separated[0][1:].strip(), separated[1].strip()
 
     @staticmethod
@@ -361,6 +376,20 @@ class JSInterpreter(object):
         except TypeError:
             return self._named_object(namespace, obj)
 
+    # used below
+    _VAR_RET_THROW_RE = re.compile(r'''(?x)
+        (?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["'])|$)|(?P<throw>throw\s+)
+        ''')
+    _COMPOUND_RE = re.compile(r'''(?x)
+        (?P<try>try)\s*\{|
+        (?P<if>if)\s*\(|
+        (?P<switch>switch)\s*\(|
+        (?P<for>for)\s*\(|
+        (?P<while>while)\s*\(
+        ''')
+    _FINALLY_RE = re.compile(r'finally\s*\{')
+    _SWITCH_RE = re.compile(r'switch\s*\(')
+
     def interpret_statement(self, stmt, local_vars, allow_recursion=100):
         if allow_recursion < 0:
             raise self.Exception('Recursion limit reached')
@@ -375,7 +404,7 @@ class JSInterpreter(object):
             if should_return:
                 return ret, should_return
 
-        m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt)
+        m = self._VAR_RET_THROW_RE.match(stmt)
         if m:
             expr = stmt[len(m.group(0)):].strip()
             if m.group('throw'):
@@ -447,13 +476,7 @@ class JSInterpreter(object):
                 for item in self._separate(inner)])
             expr = name + outer
 
-        m = re.match(r'''(?x)
-                (?P<try>try)\s*\{|
-                (?P<if>if)\s*\(|
-                (?P<switch>switch)\s*\(|
-                (?P<for>for)\s*\(|
-                (?P<while>while)\s*\(
-                ''', expr)
+        m = self._COMPOUND_RE.match(expr)
         md = m.groupdict() if m else {}
         if md.get('if'):
             cndn, expr = self._separate_at_paren(expr[m.end() - 1:])
@@ -512,7 +535,7 @@ class JSInterpreter(object):
                     err = None
                     pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion)
 
-            m = re.match(r'finally\s*\{', expr)
+            m = self._FINALLY_RE.match(expr)
             if m:
                 sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
                 ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion)
@@ -531,7 +554,7 @@ class JSInterpreter(object):
             if remaining.startswith('{'):
                 body, expr = self._separate_at_paren(remaining)
             else:
-                switch_m = re.match(r'switch\s*\(', remaining)  # FIXME
+                switch_m = self._SWITCH_RE.match(remaining)  # FIXME
                 if switch_m:
                     switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:])
                     body, expr = self._separate_at_paren(remaining, '}')
@@ -735,7 +758,7 @@ class JSInterpreter(object):
                 if obj == compat_str:
                     if member == 'fromCharCode':
                         assertion(argvals, 'takes one or more arguments')
-                        return ''.join(map(chr, argvals))
+                        return ''.join(map(compat_chr, argvals))
                     raise self.Exception('Unsupported string method ' + member, expr=expr)
                 elif obj == float:
                     if member == 'pow':
@@ -808,10 +831,17 @@ class JSInterpreter(object):
                     if idx >= len(obj):
                         return None
                     return ord(obj[idx])
-                elif member == 'replace':
+                elif member in ('replace', 'replaceAll'):
                     assertion(isinstance(obj, compat_str), 'must be applied on a string')
                     assertion(len(argvals) == 2, 'takes exactly two arguments')
-                    return re.sub(argvals[0], argvals[1], obj)
+                    # TODO: argvals[1] callable, other Py vs JS edge cases
+                    if isinstance(argvals[0], self.JS_RegExp):
+                        count = 0 if argvals[0].flags & self.JS_RegExp.RE_FLAGS['g'] else 1
+                        assertion(member != 'replaceAll' or count == 0,
+                                  'replaceAll must be called with a global RegExp')
+                        return argvals[0].sub(argvals[1], obj, count=count)
+                    count = ('replaceAll', 'replace').index(member)
+                    return re.sub(re.escape(argvals[0]), argvals[1], obj, count=count)
 
                 idx = int(member) if isinstance(obj, list) else member
                 return obj[idx](argvals, allow_recursion=allow_recursion)