[Vbox7IE] Sanitise ld+json containing unexpected characters
authordirkf <fieldhouse@gmx.net>
Sat, 27 Jan 2024 18:17:09 +0000 (18:17 +0000)
committerdirkf <fieldhouse@gmx.net>
Fri, 2 Feb 2024 12:36:05 +0000 (12:36 +0000)
* based on PR #29680
* added hack to force invoking `transform_source`
* fixes #26218

youtube_dl/extractor/vbox7.py

index c504c5311ed8b61bde434d0ccd37ddf099687fe7..d114ecb07404a1688919e15a3d8db971860216ad 100644 (file)
@@ -5,6 +5,7 @@ import re
 import time
 
 from .common import InfoExtractor
+from ..compat import compat_kwargs
 from ..utils import (
     determine_ext,
     ExtractorError,
@@ -75,6 +76,27 @@ class Vbox7IE(InfoExtractor):
         if mobj:
             return mobj.group('url')
 
+    # transform_source=None, fatal=True
+    def _parse_json(self, json_string, video_id, *args, **kwargs):
+        if '"@context"' in json_string[:30]:
+            # this is ld+json, or that's the way to bet
+            transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
+            if not transform_source:
+
+                def fix_chars(src):
+                    # fix malformed ld+json: replace raw CRLFs with escaped LFs
+                    return re.sub(
+                        r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
+
+                if len(args) > 0:
+                    args = (fix_chars,) + args[1:]
+                else:
+                    kwargs['transform_source'] = fix_chars
+                    kwargs = compat_kwargs(kwargs)
+
+        return super(Vbox7IE, self)._parse_json(
+            json_string, video_id, *args, **kwargs)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         url = 'https://vbox7.com/play:%s' % (video_id,)