[core] Handle `/../` sequences in HTTP URLs
authordirkf <fieldhouse@gmx.net>
Mon, 13 Mar 2023 19:45:54 +0000 (19:45 +0000)
committerdirkf <fieldhouse@gmx.net>
Tue, 14 Mar 2023 16:23:20 +0000 (16:23 +0000)
* use Python's RFC implementation for embedded sequences
* hack: strip unbalanced leading `../` from path, like eg Firefox

See https://github.com/yt-dlp/yt-dlp/issues/3355

youtube_dl/YoutubeDL.py

index 8e85465962e633f034c88a3c13923c330752f189..bcf7817449bfe5557e69a94e13c8842c29837064 100755 (executable)
@@ -39,6 +39,7 @@ from .compat import (
     compat_str,
     compat_tokenize_tokenize,
     compat_urllib_error,
+    compat_urllib_parse,
     compat_urllib_request,
     compat_urllib_request_DataHandler,
 )
@@ -60,6 +61,7 @@ from .utils import (
     format_bytes,
     formatSeconds,
     GeoRestrictedError,
+    HEADRequest,
     int_or_none,
     ISO3166Utils,
     locked_file,
@@ -74,6 +76,7 @@ from .utils import (
     preferredencoding,
     prepend_extension,
     process_communicate_or_kill,
+    PUTRequest,
     register_socks_protocols,
     render_table,
     replace_extension,
@@ -2297,6 +2300,27 @@ class YoutubeDL(object):
         """ Start an HTTP download """
         if isinstance(req, compat_basestring):
             req = sanitized_Request(req)
+        # an embedded /../ sequence is not automatically handled by urllib2
+        # see https://github.com/yt-dlp/yt-dlp/issues/3355
+        url = req.get_full_url()
+        parts = url.partition('/../')
+        if parts[1]:
+            url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2])
+        if url:
+            # worse, URL path may have initial /../ against RFCs: work-around
+            # by stripping such prefixes, like eg Firefox
+            parts = compat_urllib_parse.urlsplit(url)
+            path = parts.path
+            while path.startswith('/../'):
+                path = path[3:]
+            url = parts._replace(path=path).geturl()
+            # get a new Request with the munged URL
+            if url != req.get_full_url():
+                req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get(
+                    req.get_method(), compat_urllib_request.Request)
+                req = req_type(
+                    url, data=req.data, headers=dict(req.header_items()),
+                    origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
         return self._opener.open(req, timeout=self._socket_timeout)
 
     def print_debug_header(self):