[utils] Sanitize look-alike Unicode glyphs in non-ID filename fields when --restrict...
authordirkf <fieldhouse@gmx.net>
Tue, 11 Oct 2022 12:18:12 +0000 (12:18 +0000)
committerGitHub <noreply@github.com>
Tue, 11 Oct 2022 12:18:12 +0000 (12:18 +0000)
Implements https://github.com/ytdl-org/youtube-dl/issues/31216#issuecomment-1236102822, which has a test.

youtube_dl/utils.py

index fea38ed32e970a8944168552d8fd0a38961da6bb..23a65a81c2a35a3037ea83d8b27345741fc251af 100644 (file)
@@ -33,6 +33,7 @@ import sys
 import tempfile
 import time
 import traceback
+import unicodedata
 import xml.etree.ElementTree
 import zlib
 
@@ -2118,6 +2119,9 @@ def sanitize_filename(s, restricted=False, is_id=False):
             return '_'
         return char
 
+    # Replace look-alike Unicode glyphs
+    if restricted and not is_id:
+        s = unicodedata.normalize('NFKC', s)
     # Handle timestamps
     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
     result = ''.join(map(replace_insane, s))