[compat] Reformat casefold.py for easier updating

author dirkf <fieldhouse@gmx.net>

Tue, 1 Nov 2022 19:25:59 +0000 (19:25 +0000)

committer GitHub <noreply@github.com>

Tue, 1 Nov 2022 19:25:59 +0000 (19:25 +0000)
author dirkf <fieldhouse@gmx.net>
Tue, 1 Nov 2022 19:25:59 +0000 (19:25 +0000)
committer GitHub <noreply@github.com>
Tue, 1 Nov 2022 19:25:59 +0000 (19:25 +0000)
diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py

index 546269a3c90700d91c6272f433cb8e96128ee86a..7e91c3811640b7479550f5a706e74e561172a9d0 100644 (file)
--- a/youtube_dl/casefold.py
+++ b/youtube_dl/casefold.py
@@ -1,8 +1,20 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
-from .compat import compat_str
+from .compat import (
+    compat_str,
+    compat_chr,
+)
  
+# Below is included the text of icu/CaseFolding.txt retrieved from
+# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/CaseFolding.txt
+# In case newly foldable Unicode characters are defined, paste the new version
+# of the text inside the ''' marks.
+# The text is expected to have only blank lines andlines with 1st character #,
+# all ignored, and fold definitions like this:
+# `from_hex_code; space_separated_to_hex_code_list; comment`
+
+_map_str = '''
  # CaseFolding-15.0.0.txt
  # Date: 2022-02-02, 23:35:35 GMT
  # © 2022 Unicode®, Inc.
@@ -65,7 +77,6 @@ from .compat import compat_str
  #  have the value C for the status field, and the code point itself for the mapping field.
  
  # =================================================================
-_map_str = '''
  0041; C; 0061; # LATIN CAPITAL LETTER A
  0042; C; 0062; # LATIN CAPITAL LETTER B
  0043; C; 0063; # LATIN CAPITAL LETTER C
@@ -1627,17 +1638,22 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
  1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO
  1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA
  '''
+
+_parse_unichr = lambda s: compat_chr(int(s, 16))
+
  _map = dict(
-    (unichr(int(from_, 16)), ''.join((unichr(int(v, 16)) for v in to_.split(' '))))
+    (_parse_unichr(from_), ''.join(map(_parse_unichr, to_.split(' '))))
      for from_, type_, to_, _ in (
-        l.split('; ', 3) for l in _map_str.splitlines() if l)
+        l.split('; ', 3) for l in _map_str.splitlines() if l and not l[0] == '#')
      if type_ in ('C', 'F'))
  del _map_str
  
+
  def casefold(s):
      assert isinstance(s, compat_str)
      return ''.join((_map.get(c, c) for c in s))
  
+
  __all__ = [
      casefold
  ]
author	dirkf <fieldhouse@gmx.net>
	Tue, 1 Nov 2022 19:25:59 +0000 (19:25 +0000)
committer	GitHub <noreply@github.com>
	Tue, 1 Nov 2022 19:25:59 +0000 (19:25 +0000)