1 files changed, 88 insertions, 57 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index e273607b67..1be197a8f7 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
+# Copyright The GNU Toolchain Authors.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
@@ -28,7 +29,6 @@ It will output UTF-8 file
 '''
 
 import argparse
-import sys
 import re
 import unicode_utils
 
@@ -200,30 +200,40 @@ def write_header_charmap(outfile):
 
 def write_header_width(outfile, unicode_version):
     '''Writes the header on top of the WIDTH section to the output file'''
-    outfile.write('% Character width according to Unicode '
-                  + '{:s}.\n'.format(unicode_version))
-    outfile.write('% - Default width is 1.\n')
+    outfile.write('% Character width according to Unicode {:s}.\n'.format(unicode_version))
+    outfile.write('% Width is determined by the following rules, in order of decreasing precedence:\n')
+    outfile.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n')
+    outfile.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n')
+    outfile.write('%   This character stands in for an intentionally omitted leading consonant\n')
+    outfile.write('%   in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n')
+    outfile.write('%   of visible display to ensure that the complete block has the correct width.\n')
+    outfile.write('%   (See below for more information on Hangul syllables.)\n')
+    outfile.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n')
+    outfile.write('%   "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n')
+    outfile.write('%   One composed Hangul "syllable block" like 퓛 is made up of\n')
+    outfile.write('%   two to three individual component characters called "jamo".\n')
+    outfile.write('%   The complete block must have total width 2;\n')
+    outfile.write('%   to achieve this, we assign a width of 2 to leading "choseong" jamo,\n')
+    outfile.write('%   and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n')
+    outfile.write('% - Non-spacing and enclosing marks have width 0; generated from\n')
+    outfile.write('%   "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n')
+    outfile.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n')
+    outfile.write('%   "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n')
     outfile.write('% - Double-width characters have width 2; generated from\n')
-    outfile.write('%        "grep \'^[^;]*;\\s*[WF]\' EastAsianWidth.txt"\n')
-    outfile.write('% - Non-spacing characters have width 0; '
-                  + 'generated from PropList.txt or\n')
-    outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
-                  + 'UnicodeData.txt"\n')
-    outfile.write('% - Format control characters have width 0; '
-                  + 'generated from\n')
-    outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
-#   Not needed covered by Cf
-#    outfile.write("% - Zero width characters have width 0; generated from\n")
-#    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
+    outfile.write('%   "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n')
+    outfile.write('% - Default width for all other characters is 1.\n')
     outfile.write("WIDTH\n")
 
-def process_width(outfile, ulines, elines, plines):
-    '''ulines are lines from UnicodeData.txt, elines are lines from
-    EastAsianWidth.txt containing characters with width “W” or “F”,
-    plines are lines from PropList.txt which contain characters
-    with the property “Prepended_Concatenation_Mark”.
-
+def process_width(outfile, ulines, dlines, elines, klines):
+    '''ulines are lines from UnicodeData.txt.
+    elines are lines from EastAsianWidth.txt containing characters with width
+    “W” or “F”.
+    dlines are lines from DerivedCoreProperties.txt which contain
+    characters with the property “Default_Ignorable_Code_Point”.
+    klines are lines from HangulSyllableType.txt which contain characters
+    with syllable type “V” or “T”.
     '''
+    # Wide and fullwidth characters have width 1
     width_dict = {}
     for line in elines:
         fields = line.split(";")
@@ -235,14 +245,14 @@ def process_width(outfile, ulines, elines, plines):
                          int(code_points[1], 16)+1):
             width_dict[key] = 2
 
+    # Nonspacing and enclosing marks have width 0
     for line in ulines:
         fields = line.split(";")
-        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
+        if fields[4] == "NSM" or fields[2] in ("Me", "Mn"):
             width_dict[int(fields[0], 16)] = 0
 
-    for line in plines:
-        # Characters with the property “Prepended_Concatenation_Mark”
-        # should have the width 1:
+    # Conjoining vowel and trailing jamo have width 0
+    for line in klines:
         fields = line.split(";")
         if not '..' in fields[0]:
             code_points = (fields[0], fields[0])
@@ -250,21 +260,26 @@ def process_width(outfile, ulines, elines, plines):
             code_points = fields[0].split("..")
         for key in range(int(code_points[0], 16),
                          int(code_points[1], 16)+1):
-            del width_dict[key] # default width is 1
-
-    # handle special cases for compatibility
-    for key in list((0x00AD,)):
-        # https://www.cs.tut.fi/~jkorpela/shy.html
-        if key in width_dict:
-            del width_dict[key] # default width is 1
-    for key in list(range(0x1160, 0x1200)):
-        # Hangul jungseong and jongseong:
-        if key in unicode_utils.UNICODE_ATTRIBUTES:
-            width_dict[key] = 0
-    for key in list(range(0xD7B0, 0xD800)):
-        # Hangul jungseong and jongseong:
-        if key in unicode_utils.UNICODE_ATTRIBUTES:
             width_dict[key] = 0
+
+    # “Default_Ignorable_Code_Point”s have width 0
+    for line in dlines:
+        fields = line.split(";")
+        if not '..' in fields[0]:
+            code_points = (fields[0], fields[0])
+        else:
+            code_points = fields[0].split("..")
+        for key in range(int(code_points[0], 16),
+                         int(code_points[1], 16)+1):
+            width_dict[key] = 0 # default width is 1
+
+
+    # Special case: U+00AD SOFT HYPHEN
+    del width_dict[0x00AD]
+
+    # Special case: U+115F HANGUL CHOSEONG FILLER
+    width_dict[0x115F] = 2
+
     for key in list(range(0x3248, 0x3250)):
         # These are “A” which means we can decide whether to treat them
         # as “W” or “N” based on context:
@@ -302,7 +317,7 @@ def process_width(outfile, ulines, elines, plines):
 if __name__ == "__main__":
     PARSER = argparse.ArgumentParser(
         description='''
-        Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
+        Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt
         ''')
     PARSER.add_argument(
         '-u', '--unicode_data_file',
@@ -312,6 +327,13 @@ if __name__ == "__main__":
         help=('The UnicodeData.txt file to read, '
               + 'default: %(default)s'))
     PARSER.add_argument(
+        '-d', '--derived_core_properties_file',
+        nargs='?',
+        type=str,
+        default='DerivedCoreProperties.txt',
+        help=('The DerivedCoreProperties.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
         '-e', '--east_asian_with_file',
         nargs='?',
         type=str,
@@ -319,11 +341,11 @@ if __name__ == "__main__":
         help=('The EastAsianWidth.txt file to read, '
               + 'default: %(default)s'))
     PARSER.add_argument(
-        '-p', '--prop_list_file',
+        '-k', '--hangul_syllable_type_file',
         nargs='?',
         type=str,
-        default='PropList.txt',
-        help=('The PropList.txt file to read, '
+        default='HangulSyllableType.txt',
+        help=('The HangulSyllableType.txt file to read, '
               + 'default: %(default)s'))
     PARSER.add_argument(
         '--unicode_version',
@@ -336,27 +358,35 @@ if __name__ == "__main__":
     unicode_utils.fill_attributes(ARGS.unicode_data_file)
     with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
         UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
-    with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
-        EAST_ASIAN_WIDTH_LINES = []
-        for LINE in EAST_ASIAN_WIDTH_FILE:
-            # If characters from EastAsianWidth.txt which are from
-            # reserved ranges (i.e. not yet assigned code points)
+    with open(ARGS.derived_core_properties_file, mode='r') as DERIVED_CORE_PROPERTIES_FILE:
+        DERIVED_CORE_PROPERTIES_LINES = []
+        for LINE in DERIVED_CORE_PROPERTIES_FILE:
+            # If characters which are from reserved ranges
+            # (i.e. not yet assigned code points)
             # are added to the WIDTH section of the UTF-8 file, then
             # “make check” produces “Unknown Character” errors for
             # these code points because such unassigned code points
             # are not in the CHARMAP section of the UTF-8 file.
             #
-            # Therefore, we skip all reserved code points when reading
-            # the EastAsianWidth.txt file.
-            if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
+            # Therefore, we skip all reserved code points.
+            if re.match(r'.*<reserved-.+>', LINE):
+                continue
+            if re.match(r'^[^;]*;\s*Default_Ignorable_Code_Point', LINE):
+                DERIVED_CORE_PROPERTIES_LINES.append(LINE.strip())
+    with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
+        EAST_ASIAN_WIDTH_LINES = []
+        for LINE in EAST_ASIAN_WIDTH_FILE:
+            if re.match(r'.*<reserved-.+>', LINE):
                 continue
             if re.match(r'^[^;]*;\s*[WF]', LINE):
                 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
-    with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
-        PROP_LIST_LINES = []
-        for LINE in PROP_LIST_FILE:
-            if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
-                PROP_LIST_LINES.append(LINE.strip())
+    with open(ARGS.hangul_syllable_type_file, mode='r') as HANGUL_SYLLABLE_TYPE_FILE:
+        HANGUL_SYLLABLE_TYPE_LINES = []
+        for LINE in HANGUL_SYLLABLE_TYPE_FILE:
+            if re.match(r'.*<reserved-.+>', LINE):
+                continue
+            if re.match(r'^[^;]*;\s*[VT]', LINE):
+                HANGUL_SYLLABLE_TYPE_LINES.append(LINE.strip())
     with open('UTF-8', mode='w') as OUTFILE:
         # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
         write_header_charmap(OUTFILE)
@@ -366,6 +396,7 @@ if __name__ == "__main__":
         write_header_width(OUTFILE, ARGS.unicode_version)
         process_width(OUTFILE,
                       UNICODE_DATA_LINES,
+                      DERIVED_CORE_PROPERTIES_LINES,
                       EAST_ASIAN_WIDTH_LINES,
-                      PROP_LIST_LINES)
+                      HANGUL_SYLLABLE_TYPE_LINES)
         OUTFILE.write("END WIDTH\n")