aboutsummaryrefslogtreecommitdiff
path: root/localedata/unicode-gen/utf8_gen.py
diff options
context:
space:
mode:
Diffstat (limited to 'localedata/unicode-gen/utf8_gen.py')
-rwxr-xr-xlocaledata/unicode-gen/utf8_gen.py145
1 files changed, 88 insertions, 57 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index e273607b67..1be197a8f7 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -1,6 +1,7 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2014-2024 Free Software Foundation, Inc.
+# Copyright The GNU Toolchain Authors.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
@@ -28,7 +29,6 @@ It will output UTF-8 file
'''
import argparse
-import sys
import re
import unicode_utils
@@ -200,30 +200,40 @@ def write_header_charmap(outfile):
def write_header_width(outfile, unicode_version):
'''Writes the header on top of the WIDTH section to the output file'''
- outfile.write('% Character width according to Unicode '
- + '{:s}.\n'.format(unicode_version))
- outfile.write('% - Default width is 1.\n')
+ outfile.write('% Character width according to Unicode {:s}.\n'.format(unicode_version))
+ outfile.write('% Width is determined by the following rules, in order of decreasing precedence:\n')
+ outfile.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n')
+ outfile.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n')
+ outfile.write('% This character stands in for an intentionally omitted leading consonant\n')
+ outfile.write('% in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n')
+ outfile.write('% of visible display to ensure that the complete block has the correct width.\n')
+ outfile.write('% (See below for more information on Hangul syllables.)\n')
+ outfile.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n')
+ outfile.write('% "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n')
+ outfile.write('% One composed Hangul "syllable block" like 퓛 is made up of\n')
+ outfile.write('% two to three individual component characters called "jamo".\n')
+ outfile.write('% The complete block must have total width 2;\n')
+ outfile.write('% to achieve this, we assign a width of 2 to leading "choseong" jamo,\n')
+ outfile.write('% and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n')
+ outfile.write('% - Non-spacing and enclosing marks have width 0; generated from\n')
+ outfile.write('% "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n')
+ outfile.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n')
+ outfile.write('% "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n')
outfile.write('% - Double-width characters have width 2; generated from\n')
- outfile.write('% "grep \'^[^;]*;\\s*[WF]\' EastAsianWidth.txt"\n')
- outfile.write('% - Non-spacing characters have width 0; '
- + 'generated from PropList.txt or\n')
- outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
- + 'UnicodeData.txt"\n')
- outfile.write('% - Format control characters have width 0; '
- + 'generated from\n')
- outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
-# Not needed covered by Cf
-# outfile.write("% - Zero width characters have width 0; generated from\n")
-# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
+ outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n')
+ outfile.write('% - Default width for all other characters is 1.\n')
outfile.write("WIDTH\n")
-def process_width(outfile, ulines, elines, plines):
- '''ulines are lines from UnicodeData.txt, elines are lines from
- EastAsianWidth.txt containing characters with width “W” or “F”,
- plines are lines from PropList.txt which contain characters
- with the property “Prepended_Concatenation_Mark”.
-
+def process_width(outfile, ulines, dlines, elines, klines):
+ '''ulines are lines from UnicodeData.txt.
+ elines are lines from EastAsianWidth.txt containing characters with width
+ “W” or “F”.
+ dlines are lines from DerivedCoreProperties.txt which contain
+ characters with the property “Default_Ignorable_Code_Point”.
+ klines are lines from HangulSyllableType.txt which contain characters
+ with syllable type “V” or “T”.
'''
+ # Wide and fullwidth characters have width 1
width_dict = {}
for line in elines:
fields = line.split(";")
@@ -235,14 +245,14 @@ def process_width(outfile, ulines, elines, plines):
int(code_points[1], 16)+1):
width_dict[key] = 2
+ # Nonspacing and enclosing marks have width 0
for line in ulines:
fields = line.split(";")
- if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
+ if fields[4] == "NSM" or fields[2] in ("Me", "Mn"):
width_dict[int(fields[0], 16)] = 0
- for line in plines:
- # Characters with the property “Prepended_Concatenation_Mark”
- # should have the width 1:
+ # Conjoining vowel and trailing jamo have width 0
+ for line in klines:
fields = line.split(";")
if not '..' in fields[0]:
code_points = (fields[0], fields[0])
@@ -250,21 +260,26 @@ def process_width(outfile, ulines, elines, plines):
code_points = fields[0].split("..")
for key in range(int(code_points[0], 16),
int(code_points[1], 16)+1):
- del width_dict[key] # default width is 1
-
- # handle special cases for compatibility
- for key in list((0x00AD,)):
- # https://www.cs.tut.fi/~jkorpela/shy.html
- if key in width_dict:
- del width_dict[key] # default width is 1
- for key in list(range(0x1160, 0x1200)):
- # Hangul jungseong and jongseong:
- if key in unicode_utils.UNICODE_ATTRIBUTES:
- width_dict[key] = 0
- for key in list(range(0xD7B0, 0xD800)):
- # Hangul jungseong and jongseong:
- if key in unicode_utils.UNICODE_ATTRIBUTES:
width_dict[key] = 0
+
+ # “Default_Ignorable_Code_Point”s have width 0
+ for line in dlines:
+ fields = line.split(";")
+ if not '..' in fields[0]:
+ code_points = (fields[0], fields[0])
+ else:
+ code_points = fields[0].split("..")
+ for key in range(int(code_points[0], 16),
+ int(code_points[1], 16)+1):
+ width_dict[key] = 0 # default width is 1
+
+
+ # Special case: U+00AD SOFT HYPHEN
+ del width_dict[0x00AD]
+
+ # Special case: U+115F HANGUL CHOSEONG FILLER
+ width_dict[0x115F] = 2
+
for key in list(range(0x3248, 0x3250)):
# These are “A” which means we can decide whether to treat them
# as “W” or “N” based on context:
@@ -302,7 +317,7 @@ def process_width(outfile, ulines, elines, plines):
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
- Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
+ Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
@@ -312,6 +327,13 @@ if __name__ == "__main__":
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
+ '-d', '--derived_core_properties_file',
+ nargs='?',
+ type=str,
+ default='DerivedCoreProperties.txt',
+ help=('The DerivedCoreProperties.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
'-e', '--east_asian_with_file',
nargs='?',
type=str,
@@ -319,11 +341,11 @@ if __name__ == "__main__":
help=('The EastAsianWidth.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
- '-p', '--prop_list_file',
+ '-k', '--hangul_syllable_type_file',
nargs='?',
type=str,
- default='PropList.txt',
- help=('The PropList.txt file to read, '
+ default='HangulSyllableType.txt',
+ help=('The HangulSyllableType.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'--unicode_version',
@@ -336,27 +358,35 @@ if __name__ == "__main__":
unicode_utils.fill_attributes(ARGS.unicode_data_file)
with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
- with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
- EAST_ASIAN_WIDTH_LINES = []
- for LINE in EAST_ASIAN_WIDTH_FILE:
- # If characters from EastAsianWidth.txt which are from
- # reserved ranges (i.e. not yet assigned code points)
+ with open(ARGS.derived_core_properties_file, mode='r') as DERIVED_CORE_PROPERTIES_FILE:
+ DERIVED_CORE_PROPERTIES_LINES = []
+ for LINE in DERIVED_CORE_PROPERTIES_FILE:
+ # If characters which are from reserved ranges
+ # (i.e. not yet assigned code points)
# are added to the WIDTH section of the UTF-8 file, then
# “make check” produces “Unknown Character” errors for
# these code points because such unassigned code points
# are not in the CHARMAP section of the UTF-8 file.
#
- # Therefore, we skip all reserved code points when reading
- # the EastAsianWidth.txt file.
- if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
+ # Therefore, we skip all reserved code points.
+ if re.match(r'.*<reserved-.+>', LINE):
+ continue
+ if re.match(r'^[^;]*;\s*Default_Ignorable_Code_Point', LINE):
+ DERIVED_CORE_PROPERTIES_LINES.append(LINE.strip())
+ with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
+ EAST_ASIAN_WIDTH_LINES = []
+ for LINE in EAST_ASIAN_WIDTH_FILE:
+ if re.match(r'.*<reserved-.+>', LINE):
continue
if re.match(r'^[^;]*;\s*[WF]', LINE):
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
- with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
- PROP_LIST_LINES = []
- for LINE in PROP_LIST_FILE:
- if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
- PROP_LIST_LINES.append(LINE.strip())
+ with open(ARGS.hangul_syllable_type_file, mode='r') as HANGUL_SYLLABLE_TYPE_FILE:
+ HANGUL_SYLLABLE_TYPE_LINES = []
+ for LINE in HANGUL_SYLLABLE_TYPE_FILE:
+ if re.match(r'.*<reserved-.+>', LINE):
+ continue
+ if re.match(r'^[^;]*;\s*[VT]', LINE):
+ HANGUL_SYLLABLE_TYPE_LINES.append(LINE.strip())
with open('UTF-8', mode='w') as OUTFILE:
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
write_header_charmap(OUTFILE)
@@ -366,6 +396,7 @@ if __name__ == "__main__":
write_header_width(OUTFILE, ARGS.unicode_version)
process_width(OUTFILE,
UNICODE_DATA_LINES,
+ DERIVED_CORE_PROPERTIES_LINES,
EAST_ASIAN_WIDTH_LINES,
- PROP_LIST_LINES)
+ HANGUL_SYLLABLE_TYPE_LINES)
OUTFILE.write("END WIDTH\n")