diff options
| -rw-r--r-- | iconv/Makefile | 22 | ||||
| -rw-r--r-- | iconv/tst-iconv9.c | 87 | ||||
| -rw-r--r-- | localedata/C.UTF-8.in | 157 | ||||
| -rw-r--r-- | localedata/Makefile | 2 | ||||
| -rw-r--r-- | localedata/SUPPORTED | 1 | ||||
| -rw-r--r-- | localedata/locales/C | 194 | ||||
| -rw-r--r-- | posix/bug-regex1.c | 20 | ||||
| -rw-r--r-- | posix/bug-regex19.c | 22 | ||||
| -rw-r--r-- | posix/bug-regex4.c | 25 | ||||
| -rw-r--r-- | posix/bug-regex6.c | 2 | ||||
| -rw-r--r-- | posix/transbug.c | 22 | ||||
| -rw-r--r-- | posix/tst-fnmatch.input | 549 | ||||
| -rw-r--r-- | posix/tst-regcomp-truncated.c | 1 | ||||
| -rw-r--r-- | posix/tst-regex.c | 25 |
14 files changed, 1104 insertions, 25 deletions
diff --git a/iconv/Makefile b/iconv/Makefile index 07d77c9eca..9993f2d3f3 100644 --- a/iconv/Makefile +++ b/iconv/Makefile @@ -43,8 +43,19 @@ CFLAGS-charmap.c += -DCHARMAP_PATH='"$(i18ndir)/charmaps"' \ CFLAGS-linereader.c += -DNO_TRANSLITERATION CFLAGS-simple-hash.c += -I../locale -tests = tst-iconv1 tst-iconv2 tst-iconv3 tst-iconv4 tst-iconv5 tst-iconv6 \ - tst-iconv7 tst-iconv8 tst-iconv-mt tst-iconv-opt +tests = \ + tst-iconv1 \ + tst-iconv2 \ + tst-iconv3 \ + tst-iconv4 \ + tst-iconv5 \ + tst-iconv6 \ + tst-iconv7 \ + tst-iconv8 \ + tst-iconv9 \ + tst-iconv-mt \ + tst-iconv-opt \ + # tests others = iconv_prog iconvconfig install-others-programs = $(inst_bindir)/iconv @@ -83,10 +94,15 @@ endif include ../Rules ifeq ($(run-built-tests),yes) -LOCALES := en_US.UTF-8 +# We have to generate locales (list sorted alphabetically) +LOCALES := \ + C.UTF-8 \ + en_US.UTF-8 \ + # LOCALES include ../gen-locales.mk $(objpfx)tst-iconv-opt.out: $(gen-locales) +$(objpfx)tst-iconv9.out: $(gen-locales) endif $(inst_bindir)/iconv: $(objpfx)iconv_prog $(+force) diff --git a/iconv/tst-iconv9.c b/iconv/tst-iconv9.c new file mode 100644 index 0000000000..78a5324279 --- /dev/null +++ b/iconv/tst-iconv9.c @@ -0,0 +1,87 @@ +/* Verify that using C.UTF-8 works. + + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <iconv.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <support/support.h> +#include <support/check.h> + +/* This test does two things: + (1) Verify that we have likely included translit_combining in C.UTF-8. + (2) Verify default_missing is '?' as expected. */ + +/* ISO-8859-1 encoding of "für". */ +char iso88591_in[] = { 0x66, 0xfc, 0x72, 0x0 }; +/* ASCII transliteration is "fur" with C.UTF-8 translit_combining. */ +char ascii_exp[] = { 0x66, 0x75, 0x72, 0x0 }; + +/* First 3-byte UTF-8 code point. */ +char utf8_in[] = { 0xe0, 0xa0, 0x80, 0x0 }; +/* There is no ASCII transliteration for SAMARITAN LETTER ALAF + so we get default_missing used which is '?'. */ +char default_missing_exp[] = { 0x3f, 0x0 }; + +static int +do_test (void) +{ + char ascii_out[5]; + iconv_t cd; + char *inbuf; + char *outbuf; + size_t inbytes; + size_t outbytes; + size_t n; + + /* The C.UTF-8 locale should include translit_combining, which provides + the transliteration for "LATIN SMALL LETTER U WITH DIAERESIS" which + is not provided by locale/C-translit.h.in. */ + xsetlocale (LC_ALL, "C.UTF-8"); + + /* From ISO-8859-1 to ASCII. */ + cd = iconv_open ("ASCII//TRANSLIT,IGNORE", "ISO-8859-1"); + TEST_VERIFY (cd != (iconv_t) -1); + inbuf = iso88591_in; + inbytes = 3; + outbuf = ascii_out; + outbytes = 3; + n = iconv (cd, &inbuf, &inbytes, &outbuf, &outbytes); + TEST_VERIFY (n != -1); + *outbuf = '\0'; + TEST_COMPARE_BLOB (ascii_out, 3, ascii_exp, 3); + TEST_VERIFY (iconv_close (cd) == 0); + + /* From UTF-8 to ASCII. */ + cd = iconv_open ("ASCII//TRANSLIT,IGNORE", "UTF-8"); + TEST_VERIFY (cd != (iconv_t) -1); + inbuf = utf8_in; + inbytes = 3; + outbuf = ascii_out; + outbytes = 3; + n = iconv (cd, &inbuf, &inbytes, &outbuf, &outbytes); + TEST_VERIFY (n != -1); + *outbuf = '\0'; + TEST_COMPARE_BLOB (ascii_out, 1, default_missing_exp, 1); + TEST_VERIFY (iconv_close (cd) == 0); + + return 0; +} + +#include <support/test-driver.c> diff --git a/localedata/C.UTF-8.in b/localedata/C.UTF-8.in new file mode 100644 index 0000000000..c31dcc2aa0 --- /dev/null +++ b/localedata/C.UTF-8.in @@ -0,0 +1,157 @@ + ; <U1> + ; <U2> + ; <U3> + ; <U4> + ; <U5> + ; <U6> + ; <U7> + ; <U8> + ; <UE> + ; <UF> + ; <U10> + ; <U11> + ; <U12> + ; <U13> + ; <U14> + ; <U15> + ; <U16> + ; <U17> + ; <U18> + ; <U19> + ; <U1A> + ; <U1B> + ; <U1C> + ; <U1D> + ; <U1E> + ; <U1F> +! ; <U21> +" ; <U22> +# ; <U23> +$ ; <U24> +% ; <U25> +& ; <U26> +' ; <U27> +) ; <U29> +* ; <U2A> ++ ; <U2B> +, ; <U2C> +- ; <U2D> +. ; <U2E> +/ ; <U2F> +0 ; <U30> +1 ; <U31> +2 ; <U32> +3 ; <U33> +4 ; <U34> +5 ; <U35> +6 ; <U36> +7 ; <U37> +8 ; <U38> +9 ; <U39> +< ; <U3C> += ; <U3D> +> ; <U3E> +? ; <U3F> +@ ; <U40> +A ; <U41> +B ; <U42> +C ; <U43> +D ; <U44> +E ; <U45> +F ; <U46> +G ; <U47> +H ; <U48> +I ; <U49> +J ; <U4A> +K ; <U4B> +L ; <U4C> +M ; <U4D> +N ; <U4E> +O ; <U4F> +P ; <U50> +Q ; <U51> +R ; <U52> +S ; <U53> +T ; <U54> +U ; <U55> +V ; <U56> +W ; <U57> +X ; <U58> +Y ; <U59> +Z ; <U5A> +[ ; <U5B> +\ ; <U5C> +] ; <U5D> +^ ; <U5E> +_ ; <U5F> +` ; <U60> +a ; <U61> +b ; <U62> +c ; <U63> +d ; <U64> +e ; <U65> +f ; <U66> +g ; <U67> +h ; <U68> +i ; <U69> +j ; <U6A> +k ; <U6B> +l ; <U6C> +m ; <U6D> +n ; <U6E> +o ; <U6F> +p ; <U70> +q ; <U71> +r ; <U72> +s ; <U73> +t ; <U74> +u ; <U75> +v ; <U76> +w ; <U77> +x ; <U78> +y ; <U79> +z ; <U7A> +{ ; <U7B> +| ; <U7C> +} ; <U7D> +~ ; <U7E> + ; <U7F> + ; <U80> +ÿ ; <UFF> +Ā ; <U100> + ; <UFFF> +က ; <U1000> +� ; <UFFFD> + ; <UFFFF> +𐀀 ; <U10000> + ; <U1FFFF> +𠀀 ; <U20000> + ; <U2FFFF> +𰀀 ; <U30000> + ; <U3FFFE> + ; <U40000> + ; <U4FFFF> + ; <U50000> + ; <U5FFFF> + ; <U60000> + ; <U6FFFF> + ; <U70000> + ; <U7FFFF> + ; <U80000> + ; <U8FFFF> + ; <U90000> + ; <U9FFFF> + ; <UA0000> + ; <UAFFFF> + ; <UB0000> + ; <UBFFFF> + ; <UC0001> + ; <UCFFCC> + ; <UD000E> + ; <UDFFFF> + ; <UE0001> + ; <UEFFFF> + ; <UF0001> + ; <UFFFFF> + ; <U100001> + ; <U10FFFF> diff --git a/localedata/Makefile b/localedata/Makefile index f585e0dd41..66a269641b 100644 --- a/localedata/Makefile +++ b/localedata/Makefile @@ -47,6 +47,7 @@ test-input := \ bg_BG.UTF-8 \ br_FR.UTF-8 \ bs_BA.UTF-8 \ + C.UTF-8 \ ckb_IQ.UTF-8 \ cmn_TW.UTF-8 \ crh_UA.UTF-8 \ @@ -206,6 +207,7 @@ LOCALES := \ bg_BG.UTF-8 \ br_FR.UTF-8 \ bs_BA.UTF-8 \ + C.UTF-8 \ ckb_IQ.UTF-8 \ cmn_TW.UTF-8 \ crh_UA.UTF-8 \ diff --git a/localedata/SUPPORTED b/localedata/SUPPORTED index 1ee5b5e8c8..d768aa4795 100644 --- a/localedata/SUPPORTED +++ b/localedata/SUPPORTED @@ -79,6 +79,7 @@ brx_IN/UTF-8 \ bs_BA.UTF-8/UTF-8 \ bs_BA/ISO-8859-2 \ byn_ER/UTF-8 \ +C.UTF-8/UTF-8 \ ca_AD.UTF-8/UTF-8 \ ca_AD/ISO-8859-15 \ ca_ES.UTF-8/UTF-8 \ diff --git a/localedata/locales/C b/localedata/locales/C new file mode 100644 index 0000000000..651691c724 --- /dev/null +++ b/localedata/locales/C @@ -0,0 +1,194 @@ +escape_char / +comment_char % +% Locale for C locale in UTF-8 + +LC_IDENTIFICATION +title "C locale" +source "" +address "" +contact "" +email "bug-glibc-locales@gnu.org" +tel "" +fax "" +language "" +territory "" +revision "2.0" +date "2020-06-28" +category "i18n:2012";LC_IDENTIFICATION +category "i18n:2012";LC_CTYPE +category "i18n:2012";LC_COLLATE +category "i18n:2012";LC_TIME +category "i18n:2012";LC_NUMERIC +category "i18n:2012";LC_MONETARY +category "i18n:2012";LC_MESSAGES +category "i18n:2012";LC_PAPER +category "i18n:2012";LC_NAME +category "i18n:2012";LC_ADDRESS +category "i18n:2012";LC_TELEPHONE +category "i18n:2012";LC_MEASUREMENT +END LC_IDENTIFICATION + +LC_CTYPE +% Include only the i18n character type classes without any of the +% transliteration that i18n uses by default. +copy "i18n_ctype" + +% Include the neutral transliterations. The builtin C and +% POSIX locales have +1600 transliterations that are built into +% the locales, and these are a superset of those. +translit_start +include "translit_neutral";"" +% We must use '?' for default_missing because the transliteration +% framework includes it directly into the output and so it must +% be compatible with ASCII if that is the target character set. +default_missing <U003F> +translit_end + +% Include the transliterations that can convert combined cahracters. +% These are generally expected by users. +translit_start +include "translit_combining";"" +translit_end + +END LC_CTYPE + +LC_COLLATE +% The keyword 'strcmp_collation' in any part of any LC_COLLATE +% immediately discards all collation information and causes the +% locale to use strcmp for collation comparison. This is exactly +% what is needed for C (ASCII) or C.UTF-8. +strcmp_collation +END LC_COLLATE + +LC_MONETARY + +% This is the 14652 i18n fdcc-set definition for the LC_MONETARY +% category (except for the int_curr_symbol and currency_symbol, they are +% empty in the 14652 i18n fdcc-set definition and also empty in +% glibc/locale/C-monetary.c.). +int_curr_symbol "" +currency_symbol "" +mon_decimal_point "." +mon_thousands_sep "" +mon_grouping -1 +positive_sign "" +negative_sign "-" +int_frac_digits -1 +frac_digits -1 +p_cs_precedes -1 +int_p_sep_by_space -1 +p_sep_by_space -1 +n_cs_precedes -1 +int_n_sep_by_space -1 +n_sep_by_space -1 +p_sign_posn -1 +n_sign_posn -1 +% +END LC_MONETARY + +LC_NUMERIC +% This is the POSIX Locale definition for +% the LC_NUMERIC category. +% +decimal_point "." +thousands_sep "" +grouping -1 +END LC_NUMERIC + +LC_TIME +% This is the POSIX Locale definition for the LC_TIME category with the +% exception that time is per ISO 8601 and 24-hour. +% +% Abbreviated weekday names (%a) +abday "Sun";"Mon";"Tue";"Wed";"Thu";"Fri";"Sat" + +% Full weekday names (%A) +day "Sunday";"Monday";"Tuesday";"Wednesday";"Thursday";/ + "Friday";"Saturday" + +% Abbreviated month names (%b) +abmon "Jan";"Feb";"Mar";"Apr";"May";"Jun";"Jul";"Aug";"Sep";/ + "Oct";"Nov";"Dec" + +% Full month names (%B) +mon "January";"February";"March";"April";"May";"June";"July";/ + "August";"September";"October";"November";"December" + +% Week description, consists of three fields: +% 1. Number of days in a week. +% 2. Gregorian date that is a first weekday (19971130 for Sunday, 19971201 for Monday). +% 3. The weekday number to be contained in the first week of the year. +% +% ISO 8601 conforming applications should use the values 7, 19971201 (a +% Monday), and 4 (Thursday), respectively. +week 7;19971201;4 +first_weekday 1 +first_workday 2 + +% Appropriate date and time representation (%c) +d_t_fmt "%a %b %e %H:%M:%S %Y" + +% Appropriate date representation (%x) +d_fmt "%m/%d/%y" + +% Appropriate time representation (%X) +t_fmt "%H:%M:%S" + +% Appropriate AM/PM time representation (%r) +t_fmt_ampm "%I:%M:%S %p" + +% Equivalent of AM/PM (%p) +am_pm "AM";"PM" + +% Appropriate date representation (date(1)) "%a %b %e %H:%M:%S %Z %Y" +date_fmt "%a %b %e %H:%M:%S %Z %Y" +END LC_TIME + +LC_MESSAGES +% This is the POSIX Locale definition for +% the LC_NUMERIC category. +% +yesexpr "^[yY]" +noexpr "^[nN]" +yesstr "Yes" +nostr "No" +END LC_MESSAGES + +LC_PAPER +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_PAPER category. +% (A4 paper, this is also used in the built in C/POSIX +% locale in glibc/locale/C-paper.c) +height 297 +width 210 +END LC_PAPER + +LC_NAME +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_NAME category. +% (also used in the built in C/POSIX locale in glibc/locale/C-name.c) +name_fmt "%p%t%g%t%m%t%f" +END LC_NAME + +LC_ADDRESS +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_ADDRESS category. +% (also used in the built in C/POSIX locale in glibc/locale/C-address.c) +postal_fmt "%a%N%f%N%d%N%b%N%s %h %e %r%N%C-%z %T%N%c%N" +END LC_ADDRESS + +LC_TELEPHONE +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_TELEPHONE category. +% "+%c %a %l" +tel_int_fmt "+%c %a %l" +% (also used in the built in C/POSIX locale in glibc/locale/C-telephone.c) +END LC_TELEPHONE + +LC_MEASUREMENT +% This is the ISO/IEC 14652 "i18n" definition for +% the LC_MEASUREMENT category. +% (same as in the built in C/POSIX locale in glibc/locale/C-measurement.c) +%metric +measurement 1 +END LC_MEASUREMENT diff --git a/posix/bug-regex1.c b/posix/bug-regex1.c index 38eb543951..85da8cc7ca 100644 --- a/posix/bug-regex1.c +++ b/posix/bug-regex1.c @@ -41,6 +41,26 @@ main (void) puts (" -> OK"); } + puts ("in C.UTF-8 locale"); + setlocale (LC_ALL, "C.UTF-8"); + s = re_compile_pattern ("[an]*n", 7, ®ex); + if (s != NULL) + { + puts ("re_compile_pattern return non-NULL value"); + result = 1; + } + else + { + match = re_match (®ex, "an", 2, 0, ®s); + if (match != 2) + { + printf ("re_match returned %d, expected 2\n", match); + result = 1; + } + else + puts (" -> OK"); + } + puts ("in de_DE.ISO-8859-1 locale"); setlocale (LC_ALL, "de_DE.ISO-8859-1"); s = re_compile_pattern ("[an]*n", 7, ®ex); diff --git a/posix/bug-regex19.c b/posix/bug-regex19.c index b3fee0a730..e00ff60a14 100644 --- a/posix/bug-regex19.c +++ b/posix/bug-regex19.c @@ -25,6 +25,7 @@ #include <string.h> #include <locale.h> #include <libc-diag.h> +#include <support/support.h> #define BRE RE_SYNTAX_POSIX_BASIC #define ERE RE_SYNTAX_POSIX_EXTENDED @@ -407,8 +408,8 @@ do_mb_tests (const struct test_s *test) return 0; } -int -main (void) +static int +do_test (void) { size_t i; int ret = 0; @@ -417,20 +418,17 @@ main (void) for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) { - if (setlocale (LC_ALL, "de_DE.ISO-8859-1") == NULL) - { - puts ("setlocale de_DE.ISO-8859-1 failed"); - ret = 1; - } + xsetlocale (LC_ALL, "de_DE.ISO-8859-1"); ret |= do_one_test (&tests[i], ""); - if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL) - { - puts ("setlocale de_DE.UTF-8 failed"); - ret = 1; - } + xsetlocale (LC_ALL, "de_DE.UTF-8"); + ret |= do_one_test (&tests[i], "UTF-8 "); + ret |= do_mb_tests (&tests[i]); + xsetlocale (LC_ALL, "C.UTF-8"); ret |= do_one_test (&tests[i], "UTF-8 "); ret |= do_mb_tests (&tests[i]); } return ret; } + +#include <support/test-driver.c> diff --git a/posix/bug-regex4.c b/posix/bug-regex4.c index 8d5ae11567..6475833c52 100644 --- a/posix/bug-regex4.c +++ b/posix/bug-regex4.c @@ -32,6 +32,7 @@ main (void) memset (®ex, '\0', sizeof (regex)); + printf ("INFO: Checking C.\n"); setlocale (LC_ALL, "C"); s = re_compile_pattern ("ab[cde]", 7, ®ex); @@ -55,5 +56,29 @@ main (void) puts (" -> OK"); } + printf ("INFO: Checking C.UTF-8.\n"); + setlocale (LC_ALL, "C.UTF-8"); + + s = re_compile_pattern ("ab[cde]", 7, ®ex); + if (s != NULL) + { + puts ("re_compile_pattern returned non-NULL value"); + result = 1; + } + else + { + match[0] = re_search_2 (®ex, "xyabez", 6, "", 0, 1, 5, NULL, 6); + match[1] = re_search_2 (®ex, NULL, 0, "abc", 3, 0, 3, NULL, 3); + match[2] = re_search_2 (®ex, "xya", 3, "bd", 2, 2, 3, NULL, 5); + if (match[0] != 2 || match[1] != 0 || match[2] != 2) + { + printf ("re_search_2 returned %d,%d,%d, expected 2,0,2\n", + match[0], match[1], match[2]); + result = 1; + } + else + puts (" -> OK"); + } + return result; } diff --git a/posix/bug-regex6.c b/posix/bug-regex6.c index 2bdf2126a4..0929b69b83 100644 --- a/posix/bug-regex6.c +++ b/posix/bug-regex6.c @@ -30,7 +30,7 @@ main (int argc, char *argv[]) regex_t re; regmatch_t mat[10]; int i, j, ret = 0; - const char *locales[] = { "C", "de_DE.UTF-8" }; + const char *locales[] = { "C", "C.UTF-8", "de_DE.UTF-8" }; const char *string = "http://www.regex.com/pattern/matching.html#intro"; regmatch_t expect[10] = { { 0, 48 }, { 0, 5 }, { 0, 4 }, { 5, 20 }, { 7, 20 }, { 20, 42 }, diff --git a/posix/transbug.c b/posix/transbug.c index d0983b4d44..71632b7976 100644 --- a/posix/transbug.c +++ b/posix/transbug.c @@ -116,14 +116,30 @@ do_test (void) static const char lower[] = "[[:lower:]]+"; static const char upper[] = "[[:upper:]]+"; struct re_registers regs[4]; + int result; +#define CHECK(exp) \ + if (exp) { puts (#exp); result = 1; } + + printf ("INFO: Checking C.\n"); setlocale (LC_ALL, "C"); (void) re_set_syntax (RE_SYNTAX_GNU_AWK); - int result; -#define CHECK(exp) \ - if (exp) { puts (#exp); result = 1; } + result = run_test (lower, regs); + result |= run_test (upper, ®s[2]); + if (! result) + { + CHECK (regs[0].start[0] != regs[2].start[0]); + CHECK (regs[0].end[0] != regs[2].end[0]); + CHECK (regs[1].start[0] != regs[3].start[0]); + CHECK (regs[1].end[0] != regs[3].end[0]); + } + + printf ("INFO: Checking C.UTF-8.\n"); + setlocale (LC_ALL, "C.UTF-8"); + + (void) re_set_syntax (RE_SYNTAX_GNU_AWK); result = run_test (lower, regs); result |= run_test (upper, ®s[2]); diff --git a/posix/tst-fnmatch.input b/posix/tst-fnmatch.input index 67aac5aada..6ff5318032 100644 --- a/posix/tst-fnmatch.input +++ b/posix/tst-fnmatch.input @@ -472,6 +472,397 @@ C "\\" "[Z-\\]]" 0 C "]" "[Z-\\]]" 0 C "-" "[Z-\\]]" NOMATCH +# B.6 004(C) +C.UTF-8 "!#%+,-./01234567889" "!#%+,-./01234567889" 0 +C.UTF-8 ":;=@ABCDEFGHIJKLMNO" ":;=@ABCDEFGHIJKLMNO" 0 +C.UTF-8 "PQRSTUV |
