From bd9f1e60ac55143c507c767ba070ab99a5760baa Mon Sep 17 00:00:00 2001 From: Reini Urban Date: Wed, 13 Sep 2017 10:09:03 +0200 Subject: [PATCH 1/2] towupper/towlower: Update to Unicode 10.0 and sort taken from safeclib and cross-checked with the perl unicode tables. sort the tables and exit when found. O(n) -> O(n/2) --- src/ctype/towctrans.c | 213 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 163 insertions(+), 50 deletions(-) diff --git src/ctype/towctrans.c src/ctype/towctrans.c index cf13a86..4745487 100644 --- src/ctype/towctrans.c +++ src/ctype/towctrans.c @@ -1,16 +1,21 @@ #include #include +#include #include "libc.h" #define CASEMAP(u1,u2,l) { (u1), (l)-(u1), (u2)-(u1)+1 } #define CASELACE(u1,u2) CASEMAP((u1),(u2),(u1)+1) +/* Unicode 10.0 */ + +/* must be sorted */ static const struct { unsigned short upper; signed char lower; unsigned char len; } casemaps[] = { - CASEMAP(0xc0,0xde,0xe0), + CASEMAP (0x00c0,0xd6,0xe0), + CASEMAP (0x00d8,0xde,0xf8), CASELACE(0x0100,0x012e), CASELACE(0x0132,0x0136), @@ -18,11 +23,21 @@ static const struct { CASELACE(0x014a,0x0176), CASELACE(0x0179,0x017d), - CASELACE(0x370,0x372), - CASEMAP(0x391,0x3a1,0x3b1), - CASEMAP(0x3a3,0x3ab,0x3c3), - CASEMAP(0x400,0x40f,0x450), - CASEMAP(0x410,0x42f,0x430), + CASELACE(0x01a0,0x1a4), + CASELACE(0x01b3,0x1b5), + CASELACE(0x01cd,0x1db), + CASELACE(0x01de,0x1ee), + CASELACE(0x01f8,0x21e), + CASELACE(0x0222,0x232), + CASELACE(0x0246,0x24e), + + CASELACE(0x0370,0x372), + CASEMAP (0x0388,0x38a,0x3ad), + CASEMAP (0x0393,0x39f,0x3b3), + CASEMAP (0x03a7,0x3ab,0x3c7), + CASELACE(0x03d8,0x3ee), + CASEMAP (0x0400,0x40f,0x450), + CASEMAP (0x0410,0x42f,0x430), CASELACE(0x460,0x480), CASELACE(0x48a,0x4be), @@ -80,17 +95,40 @@ static const struct { CASELACE(0xa77e,0xa786), CASELACE(0xa790,0xa792), + CASELACE(0xa796,0xa79e), CASELACE(0xa7a0,0xa7a8), + CASELACE(0xa7b4,0xa7b6), /* Unicode 8 */ + CASEMAP(0xff21,0xff3a,0xff41), { 0,0,0 } }; +/* must be sorted */ +static const struct { + unsigned int upper; + int lower; + unsigned short len; +} casemapsl[] = { + CASEMAP(0x13a0,0x13ef,0xab70), /* CHEROKEE reverse */ + CASEMAP(0xab70,0xabbf,0x13a0), /* CHEROKEE */ + CASEMAP(0x10400,0x10427,0x10428), + CASEMAP(0x104b0,0x104d3,0x104d8), /* Unicode 9 */ + CASEMAP(0x10c80,0x10cb2,0x10cc0), /* Unicode 8 */ + CASEMAP(0x118a0,0x118bf,0x118c0), /* Unicode 7 */ + CASEMAP(0x1e900,0x1e921,0x1e922), /* Unicode 9 */ + { 0,0,0 } +}; + +/* must now be sorted */ static const unsigned short pairs[][2] = { + /* upper - lower */ { 'I', 0x0131 }, { 'S', 0x017f }, + { 0x00b5, 0x03bc }, { 0x0130, 'i' }, { 0x0178, 0x00ff }, + { 0x017f, 0x73 }, { 0x0181, 0x0253 }, { 0x0182, 0x0183 }, { 0x0184, 0x0185 }, @@ -111,6 +149,7 @@ static const unsigned short pairs[][2] = { { 0x019c, 0x026f }, { 0x019d, 0x0272 }, { 0x019f, 0x0275 }, + /*CASELACE(0x01a0,0x01a4),*/ { 0x01a6, 0x0280 }, { 0x01a7, 0x01a8 }, { 0x01a9, 0x0283 }, @@ -119,38 +158,108 @@ static const unsigned short pairs[][2] = { { 0x01af, 0x01b0 }, { 0x01b1, 0x028a }, { 0x01b2, 0x028b }, + { 0x01b3, 0x01b4 }, + { 0x01b5, 0x01b6 }, { 0x01b7, 0x0292 }, { 0x01b8, 0x01b9 }, { 0x01bc, 0x01bd }, { 0x01c4, 0x01c6 }, - { 0x01c4, 0x01c5 }, + /*{ 0x01c4, 0x01c5 },*/ { 0x01c5, 0x01c6 }, { 0x01c7, 0x01c9 }, - { 0x01c7, 0x01c8 }, + /*{ 0x01c7, 0x01c8 },*/ { 0x01c8, 0x01c9 }, { 0x01ca, 0x01cc }, - { 0x01ca, 0x01cb }, + /*{ 0x01ca, 0x01cb },*/ + /*CASELACE(0x01cb,0x01db),*/ { 0x01cb, 0x01cc }, + { 0x01f1, 0x01f3 }, - { 0x01f1, 0x01f2 }, + /*{ 0x01f1, 0x01f2 },*/ { 0x01f2, 0x01f3 }, { 0x01f4, 0x01f5 }, { 0x01f6, 0x0195 }, { 0x01f7, 0x01bf }, + /*CASELACE(0x01f8,0x021e),*/ { 0x0220, 0x019e }, - { 0x0386, 0x03ac }, - { 0x0388, 0x03ad }, - { 0x0389, 0x03ae }, - { 0x038a, 0x03af }, + /*CASELACE(0x0222,0x0232),*/ + { 0x023a, 0x2c65 }, + { 0x023b, 0x23c }, + { 0x023d, 0x19a }, + { 0x023e, 0x2c66 }, + { 0x0241, 0x242 }, + { 0x0243, 0x180 }, + { 0x0244, 0x289 }, + { 0x0245, 0x28c }, + + { 0x0345, 0x3b9 }, + { 0x0376, 0x377 }, /* bogus greek 'symbol' */ + { 0x037f, 0x3f3 }, + { 0x0386, 0x3ac }, { 0x038c, 0x03cc }, { 0x038e, 0x03cd }, { 0x038f, 0x03ce }, - { 0x0399, 0x0345 }, - { 0x0399, 0x1fbe }, - { 0x03a3, 0x03c2 }, + { 0x0391, 0x3b1 }, + { 0x0392, 0x3b2 }, + { 0x0392, 0x3d0 }, /* reverse */ + /*CASEMAP (0x0393,0x39f,0x3b3),*/ + { 0x0395, 0x3f5 }, /* reverse */ + { 0x0398, 0x3d1 }, + { 0x0399, 0x1fbe },/* reverse */ + { 0x039a, 0x3f0 }, /* reverse */ + { 0x03a0, 0x3c0 }, + { 0x03a0, 0x3d6 }, /* reverse */ + { 0x03a1, 0x3c1 }, + { 0x03a1, 0x3f1 }, /* reverse */ + { 0x03a3, 0x3c3 }, + { 0x03a3, 0x3c2 }, /* reverse */ + { 0x03a4, 0x3c4 }, + { 0x03a5, 0x3c5 }, + { 0x03a6, 0x3c6 }, + { 0x03a6, 0x3d5 }, /* reverse */ + /*CASEMAP(0x0391,0x3a1,0x3b1),*/ + { 0x03c2, 0x3c3 }, + { 0x03cf, 0x3d7 }, + { 0x03d0, 0x3b2 }, + { 0x03d1, 0x3b8 }, + { 0x03d5, 0x3c6 }, + { 0x03d6, 0x3c0 }, + /*CASELACE(0x03d8,0x3ee),*/ + /*CASEMAP(0x03da,0x3ee,0x3db),*/ + { 0x03f0, 0x03ba }, + { 0x03f1, 0x03c1 }, + { 0x03f4, 0x03b8 }, + { 0x03f5, 0x03b5 }, { 0x03f7, 0x03f8 }, + { 0x03f9, 0x03f2 }, { 0x03fa, 0x03fb }, + { 0x03fd, 0x037b }, + { 0x03fe, 0x037c }, + { 0x03ff, 0x037d }, + /*CASEMAP(0x0400,0x40f,0x450), + CASEMAP(0x0410,0x42f,0x430),*/ + { 0x412, 0x1c80 }, /* reverse */ + { 0x414, 0x1c81 }, /* reverse */ + { 0x41e, 0x1c82 }, /* reverse */ + { 0x421, 0x1c83 }, /* reverse */ + { 0x422, 0x1c84 }, /* reverse */ + { 0x422, 0x1c85 }, /* reverse */ + { 0x42a, 0x1c86 }, /* reverse */ + { 0x462, 0x463 }, + { 0x462, 0x1c87 }, /* reverse */ + + { 0x04c0, 0x04cf}, + /*CASELACE(0x04c1,0x4cd),*/ + { 0x0528, 0x0529}, + { 0x052a, 0x052b}, + { 0x052c, 0x052d}, + { 0x052e, 0x052f}, + + { 0x10c7, 0x2d27 }, + { 0x10cd, 0x2d2d }, + { 0x1e60, 0x1e9b }, + { 0x1e9b, 0x1e61 }, { 0x1e9e, 0xdf }, { 0x1f59, 0x1f51 }, @@ -158,25 +267,11 @@ static const unsigned short pairs[][2] = { { 0x1f5d, 0x1f55 }, { 0x1f5f, 0x1f57 }, { 0x1fbc, 0x1fb3 }, + { 0x1fbe, 0x3b9 }, { 0x1fcc, 0x1fc3 }, { 0x1fec, 0x1fe5 }, { 0x1ffc, 0x1ff3 }, - { 0x23a, 0x2c65 }, - { 0x23b, 0x23c }, - { 0x23d, 0x19a }, - { 0x23e, 0x2c66 }, - { 0x241, 0x242 }, - { 0x243, 0x180 }, - { 0x244, 0x289 }, - { 0x245, 0x28c }, - { 0x3f4, 0x3b8 }, - { 0x3f9, 0x3f2 }, - { 0x3fd, 0x37b }, - { 0x3fe, 0x37c }, - { 0x3ff, 0x37d }, - { 0x4c0, 0x4cf }, - { 0x2126, 0x3c9 }, { 0x212a, 'k' }, { 0x212b, 0xe5 }, @@ -196,25 +291,25 @@ static const unsigned short pairs[][2] = { { 0x2c7f, 0x240 }, { 0x2cf2, 0x2cf3 }, + { 0xa64a, 0xa64b }, + { 0xa64a, 0x1c88 }, /* reverse */ + { 0xa77d, 0x1d79 }, { 0xa78b, 0xa78c }, { 0xa78d, 0x265 }, { 0xa7aa, 0x266 }, - { 0x10c7, 0x2d27 }, - { 0x10cd, 0x2d2d }, + { 0xa7ab, 0x25c }, /* Unicode 7.0 */ + { 0xa7ac, 0x261 }, /* Unicode 7.0 */ + { 0xa7ad, 0x26c }, /* Unicode 7.0 */ + { 0xa7ae, 0x26a }, /* Unicode 9.0 */ + { 0xa7b0, 0x29e }, /* Unicode 7.0 */ + { 0xa7b1, 0x287 }, /* Unicode 7.0 */ + { 0xa7b2, 0x29d }, /* Unicode 7.0 */ + { 0xa7b3, 0xab53 }, /* Unicode 8.0 */ + { 0xa7b4, 0xa7b5 }, /* Unicode 8.0 */ - /* bogus greek 'symbol' letters */ - { 0x376, 0x377 }, - { 0x39c, 0xb5 }, - { 0x392, 0x3d0 }, - { 0x398, 0x3d1 }, - { 0x3a6, 0x3d5 }, - { 0x3a0, 0x3d6 }, - { 0x39a, 0x3f0 }, - { 0x3a1, 0x3f1 }, - { 0x395, 0x3f5 }, - { 0x3cf, 0x3d7 }, + { 0xa7b6, 0xa7b7 }, /* Unicode 8.0 */ { 0,0 } }; @@ -229,29 +324,47 @@ static wchar_t __towcase(wchar_t wc, int lower) if (!iswalpha(wc) || (unsigned)wc - 0x0600 <= 0x0fff-0x0600 || (unsigned)wc - 0x2e00 <= 0xa63f-0x2e00 - || (unsigned)wc - 0xa800 <= 0xfeff-0xa800) + || (unsigned)wc - 0xa800 <= 0xab69-0xa800 + || (unsigned)wc - 0xabc0 <= 0xfeff-0xabc0) return wc; /* special case because the diff between upper/lower is too big */ - if (lower && (unsigned)wc - 0x10a0 < 0x2e) + if (lower && (unsigned)wc - 0x10a0 < 0x2e) { if (wc>0x10c5 && wc != 0x10c7 && wc != 0x10cd) return wc; else return wc + 0x2d00 - 0x10a0; - if (!lower && (unsigned)wc - 0x2d00 < 0x26) + } + if (!lower && (unsigned)wc - 0x2d00 < 0x26) { if (wc>0x2d25 && wc != 0x2d27 && wc != 0x2d2d) return wc; else return wc + 0x10a0 - 0x2d00; + } for (i=0; casemaps[i].len; i++) { int base = casemaps[i].upper + (lmask & casemaps[i].lower); + assert(i>0 ? casemaps[i].upper >= casemaps[i-1].upper : 1); if ((unsigned)wc-base < casemaps[i].len) { if (casemaps[i].lower == 1) return wc + lower - ((wc-casemaps[i].upper)&1); return wc + lmul*casemaps[i].lower; } + if (lower && casemaps[i].upper > wc) + break; } for (i=0; pairs[i][1-lower]; i++) { + assert(i>0 ? pairs[i][0] >= pairs[i-1][0] : 1); if (pairs[i][1-lower] == wc) return pairs[i][lower]; + if (lower && pairs[i][0] > wc) + break; + } + for (i=0; casemapsl[i].len; i++) { + unsigned long base = casemapsl[i].upper + (lmask & casemapsl[i].lower); + assert(i>0 ? casemapsl[i].upper >= casemapsl[i-1].upper : 1); + if ((unsigned)wc-base < casemapsl[i].len) { + if (casemapsl[i].lower == 1) + return wc + lower - ((wc-casemapsl[i].upper)&1); + return wc + lmul*casemapsl[i].lower; + } + if (lower && casemaps[i].upper > wc) + break; } - if ((unsigned)wc - (0x10428 - 0x28*lower) < 0x28) - return wc - 0x28 + 0x50*lower; return wc; } -- 2.8.4 (Apple Git-73)