Home | History | Annotate | Download | only in unicode
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package unicode_test
      6 
      7 import (
      8 	"flag"
      9 	"fmt"
     10 	"runtime"
     11 	"sort"
     12 	"testing"
     13 	. "unicode"
     14 )
     15 
     16 var upperTest = []rune{
     17 	0x41,
     18 	0xc0,
     19 	0xd8,
     20 	0x100,
     21 	0x139,
     22 	0x14a,
     23 	0x178,
     24 	0x181,
     25 	0x376,
     26 	0x3cf,
     27 	0x13bd,
     28 	0x1f2a,
     29 	0x2102,
     30 	0x2c00,
     31 	0x2c10,
     32 	0x2c20,
     33 	0xa650,
     34 	0xa722,
     35 	0xff3a,
     36 	0x10400,
     37 	0x1d400,
     38 	0x1d7ca,
     39 }
     40 
     41 var notupperTest = []rune{
     42 	0x40,
     43 	0x5b,
     44 	0x61,
     45 	0x185,
     46 	0x1b0,
     47 	0x377,
     48 	0x387,
     49 	0x2150,
     50 	0xab7d,
     51 	0xffff,
     52 	0x10000,
     53 }
     54 
     55 var letterTest = []rune{
     56 	0x41,
     57 	0x61,
     58 	0xaa,
     59 	0xba,
     60 	0xc8,
     61 	0xdb,
     62 	0xf9,
     63 	0x2ec,
     64 	0x535,
     65 	0x620,
     66 	0x6e6,
     67 	0x93d,
     68 	0xa15,
     69 	0xb99,
     70 	0xdc0,
     71 	0xedd,
     72 	0x1000,
     73 	0x1200,
     74 	0x1312,
     75 	0x1401,
     76 	0x2c00,
     77 	0xa800,
     78 	0xf900,
     79 	0xfa30,
     80 	0xffda,
     81 	0xffdc,
     82 	0x10000,
     83 	0x10300,
     84 	0x10400,
     85 	0x20000,
     86 	0x2f800,
     87 	0x2fa1d,
     88 }
     89 
     90 var notletterTest = []rune{
     91 	0x20,
     92 	0x35,
     93 	0x375,
     94 	0x619,
     95 	0x700,
     96 	0x1885,
     97 	0xfffe,
     98 	0x1ffff,
     99 	0x10ffff,
    100 }
    101 
    102 // Contains all the special cased Latin-1 chars.
    103 var spaceTest = []rune{
    104 	0x09,
    105 	0x0a,
    106 	0x0b,
    107 	0x0c,
    108 	0x0d,
    109 	0x20,
    110 	0x85,
    111 	0xA0,
    112 	0x2000,
    113 	0x3000,
    114 }
    115 
    116 type caseT struct {
    117 	cas     int
    118 	in, out rune
    119 }
    120 
    121 var caseTest = []caseT{
    122 	// errors
    123 	{-1, '\n', 0xFFFD},
    124 	{UpperCase, -1, -1},
    125 	{UpperCase, 1 << 30, 1 << 30},
    126 
    127 	// ASCII (special-cased so test carefully)
    128 	{UpperCase, '\n', '\n'},
    129 	{UpperCase, 'a', 'A'},
    130 	{UpperCase, 'A', 'A'},
    131 	{UpperCase, '7', '7'},
    132 	{LowerCase, '\n', '\n'},
    133 	{LowerCase, 'a', 'a'},
    134 	{LowerCase, 'A', 'a'},
    135 	{LowerCase, '7', '7'},
    136 	{TitleCase, '\n', '\n'},
    137 	{TitleCase, 'a', 'A'},
    138 	{TitleCase, 'A', 'A'},
    139 	{TitleCase, '7', '7'},
    140 
    141 	// Latin-1: easy to read the tests!
    142 	{UpperCase, 0x80, 0x80},
    143 	{UpperCase, '', ''},
    144 	{UpperCase, '', ''},
    145 	{LowerCase, 0x80, 0x80},
    146 	{LowerCase, '', ''},
    147 	{LowerCase, '', ''},
    148 	{TitleCase, 0x80, 0x80},
    149 	{TitleCase, '', ''},
    150 	{TitleCase, '', ''},
    151 
    152 	// 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
    153 	{UpperCase, 0x0131, 'I'},
    154 	{LowerCase, 0x0131, 0x0131},
    155 	{TitleCase, 0x0131, 'I'},
    156 
    157 	// 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
    158 	{UpperCase, 0x0133, 0x0132},
    159 	{LowerCase, 0x0133, 0x0133},
    160 	{TitleCase, 0x0133, 0x0132},
    161 
    162 	// 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
    163 	{UpperCase, 0x212A, 0x212A},
    164 	{LowerCase, 0x212A, 'k'},
    165 	{TitleCase, 0x212A, 0x212A},
    166 
    167 	// From an UpperLower sequence
    168 	// A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
    169 	{UpperCase, 0xA640, 0xA640},
    170 	{LowerCase, 0xA640, 0xA641},
    171 	{TitleCase, 0xA640, 0xA640},
    172 	// A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
    173 	{UpperCase, 0xA641, 0xA640},
    174 	{LowerCase, 0xA641, 0xA641},
    175 	{TitleCase, 0xA641, 0xA640},
    176 	// A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
    177 	{UpperCase, 0xA64E, 0xA64E},
    178 	{LowerCase, 0xA64E, 0xA64F},
    179 	{TitleCase, 0xA64E, 0xA64E},
    180 	// A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
    181 	{UpperCase, 0xA65F, 0xA65E},
    182 	{LowerCase, 0xA65F, 0xA65F},
    183 	{TitleCase, 0xA65F, 0xA65E},
    184 
    185 	// From another UpperLower sequence
    186 	// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
    187 	{UpperCase, 0x0139, 0x0139},
    188 	{LowerCase, 0x0139, 0x013A},
    189 	{TitleCase, 0x0139, 0x0139},
    190 	// 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
    191 	{UpperCase, 0x013f, 0x013f},
    192 	{LowerCase, 0x013f, 0x0140},
    193 	{TitleCase, 0x013f, 0x013f},
    194 	// 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
    195 	{UpperCase, 0x0148, 0x0147},
    196 	{LowerCase, 0x0148, 0x0148},
    197 	{TitleCase, 0x0148, 0x0147},
    198 
    199 	// Lowercase lower than uppercase.
    200 	// AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8
    201 	{UpperCase, 0xab78, 0x13a8},
    202 	{LowerCase, 0xab78, 0xab78},
    203 	{TitleCase, 0xab78, 0x13a8},
    204 	{UpperCase, 0x13a8, 0x13a8},
    205 	{LowerCase, 0x13a8, 0xab78},
    206 	{TitleCase, 0x13a8, 0x13a8},
    207 
    208 	// Last block in the 5.1.0 table
    209 	// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
    210 	{UpperCase, 0x10400, 0x10400},
    211 	{LowerCase, 0x10400, 0x10428},
    212 	{TitleCase, 0x10400, 0x10400},
    213 	// 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
    214 	{UpperCase, 0x10427, 0x10427},
    215 	{LowerCase, 0x10427, 0x1044F},
    216 	{TitleCase, 0x10427, 0x10427},
    217 	// 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
    218 	{UpperCase, 0x10428, 0x10400},
    219 	{LowerCase, 0x10428, 0x10428},
    220 	{TitleCase, 0x10428, 0x10400},
    221 	// 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
    222 	{UpperCase, 0x1044F, 0x10427},
    223 	{LowerCase, 0x1044F, 0x1044F},
    224 	{TitleCase, 0x1044F, 0x10427},
    225 
    226 	// First one not in the 5.1.0 table
    227 	// 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
    228 	{UpperCase, 0x10450, 0x10450},
    229 	{LowerCase, 0x10450, 0x10450},
    230 	{TitleCase, 0x10450, 0x10450},
    231 
    232 	// Non-letters with case.
    233 	{LowerCase, 0x2161, 0x2171},
    234 	{UpperCase, 0x0345, 0x0399},
    235 }
    236 
    237 func TestIsLetter(t *testing.T) {
    238 	for _, r := range upperTest {
    239 		if !IsLetter(r) {
    240 			t.Errorf("IsLetter(U+%04X) = false, want true", r)
    241 		}
    242 	}
    243 	for _, r := range letterTest {
    244 		if !IsLetter(r) {
    245 			t.Errorf("IsLetter(U+%04X) = false, want true", r)
    246 		}
    247 	}
    248 	for _, r := range notletterTest {
    249 		if IsLetter(r) {
    250 			t.Errorf("IsLetter(U+%04X) = true, want false", r)
    251 		}
    252 	}
    253 }
    254 
    255 func TestIsUpper(t *testing.T) {
    256 	for _, r := range upperTest {
    257 		if !IsUpper(r) {
    258 			t.Errorf("IsUpper(U+%04X) = false, want true", r)
    259 		}
    260 	}
    261 	for _, r := range notupperTest {
    262 		if IsUpper(r) {
    263 			t.Errorf("IsUpper(U+%04X) = true, want false", r)
    264 		}
    265 	}
    266 	for _, r := range notletterTest {
    267 		if IsUpper(r) {
    268 			t.Errorf("IsUpper(U+%04X) = true, want false", r)
    269 		}
    270 	}
    271 }
    272 
    273 func caseString(c int) string {
    274 	switch c {
    275 	case UpperCase:
    276 		return "UpperCase"
    277 	case LowerCase:
    278 		return "LowerCase"
    279 	case TitleCase:
    280 		return "TitleCase"
    281 	}
    282 	return "ErrorCase"
    283 }
    284 
    285 func TestTo(t *testing.T) {
    286 	for _, c := range caseTest {
    287 		r := To(c.cas, c.in)
    288 		if c.out != r {
    289 			t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out)
    290 		}
    291 	}
    292 }
    293 
    294 func TestToUpperCase(t *testing.T) {
    295 	for _, c := range caseTest {
    296 		if c.cas != UpperCase {
    297 			continue
    298 		}
    299 		r := ToUpper(c.in)
    300 		if c.out != r {
    301 			t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
    302 		}
    303 	}
    304 }
    305 
    306 func TestToLowerCase(t *testing.T) {
    307 	for _, c := range caseTest {
    308 		if c.cas != LowerCase {
    309 			continue
    310 		}
    311 		r := ToLower(c.in)
    312 		if c.out != r {
    313 			t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
    314 		}
    315 	}
    316 }
    317 
    318 func TestToTitleCase(t *testing.T) {
    319 	for _, c := range caseTest {
    320 		if c.cas != TitleCase {
    321 			continue
    322 		}
    323 		r := ToTitle(c.in)
    324 		if c.out != r {
    325 			t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
    326 		}
    327 	}
    328 }
    329 
    330 func TestIsSpace(t *testing.T) {
    331 	for _, c := range spaceTest {
    332 		if !IsSpace(c) {
    333 			t.Errorf("IsSpace(U+%04X) = false; want true", c)
    334 		}
    335 	}
    336 	for _, c := range letterTest {
    337 		if IsSpace(c) {
    338 			t.Errorf("IsSpace(U+%04X) = true; want false", c)
    339 		}
    340 	}
    341 }
    342 
    343 // Check that the optimizations for IsLetter etc. agree with the tables.
    344 // We only need to check the Latin-1 range.
    345 func TestLetterOptimizations(t *testing.T) {
    346 	for i := rune(0); i <= MaxLatin1; i++ {
    347 		if Is(Letter, i) != IsLetter(i) {
    348 			t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i)
    349 		}
    350 		if Is(Upper, i) != IsUpper(i) {
    351 			t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i)
    352 		}
    353 		if Is(Lower, i) != IsLower(i) {
    354 			t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i)
    355 		}
    356 		if Is(Title, i) != IsTitle(i) {
    357 			t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i)
    358 		}
    359 		if Is(White_Space, i) != IsSpace(i) {
    360 			t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i)
    361 		}
    362 		if To(UpperCase, i) != ToUpper(i) {
    363 			t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i)
    364 		}
    365 		if To(LowerCase, i) != ToLower(i) {
    366 			t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i)
    367 		}
    368 		if To(TitleCase, i) != ToTitle(i) {
    369 			t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i)
    370 		}
    371 	}
    372 }
    373 
    374 func TestTurkishCase(t *testing.T) {
    375 	lower := []rune("abcdefghijklmnoprstuvyz")
    376 	upper := []rune("ABCDEFGHIJKLMNOPRSTUVYZ")
    377 	for i, l := range lower {
    378 		u := upper[i]
    379 		if TurkishCase.ToLower(l) != l {
    380 			t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l)
    381 		}
    382 		if TurkishCase.ToUpper(u) != u {
    383 			t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u)
    384 		}
    385 		if TurkishCase.ToUpper(l) != u {
    386 			t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u)
    387 		}
    388 		if TurkishCase.ToLower(u) != l {
    389 			t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l)
    390 		}
    391 		if TurkishCase.ToTitle(u) != u {
    392 			t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u)
    393 		}
    394 		if TurkishCase.ToTitle(l) != u {
    395 			t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u)
    396 		}
    397 	}
    398 }
    399 
    400 var simpleFoldTests = []string{
    401 	// SimpleFold(x) returns the next equivalent rune > x or wraps
    402 	// around to smaller values.
    403 
    404 	// Easy cases.
    405 	"Aa",
    406 	"",
    407 
    408 	// ASCII special cases.
    409 	"Kk",
    410 	"Ss",
    411 
    412 	// Non-ASCII special cases.
    413 	"",
    414 	"",
    415 
    416 	// Extra special cases: has lower/upper but no case fold.
    417 	"",
    418 	"",
    419 
    420 	// Upper comes before lower (Cherokee).
    421 	"\u13b0\uab80",
    422 }
    423 
    424 func TestSimpleFold(t *testing.T) {
    425 	for _, tt := range simpleFoldTests {
    426 		cycle := []rune(tt)
    427 		r := cycle[len(cycle)-1]
    428 		for _, out := range cycle {
    429 			if r := SimpleFold(r); r != out {
    430 				t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out)
    431 			}
    432 			r = out
    433 		}
    434 	}
    435 
    436 	if r := SimpleFold(-42); r != -42 {
    437 		t.Errorf("SimpleFold(-42) = %v, want -42", r)
    438 	}
    439 }
    440 
    441 // Running 'go test -calibrate' runs the calibration to find a plausible
    442 // cutoff point for linear search of a range list vs. binary search.
    443 // We create a fake table and then time how long it takes to do a
    444 // sequence of searches within that table, for all possible inputs
    445 // relative to the ranges (something before all, in each, between each, after all).
    446 // This assumes that all possible runes are equally likely.
    447 // In practice most runes are ASCII so this is a conservative estimate
    448 // of an effective cutoff value. In practice we could probably set it higher
    449 // than what this function recommends.
    450 
    451 var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search")
    452 
    453 func TestCalibrate(t *testing.T) {
    454 	if !*calibrate {
    455 		return
    456 	}
    457 
    458 	if runtime.GOARCH == "amd64" {
    459 		fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH)
    460 	}
    461 
    462 	// Find the point where binary search wins by more than 10%.
    463 	// The 10% bias gives linear search an edge when they're close,
    464 	// because on predominantly ASCII inputs linear search is even
    465 	// better than our benchmarks measure.
    466 	n := sort.Search(64, func(n int) bool {
    467 		tab := fakeTable(n)
    468 		blinear := func(b *testing.B) {
    469 			tab := tab
    470 			max := n*5 + 20
    471 			for i := 0; i < b.N; i++ {
    472 				for j := 0; j <= max; j++ {
    473 					linear(tab, uint16(j))
    474 				}
    475 			}
    476 		}
    477 		bbinary := func(b *testing.B) {
    478 			tab := tab
    479 			max := n*5 + 20
    480 			for i := 0; i < b.N; i++ {
    481 				for j := 0; j <= max; j++ {
    482 					binary(tab, uint16(j))
    483 				}
    484 			}
    485 		}
    486 		bmlinear := testing.Benchmark(blinear)
    487 		bmbinary := testing.Benchmark(bbinary)
    488 		fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp())
    489 		return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110
    490 	})
    491 	fmt.Printf("calibration: linear cutoff = %d\n", n)
    492 }
    493 
    494 func fakeTable(n int) []Range16 {
    495 	var r16 []Range16
    496 	for i := 0; i < n; i++ {
    497 		r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1})
    498 	}
    499 	return r16
    500 }
    501 
    502 func linear(ranges []Range16, r uint16) bool {
    503 	for i := range ranges {
    504 		range_ := &ranges[i]
    505 		if r < range_.Lo {
    506 			return false
    507 		}
    508 		if r <= range_.Hi {
    509 			return (r-range_.Lo)%range_.Stride == 0
    510 		}
    511 	}
    512 	return false
    513 }
    514 
    515 func binary(ranges []Range16, r uint16) bool {
    516 	// binary search over ranges
    517 	lo := 0
    518 	hi := len(ranges)
    519 	for lo < hi {
    520 		m := lo + (hi-lo)/2
    521 		range_ := &ranges[m]
    522 		if range_.Lo <= r && r <= range_.Hi {
    523 			return (r-range_.Lo)%range_.Stride == 0
    524 		}
    525 		if r < range_.Lo {
    526 			hi = m
    527 		} else {
    528 			lo = m + 1
    529 		}
    530 	}
    531 	return false
    532 }
    533 
    534 func TestLatinOffset(t *testing.T) {
    535 	var maps = []map[string]*RangeTable{
    536 		Categories,
    537 		FoldCategory,
    538 		FoldScript,
    539 		Properties,
    540 		Scripts,
    541 	}
    542 	for _, m := range maps {
    543 		for name, tab := range m {
    544 			i := 0
    545 			for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 {
    546 				i++
    547 			}
    548 			if tab.LatinOffset != i {
    549 				t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i)
    550 			}
    551 		}
    552 	}
    553 }
    554