Home | History | Annotate | Download | only in utf8
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package utf8_test
      6 
      7 import (
      8 	"bytes"
      9 	"testing"
     10 	"unicode"
     11 	. "unicode/utf8"
     12 )
     13 
     14 // Validate the constants redefined from unicode.
     15 func init() {
     16 	if MaxRune != unicode.MaxRune {
     17 		panic("utf8.MaxRune is wrong")
     18 	}
     19 	if RuneError != unicode.ReplacementChar {
     20 		panic("utf8.RuneError is wrong")
     21 	}
     22 }
     23 
     24 // Validate the constants redefined from unicode.
     25 func TestConstants(t *testing.T) {
     26 	if MaxRune != unicode.MaxRune {
     27 		t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
     28 	}
     29 	if RuneError != unicode.ReplacementChar {
     30 		t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
     31 	}
     32 }
     33 
     34 type Utf8Map struct {
     35 	r   rune
     36 	str string
     37 }
     38 
     39 var utf8map = []Utf8Map{
     40 	{0x0000, "\x00"},
     41 	{0x0001, "\x01"},
     42 	{0x007e, "\x7e"},
     43 	{0x007f, "\x7f"},
     44 	{0x0080, "\xc2\x80"},
     45 	{0x0081, "\xc2\x81"},
     46 	{0x00bf, "\xc2\xbf"},
     47 	{0x00c0, "\xc3\x80"},
     48 	{0x00c1, "\xc3\x81"},
     49 	{0x00c8, "\xc3\x88"},
     50 	{0x00d0, "\xc3\x90"},
     51 	{0x00e0, "\xc3\xa0"},
     52 	{0x00f0, "\xc3\xb0"},
     53 	{0x00f8, "\xc3\xb8"},
     54 	{0x00ff, "\xc3\xbf"},
     55 	{0x0100, "\xc4\x80"},
     56 	{0x07ff, "\xdf\xbf"},
     57 	{0x0400, "\xd0\x80"},
     58 	{0x0800, "\xe0\xa0\x80"},
     59 	{0x0801, "\xe0\xa0\x81"},
     60 	{0x1000, "\xe1\x80\x80"},
     61 	{0xd000, "\xed\x80\x80"},
     62 	{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
     63 	{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
     64 	{0xfffe, "\xef\xbf\xbe"},
     65 	{0xffff, "\xef\xbf\xbf"},
     66 	{0x10000, "\xf0\x90\x80\x80"},
     67 	{0x10001, "\xf0\x90\x80\x81"},
     68 	{0x40000, "\xf1\x80\x80\x80"},
     69 	{0x10fffe, "\xf4\x8f\xbf\xbe"},
     70 	{0x10ffff, "\xf4\x8f\xbf\xbf"},
     71 	{0xFFFD, "\xef\xbf\xbd"},
     72 }
     73 
     74 var surrogateMap = []Utf8Map{
     75 	{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
     76 	{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
     77 }
     78 
     79 var testStrings = []string{
     80 	"",
     81 	"abcd",
     82 	"",
     83 	"abi",
     84 	"abiabiabi",
     85 	"\x80\x80\x80\x80",
     86 }
     87 
     88 func TestFullRune(t *testing.T) {
     89 	for _, m := range utf8map {
     90 		b := []byte(m.str)
     91 		if !FullRune(b) {
     92 			t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
     93 		}
     94 		s := m.str
     95 		if !FullRuneInString(s) {
     96 			t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
     97 		}
     98 		b1 := b[0 : len(b)-1]
     99 		if FullRune(b1) {
    100 			t.Errorf("FullRune(%q) = true, want false", b1)
    101 		}
    102 		s1 := string(b1)
    103 		if FullRuneInString(s1) {
    104 			t.Errorf("FullRune(%q) = true, want false", s1)
    105 		}
    106 	}
    107 	for _, s := range []string{"\xc0", "\xc1"} {
    108 		b := []byte(s)
    109 		if !FullRune(b) {
    110 			t.Errorf("FullRune(%q) = false, want true", s)
    111 		}
    112 		if !FullRuneInString(s) {
    113 			t.Errorf("FullRuneInString(%q) = false, want true", s)
    114 		}
    115 	}
    116 }
    117 
    118 func TestEncodeRune(t *testing.T) {
    119 	for _, m := range utf8map {
    120 		b := []byte(m.str)
    121 		var buf [10]byte
    122 		n := EncodeRune(buf[0:], m.r)
    123 		b1 := buf[0:n]
    124 		if !bytes.Equal(b, b1) {
    125 			t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
    126 		}
    127 	}
    128 }
    129 
    130 func TestDecodeRune(t *testing.T) {
    131 	for _, m := range utf8map {
    132 		b := []byte(m.str)
    133 		r, size := DecodeRune(b)
    134 		if r != m.r || size != len(b) {
    135 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
    136 		}
    137 		s := m.str
    138 		r, size = DecodeRuneInString(s)
    139 		if r != m.r || size != len(b) {
    140 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
    141 		}
    142 
    143 		// there's an extra byte that bytes left behind - make sure trailing byte works
    144 		r, size = DecodeRune(b[0:cap(b)])
    145 		if r != m.r || size != len(b) {
    146 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
    147 		}
    148 		s = m.str + "\x00"
    149 		r, size = DecodeRuneInString(s)
    150 		if r != m.r || size != len(b) {
    151 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
    152 		}
    153 
    154 		// make sure missing bytes fail
    155 		wantsize := 1
    156 		if wantsize >= len(b) {
    157 			wantsize = 0
    158 		}
    159 		r, size = DecodeRune(b[0 : len(b)-1])
    160 		if r != RuneError || size != wantsize {
    161 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
    162 		}
    163 		s = m.str[0 : len(m.str)-1]
    164 		r, size = DecodeRuneInString(s)
    165 		if r != RuneError || size != wantsize {
    166 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
    167 		}
    168 
    169 		// make sure bad sequences fail
    170 		if len(b) == 1 {
    171 			b[0] = 0x80
    172 		} else {
    173 			b[len(b)-1] = 0x7F
    174 		}
    175 		r, size = DecodeRune(b)
    176 		if r != RuneError || size != 1 {
    177 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
    178 		}
    179 		s = string(b)
    180 		r, size = DecodeRuneInString(s)
    181 		if r != RuneError || size != 1 {
    182 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
    183 		}
    184 
    185 	}
    186 }
    187 
    188 func TestDecodeSurrogateRune(t *testing.T) {
    189 	for _, m := range surrogateMap {
    190 		b := []byte(m.str)
    191 		r, size := DecodeRune(b)
    192 		if r != RuneError || size != 1 {
    193 			t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
    194 		}
    195 		s := m.str
    196 		r, size = DecodeRuneInString(s)
    197 		if r != RuneError || size != 1 {
    198 			t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
    199 		}
    200 	}
    201 }
    202 
    203 // Check that DecodeRune and DecodeLastRune correspond to
    204 // the equivalent range loop.
    205 func TestSequencing(t *testing.T) {
    206 	for _, ts := range testStrings {
    207 		for _, m := range utf8map {
    208 			for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
    209 				testSequence(t, s)
    210 			}
    211 		}
    212 	}
    213 }
    214 
    215 // Check that a range loop and a []int conversion visit the same runes.
    216 // Not really a test of this package, but the assumption is used here and
    217 // it's good to verify
    218 func TestIntConversion(t *testing.T) {
    219 	for _, ts := range testStrings {
    220 		runes := []rune(ts)
    221 		if RuneCountInString(ts) != len(runes) {
    222 			t.Errorf("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
    223 			break
    224 		}
    225 		i := 0
    226 		for _, r := range ts {
    227 			if r != runes[i] {
    228 				t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
    229 			}
    230 			i++
    231 		}
    232 	}
    233 }
    234 
    235 var invalidSequenceTests = []string{
    236 	"\xed\xa0\x80\x80", // surrogate min
    237 	"\xed\xbf\xbf\x80", // surrogate max
    238 
    239 	// xx
    240 	"\x91\x80\x80\x80",
    241 
    242 	// s1
    243 	"\xC2\x7F\x80\x80",
    244 	"\xC2\xC0\x80\x80",
    245 	"\xDF\x7F\x80\x80",
    246 	"\xDF\xC0\x80\x80",
    247 
    248 	// s2
    249 	"\xE0\x9F\xBF\x80",
    250 	"\xE0\xA0\x7F\x80",
    251 	"\xE0\xBF\xC0\x80",
    252 	"\xE0\xC0\x80\x80",
    253 
    254 	// s3
    255 	"\xE1\x7F\xBF\x80",
    256 	"\xE1\x80\x7F\x80",
    257 	"\xE1\xBF\xC0\x80",
    258 	"\xE1\xC0\x80\x80",
    259 
    260 	//s4
    261 	"\xED\x7F\xBF\x80",
    262 	"\xED\x80\x7F\x80",
    263 	"\xED\x9F\xC0\x80",
    264 	"\xED\xA0\x80\x80",
    265 
    266 	// s5
    267 	"\xF0\x8F\xBF\xBF",
    268 	"\xF0\x90\x7F\xBF",
    269 	"\xF0\x90\x80\x7F",
    270 	"\xF0\xBF\xBF\xC0",
    271 	"\xF0\xBF\xC0\x80",
    272 	"\xF0\xC0\x80\x80",
    273 
    274 	// s6
    275 	"\xF1\x7F\xBF\xBF",
    276 	"\xF1\x80\x7F\xBF",
    277 	"\xF1\x80\x80\x7F",
    278 	"\xF1\xBF\xBF\xC0",
    279 	"\xF1\xBF\xC0\x80",
    280 	"\xF1\xC0\x80\x80",
    281 
    282 	// s7
    283 	"\xF4\x7F\xBF\xBF",
    284 	"\xF4\x80\x7F\xBF",
    285 	"\xF4\x80\x80\x7F",
    286 	"\xF4\x8F\xBF\xC0",
    287 	"\xF4\x8F\xC0\x80",
    288 	"\xF4\x90\x80\x80",
    289 }
    290 
    291 func runtimeDecodeRune(s string) rune {
    292 	for _, r := range s {
    293 		return r
    294 	}
    295 	return -1
    296 }
    297 
    298 func TestDecodeInvalidSequence(t *testing.T) {
    299 	for _, s := range invalidSequenceTests {
    300 		r1, _ := DecodeRune([]byte(s))
    301 		if want := RuneError; r1 != want {
    302 			t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want)
    303 			return
    304 		}
    305 		r2, _ := DecodeRuneInString(s)
    306 		if want := RuneError; r2 != want {
    307 			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want)
    308 			return
    309 		}
    310 		if r1 != r2 {
    311 			t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2)
    312 			return
    313 		}
    314 		r3 := runtimeDecodeRune(s)
    315 		if r2 != r3 {
    316 			t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
    317 			return
    318 		}
    319 	}
    320 }
    321 
    322 func testSequence(t *testing.T, s string) {
    323 	type info struct {
    324 		index int
    325 		r     rune
    326 	}
    327 	index := make([]info, len(s))
    328 	b := []byte(s)
    329 	si := 0
    330 	j := 0
    331 	for i, r := range s {
    332 		if si != i {
    333 			t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
    334 			return
    335 		}
    336 		index[j] = info{i, r}
    337 		j++
    338 		r1, size1 := DecodeRune(b[i:])
    339 		if r != r1 {
    340 			t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
    341 			return
    342 		}
    343 		r2, size2 := DecodeRuneInString(s[i:])
    344 		if r != r2 {
    345 			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
    346 			return
    347 		}
    348 		if size1 != size2 {
    349 			t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
    350 			return
    351 		}
    352 		si += size1
    353 	}
    354 	j--
    355 	for si = len(s); si > 0; {
    356 		r1, size1 := DecodeLastRune(b[0:si])
    357 		r2, size2 := DecodeLastRuneInString(s[0:si])
    358 		if size1 != size2 {
    359 			t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
    360 			return
    361 		}
    362 		if r1 != index[j].r {
    363 			t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
    364 			return
    365 		}
    366 		if r2 != index[j].r {
    367 			t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
    368 			return
    369 		}
    370 		si -= size1
    371 		if si != index[j].index {
    372 			t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
    373 			return
    374 		}
    375 		j--
    376 	}
    377 	if si != 0 {
    378 		t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
    379 	}
    380 }
    381 
    382 // Check that negative runes encode as U+FFFD.
    383 func TestNegativeRune(t *testing.T) {
    384 	errorbuf := make([]byte, UTFMax)
    385 	errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
    386 	buf := make([]byte, UTFMax)
    387 	buf = buf[0:EncodeRune(buf, -1)]
    388 	if !bytes.Equal(buf, errorbuf) {
    389 		t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
    390 	}
    391 }
    392 
    393 type RuneCountTest struct {
    394 	in  string
    395 	out int
    396 }
    397 
    398 var runecounttests = []RuneCountTest{
    399 	{"abcd", 4},
    400 	{"", 3},
    401 	{"1,2,3,4", 7},
    402 	{"\xe2\x00", 2},
    403 	{"\xe2\x80", 2},
    404 	{"a\xe2\x80", 3},
    405 }
    406 
    407 func TestRuneCount(t *testing.T) {
    408 	for _, tt := range runecounttests {
    409 		if out := RuneCountInString(tt.in); out != tt.out {
    410 			t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
    411 		}
    412 		if out := RuneCount([]byte(tt.in)); out != tt.out {
    413 			t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
    414 		}
    415 	}
    416 }
    417 
    418 type RuneLenTest struct {
    419 	r    rune
    420 	size int
    421 }
    422 
    423 var runelentests = []RuneLenTest{
    424 	{0, 1},
    425 	{'e', 1},
    426 	{'', 2},
    427 	{'', 3},
    428 	{RuneError, 3},
    429 	{MaxRune, 4},
    430 	{0xD800, -1},
    431 	{0xDFFF, -1},
    432 	{MaxRune + 1, -1},
    433 	{-1, -1},
    434 }
    435 
    436 func TestRuneLen(t *testing.T) {
    437 	for _, tt := range runelentests {
    438 		if size := RuneLen(tt.r); size != tt.size {
    439 			t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
    440 		}
    441 	}
    442 }
    443 
    444 type ValidTest struct {
    445 	in  string
    446 	out bool
    447 }
    448 
    449 var validTests = []ValidTest{
    450 	{"", true},
    451 	{"a", true},
    452 	{"abc", true},
    453 	{"", true},
    454 	{"", true},
    455 	{"-", true},
    456 	{"", true},
    457 	{"aa\xe2", false},
    458 	{string([]byte{66, 250}), false},
    459 	{string([]byte{66, 250, 67}), false},
    460 	{"a\uFFFDb", true},
    461 	{string("\xF4\x8F\xBF\xBF"), true},      // U+10FFFF
    462 	{string("\xF4\x90\x80\x80"), false},     // U+10FFFF+1; out of range
    463 	{string("\xF7\xBF\xBF\xBF"), false},     // 0x1FFFFF; out of range
    464 	{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
    465 	{string("\xc0\x80"), false},             // U+0000 encoded in two bytes: incorrect
    466 	{string("\xed\xa0\x80"), false},         // U+D800 high surrogate (sic)
    467 	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
    468 }
    469 
    470 func TestValid(t *testing.T) {
    471 	for _, tt := range validTests {
    472 		if Valid([]byte(tt.in)) != tt.out {
    473 			t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
    474 		}
    475 		if ValidString(tt.in) != tt.out {
    476 			t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
    477 		}
    478 	}
    479 }
    480 
    481 type ValidRuneTest struct {
    482 	r  rune
    483 	ok bool
    484 }
    485 
    486 var validrunetests = []ValidRuneTest{
    487 	{0, true},
    488 	{'e', true},
    489 	{'', true},
    490 	{'', true},
    491 	{RuneError, true},
    492 	{MaxRune, true},
    493 	{0xD7FF, true},
    494 	{0xD800, false},
    495 	{0xDFFF, false},
    496 	{0xE000, true},
    497 	{MaxRune + 1, false},
    498 	{-1, false},
    499 }
    500 
    501 func TestValidRune(t *testing.T) {
    502 	for _, tt := range validrunetests {
    503 		if ok := ValidRune(tt.r); ok != tt.ok {
    504 			t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
    505 		}
    506 	}
    507 }
    508 
    509 func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
    510 	s := []byte("0123456789")
    511 	for i := 0; i < b.N; i++ {
    512 		RuneCount(s)
    513 	}
    514 }
    515 
    516 func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
    517 	s := []byte("")
    518 	for i := 0; i < b.N; i++ {
    519 		RuneCount(s)
    520 	}
    521 }
    522 
    523 func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) {
    524 	for i := 0; i < b.N; i++ {
    525 		RuneCountInString("0123456789")
    526 	}
    527 }
    528 
    529 func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
    530 	for i := 0; i < b.N; i++ {
    531 		RuneCountInString("")
    532 	}
    533 }
    534 
    535 func BenchmarkValidTenASCIIChars(b *testing.B) {
    536 	s := []byte("0123456789")
    537 	for i := 0; i < b.N; i++ {
    538 		Valid(s)
    539 	}
    540 }
    541 
    542 func BenchmarkValidTenJapaneseChars(b *testing.B) {
    543 	s := []byte("")
    544 	for i := 0; i < b.N; i++ {
    545 		Valid(s)
    546 	}
    547 }
    548 
    549 func BenchmarkValidStringTenASCIIChars(b *testing.B) {
    550 	for i := 0; i < b.N; i++ {
    551 		ValidString("0123456789")
    552 	}
    553 }
    554 
    555 func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
    556 	for i := 0; i < b.N; i++ {
    557 		ValidString("")
    558 	}
    559 }
    560 
    561 func BenchmarkEncodeASCIIRune(b *testing.B) {
    562 	buf := make([]byte, UTFMax)
    563 	for i := 0; i < b.N; i++ {
    564 		EncodeRune(buf, 'a')
    565 	}
    566 }
    567 
    568 func BenchmarkEncodeJapaneseRune(b *testing.B) {
    569 	buf := make([]byte, UTFMax)
    570 	for i := 0; i < b.N; i++ {
    571 		EncodeRune(buf, '')
    572 	}
    573 }
    574 
    575 func BenchmarkDecodeASCIIRune(b *testing.B) {
    576 	a := []byte{'a'}
    577 	for i := 0; i < b.N; i++ {
    578 		DecodeRune(a)
    579 	}
    580 }
    581 
    582 func BenchmarkDecodeJapaneseRune(b *testing.B) {
    583 	nihon := []byte("")
    584 	for i := 0; i < b.N; i++ {
    585 		DecodeRune(nihon)
    586 	}
    587 }
    588 
    589 func BenchmarkFullASCIIRune(b *testing.B) {
    590 	a := []byte{'a'}
    591 	for i := 0; i < b.N; i++ {
    592 		FullRune(a)
    593 	}
    594 }
    595 
    596 func BenchmarkFullJapaneseRune(b *testing.B) {
    597 	nihon := []byte("")
    598 	for i := 0; i < b.N; i++ {
    599 		FullRune(nihon)
    600 	}
    601 }
    602