Home | History | Annotate | Download | only in utf8
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package utf8_test
      6 
      7 import (
      8 	"bytes"
      9 	"testing"
     10 	"unicode"
     11 	. "unicode/utf8"
     12 )
     13 
     14 // Validate the constants redefined from unicode.
     15 func init() {
     16 	if MaxRune != unicode.MaxRune {
     17 		panic("utf8.MaxRune is wrong")
     18 	}
     19 	if RuneError != unicode.ReplacementChar {
     20 		panic("utf8.RuneError is wrong")
     21 	}
     22 }
     23 
     24 // Validate the constants redefined from unicode.
     25 func TestConstants(t *testing.T) {
     26 	if MaxRune != unicode.MaxRune {
     27 		t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
     28 	}
     29 	if RuneError != unicode.ReplacementChar {
     30 		t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
     31 	}
     32 }
     33 
     34 type Utf8Map struct {
     35 	r   rune
     36 	str string
     37 }
     38 
     39 var utf8map = []Utf8Map{
     40 	{0x0000, "\x00"},
     41 	{0x0001, "\x01"},
     42 	{0x007e, "\x7e"},
     43 	{0x007f, "\x7f"},
     44 	{0x0080, "\xc2\x80"},
     45 	{0x0081, "\xc2\x81"},
     46 	{0x00bf, "\xc2\xbf"},
     47 	{0x00c0, "\xc3\x80"},
     48 	{0x00c1, "\xc3\x81"},
     49 	{0x00c8, "\xc3\x88"},
     50 	{0x00d0, "\xc3\x90"},
     51 	{0x00e0, "\xc3\xa0"},
     52 	{0x00f0, "\xc3\xb0"},
     53 	{0x00f8, "\xc3\xb8"},
     54 	{0x00ff, "\xc3\xbf"},
     55 	{0x0100, "\xc4\x80"},
     56 	{0x07ff, "\xdf\xbf"},
     57 	{0x0800, "\xe0\xa0\x80"},
     58 	{0x0801, "\xe0\xa0\x81"},
     59 	{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
     60 	{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
     61 	{0xfffe, "\xef\xbf\xbe"},
     62 	{0xffff, "\xef\xbf\xbf"},
     63 	{0x10000, "\xf0\x90\x80\x80"},
     64 	{0x10001, "\xf0\x90\x80\x81"},
     65 	{0x10fffe, "\xf4\x8f\xbf\xbe"},
     66 	{0x10ffff, "\xf4\x8f\xbf\xbf"},
     67 	{0xFFFD, "\xef\xbf\xbd"},
     68 }
     69 
     70 var surrogateMap = []Utf8Map{
     71 	{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
     72 	{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
     73 }
     74 
     75 var testStrings = []string{
     76 	"",
     77 	"abcd",
     78 	"",
     79 	"abi",
     80 	"abiabiabi",
     81 	"\x80\x80\x80\x80",
     82 }
     83 
     84 func TestFullRune(t *testing.T) {
     85 	for _, m := range utf8map {
     86 		b := []byte(m.str)
     87 		if !FullRune(b) {
     88 			t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
     89 		}
     90 		s := m.str
     91 		if !FullRuneInString(s) {
     92 			t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
     93 		}
     94 		b1 := b[0 : len(b)-1]
     95 		if FullRune(b1) {
     96 			t.Errorf("FullRune(%q) = true, want false", b1)
     97 		}
     98 		s1 := string(b1)
     99 		if FullRuneInString(s1) {
    100 			t.Errorf("FullRune(%q) = true, want false", s1)
    101 		}
    102 	}
    103 }
    104 
    105 func TestEncodeRune(t *testing.T) {
    106 	for _, m := range utf8map {
    107 		b := []byte(m.str)
    108 		var buf [10]byte
    109 		n := EncodeRune(buf[0:], m.r)
    110 		b1 := buf[0:n]
    111 		if !bytes.Equal(b, b1) {
    112 			t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
    113 		}
    114 	}
    115 }
    116 
    117 func TestDecodeRune(t *testing.T) {
    118 	for _, m := range utf8map {
    119 		b := []byte(m.str)
    120 		r, size := DecodeRune(b)
    121 		if r != m.r || size != len(b) {
    122 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
    123 		}
    124 		s := m.str
    125 		r, size = DecodeRuneInString(s)
    126 		if r != m.r || size != len(b) {
    127 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
    128 		}
    129 
    130 		// there's an extra byte that bytes left behind - make sure trailing byte works
    131 		r, size = DecodeRune(b[0:cap(b)])
    132 		if r != m.r || size != len(b) {
    133 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
    134 		}
    135 		s = m.str + "\x00"
    136 		r, size = DecodeRuneInString(s)
    137 		if r != m.r || size != len(b) {
    138 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
    139 		}
    140 
    141 		// make sure missing bytes fail
    142 		wantsize := 1
    143 		if wantsize >= len(b) {
    144 			wantsize = 0
    145 		}
    146 		r, size = DecodeRune(b[0 : len(b)-1])
    147 		if r != RuneError || size != wantsize {
    148 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
    149 		}
    150 		s = m.str[0 : len(m.str)-1]
    151 		r, size = DecodeRuneInString(s)
    152 		if r != RuneError || size != wantsize {
    153 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
    154 		}
    155 
    156 		// make sure bad sequences fail
    157 		if len(b) == 1 {
    158 			b[0] = 0x80
    159 		} else {
    160 			b[len(b)-1] = 0x7F
    161 		}
    162 		r, size = DecodeRune(b)
    163 		if r != RuneError || size != 1 {
    164 			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
    165 		}
    166 		s = string(b)
    167 		r, size = DecodeRuneInString(s)
    168 		if r != RuneError || size != 1 {
    169 			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
    170 		}
    171 
    172 	}
    173 }
    174 
    175 func TestDecodeSurrogateRune(t *testing.T) {
    176 	for _, m := range surrogateMap {
    177 		b := []byte(m.str)
    178 		r, size := DecodeRune(b)
    179 		if r != RuneError || size != 1 {
    180 			t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
    181 		}
    182 		s := m.str
    183 		r, size = DecodeRuneInString(s)
    184 		if r != RuneError || size != 1 {
    185 			t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
    186 		}
    187 	}
    188 }
    189 
    190 // Check that DecodeRune and DecodeLastRune correspond to
    191 // the equivalent range loop.
    192 func TestSequencing(t *testing.T) {
    193 	for _, ts := range testStrings {
    194 		for _, m := range utf8map {
    195 			for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
    196 				testSequence(t, s)
    197 			}
    198 		}
    199 	}
    200 }
    201 
    202 // Check that a range loop and a []int conversion visit the same runes.
    203 // Not really a test of this package, but the assumption is used here and
    204 // it's good to verify
    205 func TestIntConversion(t *testing.T) {
    206 	for _, ts := range testStrings {
    207 		runes := []rune(ts)
    208 		if RuneCountInString(ts) != len(runes) {
    209 			t.Errorf("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
    210 			break
    211 		}
    212 		i := 0
    213 		for _, r := range ts {
    214 			if r != runes[i] {
    215 				t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
    216 			}
    217 			i++
    218 		}
    219 	}
    220 }
    221 
    222 func testSequence(t *testing.T, s string) {
    223 	type info struct {
    224 		index int
    225 		r     rune
    226 	}
    227 	index := make([]info, len(s))
    228 	b := []byte(s)
    229 	si := 0
    230 	j := 0
    231 	for i, r := range s {
    232 		if si != i {
    233 			t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
    234 			return
    235 		}
    236 		index[j] = info{i, r}
    237 		j++
    238 		r1, size1 := DecodeRune(b[i:])
    239 		if r != r1 {
    240 			t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
    241 			return
    242 		}
    243 		r2, size2 := DecodeRuneInString(s[i:])
    244 		if r != r2 {
    245 			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
    246 			return
    247 		}
    248 		if size1 != size2 {
    249 			t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
    250 			return
    251 		}
    252 		si += size1
    253 	}
    254 	j--
    255 	for si = len(s); si > 0; {
    256 		r1, size1 := DecodeLastRune(b[0:si])
    257 		r2, size2 := DecodeLastRuneInString(s[0:si])
    258 		if size1 != size2 {
    259 			t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
    260 			return
    261 		}
    262 		if r1 != index[j].r {
    263 			t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
    264 			return
    265 		}
    266 		if r2 != index[j].r {
    267 			t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
    268 			return
    269 		}
    270 		si -= size1
    271 		if si != index[j].index {
    272 			t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
    273 			return
    274 		}
    275 		j--
    276 	}
    277 	if si != 0 {
    278 		t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
    279 	}
    280 }
    281 
    282 // Check that negative runes encode as U+FFFD.
    283 func TestNegativeRune(t *testing.T) {
    284 	errorbuf := make([]byte, UTFMax)
    285 	errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
    286 	buf := make([]byte, UTFMax)
    287 	buf = buf[0:EncodeRune(buf, -1)]
    288 	if !bytes.Equal(buf, errorbuf) {
    289 		t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
    290 	}
    291 }
    292 
    293 type RuneCountTest struct {
    294 	in  string
    295 	out int
    296 }
    297 
    298 var runecounttests = []RuneCountTest{
    299 	{"abcd", 4},
    300 	{"", 3},
    301 	{"1,2,3,4", 7},
    302 	{"\xe2\x00", 2},
    303 }
    304 
    305 func TestRuneCount(t *testing.T) {
    306 	for _, tt := range runecounttests {
    307 		if out := RuneCountInString(tt.in); out != tt.out {
    308 			t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
    309 		}
    310 		if out := RuneCount([]byte(tt.in)); out != tt.out {
    311 			t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
    312 		}
    313 	}
    314 }
    315 
    316 type RuneLenTest struct {
    317 	r    rune
    318 	size int
    319 }
    320 
    321 var runelentests = []RuneLenTest{
    322 	{0, 1},
    323 	{'e', 1},
    324 	{'', 2},
    325 	{'', 3},
    326 	{RuneError, 3},
    327 	{MaxRune, 4},
    328 	{0xD800, -1},
    329 	{0xDFFF, -1},
    330 	{MaxRune + 1, -1},
    331 	{-1, -1},
    332 }
    333 
    334 func TestRuneLen(t *testing.T) {
    335 	for _, tt := range runelentests {
    336 		if size := RuneLen(tt.r); size != tt.size {
    337 			t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
    338 		}
    339 	}
    340 }
    341 
    342 type ValidTest struct {
    343 	in  string
    344 	out bool
    345 }
    346 
    347 var validTests = []ValidTest{
    348 	{"", true},
    349 	{"a", true},
    350 	{"abc", true},
    351 	{"", true},
    352 	{"", true},
    353 	{"-", true},
    354 	{"", true},
    355 	{string([]byte{66, 250}), false},
    356 	{string([]byte{66, 250, 67}), false},
    357 	{"a\uFFFDb", true},
    358 	{string("\xF4\x8F\xBF\xBF"), true},      // U+10FFFF
    359 	{string("\xF4\x90\x80\x80"), false},     // U+10FFFF+1; out of range
    360 	{string("\xF7\xBF\xBF\xBF"), false},     // 0x1FFFFF; out of range
    361 	{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
    362 	{string("\xc0\x80"), false},             // U+0000 encoded in two bytes: incorrect
    363 	{string("\xed\xa0\x80"), false},         // U+D800 high surrogate (sic)
    364 	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
    365 }
    366 
    367 func TestValid(t *testing.T) {
    368 	for _, tt := range validTests {
    369 		if Valid([]byte(tt.in)) != tt.out {
    370 			t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
    371 		}
    372 		if ValidString(tt.in) != tt.out {
    373 			t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
    374 		}
    375 	}
    376 }
    377 
    378 type ValidRuneTest struct {
    379 	r  rune
    380 	ok bool
    381 }
    382 
    383 var validrunetests = []ValidRuneTest{
    384 	{0, true},
    385 	{'e', true},
    386 	{'', true},
    387 	{'', true},
    388 	{RuneError, true},
    389 	{MaxRune, true},
    390 	{0xD7FF, true},
    391 	{0xD800, false},
    392 	{0xDFFF, false},
    393 	{0xE000, true},
    394 	{MaxRune + 1, false},
    395 	{-1, false},
    396 }
    397 
    398 func TestValidRune(t *testing.T) {
    399 	for _, tt := range validrunetests {
    400 		if ok := ValidRune(tt.r); ok != tt.ok {
    401 			t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
    402 		}
    403 	}
    404 }
    405 
    406 func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
    407 	for i := 0; i < b.N; i++ {
    408 		RuneCountInString("0123456789")
    409 	}
    410 }
    411 
    412 func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
    413 	for i := 0; i < b.N; i++ {
    414 		RuneCountInString("")
    415 	}
    416 }
    417 
    418 func BenchmarkEncodeASCIIRune(b *testing.B) {
    419 	buf := make([]byte, UTFMax)
    420 	for i := 0; i < b.N; i++ {
    421 		EncodeRune(buf, 'a')
    422 	}
    423 }
    424 
    425 func BenchmarkEncodeJapaneseRune(b *testing.B) {
    426 	buf := make([]byte, UTFMax)
    427 	for i := 0; i < b.N; i++ {
    428 		EncodeRune(buf, '')
    429 	}
    430 }
    431 
    432 func BenchmarkDecodeASCIIRune(b *testing.B) {
    433 	a := []byte{'a'}
    434 	for i := 0; i < b.N; i++ {
    435 		DecodeRune(a)
    436 	}
    437 }
    438 
    439 func BenchmarkDecodeJapaneseRune(b *testing.B) {
    440 	nihon := []byte("")
    441 	for i := 0; i < b.N; i++ {
    442 		DecodeRune(nihon)
    443 	}
    444 }
    445