Home | History | Annotate | Download | only in utf8
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package utf8 implements functions and constants to support text encoded in
      6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
      7 package utf8
      8 
      9 // The conditions RuneError==unicode.ReplacementChar and
     10 // MaxRune==unicode.MaxRune are verified in the tests.
     11 // Defining them locally avoids this package depending on package unicode.
     12 
     13 // Numbers fundamental to the encoding.
     14 const (
     15 	RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
     16 	RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
     17 	MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
     18 	UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
     19 )
     20 
     21 // Code points in the surrogate range are not valid for UTF-8.
     22 const (
     23 	surrogateMin = 0xD800
     24 	surrogateMax = 0xDFFF
     25 )
     26 
     27 const (
     28 	t1 = 0x00 // 0000 0000
     29 	tx = 0x80 // 1000 0000
     30 	t2 = 0xC0 // 1100 0000
     31 	t3 = 0xE0 // 1110 0000
     32 	t4 = 0xF0 // 1111 0000
     33 	t5 = 0xF8 // 1111 1000
     34 
     35 	maskx = 0x3F // 0011 1111
     36 	mask2 = 0x1F // 0001 1111
     37 	mask3 = 0x0F // 0000 1111
     38 	mask4 = 0x07 // 0000 0111
     39 
     40 	rune1Max = 1<<7 - 1
     41 	rune2Max = 1<<11 - 1
     42 	rune3Max = 1<<16 - 1
     43 
     44 	// The default lowest and highest continuation byte.
     45 	locb = 0x80 // 1000 0000
     46 	hicb = 0xBF // 1011 1111
     47 
     48 	// These names of these constants are chosen to give nice alignment in the
     49 	// table below. The first nibble is an index into acceptRanges or F for
     50 	// special one-byte cases. The second nibble is the Rune length or the
     51 	// Status for the special one-byte case.
     52 	xx = 0xF1 // invalid: size 1
     53 	as = 0xF0 // ASCII: size 1
     54 	s1 = 0x02 // accept 0, size 2
     55 	s2 = 0x13 // accept 1, size 3
     56 	s3 = 0x03 // accept 0, size 3
     57 	s4 = 0x23 // accept 2, size 3
     58 	s5 = 0x34 // accept 3, size 4
     59 	s6 = 0x04 // accept 0, size 4
     60 	s7 = 0x44 // accept 4, size 4
     61 )
     62 
     63 // first is information about the first byte in a UTF-8 sequence.
     64 var first = [256]uint8{
     65 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
     66 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
     67 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
     68 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
     69 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
     70 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
     71 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
     72 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
     73 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
     74 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
     75 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
     76 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
     77 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
     78 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
     79 	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
     80 	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
     81 	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
     82 	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
     83 }
     84 
     85 // acceptRange gives the range of valid values for the second byte in a UTF-8
     86 // sequence.
     87 type acceptRange struct {
     88 	lo uint8 // lowest value for second byte.
     89 	hi uint8 // highest value for second byte.
     90 }
     91 
     92 var acceptRanges = [...]acceptRange{
     93 	0: {locb, hicb},
     94 	1: {0xA0, hicb},
     95 	2: {locb, 0x9F},
     96 	3: {0x90, hicb},
     97 	4: {locb, 0x8F},
     98 }
     99 
    100 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
    101 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
    102 func FullRune(p []byte) bool {
    103 	n := len(p)
    104 	if n == 0 {
    105 		return false
    106 	}
    107 	x := first[p[0]]
    108 	if n >= int(x&7) {
    109 		return true // ASCII, invalid or valid.
    110 	}
    111 	// Must be short or invalid.
    112 	accept := acceptRanges[x>>4]
    113 	if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
    114 		return true
    115 	} else if n > 2 && (p[2] < locb || hicb < p[2]) {
    116 		return true
    117 	}
    118 	return false
    119 }
    120 
    121 // FullRuneInString is like FullRune but its input is a string.
    122 func FullRuneInString(s string) bool {
    123 	n := len(s)
    124 	if n == 0 {
    125 		return false
    126 	}
    127 	x := first[s[0]]
    128 	if n >= int(x&7) {
    129 		return true // ASCII, invalid, or valid.
    130 	}
    131 	// Must be short or invalid.
    132 	accept := acceptRanges[x>>4]
    133 	if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
    134 		return true
    135 	} else if n > 2 && (s[2] < locb || hicb < s[2]) {
    136 		return true
    137 	}
    138 	return false
    139 }
    140 
    141 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
    142 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    143 // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    144 // results for correct, non-empty UTF-8.
    145 //
    146 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    147 // out of range, or is not the shortest possible UTF-8 encoding for the
    148 // value. No other validation is performed.
    149 func DecodeRune(p []byte) (r rune, size int) {
    150 	n := len(p)
    151 	if n < 1 {
    152 		return RuneError, 0
    153 	}
    154 	p0 := p[0]
    155 	x := first[p0]
    156 	if x >= as {
    157 		// The following code simulates an additional check for x == xx and
    158 		// handling the ASCII and invalid cases accordingly. This mask-and-or
    159 		// approach prevents an additional branch.
    160 		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
    161 		return rune(p[0])&^mask | RuneError&mask, 1
    162 	}
    163 	sz := x & 7
    164 	accept := acceptRanges[x>>4]
    165 	if n < int(sz) {
    166 		return RuneError, 1
    167 	}
    168 	b1 := p[1]
    169 	if b1 < accept.lo || accept.hi < b1 {
    170 		return RuneError, 1
    171 	}
    172 	if sz == 2 {
    173 		return rune(p0&mask2)<<6 | rune(b1&maskx), 2
    174 	}
    175 	b2 := p[2]
    176 	if b2 < locb || hicb < b2 {
    177 		return RuneError, 1
    178 	}
    179 	if sz == 3 {
    180 		return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
    181 	}
    182 	b3 := p[3]
    183 	if b3 < locb || hicb < b3 {
    184 		return RuneError, 1
    185 	}
    186 	return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
    187 }
    188 
    189 // DecodeRuneInString is like DecodeRune but its input is a string. If s is
    190 // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
    191 // returns (RuneError, 1). Both are impossible results for correct, non-empty
    192 // UTF-8.
    193 //
    194 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    195 // out of range, or is not the shortest possible UTF-8 encoding for the
    196 // value. No other validation is performed.
    197 func DecodeRuneInString(s string) (r rune, size int) {
    198 	n := len(s)
    199 	if n < 1 {
    200 		return RuneError, 0
    201 	}
    202 	s0 := s[0]
    203 	x := first[s0]
    204 	if x >= as {
    205 		// The following code simulates an additional check for x == xx and
    206 		// handling the ASCII and invalid cases accordingly. This mask-and-or
    207 		// approach prevents an additional branch.
    208 		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
    209 		return rune(s[0])&^mask | RuneError&mask, 1
    210 	}
    211 	sz := x & 7
    212 	accept := acceptRanges[x>>4]
    213 	if n < int(sz) {
    214 		return RuneError, 1
    215 	}
    216 	s1 := s[1]
    217 	if s1 < accept.lo || accept.hi < s1 {
    218 		return RuneError, 1
    219 	}
    220 	if sz == 2 {
    221 		return rune(s0&mask2)<<6 | rune(s1&maskx), 2
    222 	}
    223 	s2 := s[2]
    224 	if s2 < locb || hicb < s2 {
    225 		return RuneError, 1
    226 	}
    227 	if sz == 3 {
    228 		return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
    229 	}
    230 	s3 := s[3]
    231 	if s3 < locb || hicb < s3 {
    232 		return RuneError, 1
    233 	}
    234 	return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
    235 }
    236 
    237 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
    238 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    239 // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    240 // results for correct, non-empty UTF-8.
    241 //
    242 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    243 // out of range, or is not the shortest possible UTF-8 encoding for the
    244 // value. No other validation is performed.
    245 func DecodeLastRune(p []byte) (r rune, size int) {
    246 	end := len(p)
    247 	if end == 0 {
    248 		return RuneError, 0
    249 	}
    250 	start := end - 1
    251 	r = rune(p[start])
    252 	if r < RuneSelf {
    253 		return r, 1
    254 	}
    255 	// guard against O(n^2) behavior when traversing
    256 	// backwards through strings with long sequences of
    257 	// invalid UTF-8.
    258 	lim := end - UTFMax
    259 	if lim < 0 {
    260 		lim = 0
    261 	}
    262 	for start--; start >= lim; start-- {
    263 		if RuneStart(p[start]) {
    264 			break
    265 		}
    266 	}
    267 	if start < 0 {
    268 		start = 0
    269 	}
    270 	r, size = DecodeRune(p[start:end])
    271 	if start+size != end {
    272 		return RuneError, 1
    273 	}
    274 	return r, size
    275 }
    276 
    277 // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
    278 // s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
    279 // it returns (RuneError, 1). Both are impossible results for correct,
    280 // non-empty UTF-8.
    281 //
    282 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    283 // out of range, or is not the shortest possible UTF-8 encoding for the
    284 // value. No other validation is performed.
    285 func DecodeLastRuneInString(s string) (r rune, size int) {
    286 	end := len(s)
    287 	if end == 0 {
    288 		return RuneError, 0
    289 	}
    290 	start := end - 1
    291 	r = rune(s[start])
    292 	if r < RuneSelf {
    293 		return r, 1
    294 	}
    295 	// guard against O(n^2) behavior when traversing
    296 	// backwards through strings with long sequences of
    297 	// invalid UTF-8.
    298 	lim := end - UTFMax
    299 	if lim < 0 {
    300 		lim = 0
    301 	}
    302 	for start--; start >= lim; start-- {
    303 		if RuneStart(s[start]) {
    304 			break
    305 		}
    306 	}
    307 	if start < 0 {
    308 		start = 0
    309 	}
    310 	r, size = DecodeRuneInString(s[start:end])
    311 	if start+size != end {
    312 		return RuneError, 1
    313 	}
    314 	return r, size
    315 }
    316 
    317 // RuneLen returns the number of bytes required to encode the rune.
    318 // It returns -1 if the rune is not a valid value to encode in UTF-8.
    319 func RuneLen(r rune) int {
    320 	switch {
    321 	case r < 0:
    322 		return -1
    323 	case r <= rune1Max:
    324 		return 1
    325 	case r <= rune2Max:
    326 		return 2
    327 	case surrogateMin <= r && r <= surrogateMax:
    328 		return -1
    329 	case r <= rune3Max:
    330 		return 3
    331 	case r <= MaxRune:
    332 		return 4
    333 	}
    334 	return -1
    335 }
    336 
    337 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
    338 // It returns the number of bytes written.
    339 func EncodeRune(p []byte, r rune) int {
    340 	// Negative values are erroneous. Making it unsigned addresses the problem.
    341 	switch i := uint32(r); {
    342 	case i <= rune1Max:
    343 		p[0] = byte(r)
    344 		return 1
    345 	case i <= rune2Max:
    346 		_ = p[1] // eliminate bounds checks
    347 		p[0] = t2 | byte(r>>6)
    348 		p[1] = tx | byte(r)&maskx
    349 		return 2
    350 	case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
    351 		r = RuneError
    352 		fallthrough
    353 	case i <= rune3Max:
    354 		_ = p[2] // eliminate bounds checks
    355 		p[0] = t3 | byte(r>>12)
    356 		p[1] = tx | byte(r>>6)&maskx
    357 		p[2] = tx | byte(r)&maskx
    358 		return 3
    359 	default:
    360 		_ = p[3] // eliminate bounds checks
    361 		p[0] = t4 | byte(r>>18)
    362 		p[1] = tx | byte(r>>12)&maskx
    363 		p[2] = tx | byte(r>>6)&maskx
    364 		p[3] = tx | byte(r)&maskx
    365 		return 4
    366 	}
    367 }
    368 
    369 // RuneCount returns the number of runes in p. Erroneous and short
    370 // encodings are treated as single runes of width 1 byte.
    371 func RuneCount(p []byte) int {
    372 	np := len(p)
    373 	var n int
    374 	for i := 0; i < np; {
    375 		n++
    376 		c := p[i]
    377 		if c < RuneSelf {
    378 			// ASCII fast path
    379 			i++
    380 			continue
    381 		}
    382 		x := first[c]
    383 		if x == xx {
    384 			i++ // invalid.
    385 			continue
    386 		}
    387 		size := int(x & 7)
    388 		if i+size > np {
    389 			i++ // Short or invalid.
    390 			continue
    391 		}
    392 		accept := acceptRanges[x>>4]
    393 		if c := p[i+1]; c < accept.lo || accept.hi < c {
    394 			size = 1
    395 		} else if size == 2 {
    396 		} else if c := p[i+2]; c < locb || hicb < c {
    397 			size = 1
    398 		} else if size == 3 {
    399 		} else if c := p[i+3]; c < locb || hicb < c {
    400 			size = 1
    401 		}
    402 		i += size
    403 	}
    404 	return n
    405 }
    406 
    407 // RuneCountInString is like RuneCount but its input is a string.
    408 func RuneCountInString(s string) (n int) {
    409 	ns := len(s)
    410 	for i := 0; i < ns; n++ {
    411 		c := s[i]
    412 		if c < RuneSelf {
    413 			// ASCII fast path
    414 			i++
    415 			continue
    416 		}
    417 		x := first[c]
    418 		if x == xx {
    419 			i++ // invalid.
    420 			continue
    421 		}
    422 		size := int(x & 7)
    423 		if i+size > ns {
    424 			i++ // Short or invalid.
    425 			continue
    426 		}
    427 		accept := acceptRanges[x>>4]
    428 		if c := s[i+1]; c < accept.lo || accept.hi < c {
    429 			size = 1
    430 		} else if size == 2 {
    431 		} else if c := s[i+2]; c < locb || hicb < c {
    432 			size = 1
    433 		} else if size == 3 {
    434 		} else if c := s[i+3]; c < locb || hicb < c {
    435 			size = 1
    436 		}
    437 		i += size
    438 	}
    439 	return n
    440 }
    441 
    442 // RuneStart reports whether the byte could be the first byte of an encoded,
    443 // possibly invalid rune. Second and subsequent bytes always have the top two
    444 // bits set to 10.
    445 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
    446 
    447 // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
    448 func Valid(p []byte) bool {
    449 	n := len(p)
    450 	for i := 0; i < n; {
    451 		pi := p[i]
    452 		if pi < RuneSelf {
    453 			i++
    454 			continue
    455 		}
    456 		x := first[pi]
    457 		if x == xx {
    458 			return false // Illegal starter byte.
    459 		}
    460 		size := int(x & 7)
    461 		if i+size > n {
    462 			return false // Short or invalid.
    463 		}
    464 		accept := acceptRanges[x>>4]
    465 		if c := p[i+1]; c < accept.lo || accept.hi < c {
    466 			return false
    467 		} else if size == 2 {
    468 		} else if c := p[i+2]; c < locb || hicb < c {
    469 			return false
    470 		} else if size == 3 {
    471 		} else if c := p[i+3]; c < locb || hicb < c {
    472 			return false
    473 		}
    474 		i += size
    475 	}
    476 	return true
    477 }
    478 
    479 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
    480 func ValidString(s string) bool {
    481 	n := len(s)
    482 	for i := 0; i < n; {
    483 		si := s[i]
    484 		if si < RuneSelf {
    485 			i++
    486 			continue
    487 		}
    488 		x := first[si]
    489 		if x == xx {
    490 			return false // Illegal starter byte.
    491 		}
    492 		size := int(x & 7)
    493 		if i+size > n {
    494 			return false // Short or invalid.
    495 		}
    496 		accept := acceptRanges[x>>4]
    497 		if c := s[i+1]; c < accept.lo || accept.hi < c {
    498 			return false
    499 		} else if size == 2 {
    500 		} else if c := s[i+2]; c < locb || hicb < c {
    501 			return false
    502 		} else if size == 3 {
    503 		} else if c := s[i+3]; c < locb || hicb < c {
    504 			return false
    505 		}
    506 		i += size
    507 	}
    508 	return true
    509 }
    510 
    511 // ValidRune reports whether r can be legally encoded as UTF-8.
    512 // Code points that are out of range or a surrogate half are illegal.
    513 func ValidRune(r rune) bool {
    514 	switch {
    515 	case 0 <= r && r < surrogateMin:
    516 		return true
    517 	case surrogateMax < r && r <= MaxRune:
    518 		return true
    519 	}
    520 	return false
    521 }
    522