Home | History | Annotate | Download | only in utf8
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package utf8 implements functions and constants to support text encoded in
      6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
      7 package utf8
      8 
      9 // The conditions RuneError==unicode.ReplacementChar and
     10 // MaxRune==unicode.MaxRune are verified in the tests.
     11 // Defining them locally avoids this package depending on package unicode.
     12 
     13 // Numbers fundamental to the encoding.
     14 const (
     15 	RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
     16 	RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
     17 	MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
     18 	UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
     19 )
     20 
     21 // Code points in the surrogate range are not valid for UTF-8.
     22 const (
     23 	surrogateMin = 0xD800
     24 	surrogateMax = 0xDFFF
     25 )
     26 
     27 const (
     28 	t1 = 0x00 // 0000 0000
     29 	tx = 0x80 // 1000 0000
     30 	t2 = 0xC0 // 1100 0000
     31 	t3 = 0xE0 // 1110 0000
     32 	t4 = 0xF0 // 1111 0000
     33 	t5 = 0xF8 // 1111 1000
     34 
     35 	maskx = 0x3F // 0011 1111
     36 	mask2 = 0x1F // 0001 1111
     37 	mask3 = 0x0F // 0000 1111
     38 	mask4 = 0x07 // 0000 0111
     39 
     40 	rune1Max = 1<<7 - 1
     41 	rune2Max = 1<<11 - 1
     42 	rune3Max = 1<<16 - 1
     43 
     44 	// The default lowest and highest continuation byte.
     45 	locb = 0x80 // 1000 0000
     46 	hicb = 0xBF // 1011 1111
     47 
     48 	// These names of these constants are chosen to give nice alignment in the
     49 	// table below. The first nibble is an index into acceptRanges or F for
     50 	// special one-byte cases. The second nibble is the Rune length or the
     51 	// Status for the special one-byte case.
     52 	xx = 0xF1 // invalid: size 1
     53 	as = 0xF0 // ASCII: size 1
     54 	s1 = 0x02 // accept 0, size 2
     55 	s2 = 0x13 // accept 1, size 3
     56 	s3 = 0x03 // accept 0, size 3
     57 	s4 = 0x23 // accept 2, size 3
     58 	s5 = 0x34 // accept 3, size 4
     59 	s6 = 0x04 // accept 0, size 4
     60 	s7 = 0x44 // accept 4, size 4
     61 )
     62 
     63 // first is information about the first byte in a UTF-8 sequence.
     64 var first = [256]uint8{
     65 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
     66 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
     67 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
     68 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
     69 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
     70 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
     71 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
     72 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
     73 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
     74 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
     75 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
     76 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
     77 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
     78 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
     79 	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
     80 	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
     81 	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
     82 	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
     83 }
     84 
     85 // acceptRange gives the range of valid values for the second byte in a UTF-8
     86 // sequence.
     87 type acceptRange struct {
     88 	lo uint8 // lowest value for second byte.
     89 	hi uint8 // highest value for second byte.
     90 }
     91 
     92 var acceptRanges = [...]acceptRange{
     93 	0: {locb, hicb},
     94 	1: {0xA0, hicb},
     95 	2: {locb, 0x9F},
     96 	3: {0x90, hicb},
     97 	4: {locb, 0x8F},
     98 }
     99 
    100 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
    101 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
    102 func FullRune(p []byte) bool {
    103 	n := len(p)
    104 	if n == 0 {
    105 		return false
    106 	}
    107 	x := first[p[0]]
    108 	if n >= int(x&7) {
    109 		return true // ASCII, invalid or valid.
    110 	}
    111 	// Must be short or invalid.
    112 	accept := acceptRanges[x>>4]
    113 	if n > 1 {
    114 		if c := p[1]; c < accept.lo || accept.hi < c {
    115 			return true
    116 		} else if n > 2 && (p[2] < locb || hicb < p[2]) {
    117 			return true
    118 		}
    119 	}
    120 	return false
    121 }
    122 
    123 // FullRuneInString is like FullRune but its input is a string.
    124 func FullRuneInString(s string) bool {
    125 	n := len(s)
    126 	if n == 0 {
    127 		return false
    128 	}
    129 	x := first[s[0]]
    130 	if n >= int(x&7) {
    131 		return true // ASCII, invalid, or valid.
    132 	}
    133 	// Must be short or invalid.
    134 	accept := acceptRanges[x>>4]
    135 	if n > 1 {
    136 		if c := s[1]; c < accept.lo || accept.hi < c {
    137 			return true
    138 		} else if n > 2 && (s[2] < locb || hicb < s[2]) {
    139 			return true
    140 		}
    141 	}
    142 	return false
    143 }
    144 
    145 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
    146 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    147 // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    148 // results for correct, non-empty UTF-8.
    149 //
    150 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    151 // out of range, or is not the shortest possible UTF-8 encoding for the
    152 // value. No other validation is performed.
    153 func DecodeRune(p []byte) (r rune, size int) {
    154 	n := len(p)
    155 	if n < 1 {
    156 		return RuneError, 0
    157 	}
    158 	p0 := p[0]
    159 	x := first[p0]
    160 	if x >= as {
    161 		// The following code simulates an additional check for x == xx and
    162 		// handling the ASCII and invalid cases accordingly. This mask-and-or
    163 		// approach prevents an additional branch.
    164 		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
    165 		return rune(p[0])&^mask | RuneError&mask, 1
    166 	}
    167 	sz := x & 7
    168 	accept := acceptRanges[x>>4]
    169 	if n < int(sz) {
    170 		return RuneError, 1
    171 	}
    172 	b1 := p[1]
    173 	if b1 < accept.lo || accept.hi < b1 {
    174 		return RuneError, 1
    175 	}
    176 	if sz == 2 {
    177 		return rune(p0&mask2)<<6 | rune(b1&maskx), 2
    178 	}
    179 	b2 := p[2]
    180 	if b2 < locb || hicb < b2 {
    181 		return RuneError, 1
    182 	}
    183 	if sz == 3 {
    184 		return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
    185 	}
    186 	b3 := p[3]
    187 	if b3 < locb || hicb < b3 {
    188 		return RuneError, 1
    189 	}
    190 	return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
    191 }
    192 
    193 // DecodeRuneInString is like DecodeRune but its input is a string. If s is
    194 // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
    195 // returns (RuneError, 1). Both are impossible results for correct, non-empty
    196 // UTF-8.
    197 //
    198 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    199 // out of range, or is not the shortest possible UTF-8 encoding for the
    200 // value. No other validation is performed.
    201 func DecodeRuneInString(s string) (r rune, size int) {
    202 	n := len(s)
    203 	if n < 1 {
    204 		return RuneError, 0
    205 	}
    206 	s0 := s[0]
    207 	x := first[s0]
    208 	if x >= as {
    209 		// The following code simulates an additional check for x == xx and
    210 		// handling the ASCII and invalid cases accordingly. This mask-and-or
    211 		// approach prevents an additional branch.
    212 		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
    213 		return rune(s[0])&^mask | RuneError&mask, 1
    214 	}
    215 	sz := x & 7
    216 	accept := acceptRanges[x>>4]
    217 	if n < int(sz) {
    218 		return RuneError, 1
    219 	}
    220 	s1 := s[1]
    221 	if s1 < accept.lo || accept.hi < s1 {
    222 		return RuneError, 1
    223 	}
    224 	if sz == 2 {
    225 		return rune(s0&mask2)<<6 | rune(s1&maskx), 2
    226 	}
    227 	s2 := s[2]
    228 	if s2 < locb || hicb < s2 {
    229 		return RuneError, 1
    230 	}
    231 	if sz == 3 {
    232 		return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
    233 	}
    234 	s3 := s[3]
    235 	if s3 < locb || hicb < s3 {
    236 		return RuneError, 1
    237 	}
    238 	return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
    239 }
    240 
    241 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
    242 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    243 // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    244 // results for correct, non-empty UTF-8.
    245 //
    246 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    247 // out of range, or is not the shortest possible UTF-8 encoding for the
    248 // value. No other validation is performed.
    249 func DecodeLastRune(p []byte) (r rune, size int) {
    250 	end := len(p)
    251 	if end == 0 {
    252 		return RuneError, 0
    253 	}
    254 	start := end - 1
    255 	r = rune(p[start])
    256 	if r < RuneSelf {
    257 		return r, 1
    258 	}
    259 	// guard against O(n^2) behavior when traversing
    260 	// backwards through strings with long sequences of
    261 	// invalid UTF-8.
    262 	lim := end - UTFMax
    263 	if lim < 0 {
    264 		lim = 0
    265 	}
    266 	for start--; start >= lim; start-- {
    267 		if RuneStart(p[start]) {
    268 			break
    269 		}
    270 	}
    271 	if start < 0 {
    272 		start = 0
    273 	}
    274 	r, size = DecodeRune(p[start:end])
    275 	if start+size != end {
    276 		return RuneError, 1
    277 	}
    278 	return r, size
    279 }
    280 
    281 // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
    282 // s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
    283 // it returns (RuneError, 1). Both are impossible results for correct,
    284 // non-empty UTF-8.
    285 //
    286 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    287 // out of range, or is not the shortest possible UTF-8 encoding for the
    288 // value. No other validation is performed.
    289 func DecodeLastRuneInString(s string) (r rune, size int) {
    290 	end := len(s)
    291 	if end == 0 {
    292 		return RuneError, 0
    293 	}
    294 	start := end - 1
    295 	r = rune(s[start])
    296 	if r < RuneSelf {
    297 		return r, 1
    298 	}
    299 	// guard against O(n^2) behavior when traversing
    300 	// backwards through strings with long sequences of
    301 	// invalid UTF-8.
    302 	lim := end - UTFMax
    303 	if lim < 0 {
    304 		lim = 0
    305 	}
    306 	for start--; start >= lim; start-- {
    307 		if RuneStart(s[start]) {
    308 			break
    309 		}
    310 	}
    311 	if start < 0 {
    312 		start = 0
    313 	}
    314 	r, size = DecodeRuneInString(s[start:end])
    315 	if start+size != end {
    316 		return RuneError, 1
    317 	}
    318 	return r, size
    319 }
    320 
    321 // RuneLen returns the number of bytes required to encode the rune.
    322 // It returns -1 if the rune is not a valid value to encode in UTF-8.
    323 func RuneLen(r rune) int {
    324 	switch {
    325 	case r < 0:
    326 		return -1
    327 	case r <= rune1Max:
    328 		return 1
    329 	case r <= rune2Max:
    330 		return 2
    331 	case surrogateMin <= r && r <= surrogateMax:
    332 		return -1
    333 	case r <= rune3Max:
    334 		return 3
    335 	case r <= MaxRune:
    336 		return 4
    337 	}
    338 	return -1
    339 }
    340 
    341 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
    342 // It returns the number of bytes written.
    343 func EncodeRune(p []byte, r rune) int {
    344 	// Negative values are erroneous. Making it unsigned addresses the problem.
    345 	switch i := uint32(r); {
    346 	case i <= rune1Max:
    347 		p[0] = byte(r)
    348 		return 1
    349 	case i <= rune2Max:
    350 		_ = p[1] // eliminate bounds checks
    351 		p[0] = t2 | byte(r>>6)
    352 		p[1] = tx | byte(r)&maskx
    353 		return 2
    354 	case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
    355 		r = RuneError
    356 		fallthrough
    357 	case i <= rune3Max:
    358 		_ = p[2] // eliminate bounds checks
    359 		p[0] = t3 | byte(r>>12)
    360 		p[1] = tx | byte(r>>6)&maskx
    361 		p[2] = tx | byte(r)&maskx
    362 		return 3
    363 	default:
    364 		_ = p[3] // eliminate bounds checks
    365 		p[0] = t4 | byte(r>>18)
    366 		p[1] = tx | byte(r>>12)&maskx
    367 		p[2] = tx | byte(r>>6)&maskx
    368 		p[3] = tx | byte(r)&maskx
    369 		return 4
    370 	}
    371 }
    372 
    373 // RuneCount returns the number of runes in p. Erroneous and short
    374 // encodings are treated as single runes of width 1 byte.
    375 func RuneCount(p []byte) int {
    376 	np := len(p)
    377 	var n int
    378 	for i := 0; i < np; {
    379 		n++
    380 		c := p[i]
    381 		if c < RuneSelf {
    382 			// ASCII fast path
    383 			i++
    384 			continue
    385 		}
    386 		x := first[c]
    387 		if x == xx {
    388 			i++ // invalid.
    389 			continue
    390 		}
    391 		size := int(x & 7)
    392 		if i+size > np {
    393 			i++ // Short or invalid.
    394 			continue
    395 		}
    396 		accept := acceptRanges[x>>4]
    397 		if c := p[i+1]; c < accept.lo || accept.hi < c {
    398 			size = 1
    399 		} else if size == 2 {
    400 		} else if c := p[i+2]; c < locb || hicb < c {
    401 			size = 1
    402 		} else if size == 3 {
    403 		} else if c := p[i+3]; c < locb || hicb < c {
    404 			size = 1
    405 		}
    406 		i += size
    407 	}
    408 	return n
    409 }
    410 
    411 // RuneCountInString is like RuneCount but its input is a string.
    412 func RuneCountInString(s string) (n int) {
    413 	ns := len(s)
    414 	for i := 0; i < ns; n++ {
    415 		c := s[i]
    416 		if c < RuneSelf {
    417 			// ASCII fast path
    418 			i++
    419 			continue
    420 		}
    421 		x := first[c]
    422 		if x == xx {
    423 			i++ // invalid.
    424 			continue
    425 		}
    426 		size := int(x & 7)
    427 		if i+size > ns {
    428 			i++ // Short or invalid.
    429 			continue
    430 		}
    431 		accept := acceptRanges[x>>4]
    432 		if c := s[i+1]; c < accept.lo || accept.hi < c {
    433 			size = 1
    434 		} else if size == 2 {
    435 		} else if c := s[i+2]; c < locb || hicb < c {
    436 			size = 1
    437 		} else if size == 3 {
    438 		} else if c := s[i+3]; c < locb || hicb < c {
    439 			size = 1
    440 		}
    441 		i += size
    442 	}
    443 	return n
    444 }
    445 
    446 // RuneStart reports whether the byte could be the first byte of an encoded,
    447 // possibly invalid rune. Second and subsequent bytes always have the top two
    448 // bits set to 10.
    449 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
    450 
    451 // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
    452 func Valid(p []byte) bool {
    453 	n := len(p)
    454 	for i := 0; i < n; {
    455 		pi := p[i]
    456 		if pi < RuneSelf {
    457 			i++
    458 			continue
    459 		}
    460 		x := first[pi]
    461 		if x == xx {
    462 			return false // Illegal starter byte.
    463 		}
    464 		size := int(x & 7)
    465 		if i+size > n {
    466 			return false // Short or invalid.
    467 		}
    468 		accept := acceptRanges[x>>4]
    469 		if c := p[i+1]; c < accept.lo || accept.hi < c {
    470 			return false
    471 		} else if size == 2 {
    472 		} else if c := p[i+2]; c < locb || hicb < c {
    473 			return false
    474 		} else if size == 3 {
    475 		} else if c := p[i+3]; c < locb || hicb < c {
    476 			return false
    477 		}
    478 		i += size
    479 	}
    480 	return true
    481 }
    482 
    483 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
    484 func ValidString(s string) bool {
    485 	n := len(s)
    486 	for i := 0; i < n; {
    487 		si := s[i]
    488 		if si < RuneSelf {
    489 			i++
    490 			continue
    491 		}
    492 		x := first[si]
    493 		if x == xx {
    494 			return false // Illegal starter byte.
    495 		}
    496 		size := int(x & 7)
    497 		if i+size > n {
    498 			return false // Short or invalid.
    499 		}
    500 		accept := acceptRanges[x>>4]
    501 		if c := s[i+1]; c < accept.lo || accept.hi < c {
    502 			return false
    503 		} else if size == 2 {
    504 		} else if c := s[i+2]; c < locb || hicb < c {
    505 			return false
    506 		} else if size == 3 {
    507 		} else if c := s[i+3]; c < locb || hicb < c {
    508 			return false
    509 		}
    510 		i += size
    511 	}
    512 	return true
    513 }
    514 
    515 // ValidRune reports whether r can be legally encoded as UTF-8.
    516 // Code points that are out of range or a surrogate half are illegal.
    517 func ValidRune(r rune) bool {
    518 	switch {
    519 	case 0 <= r && r < surrogateMin:
    520 		return true
    521 	case surrogateMax < r && r <= MaxRune:
    522 		return true
    523 	}
    524 	return false
    525 }
    526