Home | History | Annotate | Download | only in utf8
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package utf8 implements functions and constants to support text encoded in
      6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
      7 package utf8
      8 
      9 // The conditions RuneError==unicode.ReplacementChar and
     10 // MaxRune==unicode.MaxRune are verified in the tests.
     11 // Defining them locally avoids this package depending on package unicode.
     12 
     13 // Numbers fundamental to the encoding.
     14 const (
     15 	RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
     16 	RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
     17 	MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
     18 	UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
     19 )
     20 
     21 // Code points in the surrogate range are not valid for UTF-8.
     22 const (
     23 	surrogateMin = 0xD800
     24 	surrogateMax = 0xDFFF
     25 )
     26 
     27 const (
     28 	t1 = 0x00 // 0000 0000
     29 	tx = 0x80 // 1000 0000
     30 	t2 = 0xC0 // 1100 0000
     31 	t3 = 0xE0 // 1110 0000
     32 	t4 = 0xF0 // 1111 0000
     33 	t5 = 0xF8 // 1111 1000
     34 
     35 	maskx = 0x3F // 0011 1111
     36 	mask2 = 0x1F // 0001 1111
     37 	mask3 = 0x0F // 0000 1111
     38 	mask4 = 0x07 // 0000 0111
     39 
     40 	rune1Max = 1<<7 - 1
     41 	rune2Max = 1<<11 - 1
     42 	rune3Max = 1<<16 - 1
     43 )
     44 
     45 func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
     46 	n := len(p)
     47 	if n < 1 {
     48 		return RuneError, 0, true
     49 	}
     50 	c0 := p[0]
     51 
     52 	// 1-byte, 7-bit sequence?
     53 	if c0 < tx {
     54 		return rune(c0), 1, false
     55 	}
     56 
     57 	// unexpected continuation byte?
     58 	if c0 < t2 {
     59 		return RuneError, 1, false
     60 	}
     61 
     62 	// need first continuation byte
     63 	if n < 2 {
     64 		return RuneError, 1, true
     65 	}
     66 	c1 := p[1]
     67 	if c1 < tx || t2 <= c1 {
     68 		return RuneError, 1, false
     69 	}
     70 
     71 	// 2-byte, 11-bit sequence?
     72 	if c0 < t3 {
     73 		r = rune(c0&mask2)<<6 | rune(c1&maskx)
     74 		if r <= rune1Max {
     75 			return RuneError, 1, false
     76 		}
     77 		return r, 2, false
     78 	}
     79 
     80 	// need second continuation byte
     81 	if n < 3 {
     82 		return RuneError, 1, true
     83 	}
     84 	c2 := p[2]
     85 	if c2 < tx || t2 <= c2 {
     86 		return RuneError, 1, false
     87 	}
     88 
     89 	// 3-byte, 16-bit sequence?
     90 	if c0 < t4 {
     91 		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
     92 		if r <= rune2Max {
     93 			return RuneError, 1, false
     94 		}
     95 		if surrogateMin <= r && r <= surrogateMax {
     96 			return RuneError, 1, false
     97 		}
     98 		return r, 3, false
     99 	}
    100 
    101 	// need third continuation byte
    102 	if n < 4 {
    103 		return RuneError, 1, true
    104 	}
    105 	c3 := p[3]
    106 	if c3 < tx || t2 <= c3 {
    107 		return RuneError, 1, false
    108 	}
    109 
    110 	// 4-byte, 21-bit sequence?
    111 	if c0 < t5 {
    112 		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
    113 		if r <= rune3Max || MaxRune < r {
    114 			return RuneError, 1, false
    115 		}
    116 		return r, 4, false
    117 	}
    118 
    119 	// error
    120 	return RuneError, 1, false
    121 }
    122 
    123 func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
    124 	n := len(s)
    125 	if n < 1 {
    126 		return RuneError, 0, true
    127 	}
    128 	c0 := s[0]
    129 
    130 	// 1-byte, 7-bit sequence?
    131 	if c0 < tx {
    132 		return rune(c0), 1, false
    133 	}
    134 
    135 	// unexpected continuation byte?
    136 	if c0 < t2 {
    137 		return RuneError, 1, false
    138 	}
    139 
    140 	// need first continuation byte
    141 	if n < 2 {
    142 		return RuneError, 1, true
    143 	}
    144 	c1 := s[1]
    145 	if c1 < tx || t2 <= c1 {
    146 		return RuneError, 1, false
    147 	}
    148 
    149 	// 2-byte, 11-bit sequence?
    150 	if c0 < t3 {
    151 		r = rune(c0&mask2)<<6 | rune(c1&maskx)
    152 		if r <= rune1Max {
    153 			return RuneError, 1, false
    154 		}
    155 		return r, 2, false
    156 	}
    157 
    158 	// need second continuation byte
    159 	if n < 3 {
    160 		return RuneError, 1, true
    161 	}
    162 	c2 := s[2]
    163 	if c2 < tx || t2 <= c2 {
    164 		return RuneError, 1, false
    165 	}
    166 
    167 	// 3-byte, 16-bit sequence?
    168 	if c0 < t4 {
    169 		r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
    170 		if r <= rune2Max {
    171 			return RuneError, 1, false
    172 		}
    173 		if surrogateMin <= r && r <= surrogateMax {
    174 			return RuneError, 1, false
    175 		}
    176 		return r, 3, false
    177 	}
    178 
    179 	// need third continuation byte
    180 	if n < 4 {
    181 		return RuneError, 1, true
    182 	}
    183 	c3 := s[3]
    184 	if c3 < tx || t2 <= c3 {
    185 		return RuneError, 1, false
    186 	}
    187 
    188 	// 4-byte, 21-bit sequence?
    189 	if c0 < t5 {
    190 		r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
    191 		if r <= rune3Max || MaxRune < r {
    192 			return RuneError, 1, false
    193 		}
    194 		return r, 4, false
    195 	}
    196 
    197 	// error
    198 	return RuneError, 1, false
    199 }
    200 
    201 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
    202 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
    203 func FullRune(p []byte) bool {
    204 	_, _, short := decodeRuneInternal(p)
    205 	return !short
    206 }
    207 
    208 // FullRuneInString is like FullRune but its input is a string.
    209 func FullRuneInString(s string) bool {
    210 	_, _, short := decodeRuneInStringInternal(s)
    211 	return !short
    212 }
    213 
    214 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
    215 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    216 // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    217 // results for correct UTF-8.
    218 //
    219 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    220 // out of range, or is not the shortest possible UTF-8 encoding for the
    221 // value. No other validation is performed.
    222 func DecodeRune(p []byte) (r rune, size int) {
    223 	r, size, _ = decodeRuneInternal(p)
    224 	return
    225 }
    226 
    227 // DecodeRuneInString is like DecodeRune but its input is a string. If s is
    228 // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
    229 // returns (RuneError, 1). Both are impossible results for correct UTF-8.
    230 //
    231 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    232 // out of range, or is not the shortest possible UTF-8 encoding for the
    233 // value. No other validation is performed.
    234 func DecodeRuneInString(s string) (r rune, size int) {
    235 	r, size, _ = decodeRuneInStringInternal(s)
    236 	return
    237 }
    238 
    239 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
    240 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
    241 // the encoding is invalid, it returns (RuneError, 1). Both are impossible
    242 // results for correct UTF-8.
    243 //
    244 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    245 // out of range, or is not the shortest possible UTF-8 encoding for the
    246 // value. No other validation is performed.
    247 func DecodeLastRune(p []byte) (r rune, size int) {
    248 	end := len(p)
    249 	if end == 0 {
    250 		return RuneError, 0
    251 	}
    252 	start := end - 1
    253 	r = rune(p[start])
    254 	if r < RuneSelf {
    255 		return r, 1
    256 	}
    257 	// guard against O(n^2) behavior when traversing
    258 	// backwards through strings with long sequences of
    259 	// invalid UTF-8.
    260 	lim := end - UTFMax
    261 	if lim < 0 {
    262 		lim = 0
    263 	}
    264 	for start--; start >= lim; start-- {
    265 		if RuneStart(p[start]) {
    266 			break
    267 		}
    268 	}
    269 	if start < 0 {
    270 		start = 0
    271 	}
    272 	r, size = DecodeRune(p[start:end])
    273 	if start+size != end {
    274 		return RuneError, 1
    275 	}
    276 	return r, size
    277 }
    278 
    279 // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
    280 // s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
    281 // it returns (RuneError, 1). Both are impossible results for correct UTF-8.
    282 //
    283 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
    284 // out of range, or is not the shortest possible UTF-8 encoding for the
    285 // value. No other validation is performed.
    286 func DecodeLastRuneInString(s string) (r rune, size int) {
    287 	end := len(s)
    288 	if end == 0 {
    289 		return RuneError, 0
    290 	}
    291 	start := end - 1
    292 	r = rune(s[start])
    293 	if r < RuneSelf {
    294 		return r, 1
    295 	}
    296 	// guard against O(n^2) behavior when traversing
    297 	// backwards through strings with long sequences of
    298 	// invalid UTF-8.
    299 	lim := end - UTFMax
    300 	if lim < 0 {
    301 		lim = 0
    302 	}
    303 	for start--; start >= lim; start-- {
    304 		if RuneStart(s[start]) {
    305 			break
    306 		}
    307 	}
    308 	if start < 0 {
    309 		start = 0
    310 	}
    311 	r, size = DecodeRuneInString(s[start:end])
    312 	if start+size != end {
    313 		return RuneError, 1
    314 	}
    315 	return r, size
    316 }
    317 
    318 // RuneLen returns the number of bytes required to encode the rune.
    319 // It returns -1 if the rune is not a valid value to encode in UTF-8.
    320 func RuneLen(r rune) int {
    321 	switch {
    322 	case r < 0:
    323 		return -1
    324 	case r <= rune1Max:
    325 		return 1
    326 	case r <= rune2Max:
    327 		return 2
    328 	case surrogateMin <= r && r <= surrogateMax:
    329 		return -1
    330 	case r <= rune3Max:
    331 		return 3
    332 	case r <= MaxRune:
    333 		return 4
    334 	}
    335 	return -1
    336 }
    337 
    338 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
    339 // It returns the number of bytes written.
    340 func EncodeRune(p []byte, r rune) int {
    341 	// Negative values are erroneous.  Making it unsigned addresses the problem.
    342 	switch i := uint32(r); {
    343 	case i <= rune1Max:
    344 		p[0] = byte(r)
    345 		return 1
    346 	case i <= rune2Max:
    347 		p[0] = t2 | byte(r>>6)
    348 		p[1] = tx | byte(r)&maskx
    349 		return 2
    350 	case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
    351 		r = RuneError
    352 		fallthrough
    353 	case i <= rune3Max:
    354 		p[0] = t3 | byte(r>>12)
    355 		p[1] = tx | byte(r>>6)&maskx
    356 		p[2] = tx | byte(r)&maskx
    357 		return 3
    358 	default:
    359 		p[0] = t4 | byte(r>>18)
    360 		p[1] = tx | byte(r>>12)&maskx
    361 		p[2] = tx | byte(r>>6)&maskx
    362 		p[3] = tx | byte(r)&maskx
    363 		return 4
    364 	}
    365 }
    366 
    367 // RuneCount returns the number of runes in p.  Erroneous and short
    368 // encodings are treated as single runes of width 1 byte.
    369 func RuneCount(p []byte) int {
    370 	i := 0
    371 	var n int
    372 	for n = 0; i < len(p); n++ {
    373 		if p[i] < RuneSelf {
    374 			i++
    375 		} else {
    376 			_, size := DecodeRune(p[i:])
    377 			i += size
    378 		}
    379 	}
    380 	return n
    381 }
    382 
    383 // RuneCountInString is like RuneCount but its input is a string.
    384 func RuneCountInString(s string) (n int) {
    385 	for range s {
    386 		n++
    387 	}
    388 	return
    389 }
    390 
    391 // RuneStart reports whether the byte could be the first byte of
    392 // an encoded rune.  Second and subsequent bytes always have the top
    393 // two bits set to 10.
    394 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
    395 
    396 // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
    397 func Valid(p []byte) bool {
    398 	i := 0
    399 	for i < len(p) {
    400 		if p[i] < RuneSelf {
    401 			i++
    402 		} else {
    403 			_, size := DecodeRune(p[i:])
    404 			if size == 1 {
    405 				// All valid runes of size 1 (those
    406 				// below RuneSelf) were handled above.
    407 				// This must be a RuneError.
    408 				return false
    409 			}
    410 			i += size
    411 		}
    412 	}
    413 	return true
    414 }
    415 
    416 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
    417 func ValidString(s string) bool {
    418 	for i, r := range s {
    419 		if r == RuneError {
    420 			// The RuneError value can be an error
    421 			// sentinel value (if it's size 1) or the same
    422 			// value encoded properly. Decode it to see if
    423 			// it's the 1 byte sentinel value.
    424 			_, size := DecodeRuneInString(s[i:])
    425 			if size == 1 {
    426 				return false
    427 			}
    428 		}
    429 	}
    430 	return true
    431 }
    432 
    433 // ValidRune reports whether r can be legally encoded as UTF-8.
    434 // Code points that are out of range or a surrogate half are illegal.
    435 func ValidRune(r rune) bool {
    436 	switch {
    437 	case r < 0:
    438 		return false
    439 	case surrogateMin <= r && r <= surrogateMax:
    440 		return false
    441 	case r > MaxRune:
    442 		return false
    443 	}
    444 	return true
    445 }
    446