1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package utf8 implements functions and constants to support text encoded in 6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. 7 package utf8 8 9 // The conditions RuneError==unicode.ReplacementChar and 10 // MaxRune==unicode.MaxRune are verified in the tests. 11 // Defining them locally avoids this package depending on package unicode. 12 13 // Numbers fundamental to the encoding. 14 const ( 15 RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character" 16 RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte. 17 MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. 18 UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. 19 ) 20 21 // Code points in the surrogate range are not valid for UTF-8. 22 const ( 23 surrogateMin = 0xD800 24 surrogateMax = 0xDFFF 25 ) 26 27 const ( 28 t1 = 0x00 // 0000 0000 29 tx = 0x80 // 1000 0000 30 t2 = 0xC0 // 1100 0000 31 t3 = 0xE0 // 1110 0000 32 t4 = 0xF0 // 1111 0000 33 t5 = 0xF8 // 1111 1000 34 35 maskx = 0x3F // 0011 1111 36 mask2 = 0x1F // 0001 1111 37 mask3 = 0x0F // 0000 1111 38 mask4 = 0x07 // 0000 0111 39 40 rune1Max = 1<<7 - 1 41 rune2Max = 1<<11 - 1 42 rune3Max = 1<<16 - 1 43 ) 44 45 func decodeRuneInternal(p []byte) (r rune, size int, short bool) { 46 n := len(p) 47 if n < 1 { 48 return RuneError, 0, true 49 } 50 c0 := p[0] 51 52 // 1-byte, 7-bit sequence? 53 if c0 < tx { 54 return rune(c0), 1, false 55 } 56 57 // unexpected continuation byte? 58 if c0 < t2 { 59 return RuneError, 1, false 60 } 61 62 // need first continuation byte 63 if n < 2 { 64 return RuneError, 1, true 65 } 66 c1 := p[1] 67 if c1 < tx || t2 <= c1 { 68 return RuneError, 1, false 69 } 70 71 // 2-byte, 11-bit sequence? 72 if c0 < t3 { 73 r = rune(c0&mask2)<<6 | rune(c1&maskx) 74 if r <= rune1Max { 75 return RuneError, 1, false 76 } 77 return r, 2, false 78 } 79 80 // need second continuation byte 81 if n < 3 { 82 return RuneError, 1, true 83 } 84 c2 := p[2] 85 if c2 < tx || t2 <= c2 { 86 return RuneError, 1, false 87 } 88 89 // 3-byte, 16-bit sequence? 90 if c0 < t4 { 91 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 92 if r <= rune2Max { 93 return RuneError, 1, false 94 } 95 if surrogateMin <= r && r <= surrogateMax { 96 return RuneError, 1, false 97 } 98 return r, 3, false 99 } 100 101 // need third continuation byte 102 if n < 4 { 103 return RuneError, 1, true 104 } 105 c3 := p[3] 106 if c3 < tx || t2 <= c3 { 107 return RuneError, 1, false 108 } 109 110 // 4-byte, 21-bit sequence? 111 if c0 < t5 { 112 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 113 if r <= rune3Max || MaxRune < r { 114 return RuneError, 1, false 115 } 116 return r, 4, false 117 } 118 119 // error 120 return RuneError, 1, false 121 } 122 123 func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { 124 n := len(s) 125 if n < 1 { 126 return RuneError, 0, true 127 } 128 c0 := s[0] 129 130 // 1-byte, 7-bit sequence? 131 if c0 < tx { 132 return rune(c0), 1, false 133 } 134 135 // unexpected continuation byte? 136 if c0 < t2 { 137 return RuneError, 1, false 138 } 139 140 // need first continuation byte 141 if n < 2 { 142 return RuneError, 1, true 143 } 144 c1 := s[1] 145 if c1 < tx || t2 <= c1 { 146 return RuneError, 1, false 147 } 148 149 // 2-byte, 11-bit sequence? 150 if c0 < t3 { 151 r = rune(c0&mask2)<<6 | rune(c1&maskx) 152 if r <= rune1Max { 153 return RuneError, 1, false 154 } 155 return r, 2, false 156 } 157 158 // need second continuation byte 159 if n < 3 { 160 return RuneError, 1, true 161 } 162 c2 := s[2] 163 if c2 < tx || t2 <= c2 { 164 return RuneError, 1, false 165 } 166 167 // 3-byte, 16-bit sequence? 168 if c0 < t4 { 169 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 170 if r <= rune2Max { 171 return RuneError, 1, false 172 } 173 if surrogateMin <= r && r <= surrogateMax { 174 return RuneError, 1, false 175 } 176 return r, 3, false 177 } 178 179 // need third continuation byte 180 if n < 4 { 181 return RuneError, 1, true 182 } 183 c3 := s[3] 184 if c3 < tx || t2 <= c3 { 185 return RuneError, 1, false 186 } 187 188 // 4-byte, 21-bit sequence? 189 if c0 < t5 { 190 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 191 if r <= rune3Max || MaxRune < r { 192 return RuneError, 1, false 193 } 194 return r, 4, false 195 } 196 197 // error 198 return RuneError, 1, false 199 } 200 201 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. 202 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. 203 func FullRune(p []byte) bool { 204 _, _, short := decodeRuneInternal(p) 205 return !short 206 } 207 208 // FullRuneInString is like FullRune but its input is a string. 209 func FullRuneInString(s string) bool { 210 _, _, short := decodeRuneInStringInternal(s) 211 return !short 212 } 213 214 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and 215 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if 216 // the encoding is invalid, it returns (RuneError, 1). Both are impossible 217 // results for correct UTF-8. 218 // 219 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 220 // out of range, or is not the shortest possible UTF-8 encoding for the 221 // value. No other validation is performed. 222 func DecodeRune(p []byte) (r rune, size int) { 223 r, size, _ = decodeRuneInternal(p) 224 return 225 } 226 227 // DecodeRuneInString is like DecodeRune but its input is a string. If s is 228 // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it 229 // returns (RuneError, 1). Both are impossible results for correct UTF-8. 230 // 231 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 232 // out of range, or is not the shortest possible UTF-8 encoding for the 233 // value. No other validation is performed. 234 func DecodeRuneInString(s string) (r rune, size int) { 235 r, size, _ = decodeRuneInStringInternal(s) 236 return 237 } 238 239 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and 240 // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if 241 // the encoding is invalid, it returns (RuneError, 1). Both are impossible 242 // results for correct UTF-8. 243 // 244 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 245 // out of range, or is not the shortest possible UTF-8 encoding for the 246 // value. No other validation is performed. 247 func DecodeLastRune(p []byte) (r rune, size int) { 248 end := len(p) 249 if end == 0 { 250 return RuneError, 0 251 } 252 start := end - 1 253 r = rune(p[start]) 254 if r < RuneSelf { 255 return r, 1 256 } 257 // guard against O(n^2) behavior when traversing 258 // backwards through strings with long sequences of 259 // invalid UTF-8. 260 lim := end - UTFMax 261 if lim < 0 { 262 lim = 0 263 } 264 for start--; start >= lim; start-- { 265 if RuneStart(p[start]) { 266 break 267 } 268 } 269 if start < 0 { 270 start = 0 271 } 272 r, size = DecodeRune(p[start:end]) 273 if start+size != end { 274 return RuneError, 1 275 } 276 return r, size 277 } 278 279 // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If 280 // s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, 281 // it returns (RuneError, 1). Both are impossible results for correct UTF-8. 282 // 283 // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is 284 // out of range, or is not the shortest possible UTF-8 encoding for the 285 // value. No other validation is performed. 286 func DecodeLastRuneInString(s string) (r rune, size int) { 287 end := len(s) 288 if end == 0 { 289 return RuneError, 0 290 } 291 start := end - 1 292 r = rune(s[start]) 293 if r < RuneSelf { 294 return r, 1 295 } 296 // guard against O(n^2) behavior when traversing 297 // backwards through strings with long sequences of 298 // invalid UTF-8. 299 lim := end - UTFMax 300 if lim < 0 { 301 lim = 0 302 } 303 for start--; start >= lim; start-- { 304 if RuneStart(s[start]) { 305 break 306 } 307 } 308 if start < 0 { 309 start = 0 310 } 311 r, size = DecodeRuneInString(s[start:end]) 312 if start+size != end { 313 return RuneError, 1 314 } 315 return r, size 316 } 317 318 // RuneLen returns the number of bytes required to encode the rune. 319 // It returns -1 if the rune is not a valid value to encode in UTF-8. 320 func RuneLen(r rune) int { 321 switch { 322 case r < 0: 323 return -1 324 case r <= rune1Max: 325 return 1 326 case r <= rune2Max: 327 return 2 328 case surrogateMin <= r && r <= surrogateMax: 329 return -1 330 case r <= rune3Max: 331 return 3 332 case r <= MaxRune: 333 return 4 334 } 335 return -1 336 } 337 338 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. 339 // It returns the number of bytes written. 340 func EncodeRune(p []byte, r rune) int { 341 // Negative values are erroneous. Making it unsigned addresses the problem. 342 switch i := uint32(r); { 343 case i <= rune1Max: 344 p[0] = byte(r) 345 return 1 346 case i <= rune2Max: 347 p[0] = t2 | byte(r>>6) 348 p[1] = tx | byte(r)&maskx 349 return 2 350 case i > MaxRune, surrogateMin <= i && i <= surrogateMax: 351 r = RuneError 352 fallthrough 353 case i <= rune3Max: 354 p[0] = t3 | byte(r>>12) 355 p[1] = tx | byte(r>>6)&maskx 356 p[2] = tx | byte(r)&maskx 357 return 3 358 default: 359 p[0] = t4 | byte(r>>18) 360 p[1] = tx | byte(r>>12)&maskx 361 p[2] = tx | byte(r>>6)&maskx 362 p[3] = tx | byte(r)&maskx 363 return 4 364 } 365 } 366 367 // RuneCount returns the number of runes in p. Erroneous and short 368 // encodings are treated as single runes of width 1 byte. 369 func RuneCount(p []byte) int { 370 i := 0 371 var n int 372 for n = 0; i < len(p); n++ { 373 if p[i] < RuneSelf { 374 i++ 375 } else { 376 _, size := DecodeRune(p[i:]) 377 i += size 378 } 379 } 380 return n 381 } 382 383 // RuneCountInString is like RuneCount but its input is a string. 384 func RuneCountInString(s string) (n int) { 385 for range s { 386 n++ 387 } 388 return 389 } 390 391 // RuneStart reports whether the byte could be the first byte of 392 // an encoded rune. Second and subsequent bytes always have the top 393 // two bits set to 10. 394 func RuneStart(b byte) bool { return b&0xC0 != 0x80 } 395 396 // Valid reports whether p consists entirely of valid UTF-8-encoded runes. 397 func Valid(p []byte) bool { 398 i := 0 399 for i < len(p) { 400 if p[i] < RuneSelf { 401 i++ 402 } else { 403 _, size := DecodeRune(p[i:]) 404 if size == 1 { 405 // All valid runes of size 1 (those 406 // below RuneSelf) were handled above. 407 // This must be a RuneError. 408 return false 409 } 410 i += size 411 } 412 } 413 return true 414 } 415 416 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes. 417 func ValidString(s string) bool { 418 for i, r := range s { 419 if r == RuneError { 420 // The RuneError value can be an error 421 // sentinel value (if it's size 1) or the same 422 // value encoded properly. Decode it to see if 423 // it's the 1 byte sentinel value. 424 _, size := DecodeRuneInString(s[i:]) 425 if size == 1 { 426 return false 427 } 428 } 429 } 430 return true 431 } 432 433 // ValidRune reports whether r can be legally encoded as UTF-8. 434 // Code points that are out of range or a surrogate half are illegal. 435 func ValidRune(r rune) bool { 436 switch { 437 case r < 0: 438 return false 439 case surrogateMin <= r && r <= surrogateMax: 440 return false 441 case r > MaxRune: 442 return false 443 } 444 return true 445 } 446