1 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT. 2 3 // Copyright 2011 The Go Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 7 package norm 8 9 import ( 10 "fmt" 11 "unicode/utf8" 12 ) 13 14 // MaxSegmentSize is the maximum size of a byte buffer needed to consider any 15 // sequence of starter and non-starter runes for the purpose of normalization. 16 const MaxSegmentSize = maxByteBufferSize 17 18 // An Iter iterates over a string or byte slice, while normalizing it 19 // to a given Form. 20 type Iter struct { 21 rb reorderBuffer 22 buf [maxByteBufferSize]byte 23 info Properties // first character saved from previous iteration 24 next iterFunc // implementation of next depends on form 25 asciiF iterFunc 26 27 p int // current position in input source 28 multiSeg []byte // remainder of multi-segment decomposition 29 } 30 31 type iterFunc func(*Iter) []byte 32 33 // Init initializes i to iterate over src after normalizing it to Form f. 34 func (i *Iter) Init(f Form, src []byte) { 35 i.p = 0 36 if len(src) == 0 { 37 i.setDone() 38 i.rb.nsrc = 0 39 return 40 } 41 i.multiSeg = nil 42 i.rb.init(f, src) 43 i.next = i.rb.f.nextMain 44 i.asciiF = nextASCIIBytes 45 i.info = i.rb.f.info(i.rb.src, i.p) 46 i.rb.ss.first(i.info) 47 } 48 49 // InitString initializes i to iterate over src after normalizing it to Form f. 50 func (i *Iter) InitString(f Form, src string) { 51 i.p = 0 52 if len(src) == 0 { 53 i.setDone() 54 i.rb.nsrc = 0 55 return 56 } 57 i.multiSeg = nil 58 i.rb.initString(f, src) 59 i.next = i.rb.f.nextMain 60 i.asciiF = nextASCIIString 61 i.info = i.rb.f.info(i.rb.src, i.p) 62 i.rb.ss.first(i.info) 63 } 64 65 // Seek sets the segment to be returned by the next call to Next to start 66 // at position p. It is the responsibility of the caller to set p to the 67 // start of a segment. 68 func (i *Iter) Seek(offset int64, whence int) (int64, error) { 69 var abs int64 70 switch whence { 71 case 0: 72 abs = offset 73 case 1: 74 abs = int64(i.p) + offset 75 case 2: 76 abs = int64(i.rb.nsrc) + offset 77 default: 78 return 0, fmt.Errorf("norm: invalid whence") 79 } 80 if abs < 0 { 81 return 0, fmt.Errorf("norm: negative position") 82 } 83 if int(abs) >= i.rb.nsrc { 84 i.setDone() 85 return int64(i.p), nil 86 } 87 i.p = int(abs) 88 i.multiSeg = nil 89 i.next = i.rb.f.nextMain 90 i.info = i.rb.f.info(i.rb.src, i.p) 91 i.rb.ss.first(i.info) 92 return abs, nil 93 } 94 95 // returnSlice returns a slice of the underlying input type as a byte slice. 96 // If the underlying is of type []byte, it will simply return a slice. 97 // If the underlying is of type string, it will copy the slice to the buffer 98 // and return that. 99 func (i *Iter) returnSlice(a, b int) []byte { 100 if i.rb.src.bytes == nil { 101 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] 102 } 103 return i.rb.src.bytes[a:b] 104 } 105 106 // Pos returns the byte position at which the next call to Next will commence processing. 107 func (i *Iter) Pos() int { 108 return i.p 109 } 110 111 func (i *Iter) setDone() { 112 i.next = nextDone 113 i.p = i.rb.nsrc 114 } 115 116 // Done returns true if there is no more input to process. 117 func (i *Iter) Done() bool { 118 return i.p >= i.rb.nsrc 119 } 120 121 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. 122 // For any input a and b for which f(a) == f(b), subsequent calls 123 // to Next will return the same segments. 124 // Modifying runes are grouped together with the preceding starter, if such a starter exists. 125 // Although not guaranteed, n will typically be the smallest possible n. 126 func (i *Iter) Next() []byte { 127 return i.next(i) 128 } 129 130 func nextASCIIBytes(i *Iter) []byte { 131 p := i.p + 1 132 if p >= i.rb.nsrc { 133 i.setDone() 134 return i.rb.src.bytes[i.p:p] 135 } 136 if i.rb.src.bytes[p] < utf8.RuneSelf { 137 p0 := i.p 138 i.p = p 139 return i.rb.src.bytes[p0:p] 140 } 141 i.info = i.rb.f.info(i.rb.src, i.p) 142 i.next = i.rb.f.nextMain 143 return i.next(i) 144 } 145 146 func nextASCIIString(i *Iter) []byte { 147 p := i.p + 1 148 if p >= i.rb.nsrc { 149 i.buf[0] = i.rb.src.str[i.p] 150 i.setDone() 151 return i.buf[:1] 152 } 153 if i.rb.src.str[p] < utf8.RuneSelf { 154 i.buf[0] = i.rb.src.str[i.p] 155 i.p = p 156 return i.buf[:1] 157 } 158 i.info = i.rb.f.info(i.rb.src, i.p) 159 i.next = i.rb.f.nextMain 160 return i.next(i) 161 } 162 163 func nextHangul(i *Iter) []byte { 164 p := i.p 165 next := p + hangulUTF8Size 166 if next >= i.rb.nsrc { 167 i.setDone() 168 } else if i.rb.src.hangul(next) == 0 { 169 i.rb.ss.next(i.info) 170 i.info = i.rb.f.info(i.rb.src, i.p) 171 i.next = i.rb.f.nextMain 172 return i.next(i) 173 } 174 i.p = next 175 return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] 176 } 177 178 func nextDone(i *Iter) []byte { 179 return nil 180 } 181 182 // nextMulti is used for iterating over multi-segment decompositions 183 // for decomposing normal forms. 184 func nextMulti(i *Iter) []byte { 185 j := 0 186 d := i.multiSeg 187 // skip first rune 188 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { 189 } 190 for j < len(d) { 191 info := i.rb.f.info(input{bytes: d}, j) 192 if info.BoundaryBefore() { 193 i.multiSeg = d[j:] 194 return d[:j] 195 } 196 j += int(info.size) 197 } 198 // treat last segment as normal decomposition 199 i.next = i.rb.f.nextMain 200 return i.next(i) 201 } 202 203 // nextMultiNorm is used for iterating over multi-segment decompositions 204 // for composing normal forms. 205 func nextMultiNorm(i *Iter) []byte { 206 j := 0 207 d := i.multiSeg 208 for j < len(d) { 209 info := i.rb.f.info(input{bytes: d}, j) 210 if info.BoundaryBefore() { 211 i.rb.compose() 212 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 213 i.rb.insertUnsafe(input{bytes: d}, j, info) 214 i.multiSeg = d[j+int(info.size):] 215 return seg 216 } 217 i.rb.insertUnsafe(input{bytes: d}, j, info) 218 j += int(info.size) 219 } 220 i.multiSeg = nil 221 i.next = nextComposed 222 return doNormComposed(i) 223 } 224 225 // nextDecomposed is the implementation of Next for forms NFD and NFKD. 226 func nextDecomposed(i *Iter) (next []byte) { 227 outp := 0 228 inCopyStart, outCopyStart := i.p, 0 229 for { 230 if sz := int(i.info.size); sz <= 1 { 231 i.rb.ss = 0 232 p := i.p 233 i.p++ // ASCII or illegal byte. Either way, advance by 1. 234 if i.p >= i.rb.nsrc { 235 i.setDone() 236 return i.returnSlice(p, i.p) 237 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 238 i.next = i.asciiF 239 return i.returnSlice(p, i.p) 240 } 241 outp++ 242 } else if d := i.info.Decomposition(); d != nil { 243 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. 244 // Case 1: there is a leftover to copy. In this case the decomposition 245 // must begin with a modifier and should always be appended. 246 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. 247 p := outp + len(d) 248 if outp > 0 { 249 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 250 // TODO: this condition should not be possible, but we leave it 251 // in for defensive purposes. 252 if p > len(i.buf) { 253 return i.buf[:outp] 254 } 255 } else if i.info.multiSegment() { 256 // outp must be 0 as multi-segment decompositions always 257 // start a new segment. 258 if i.multiSeg == nil { 259 i.multiSeg = d 260 i.next = nextMulti 261 return nextMulti(i) 262 } 263 // We are in the last segment. Treat as normal decomposition. 264 d = i.multiSeg 265 i.multiSeg = nil 266 p = len(d) 267 } 268 prevCC := i.info.tccc 269 if i.p += sz; i.p >= i.rb.nsrc { 270 i.setDone() 271 i.info = Properties{} // Force BoundaryBefore to succeed. 272 } else { 273 i.info = i.rb.f.info(i.rb.src, i.p) 274 } 275 switch i.rb.ss.next(i.info) { 276 case ssOverflow: 277 i.next = nextCGJDecompose 278 fallthrough 279 case ssStarter: 280 if outp > 0 { 281 copy(i.buf[outp:], d) 282 return i.buf[:p] 283 } 284 return d 285 } 286 copy(i.buf[outp:], d) 287 outp = p 288 inCopyStart, outCopyStart = i.p, outp 289 if i.info.ccc < prevCC { 290 goto doNorm 291 } 292 continue 293 } else if r := i.rb.src.hangul(i.p); r != 0 { 294 outp = decomposeHangul(i.buf[:], r) 295 i.p += hangulUTF8Size 296 inCopyStart, outCopyStart = i.p, outp 297 if i.p >= i.rb.nsrc { 298 i.setDone() 299 break 300 } else if i.rb.src.hangul(i.p) != 0 { 301 i.next = nextHangul 302 return i.buf[:outp] 303 } 304 } else { 305 p := outp + sz 306 if p > len(i.buf) { 307 break 308 } 309 outp = p 310 i.p += sz 311 } 312 if i.p >= i.rb.nsrc { 313 i.setDone() 314 break 315 } 316 prevCC := i.info.tccc 317 i.info = i.rb.f.info(i.rb.src, i.p) 318 if v := i.rb.ss.next(i.info); v == ssStarter { 319 break 320 } else if v == ssOverflow { 321 i.next = nextCGJDecompose 322 break 323 } 324 if i.info.ccc < prevCC { 325 goto doNorm 326 } 327 } 328 if outCopyStart == 0 { 329 return i.returnSlice(inCopyStart, i.p) 330 } else if inCopyStart < i.p { 331 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 332 } 333 return i.buf[:outp] 334 doNorm: 335 // Insert what we have decomposed so far in the reorderBuffer. 336 // As we will only reorder, there will always be enough room. 337 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) 338 i.rb.insertDecomposed(i.buf[0:outp]) 339 return doNormDecomposed(i) 340 } 341 342 func doNormDecomposed(i *Iter) []byte { 343 for { 344 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 345 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 346 i.setDone() 347 break 348 } 349 i.info = i.rb.f.info(i.rb.src, i.p) 350 if i.info.ccc == 0 { 351 break 352 } 353 if s := i.rb.ss.next(i.info); s == ssOverflow { 354 i.next = nextCGJDecompose 355 break 356 } 357 } 358 // new segment or too many combining characters: exit normalization 359 return i.buf[:i.rb.flushCopy(i.buf[:])] 360 } 361 362 func nextCGJDecompose(i *Iter) []byte { 363 i.rb.ss = 0 364 i.rb.insertCGJ() 365 i.next = nextDecomposed 366 i.rb.ss.first(i.info) 367 buf := doNormDecomposed(i) 368 return buf 369 } 370 371 // nextComposed is the implementation of Next for forms NFC and NFKC. 372 func nextComposed(i *Iter) []byte { 373 outp, startp := 0, i.p 374 var prevCC uint8 375 for { 376 if !i.info.isYesC() { 377 goto doNorm 378 } 379 prevCC = i.info.tccc 380 sz := int(i.info.size) 381 if sz == 0 { 382 sz = 1 // illegal rune: copy byte-by-byte 383 } 384 p := outp + sz 385 if p > len(i.buf) { 386 break 387 } 388 outp = p 389 i.p += sz 390 if i.p >= i.rb.nsrc { 391 i.setDone() 392 break 393 } else if i.rb.src._byte(i.p) < utf8.RuneSelf { 394 i.rb.ss = 0 395 i.next = i.asciiF 396 break 397 } 398 i.info = i.rb.f.info(i.rb.src, i.p) 399 if v := i.rb.ss.next(i.info); v == ssStarter { 400 break 401 } else if v == ssOverflow { 402 i.next = nextCGJCompose 403 break 404 } 405 if i.info.ccc < prevCC { 406 goto doNorm 407 } 408 } 409 return i.returnSlice(startp, i.p) 410 doNorm: 411 // reset to start position 412 i.p = startp 413 i.info = i.rb.f.info(i.rb.src, i.p) 414 i.rb.ss.first(i.info) 415 if i.info.multiSegment() { 416 d := i.info.Decomposition() 417 info := i.rb.f.info(input{bytes: d}, 0) 418 i.rb.insertUnsafe(input{bytes: d}, 0, info) 419 i.multiSeg = d[int(info.size):] 420 i.next = nextMultiNorm 421 return nextMultiNorm(i) 422 } 423 i.rb.ss.first(i.info) 424 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 425 return doNormComposed(i) 426 } 427 428 func doNormComposed(i *Iter) []byte { 429 // First rune should already be inserted. 430 for { 431 if i.p += int(i.info.size); i.p >= i.rb.nsrc { 432 i.setDone() 433 break 434 } 435 i.info = i.rb.f.info(i.rb.src, i.p) 436 if s := i.rb.ss.next(i.info); s == ssStarter { 437 break 438 } else if s == ssOverflow { 439 i.next = nextCGJCompose 440 break 441 } 442 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 443 } 444 i.rb.compose() 445 seg := i.buf[:i.rb.flushCopy(i.buf[:])] 446 return seg 447 } 448 449 func nextCGJCompose(i *Iter) []byte { 450 i.rb.ss = 0 // instead of first 451 i.rb.insertCGJ() 452 i.next = nextComposed 453 // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, 454 // even if they are not. This is particularly dubious for U+FF9E and UFF9A. 455 // If we ever change that, insert a check here. 456 i.rb.ss.first(i.info) 457 i.rb.insertUnsafe(i.rb.src, i.p, i.info) 458 return doNormComposed(i) 459 } 460