Home | History | Annotate | Download | only in norm
      1 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
      2 
      3 // Copyright 2011 The Go Authors. All rights reserved.
      4 // Use of this source code is governed by a BSD-style
      5 // license that can be found in the LICENSE file.
      6 
      7 package norm
      8 
      9 import (
     10 	"fmt"
     11 	"unicode/utf8"
     12 )
     13 
     14 // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
     15 // sequence of starter and non-starter runes for the purpose of normalization.
     16 const MaxSegmentSize = maxByteBufferSize
     17 
     18 // An Iter iterates over a string or byte slice, while normalizing it
     19 // to a given Form.
     20 type Iter struct {
     21 	rb     reorderBuffer
     22 	buf    [maxByteBufferSize]byte
     23 	info   Properties // first character saved from previous iteration
     24 	next   iterFunc   // implementation of next depends on form
     25 	asciiF iterFunc
     26 
     27 	p        int    // current position in input source
     28 	multiSeg []byte // remainder of multi-segment decomposition
     29 }
     30 
     31 type iterFunc func(*Iter) []byte
     32 
     33 // Init initializes i to iterate over src after normalizing it to Form f.
     34 func (i *Iter) Init(f Form, src []byte) {
     35 	i.p = 0
     36 	if len(src) == 0 {
     37 		i.setDone()
     38 		i.rb.nsrc = 0
     39 		return
     40 	}
     41 	i.multiSeg = nil
     42 	i.rb.init(f, src)
     43 	i.next = i.rb.f.nextMain
     44 	i.asciiF = nextASCIIBytes
     45 	i.info = i.rb.f.info(i.rb.src, i.p)
     46 	i.rb.ss.first(i.info)
     47 }
     48 
     49 // InitString initializes i to iterate over src after normalizing it to Form f.
     50 func (i *Iter) InitString(f Form, src string) {
     51 	i.p = 0
     52 	if len(src) == 0 {
     53 		i.setDone()
     54 		i.rb.nsrc = 0
     55 		return
     56 	}
     57 	i.multiSeg = nil
     58 	i.rb.initString(f, src)
     59 	i.next = i.rb.f.nextMain
     60 	i.asciiF = nextASCIIString
     61 	i.info = i.rb.f.info(i.rb.src, i.p)
     62 	i.rb.ss.first(i.info)
     63 }
     64 
     65 // Seek sets the segment to be returned by the next call to Next to start
     66 // at position p.  It is the responsibility of the caller to set p to the
     67 // start of a segment.
     68 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
     69 	var abs int64
     70 	switch whence {
     71 	case 0:
     72 		abs = offset
     73 	case 1:
     74 		abs = int64(i.p) + offset
     75 	case 2:
     76 		abs = int64(i.rb.nsrc) + offset
     77 	default:
     78 		return 0, fmt.Errorf("norm: invalid whence")
     79 	}
     80 	if abs < 0 {
     81 		return 0, fmt.Errorf("norm: negative position")
     82 	}
     83 	if int(abs) >= i.rb.nsrc {
     84 		i.setDone()
     85 		return int64(i.p), nil
     86 	}
     87 	i.p = int(abs)
     88 	i.multiSeg = nil
     89 	i.next = i.rb.f.nextMain
     90 	i.info = i.rb.f.info(i.rb.src, i.p)
     91 	i.rb.ss.first(i.info)
     92 	return abs, nil
     93 }
     94 
     95 // returnSlice returns a slice of the underlying input type as a byte slice.
     96 // If the underlying is of type []byte, it will simply return a slice.
     97 // If the underlying is of type string, it will copy the slice to the buffer
     98 // and return that.
     99 func (i *Iter) returnSlice(a, b int) []byte {
    100 	if i.rb.src.bytes == nil {
    101 		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
    102 	}
    103 	return i.rb.src.bytes[a:b]
    104 }
    105 
    106 // Pos returns the byte position at which the next call to Next will commence processing.
    107 func (i *Iter) Pos() int {
    108 	return i.p
    109 }
    110 
    111 func (i *Iter) setDone() {
    112 	i.next = nextDone
    113 	i.p = i.rb.nsrc
    114 }
    115 
    116 // Done returns true if there is no more input to process.
    117 func (i *Iter) Done() bool {
    118 	return i.p >= i.rb.nsrc
    119 }
    120 
    121 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
    122 // For any input a and b for which f(a) == f(b), subsequent calls
    123 // to Next will return the same segments.
    124 // Modifying runes are grouped together with the preceding starter, if such a starter exists.
    125 // Although not guaranteed, n will typically be the smallest possible n.
    126 func (i *Iter) Next() []byte {
    127 	return i.next(i)
    128 }
    129 
    130 func nextASCIIBytes(i *Iter) []byte {
    131 	p := i.p + 1
    132 	if p >= i.rb.nsrc {
    133 		i.setDone()
    134 		return i.rb.src.bytes[i.p:p]
    135 	}
    136 	if i.rb.src.bytes[p] < utf8.RuneSelf {
    137 		p0 := i.p
    138 		i.p = p
    139 		return i.rb.src.bytes[p0:p]
    140 	}
    141 	i.info = i.rb.f.info(i.rb.src, i.p)
    142 	i.next = i.rb.f.nextMain
    143 	return i.next(i)
    144 }
    145 
    146 func nextASCIIString(i *Iter) []byte {
    147 	p := i.p + 1
    148 	if p >= i.rb.nsrc {
    149 		i.buf[0] = i.rb.src.str[i.p]
    150 		i.setDone()
    151 		return i.buf[:1]
    152 	}
    153 	if i.rb.src.str[p] < utf8.RuneSelf {
    154 		i.buf[0] = i.rb.src.str[i.p]
    155 		i.p = p
    156 		return i.buf[:1]
    157 	}
    158 	i.info = i.rb.f.info(i.rb.src, i.p)
    159 	i.next = i.rb.f.nextMain
    160 	return i.next(i)
    161 }
    162 
    163 func nextHangul(i *Iter) []byte {
    164 	p := i.p
    165 	next := p + hangulUTF8Size
    166 	if next >= i.rb.nsrc {
    167 		i.setDone()
    168 	} else if i.rb.src.hangul(next) == 0 {
    169 		i.rb.ss.next(i.info)
    170 		i.info = i.rb.f.info(i.rb.src, i.p)
    171 		i.next = i.rb.f.nextMain
    172 		return i.next(i)
    173 	}
    174 	i.p = next
    175 	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
    176 }
    177 
    178 func nextDone(i *Iter) []byte {
    179 	return nil
    180 }
    181 
    182 // nextMulti is used for iterating over multi-segment decompositions
    183 // for decomposing normal forms.
    184 func nextMulti(i *Iter) []byte {
    185 	j := 0
    186 	d := i.multiSeg
    187 	// skip first rune
    188 	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
    189 	}
    190 	for j < len(d) {
    191 		info := i.rb.f.info(input{bytes: d}, j)
    192 		if info.BoundaryBefore() {
    193 			i.multiSeg = d[j:]
    194 			return d[:j]
    195 		}
    196 		j += int(info.size)
    197 	}
    198 	// treat last segment as normal decomposition
    199 	i.next = i.rb.f.nextMain
    200 	return i.next(i)
    201 }
    202 
    203 // nextMultiNorm is used for iterating over multi-segment decompositions
    204 // for composing normal forms.
    205 func nextMultiNorm(i *Iter) []byte {
    206 	j := 0
    207 	d := i.multiSeg
    208 	for j < len(d) {
    209 		info := i.rb.f.info(input{bytes: d}, j)
    210 		if info.BoundaryBefore() {
    211 			i.rb.compose()
    212 			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
    213 			i.rb.insertUnsafe(input{bytes: d}, j, info)
    214 			i.multiSeg = d[j+int(info.size):]
    215 			return seg
    216 		}
    217 		i.rb.insertUnsafe(input{bytes: d}, j, info)
    218 		j += int(info.size)
    219 	}
    220 	i.multiSeg = nil
    221 	i.next = nextComposed
    222 	return doNormComposed(i)
    223 }
    224 
    225 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
    226 func nextDecomposed(i *Iter) (next []byte) {
    227 	outp := 0
    228 	inCopyStart, outCopyStart := i.p, 0
    229 	for {
    230 		if sz := int(i.info.size); sz <= 1 {
    231 			i.rb.ss = 0
    232 			p := i.p
    233 			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
    234 			if i.p >= i.rb.nsrc {
    235 				i.setDone()
    236 				return i.returnSlice(p, i.p)
    237 			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
    238 				i.next = i.asciiF
    239 				return i.returnSlice(p, i.p)
    240 			}
    241 			outp++
    242 		} else if d := i.info.Decomposition(); d != nil {
    243 			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
    244 			// Case 1: there is a leftover to copy.  In this case the decomposition
    245 			// must begin with a modifier and should always be appended.
    246 			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
    247 			p := outp + len(d)
    248 			if outp > 0 {
    249 				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
    250 				// TODO: this condition should not be possible, but we leave it
    251 				// in for defensive purposes.
    252 				if p > len(i.buf) {
    253 					return i.buf[:outp]
    254 				}
    255 			} else if i.info.multiSegment() {
    256 				// outp must be 0 as multi-segment decompositions always
    257 				// start a new segment.
    258 				if i.multiSeg == nil {
    259 					i.multiSeg = d
    260 					i.next = nextMulti
    261 					return nextMulti(i)
    262 				}
    263 				// We are in the last segment.  Treat as normal decomposition.
    264 				d = i.multiSeg
    265 				i.multiSeg = nil
    266 				p = len(d)
    267 			}
    268 			prevCC := i.info.tccc
    269 			if i.p += sz; i.p >= i.rb.nsrc {
    270 				i.setDone()
    271 				i.info = Properties{} // Force BoundaryBefore to succeed.
    272 			} else {
    273 				i.info = i.rb.f.info(i.rb.src, i.p)
    274 			}
    275 			switch i.rb.ss.next(i.info) {
    276 			case ssOverflow:
    277 				i.next = nextCGJDecompose
    278 				fallthrough
    279 			case ssStarter:
    280 				if outp > 0 {
    281 					copy(i.buf[outp:], d)
    282 					return i.buf[:p]
    283 				}
    284 				return d
    285 			}
    286 			copy(i.buf[outp:], d)
    287 			outp = p
    288 			inCopyStart, outCopyStart = i.p, outp
    289 			if i.info.ccc < prevCC {
    290 				goto doNorm
    291 			}
    292 			continue
    293 		} else if r := i.rb.src.hangul(i.p); r != 0 {
    294 			outp = decomposeHangul(i.buf[:], r)
    295 			i.p += hangulUTF8Size
    296 			inCopyStart, outCopyStart = i.p, outp
    297 			if i.p >= i.rb.nsrc {
    298 				i.setDone()
    299 				break
    300 			} else if i.rb.src.hangul(i.p) != 0 {
    301 				i.next = nextHangul
    302 				return i.buf[:outp]
    303 			}
    304 		} else {
    305 			p := outp + sz
    306 			if p > len(i.buf) {
    307 				break
    308 			}
    309 			outp = p
    310 			i.p += sz
    311 		}
    312 		if i.p >= i.rb.nsrc {
    313 			i.setDone()
    314 			break
    315 		}
    316 		prevCC := i.info.tccc
    317 		i.info = i.rb.f.info(i.rb.src, i.p)
    318 		if v := i.rb.ss.next(i.info); v == ssStarter {
    319 			break
    320 		} else if v == ssOverflow {
    321 			i.next = nextCGJDecompose
    322 			break
    323 		}
    324 		if i.info.ccc < prevCC {
    325 			goto doNorm
    326 		}
    327 	}
    328 	if outCopyStart == 0 {
    329 		return i.returnSlice(inCopyStart, i.p)
    330 	} else if inCopyStart < i.p {
    331 		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
    332 	}
    333 	return i.buf[:outp]
    334 doNorm:
    335 	// Insert what we have decomposed so far in the reorderBuffer.
    336 	// As we will only reorder, there will always be enough room.
    337 	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
    338 	i.rb.insertDecomposed(i.buf[0:outp])
    339 	return doNormDecomposed(i)
    340 }
    341 
    342 func doNormDecomposed(i *Iter) []byte {
    343 	for {
    344 		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    345 		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
    346 			i.setDone()
    347 			break
    348 		}
    349 		i.info = i.rb.f.info(i.rb.src, i.p)
    350 		if i.info.ccc == 0 {
    351 			break
    352 		}
    353 		if s := i.rb.ss.next(i.info); s == ssOverflow {
    354 			i.next = nextCGJDecompose
    355 			break
    356 		}
    357 	}
    358 	// new segment or too many combining characters: exit normalization
    359 	return i.buf[:i.rb.flushCopy(i.buf[:])]
    360 }
    361 
    362 func nextCGJDecompose(i *Iter) []byte {
    363 	i.rb.ss = 0
    364 	i.rb.insertCGJ()
    365 	i.next = nextDecomposed
    366 	i.rb.ss.first(i.info)
    367 	buf := doNormDecomposed(i)
    368 	return buf
    369 }
    370 
    371 // nextComposed is the implementation of Next for forms NFC and NFKC.
    372 func nextComposed(i *Iter) []byte {
    373 	outp, startp := 0, i.p
    374 	var prevCC uint8
    375 	for {
    376 		if !i.info.isYesC() {
    377 			goto doNorm
    378 		}
    379 		prevCC = i.info.tccc
    380 		sz := int(i.info.size)
    381 		if sz == 0 {
    382 			sz = 1 // illegal rune: copy byte-by-byte
    383 		}
    384 		p := outp + sz
    385 		if p > len(i.buf) {
    386 			break
    387 		}
    388 		outp = p
    389 		i.p += sz
    390 		if i.p >= i.rb.nsrc {
    391 			i.setDone()
    392 			break
    393 		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
    394 			i.rb.ss = 0
    395 			i.next = i.asciiF
    396 			break
    397 		}
    398 		i.info = i.rb.f.info(i.rb.src, i.p)
    399 		if v := i.rb.ss.next(i.info); v == ssStarter {
    400 			break
    401 		} else if v == ssOverflow {
    402 			i.next = nextCGJCompose
    403 			break
    404 		}
    405 		if i.info.ccc < prevCC {
    406 			goto doNorm
    407 		}
    408 	}
    409 	return i.returnSlice(startp, i.p)
    410 doNorm:
    411 	// reset to start position
    412 	i.p = startp
    413 	i.info = i.rb.f.info(i.rb.src, i.p)
    414 	i.rb.ss.first(i.info)
    415 	if i.info.multiSegment() {
    416 		d := i.info.Decomposition()
    417 		info := i.rb.f.info(input{bytes: d}, 0)
    418 		i.rb.insertUnsafe(input{bytes: d}, 0, info)
    419 		i.multiSeg = d[int(info.size):]
    420 		i.next = nextMultiNorm
    421 		return nextMultiNorm(i)
    422 	}
    423 	i.rb.ss.first(i.info)
    424 	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    425 	return doNormComposed(i)
    426 }
    427 
    428 func doNormComposed(i *Iter) []byte {
    429 	// First rune should already be inserted.
    430 	for {
    431 		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
    432 			i.setDone()
    433 			break
    434 		}
    435 		i.info = i.rb.f.info(i.rb.src, i.p)
    436 		if s := i.rb.ss.next(i.info); s == ssStarter {
    437 			break
    438 		} else if s == ssOverflow {
    439 			i.next = nextCGJCompose
    440 			break
    441 		}
    442 		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    443 	}
    444 	i.rb.compose()
    445 	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
    446 	return seg
    447 }
    448 
    449 func nextCGJCompose(i *Iter) []byte {
    450 	i.rb.ss = 0 // instead of first
    451 	i.rb.insertCGJ()
    452 	i.next = nextComposed
    453 	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
    454 	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
    455 	// If we ever change that, insert a check here.
    456 	i.rb.ss.first(i.info)
    457 	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    458 	return doNormComposed(i)
    459 }
    460