Home | History | Annotate | Download | only in norm
      1 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
      2 
      3 // Copyright 2011 The Go Authors. All rights reserved.
      4 // Use of this source code is governed by a BSD-style
      5 // license that can be found in the LICENSE file.
      6 
      7 // Note: the file data_test.go that is generated should not be checked in.
      8 
      9 // Package norm contains types and functions for normalizing Unicode strings.
     10 package norm // import "golang_org/x/text/unicode/norm"
     11 
     12 import (
     13 	"unicode/utf8"
     14 
     15 	"golang_org/x/text/transform"
     16 )
     17 
     18 // A Form denotes a canonical representation of Unicode code points.
     19 // The Unicode-defined normalization and equivalence forms are:
     20 //
     21 //   NFC   Unicode Normalization Form C
     22 //   NFD   Unicode Normalization Form D
     23 //   NFKC  Unicode Normalization Form KC
     24 //   NFKD  Unicode Normalization Form KD
     25 //
     26 // For a Form f, this documentation uses the notation f(x) to mean
     27 // the bytes or string x converted to the given form.
     28 // A position n in x is called a boundary if conversion to the form can
     29 // proceed independently on both sides:
     30 //   f(x) == append(f(x[0:n]), f(x[n:])...)
     31 //
     32 // References: http://unicode.org/reports/tr15/ and
     33 // http://unicode.org/notes/tn5/.
     34 type Form int
     35 
     36 const (
     37 	NFC Form = iota
     38 	NFD
     39 	NFKC
     40 	NFKD
     41 )
     42 
     43 // Bytes returns f(b). May return b if f(b) = b.
     44 func (f Form) Bytes(b []byte) []byte {
     45 	src := inputBytes(b)
     46 	ft := formTable[f]
     47 	n, ok := ft.quickSpan(src, 0, len(b), true)
     48 	if ok {
     49 		return b
     50 	}
     51 	out := make([]byte, n, len(b))
     52 	copy(out, b[0:n])
     53 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
     54 	return doAppendInner(&rb, n)
     55 }
     56 
     57 // String returns f(s).
     58 func (f Form) String(s string) string {
     59 	src := inputString(s)
     60 	ft := formTable[f]
     61 	n, ok := ft.quickSpan(src, 0, len(s), true)
     62 	if ok {
     63 		return s
     64 	}
     65 	out := make([]byte, n, len(s))
     66 	copy(out, s[0:n])
     67 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
     68 	return string(doAppendInner(&rb, n))
     69 }
     70 
     71 // IsNormal returns true if b == f(b).
     72 func (f Form) IsNormal(b []byte) bool {
     73 	src := inputBytes(b)
     74 	ft := formTable[f]
     75 	bp, ok := ft.quickSpan(src, 0, len(b), true)
     76 	if ok {
     77 		return true
     78 	}
     79 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
     80 	rb.setFlusher(nil, cmpNormalBytes)
     81 	for bp < len(b) {
     82 		rb.out = b[bp:]
     83 		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
     84 			return false
     85 		}
     86 		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
     87 	}
     88 	return true
     89 }
     90 
     91 func cmpNormalBytes(rb *reorderBuffer) bool {
     92 	b := rb.out
     93 	for i := 0; i < rb.nrune; i++ {
     94 		info := rb.rune[i]
     95 		if int(info.size) > len(b) {
     96 			return false
     97 		}
     98 		p := info.pos
     99 		pe := p + info.size
    100 		for ; p < pe; p++ {
    101 			if b[0] != rb.byte[p] {
    102 				return false
    103 			}
    104 			b = b[1:]
    105 		}
    106 	}
    107 	return true
    108 }
    109 
    110 // IsNormalString returns true if s == f(s).
    111 func (f Form) IsNormalString(s string) bool {
    112 	src := inputString(s)
    113 	ft := formTable[f]
    114 	bp, ok := ft.quickSpan(src, 0, len(s), true)
    115 	if ok {
    116 		return true
    117 	}
    118 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
    119 	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
    120 		for i := 0; i < rb.nrune; i++ {
    121 			info := rb.rune[i]
    122 			if bp+int(info.size) > len(s) {
    123 				return false
    124 			}
    125 			p := info.pos
    126 			pe := p + info.size
    127 			for ; p < pe; p++ {
    128 				if s[bp] != rb.byte[p] {
    129 					return false
    130 				}
    131 				bp++
    132 			}
    133 		}
    134 		return true
    135 	})
    136 	for bp < len(s) {
    137 		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
    138 			return false
    139 		}
    140 		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
    141 	}
    142 	return true
    143 }
    144 
    145 // patchTail fixes a case where a rune may be incorrectly normalized
    146 // if it is followed by illegal continuation bytes. It returns the
    147 // patched buffer and whether the decomposition is still in progress.
    148 func patchTail(rb *reorderBuffer) bool {
    149 	info, p := lastRuneStart(&rb.f, rb.out)
    150 	if p == -1 || info.size == 0 {
    151 		return true
    152 	}
    153 	end := p + int(info.size)
    154 	extra := len(rb.out) - end
    155 	if extra > 0 {
    156 		// Potentially allocating memory. However, this only
    157 		// happens with ill-formed UTF-8.
    158 		x := make([]byte, 0)
    159 		x = append(x, rb.out[len(rb.out)-extra:]...)
    160 		rb.out = rb.out[:end]
    161 		decomposeToLastBoundary(rb)
    162 		rb.doFlush()
    163 		rb.out = append(rb.out, x...)
    164 		return false
    165 	}
    166 	buf := rb.out[p:]
    167 	rb.out = rb.out[:p]
    168 	decomposeToLastBoundary(rb)
    169 	if s := rb.ss.next(info); s == ssStarter {
    170 		rb.doFlush()
    171 		rb.ss.first(info)
    172 	} else if s == ssOverflow {
    173 		rb.doFlush()
    174 		rb.insertCGJ()
    175 		rb.ss = 0
    176 	}
    177 	rb.insertUnsafe(inputBytes(buf), 0, info)
    178 	return true
    179 }
    180 
    181 func appendQuick(rb *reorderBuffer, i int) int {
    182 	if rb.nsrc == i {
    183 		return i
    184 	}
    185 	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
    186 	rb.out = rb.src.appendSlice(rb.out, i, end)
    187 	return end
    188 }
    189 
    190 // Append returns f(append(out, b...)).
    191 // The buffer out must be nil, empty, or equal to f(out).
    192 func (f Form) Append(out []byte, src ...byte) []byte {
    193 	return f.doAppend(out, inputBytes(src), len(src))
    194 }
    195 
    196 func (f Form) doAppend(out []byte, src input, n int) []byte {
    197 	if n == 0 {
    198 		return out
    199 	}
    200 	ft := formTable[f]
    201 	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
    202 	if len(out) == 0 {
    203 		p, _ := ft.quickSpan(src, 0, n, true)
    204 		out = src.appendSlice(out, 0, p)
    205 		if p == n {
    206 			return out
    207 		}
    208 		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
    209 		return doAppendInner(&rb, p)
    210 	}
    211 	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
    212 	return doAppend(&rb, out, 0)
    213 }
    214 
    215 func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
    216 	rb.setFlusher(out, appendFlush)
    217 	src, n := rb.src, rb.nsrc
    218 	doMerge := len(out) > 0
    219 	if q := src.skipContinuationBytes(p); q > p {
    220 		// Move leading non-starters to destination.
    221 		rb.out = src.appendSlice(rb.out, p, q)
    222 		p = q
    223 		doMerge = patchTail(rb)
    224 	}
    225 	fd := &rb.f
    226 	if doMerge {
    227 		var info Properties
    228 		if p < n {
    229 			info = fd.info(src, p)
    230 			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
    231 				if p == 0 {
    232 					decomposeToLastBoundary(rb)
    233 				}
    234 				p = decomposeSegment(rb, p, true)
    235 			}
    236 		}
    237 		if info.size == 0 {
    238 			rb.doFlush()
    239 			// Append incomplete UTF-8 encoding.
    240 			return src.appendSlice(rb.out, p, n)
    241 		}
    242 		if rb.nrune > 0 {
    243 			return doAppendInner(rb, p)
    244 		}
    245 	}
    246 	p = appendQuick(rb, p)
    247 	return doAppendInner(rb, p)
    248 }
    249 
    250 func doAppendInner(rb *reorderBuffer, p int) []byte {
    251 	for n := rb.nsrc; p < n; {
    252 		p = decomposeSegment(rb, p, true)
    253 		p = appendQuick(rb, p)
    254 	}
    255 	return rb.out
    256 }
    257 
    258 // AppendString returns f(append(out, []byte(s))).
    259 // The buffer out must be nil, empty, or equal to f(out).
    260 func (f Form) AppendString(out []byte, src string) []byte {
    261 	return f.doAppend(out, inputString(src), len(src))
    262 }
    263 
    264 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
    265 // It is not guaranteed to return the largest such n.
    266 func (f Form) QuickSpan(b []byte) int {
    267 	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
    268 	return n
    269 }
    270 
    271 // Span implements transform.SpanningTransformer. It returns a boundary n such
    272 // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
    273 func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
    274 	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
    275 	if n < len(b) {
    276 		if !ok {
    277 			err = transform.ErrEndOfSpan
    278 		} else {
    279 			err = transform.ErrShortSrc
    280 		}
    281 	}
    282 	return n, err
    283 }
    284 
    285 // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
    286 // It is not guaranteed to return the largest such n.
    287 func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
    288 	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
    289 	if n < len(s) {
    290 		if !ok {
    291 			err = transform.ErrEndOfSpan
    292 		} else {
    293 			err = transform.ErrShortSrc
    294 		}
    295 	}
    296 	return n, err
    297 }
    298 
    299 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
    300 // whether any non-normalized parts were found. If atEOF is false, n will
    301 // not point past the last segment if this segment might be become
    302 // non-normalized by appending other runes.
    303 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
    304 	var lastCC uint8
    305 	ss := streamSafe(0)
    306 	lastSegStart := i
    307 	for n = end; i < n; {
    308 		if j := src.skipASCII(i, n); i != j {
    309 			i = j
    310 			lastSegStart = i - 1
    311 			lastCC = 0
    312 			ss = 0
    313 			continue
    314 		}
    315 		info := f.info(src, i)
    316 		if info.size == 0 {
    317 			if atEOF {
    318 				// include incomplete runes
    319 				return n, true
    320 			}
    321 			return lastSegStart, true
    322 		}
    323 		// This block needs to be before the next, because it is possible to
    324 		// have an overflow for runes that are starters (e.g. with U+FF9E).
    325 		switch ss.next(info) {
    326 		case ssStarter:
    327 			lastSegStart = i
    328 		case ssOverflow:
    329 			return lastSegStart, false
    330 		case ssSuccess:
    331 			if lastCC > info.ccc {
    332 				return lastSegStart, false
    333 			}
    334 		}
    335 		if f.composing {
    336 			if !info.isYesC() {
    337 				break
    338 			}
    339 		} else {
    340 			if !info.isYesD() {
    341 				break
    342 			}
    343 		}
    344 		lastCC = info.ccc
    345 		i += int(info.size)
    346 	}
    347 	if i == n {
    348 		if !atEOF {
    349 			n = lastSegStart
    350 		}
    351 		return n, true
    352 	}
    353 	return lastSegStart, false
    354 }
    355 
    356 // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
    357 // It is not guaranteed to return the largest such n.
    358 func (f Form) QuickSpanString(s string) int {
    359 	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
    360 	return n
    361 }
    362 
    363 // FirstBoundary returns the position i of the first boundary in b
    364 // or -1 if b contains no boundary.
    365 func (f Form) FirstBoundary(b []byte) int {
    366 	return f.firstBoundary(inputBytes(b), len(b))
    367 }
    368 
    369 func (f Form) firstBoundary(src input, nsrc int) int {
    370 	i := src.skipContinuationBytes(0)
    371 	if i >= nsrc {
    372 		return -1
    373 	}
    374 	fd := formTable[f]
    375 	ss := streamSafe(0)
    376 	// We should call ss.first here, but we can't as the first rune is
    377 	// skipped already. This means FirstBoundary can't really determine
    378 	// CGJ insertion points correctly. Luckily it doesn't have to.
    379 	for {
    380 		info := fd.info(src, i)
    381 		if info.size == 0 {
    382 			return -1
    383 		}
    384 		if s := ss.next(info); s != ssSuccess {
    385 			return i
    386 		}
    387 		i += int(info.size)
    388 		if i >= nsrc {
    389 			if !info.BoundaryAfter() && !ss.isMax() {
    390 				return -1
    391 			}
    392 			return nsrc
    393 		}
    394 	}
    395 }
    396 
    397 // FirstBoundaryInString returns the position i of the first boundary in s
    398 // or -1 if s contains no boundary.
    399 func (f Form) FirstBoundaryInString(s string) int {
    400 	return f.firstBoundary(inputString(s), len(s))
    401 }
    402 
    403 // NextBoundary reports the index of the boundary between the first and next
    404 // segment in b or -1 if atEOF is false and there are not enough bytes to
    405 // determine this boundary.
    406 func (f Form) NextBoundary(b []byte, atEOF bool) int {
    407 	return f.nextBoundary(inputBytes(b), len(b), atEOF)
    408 }
    409 
    410 // NextBoundaryInString reports the index of the boundary between the first and
    411 // next segment in b or -1 if atEOF is false and there are not enough bytes to
    412 // determine this boundary.
    413 func (f Form) NextBoundaryInString(s string, atEOF bool) int {
    414 	return f.nextBoundary(inputString(s), len(s), atEOF)
    415 }
    416 
    417 func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
    418 	if nsrc == 0 {
    419 		if atEOF {
    420 			return 0
    421 		}
    422 		return -1
    423 	}
    424 	fd := formTable[f]
    425 	info := fd.info(src, 0)
    426 	if info.size == 0 {
    427 		if atEOF {
    428 			return 1
    429 		}
    430 		return -1
    431 	}
    432 	ss := streamSafe(0)
    433 	ss.first(info)
    434 
    435 	for i := int(info.size); i < nsrc; i += int(info.size) {
    436 		info = fd.info(src, i)
    437 		if info.size == 0 {
    438 			if atEOF {
    439 				return i
    440 			}
    441 			return -1
    442 		}
    443 		// TODO: Using streamSafe to determine the boundary isn't the same as
    444 		// using BoundaryBefore. Determine which should be used.
    445 		if s := ss.next(info); s != ssSuccess {
    446 			return i
    447 		}
    448 	}
    449 	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
    450 		return -1
    451 	}
    452 	return nsrc
    453 }
    454 
    455 // LastBoundary returns the position i of the last boundary in b
    456 // or -1 if b contains no boundary.
    457 func (f Form) LastBoundary(b []byte) int {
    458 	return lastBoundary(formTable[f], b)
    459 }
    460 
    461 func lastBoundary(fd *formInfo, b []byte) int {
    462 	i := len(b)
    463 	info, p := lastRuneStart(fd, b)
    464 	if p == -1 {
    465 		return -1
    466 	}
    467 	if info.size == 0 { // ends with incomplete rune
    468 		if p == 0 { // starts with incomplete rune
    469 			return -1
    470 		}
    471 		i = p
    472 		info, p = lastRuneStart(fd, b[:i])
    473 		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
    474 			return i
    475 		}
    476 	}
    477 	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
    478 		return i
    479 	}
    480 	if info.BoundaryAfter() {
    481 		return i
    482 	}
    483 	ss := streamSafe(0)
    484 	v := ss.backwards(info)
    485 	for i = p; i >= 0 && v != ssStarter; i = p {
    486 		info, p = lastRuneStart(fd, b[:i])
    487 		if v = ss.backwards(info); v == ssOverflow {
    488 			break
    489 		}
    490 		if p+int(info.size) != i {
    491 			if p == -1 { // no boundary found
    492 				return -1
    493 			}
    494 			return i // boundary after an illegal UTF-8 encoding
    495 		}
    496 	}
    497 	return i
    498 }
    499 
    500 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
    501 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
    502 // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
    503 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
    504 	// Force one character to be consumed.
    505 	info := rb.f.info(rb.src, sp)
    506 	if info.size == 0 {
    507 		return 0
    508 	}
    509 	if s := rb.ss.next(info); s == ssStarter {
    510 		// TODO: this could be removed if we don't support merging.
    511 		if rb.nrune > 0 {
    512 			goto end
    513 		}
    514 	} else if s == ssOverflow {
    515 		rb.insertCGJ()
    516 		goto end
    517 	}
    518 	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
    519 		return int(err)
    520 	}
    521 	for {
    522 		sp += int(info.size)
    523 		if sp >= rb.nsrc {
    524 			if !atEOF && !info.BoundaryAfter() {
    525 				return int(iShortSrc)
    526 			}
    527 			break
    528 		}
    529 		info = rb.f.info(rb.src, sp)
    530 		if info.size == 0 {
    531 			if !atEOF {
    532 				return int(iShortSrc)
    533 			}
    534 			break
    535 		}
    536 		if s := rb.ss.next(info); s == ssStarter {
    537 			break
    538 		} else if s == ssOverflow {
    539 			rb.insertCGJ()
    540 			break
    541 		}
    542 		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
    543 			return int(err)
    544 		}
    545 	}
    546 end:
    547 	if !rb.doFlush() {
    548 		return int(iShortDst)
    549 	}
    550 	return sp
    551 }
    552 
    553 // lastRuneStart returns the runeInfo and position of the last
    554 // rune in buf or the zero runeInfo and -1 if no rune was found.
    555 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
    556 	p := len(buf) - 1
    557 	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
    558 	}
    559 	if p < 0 {
    560 		return Properties{}, -1
    561 	}
    562 	return fd.info(inputBytes(buf), p), p
    563 }
    564 
    565 // decomposeToLastBoundary finds an open segment at the end of the buffer
    566 // and scans it into rb. Returns the buffer minus the last segment.
    567 func decomposeToLastBoundary(rb *reorderBuffer) {
    568 	fd := &rb.f
    569 	info, i := lastRuneStart(fd, rb.out)
    570 	if int(info.size) != len(rb.out)-i {
    571 		// illegal trailing continuation bytes
    572 		return
    573 	}
    574 	if info.BoundaryAfter() {
    575 		return
    576 	}
    577 	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
    578 	padd := 0
    579 	ss := streamSafe(0)
    580 	p := len(rb.out)
    581 	for {
    582 		add[padd] = info
    583 		v := ss.backwards(info)
    584 		if v == ssOverflow {
    585 			// Note that if we have an overflow, it the string we are appending to
    586 			// is not correctly normalized. In this case the behavior is undefined.
    587 			break
    588 		}
    589 		padd++
    590 		p -= int(info.size)
    591 		if v == ssStarter || p < 0 {
    592 			break
    593 		}
    594 		info, i = lastRuneStart(fd, rb.out[:p])
    595 		if int(info.size) != p-i {
    596 			break
    597 		}
    598 	}
    599 	rb.ss = ss
    600 	// Copy bytes for insertion as we may need to overwrite rb.out.
    601 	var buf [maxBufferSize * utf8.UTFMax]byte
    602 	cp := buf[:copy(buf[:], rb.out[p:])]
    603 	rb.out = rb.out[:p]
    604 	for padd--; padd >= 0; padd-- {
    605 		info = add[padd]
    606 		rb.insertUnsafe(inputBytes(cp), 0, info)
    607 		cp = cp[info.size:]
    608 	}
    609 }
    610