Home | History | Annotate | Download | only in norm
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package norm
      6 
      7 import "unicode/utf8"
      8 
      9 const (
     10 	maxNonStarters = 30
     11 	// The maximum number of characters needed for a buffer is
     12 	// maxNonStarters + 1 for the starter + 1 for the GCJ
     13 	maxBufferSize    = maxNonStarters + 2
     14 	maxNFCExpansion  = 3  // NFC(0x1D160)
     15 	maxNFKCExpansion = 18 // NFKC(0xFDFA)
     16 
     17 	maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
     18 )
     19 
     20 // ssState is used for reporting the segment state after inserting a rune.
     21 // It is returned by streamSafe.next.
     22 type ssState int
     23 
     24 const (
     25 	// Indicates a rune was successfully added to the segment.
     26 	ssSuccess ssState = iota
     27 	// Indicates a rune starts a new segment and should not be added.
     28 	ssStarter
     29 	// Indicates a rune caused a segment overflow and a CGJ should be inserted.
     30 	ssOverflow
     31 )
     32 
     33 // streamSafe implements the policy of when a CGJ should be inserted.
     34 type streamSafe uint8
     35 
     36 // mkStreamSafe is a shorthand for declaring a streamSafe var and calling
     37 // first on it.
     38 func mkStreamSafe(p Properties) streamSafe {
     39 	return streamSafe(p.nTrailingNonStarters())
     40 }
     41 
     42 // first inserts the first rune of a segment.
     43 func (ss *streamSafe) first(p Properties) {
     44 	if *ss != 0 {
     45 		panic("!= 0")
     46 	}
     47 	*ss = streamSafe(p.nTrailingNonStarters())
     48 }
     49 
     50 // insert returns a ssState value to indicate whether a rune represented by p
     51 // can be inserted.
     52 func (ss *streamSafe) next(p Properties) ssState {
     53 	if *ss > maxNonStarters {
     54 		panic("streamSafe was not reset")
     55 	}
     56 	n := p.nLeadingNonStarters()
     57 	if *ss += streamSafe(n); *ss > maxNonStarters {
     58 		*ss = 0
     59 		return ssOverflow
     60 	}
     61 	// The Stream-Safe Text Processing prescribes that the counting can stop
     62 	// as soon as a starter is encountered. However, there are some starters,
     63 	// like Jamo V and T, that can combine with other runes, leaving their
     64 	// successive non-starters appended to the previous, possibly causing an
     65 	// overflow. We will therefore consider any rune with a non-zero nLead to
     66 	// be a non-starter. Note that it always hold that if nLead > 0 then
     67 	// nLead == nTrail.
     68 	if n == 0 {
     69 		*ss = 0
     70 		return ssStarter
     71 	}
     72 	return ssSuccess
     73 }
     74 
     75 // backwards is used for checking for overflow and segment starts
     76 // when traversing a string backwards. Users do not need to call first
     77 // for the first rune. The state of the streamSafe retains the count of
     78 // the non-starters loaded.
     79 func (ss *streamSafe) backwards(p Properties) ssState {
     80 	if *ss > maxNonStarters {
     81 		panic("streamSafe was not reset")
     82 	}
     83 	c := *ss + streamSafe(p.nTrailingNonStarters())
     84 	if c > maxNonStarters {
     85 		return ssOverflow
     86 	}
     87 	*ss = c
     88 	if p.nLeadingNonStarters() == 0 {
     89 		return ssStarter
     90 	}
     91 	return ssSuccess
     92 }
     93 
     94 func (ss streamSafe) isMax() bool {
     95 	return ss == maxNonStarters
     96 }
     97 
     98 // GraphemeJoiner is inserted after maxNonStarters non-starter runes.
     99 const GraphemeJoiner = "\u034F"
    100 
    101 // reorderBuffer is used to normalize a single segment.  Characters inserted with
    102 // insert are decomposed and reordered based on CCC. The compose method can
    103 // be used to recombine characters.  Note that the byte buffer does not hold
    104 // the UTF-8 characters in order.  Only the rune array is maintained in sorted
    105 // order. flush writes the resulting segment to a byte array.
    106 type reorderBuffer struct {
    107 	rune  [maxBufferSize]Properties // Per character info.
    108 	byte  [maxByteBufferSize]byte   // UTF-8 buffer. Referenced by runeInfo.pos.
    109 	nbyte uint8                     // Number or bytes.
    110 	ss    streamSafe                // For limiting length of non-starter sequence.
    111 	nrune int                       // Number of runeInfos.
    112 	f     formInfo
    113 
    114 	src      input
    115 	nsrc     int
    116 	tmpBytes input
    117 
    118 	out    []byte
    119 	flushF func(*reorderBuffer) bool
    120 }
    121 
    122 func (rb *reorderBuffer) init(f Form, src []byte) {
    123 	rb.f = *formTable[f]
    124 	rb.src.setBytes(src)
    125 	rb.nsrc = len(src)
    126 	rb.ss = 0
    127 }
    128 
    129 func (rb *reorderBuffer) initString(f Form, src string) {
    130 	rb.f = *formTable[f]
    131 	rb.src.setString(src)
    132 	rb.nsrc = len(src)
    133 	rb.ss = 0
    134 }
    135 
    136 func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
    137 	rb.out = out
    138 	rb.flushF = f
    139 }
    140 
    141 // reset discards all characters from the buffer.
    142 func (rb *reorderBuffer) reset() {
    143 	rb.nrune = 0
    144 	rb.nbyte = 0
    145 	rb.ss = 0
    146 }
    147 
    148 func (rb *reorderBuffer) doFlush() bool {
    149 	if rb.f.composing {
    150 		rb.compose()
    151 	}
    152 	res := rb.flushF(rb)
    153 	rb.reset()
    154 	return res
    155 }
    156 
    157 // appendFlush appends the normalized segment to rb.out.
    158 func appendFlush(rb *reorderBuffer) bool {
    159 	for i := 0; i < rb.nrune; i++ {
    160 		start := rb.rune[i].pos
    161 		end := start + rb.rune[i].size
    162 		rb.out = append(rb.out, rb.byte[start:end]...)
    163 	}
    164 	return true
    165 }
    166 
    167 // flush appends the normalized segment to out and resets rb.
    168 func (rb *reorderBuffer) flush(out []byte) []byte {
    169 	for i := 0; i < rb.nrune; i++ {
    170 		start := rb.rune[i].pos
    171 		end := start + rb.rune[i].size
    172 		out = append(out, rb.byte[start:end]...)
    173 	}
    174 	rb.reset()
    175 	return out
    176 }
    177 
    178 // flushCopy copies the normalized segment to buf and resets rb.
    179 // It returns the number of bytes written to buf.
    180 func (rb *reorderBuffer) flushCopy(buf []byte) int {
    181 	p := 0
    182 	for i := 0; i < rb.nrune; i++ {
    183 		runep := rb.rune[i]
    184 		p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
    185 	}
    186 	rb.reset()
    187 	return p
    188 }
    189 
    190 // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
    191 // It returns false if the buffer is not large enough to hold the rune.
    192 // It is used internally by insert and insertString only.
    193 func (rb *reorderBuffer) insertOrdered(info Properties) {
    194 	n := rb.nrune
    195 	b := rb.rune[:]
    196 	cc := info.ccc
    197 	if cc > 0 {
    198 		// Find insertion position + move elements to make room.
    199 		for ; n > 0; n-- {
    200 			if b[n-1].ccc <= cc {
    201 				break
    202 			}
    203 			b[n] = b[n-1]
    204 		}
    205 	}
    206 	rb.nrune += 1
    207 	pos := uint8(rb.nbyte)
    208 	rb.nbyte += utf8.UTFMax
    209 	info.pos = pos
    210 	b[n] = info
    211 }
    212 
    213 // insertErr is an error code returned by insert. Using this type instead
    214 // of error improves performance up to 20% for many of the benchmarks.
    215 type insertErr int
    216 
    217 const (
    218 	iSuccess insertErr = -iota
    219 	iShortDst
    220 	iShortSrc
    221 )
    222 
    223 // insertFlush inserts the given rune in the buffer ordered by CCC.
    224 // If a decomposition with multiple segments are encountered, they leading
    225 // ones are flushed.
    226 // It returns a non-zero error code if the rune was not inserted.
    227 func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
    228 	if rune := src.hangul(i); rune != 0 {
    229 		rb.decomposeHangul(rune)
    230 		return iSuccess
    231 	}
    232 	if info.hasDecomposition() {
    233 		return rb.insertDecomposed(info.Decomposition())
    234 	}
    235 	rb.insertSingle(src, i, info)
    236 	return iSuccess
    237 }
    238 
    239 // insertUnsafe inserts the given rune in the buffer ordered by CCC.
    240 // It is assumed there is sufficient space to hold the runes. It is the
    241 // responsibility of the caller to ensure this. This can be done by checking
    242 // the state returned by the streamSafe type.
    243 func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
    244 	if rune := src.hangul(i); rune != 0 {
    245 		rb.decomposeHangul(rune)
    246 	}
    247 	if info.hasDecomposition() {
    248 		// TODO: inline.
    249 		rb.insertDecomposed(info.Decomposition())
    250 	} else {
    251 		rb.insertSingle(src, i, info)
    252 	}
    253 }
    254 
    255 // insertDecomposed inserts an entry in to the reorderBuffer for each rune
    256 // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
    257 // It flushes the buffer on each new segment start.
    258 func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
    259 	rb.tmpBytes.setBytes(dcomp)
    260 	for i := 0; i < len(dcomp); {
    261 		info := rb.f.info(rb.tmpBytes, i)
    262 		if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
    263 			return iShortDst
    264 		}
    265 		i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
    266 		rb.insertOrdered(info)
    267 	}
    268 	return iSuccess
    269 }
    270 
    271 // insertSingle inserts an entry in the reorderBuffer for the rune at
    272 // position i. info is the runeInfo for the rune at position i.
    273 func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
    274 	src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
    275 	rb.insertOrdered(info)
    276 }
    277 
    278 // insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
    279 func (rb *reorderBuffer) insertCGJ() {
    280 	rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
    281 }
    282 
    283 // appendRune inserts a rune at the end of the buffer. It is used for Hangul.
    284 func (rb *reorderBuffer) appendRune(r rune) {
    285 	bn := rb.nbyte
    286 	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
    287 	rb.nbyte += utf8.UTFMax
    288 	rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
    289 	rb.nrune++
    290 }
    291 
    292 // assignRune sets a rune at position pos. It is used for Hangul and recomposition.
    293 func (rb *reorderBuffer) assignRune(pos int, r rune) {
    294 	bn := rb.rune[pos].pos
    295 	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
    296 	rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
    297 }
    298 
    299 // runeAt returns the rune at position n. It is used for Hangul and recomposition.
    300 func (rb *reorderBuffer) runeAt(n int) rune {
    301 	inf := rb.rune[n]
    302 	r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
    303 	return r
    304 }
    305 
    306 // bytesAt returns the UTF-8 encoding of the rune at position n.
    307 // It is used for Hangul and recomposition.
    308 func (rb *reorderBuffer) bytesAt(n int) []byte {
    309 	inf := rb.rune[n]
    310 	return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
    311 }
    312 
    313 // For Hangul we combine algorithmically, instead of using tables.
    314 const (
    315 	hangulBase  = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
    316 	hangulBase0 = 0xEA
    317 	hangulBase1 = 0xB0
    318 	hangulBase2 = 0x80
    319 
    320 	hangulEnd  = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
    321 	hangulEnd0 = 0xED
    322 	hangulEnd1 = 0x9E
    323 	hangulEnd2 = 0xA4
    324 
    325 	jamoLBase  = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
    326 	jamoLBase0 = 0xE1
    327 	jamoLBase1 = 0x84
    328 	jamoLEnd   = 0x1113
    329 	jamoVBase  = 0x1161
    330 	jamoVEnd   = 0x1176
    331 	jamoTBase  = 0x11A7
    332 	jamoTEnd   = 0x11C3
    333 
    334 	jamoTCount   = 28
    335 	jamoVCount   = 21
    336 	jamoVTCount  = 21 * 28
    337 	jamoLVTCount = 19 * 21 * 28
    338 )
    339 
    340 const hangulUTF8Size = 3
    341 
    342 func isHangul(b []byte) bool {
    343 	if len(b) < hangulUTF8Size {
    344 		return false
    345 	}
    346 	b0 := b[0]
    347 	if b0 < hangulBase0 {
    348 		return false
    349 	}
    350 	b1 := b[1]
    351 	switch {
    352 	case b0 == hangulBase0:
    353 		return b1 >= hangulBase1
    354 	case b0 < hangulEnd0:
    355 		return true
    356 	case b0 > hangulEnd0:
    357 		return false
    358 	case b1 < hangulEnd1:
    359 		return true
    360 	}
    361 	return b1 == hangulEnd1 && b[2] < hangulEnd2
    362 }
    363 
    364 func isHangulString(b string) bool {
    365 	if len(b) < hangulUTF8Size {
    366 		return false
    367 	}
    368 	b0 := b[0]
    369 	if b0 < hangulBase0 {
    370 		return false
    371 	}
    372 	b1 := b[1]
    373 	switch {
    374 	case b0 == hangulBase0:
    375 		return b1 >= hangulBase1
    376 	case b0 < hangulEnd0:
    377 		return true
    378 	case b0 > hangulEnd0:
    379 		return false
    380 	case b1 < hangulEnd1:
    381 		return true
    382 	}
    383 	return b1 == hangulEnd1 && b[2] < hangulEnd2
    384 }
    385 
    386 // Caller must ensure len(b) >= 2.
    387 func isJamoVT(b []byte) bool {
    388 	// True if (rune & 0xff00) == jamoLBase
    389 	return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
    390 }
    391 
    392 func isHangulWithoutJamoT(b []byte) bool {
    393 	c, _ := utf8.DecodeRune(b)
    394 	c -= hangulBase
    395 	return c < jamoLVTCount && c%jamoTCount == 0
    396 }
    397 
    398 // decomposeHangul writes the decomposed Hangul to buf and returns the number
    399 // of bytes written.  len(buf) should be at least 9.
    400 func decomposeHangul(buf []byte, r rune) int {
    401 	const JamoUTF8Len = 3
    402 	r -= hangulBase
    403 	x := r % jamoTCount
    404 	r /= jamoTCount
    405 	utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
    406 	utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
    407 	if x != 0 {
    408 		utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
    409 		return 3 * JamoUTF8Len
    410 	}
    411 	return 2 * JamoUTF8Len
    412 }
    413 
    414 // decomposeHangul algorithmically decomposes a Hangul rune into
    415 // its Jamo components.
    416 // See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
    417 func (rb *reorderBuffer) decomposeHangul(r rune) {
    418 	r -= hangulBase
    419 	x := r % jamoTCount
    420 	r /= jamoTCount
    421 	rb.appendRune(jamoLBase + r/jamoVCount)
    422 	rb.appendRune(jamoVBase + r%jamoVCount)
    423 	if x != 0 {
    424 		rb.appendRune(jamoTBase + x)
    425 	}
    426 }
    427 
    428 // combineHangul algorithmically combines Jamo character components into Hangul.
    429 // See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
    430 func (rb *reorderBuffer) combineHangul(s, i, k int) {
    431 	b := rb.rune[:]
    432 	bn := rb.nrune
    433 	for ; i < bn; i++ {
    434 		cccB := b[k-1].ccc
    435 		cccC := b[i].ccc
    436 		if cccB == 0 {
    437 			s = k - 1
    438 		}
    439 		if s != k-1 && cccB >= cccC {
    440 			// b[i] is blocked by greater-equal cccX below it
    441 			b[k] = b[i]
    442 			k++
    443 		} else {
    444 			l := rb.runeAt(s) // also used to compare to hangulBase
    445 			v := rb.runeAt(i) // also used to compare to jamoT
    446 			switch {
    447 			case jamoLBase <= l && l < jamoLEnd &&
    448 				jamoVBase <= v && v < jamoVEnd:
    449 				// 11xx plus 116x to LV
    450 				rb.assignRune(s, hangulBase+
    451 					(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
    452 			case hangulBase <= l && l < hangulEnd &&
    453 				jamoTBase < v && v < jamoTEnd &&
    454 				((l-hangulBase)%jamoTCount) == 0:
    455 				// ACxx plus 11Ax to LVT
    456 				rb.assignRune(s, l+v-jamoTBase)
    457 			default:
    458 				b[k] = b[i]
    459 				k++
    460 			}
    461 		}
    462 	}
    463 	rb.nrune = k
    464 }
    465 
    466 // compose recombines the runes in the buffer.
    467 // It should only be used to recompose a single segment, as it will not
    468 // handle alternations between Hangul and non-Hangul characters correctly.
    469 func (rb *reorderBuffer) compose() {
    470 	// UAX #15, section X5 , including Corrigendum #5
    471 	// "In any character sequence beginning with starter S, a character C is
    472 	//  blocked from S if and only if there is some character B between S
    473 	//  and C, and either B is a starter or it has the same or higher
    474 	//  combining class as C."
    475 	bn := rb.nrune
    476 	if bn == 0 {
    477 		return
    478 	}
    479 	k := 1
    480 	b := rb.rune[:]
    481 	for s, i := 0, 1; i < bn; i++ {
    482 		if isJamoVT(rb.bytesAt(i)) {
    483 			// Redo from start in Hangul mode. Necessary to support
    484 			// U+320E..U+321E in NFKC mode.
    485 			rb.combineHangul(s, i, k)
    486 			return
    487 		}
    488 		ii := b[i]
    489 		// We can only use combineForward as a filter if we later
    490 		// get the info for the combined character. This is more
    491 		// expensive than using the filter. Using combinesBackward()
    492 		// is safe.
    493 		if ii.combinesBackward() {
    494 			cccB := b[k-1].ccc
    495 			cccC := ii.ccc
    496 			blocked := false // b[i] blocked by starter or greater or equal CCC?
    497 			if cccB == 0 {
    498 				s = k - 1
    499 			} else {
    500 				blocked = s != k-1 && cccB >= cccC
    501 			}
    502 			if !blocked {
    503 				combined := combine(rb.runeAt(s), rb.runeAt(i))
    504 				if combined != 0 {
    505 					rb.assignRune(s, combined)
    506 					continue
    507 				}
    508 			}
    509 		}
    510 		b[k] = b[i]
    511 		k++
    512 	}
    513 	rb.nrune = k
    514 }
    515