Home | History | Annotate | Download | only in norm
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package norm
      6 
      7 // This file contains Form-specific logic and wrappers for data in tables.go.
      8 
      9 // Rune info is stored in a separate trie per composing form. A composing form
     10 // and its corresponding decomposing form share the same trie.  Each trie maps
     11 // a rune to a uint16. The values take two forms.  For v >= 0x8000:
     12 //   bits
     13 //   15:    1 (inverse of NFD_QD bit of qcInfo)
     14 //   13..7: qcInfo (see below). isYesD is always true (no decompostion).
     15 //    6..0: ccc (compressed CCC value).
     16 // For v < 0x8000, the respective rune has a decomposition and v is an index
     17 // into a byte array of UTF-8 decomposition sequences and additional info and
     18 // has the form:
     19 //    <header> <decomp_byte>* [<tccc> [<lccc>]]
     20 // The header contains the number of bytes in the decomposition (excluding this
     21 // length byte). The two most significant bits of this length byte correspond
     22 // to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
     23 // The byte sequence is followed by a trailing and leading CCC if the values
     24 // for these are not zero.  The value of v determines which ccc are appended
     25 // to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
     26 // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
     27 // there is an additional leading ccc. The value of tccc itself is the
     28 // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
     29 // are the number of trailing non-starters.
     30 
     31 const (
     32 	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
     33 	headerLenMask   = 0x3F // extract the length value from the header byte
     34 	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
     35 )
     36 
     37 // Properties provides access to normalization properties of a rune.
     38 type Properties struct {
     39 	pos   uint8  // start position in reorderBuffer; used in composition.go
     40 	size  uint8  // length of UTF-8 encoding of this rune
     41 	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
     42 	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
     43 	nLead uint8  // number of leading non-starters.
     44 	flags qcInfo // quick check flags
     45 	index uint16
     46 }
     47 
     48 // functions dispatchable per form
     49 type lookupFunc func(b input, i int) Properties
     50 
     51 // formInfo holds Form-specific functions and tables.
     52 type formInfo struct {
     53 	form                     Form
     54 	composing, compatibility bool // form type
     55 	info                     lookupFunc
     56 	nextMain                 iterFunc
     57 }
     58 
     59 var formTable []*formInfo
     60 
     61 func init() {
     62 	formTable = make([]*formInfo, 4)
     63 
     64 	for i := range formTable {
     65 		f := &formInfo{}
     66 		formTable[i] = f
     67 		f.form = Form(i)
     68 		if Form(i) == NFKD || Form(i) == NFKC {
     69 			f.compatibility = true
     70 			f.info = lookupInfoNFKC
     71 		} else {
     72 			f.info = lookupInfoNFC
     73 		}
     74 		f.nextMain = nextDecomposed
     75 		if Form(i) == NFC || Form(i) == NFKC {
     76 			f.nextMain = nextComposed
     77 			f.composing = true
     78 		}
     79 	}
     80 }
     81 
     82 // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
     83 // unexpected behavior for the user.  For example, in NFD, there is a boundary
     84 // after 'a'.  However, 'a' might combine with modifiers, so from the application's
     85 // perspective it is not a good boundary. We will therefore always use the
     86 // boundaries for the combining variants.
     87 
     88 // BoundaryBefore returns true if this rune starts a new segment and
     89 // cannot combine with any rune on the left.
     90 func (p Properties) BoundaryBefore() bool {
     91 	if p.ccc == 0 && !p.combinesBackward() {
     92 		return true
     93 	}
     94 	// We assume that the CCC of the first character in a decomposition
     95 	// is always non-zero if different from info.ccc and that we can return
     96 	// false at this point. This is verified by maketables.
     97 	return false
     98 }
     99 
    100 // BoundaryAfter returns true if runes cannot combine with or otherwise
    101 // interact with this or previous runes.
    102 func (p Properties) BoundaryAfter() bool {
    103 	// TODO: loosen these conditions.
    104 	return p.isInert()
    105 }
    106 
    107 // We pack quick check data in 4 bits:
    108 //   5:    Combines forward  (0 == false, 1 == true)
    109 //   4..3: NFC_QC Yes(00), No (10), or Maybe (11)
    110 //   2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
    111 //   1..0: Number of trailing non-starters.
    112 //
    113 // When all 4 bits are zero, the character is inert, meaning it is never
    114 // influenced by normalization.
    115 type qcInfo uint8
    116 
    117 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
    118 func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
    119 
    120 func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
    121 func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
    122 func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
    123 
    124 func (p Properties) isInert() bool {
    125 	return p.flags&qcInfoMask == 0 && p.ccc == 0
    126 }
    127 
    128 func (p Properties) multiSegment() bool {
    129 	return p.index >= firstMulti && p.index < endMulti
    130 }
    131 
    132 func (p Properties) nLeadingNonStarters() uint8 {
    133 	return p.nLead
    134 }
    135 
    136 func (p Properties) nTrailingNonStarters() uint8 {
    137 	return uint8(p.flags & 0x03)
    138 }
    139 
    140 // Decomposition returns the decomposition for the underlying rune
    141 // or nil if there is none.
    142 func (p Properties) Decomposition() []byte {
    143 	// TODO: create the decomposition for Hangul?
    144 	if p.index == 0 {
    145 		return nil
    146 	}
    147 	i := p.index
    148 	n := decomps[i] & headerLenMask
    149 	i++
    150 	return decomps[i : i+uint16(n)]
    151 }
    152 
    153 // Size returns the length of UTF-8 encoding of the rune.
    154 func (p Properties) Size() int {
    155 	return int(p.size)
    156 }
    157 
    158 // CCC returns the canonical combining class of the underlying rune.
    159 func (p Properties) CCC() uint8 {
    160 	if p.index >= firstCCCZeroExcept {
    161 		return 0
    162 	}
    163 	return ccc[p.ccc]
    164 }
    165 
    166 // LeadCCC returns the CCC of the first rune in the decomposition.
    167 // If there is no decomposition, LeadCCC equals CCC.
    168 func (p Properties) LeadCCC() uint8 {
    169 	return ccc[p.ccc]
    170 }
    171 
    172 // TrailCCC returns the CCC of the last rune in the decomposition.
    173 // If there is no decomposition, TrailCCC equals CCC.
    174 func (p Properties) TrailCCC() uint8 {
    175 	return ccc[p.tccc]
    176 }
    177 
    178 // Recomposition
    179 // We use 32-bit keys instead of 64-bit for the two codepoint keys.
    180 // This clips off the bits of three entries, but we know this will not
    181 // result in a collision. In the unlikely event that changes to
    182 // UnicodeData.txt introduce collisions, the compiler will catch it.
    183 // Note that the recomposition map for NFC and NFKC are identical.
    184 
    185 // combine returns the combined rune or 0 if it doesn't exist.
    186 func combine(a, b rune) rune {
    187 	key := uint32(uint16(a))<<16 + uint32(uint16(b))
    188 	return recompMap[key]
    189 }
    190 
    191 func lookupInfoNFC(b input, i int) Properties {
    192 	v, sz := b.charinfoNFC(i)
    193 	return compInfo(v, sz)
    194 }
    195 
    196 func lookupInfoNFKC(b input, i int) Properties {
    197 	v, sz := b.charinfoNFKC(i)
    198 	return compInfo(v, sz)
    199 }
    200 
    201 // Properties returns properties for the first rune in s.
    202 func (f Form) Properties(s []byte) Properties {
    203 	if f == NFC || f == NFD {
    204 		return compInfo(nfcData.lookup(s))
    205 	}
    206 	return compInfo(nfkcData.lookup(s))
    207 }
    208 
    209 // PropertiesString returns properties for the first rune in s.
    210 func (f Form) PropertiesString(s string) Properties {
    211 	if f == NFC || f == NFD {
    212 		return compInfo(nfcData.lookupString(s))
    213 	}
    214 	return compInfo(nfkcData.lookupString(s))
    215 }
    216 
    217 // compInfo converts the information contained in v and sz
    218 // to a Properties.  See the comment at the top of the file
    219 // for more information on the format.
    220 func compInfo(v uint16, sz int) Properties {
    221 	if v == 0 {
    222 		return Properties{size: uint8(sz)}
    223 	} else if v >= 0x8000 {
    224 		p := Properties{
    225 			size:  uint8(sz),
    226 			ccc:   uint8(v),
    227 			tccc:  uint8(v),
    228 			flags: qcInfo(v >> 8),
    229 		}
    230 		if p.ccc > 0 || p.combinesBackward() {
    231 			p.nLead = uint8(p.flags & 0x3)
    232 		}
    233 		return p
    234 	}
    235 	// has decomposition
    236 	h := decomps[v]
    237 	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
    238 	p := Properties{size: uint8(sz), flags: f, index: v}
    239 	if v >= firstCCC {
    240 		v += uint16(h&headerLenMask) + 1
    241 		c := decomps[v]
    242 		p.tccc = c >> 2
    243 		p.flags |= qcInfo(c & 0x3)
    244 		if v >= firstLeadingCCC {
    245 			p.nLead = c & 0x3
    246 			if v >= firstStarterWithNLead {
    247 				// We were tricked. Remove the decomposition.
    248 				p.flags &= 0x03
    249 				p.index = 0
    250 				return p
    251 			}
    252 			p.ccc = decomps[v+1]
    253 		}
    254 	}
    255 	return p
    256 }
    257