1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package norm 6 7 // This file contains Form-specific logic and wrappers for data in tables.go. 8 9 // Rune info is stored in a separate trie per composing form. A composing form 10 // and its corresponding decomposing form share the same trie. Each trie maps 11 // a rune to a uint16. The values take two forms. For v >= 0x8000: 12 // bits 13 // 15: 1 (inverse of NFD_QD bit of qcInfo) 14 // 13..7: qcInfo (see below). isYesD is always true (no decompostion). 15 // 6..0: ccc (compressed CCC value). 16 // For v < 0x8000, the respective rune has a decomposition and v is an index 17 // into a byte array of UTF-8 decomposition sequences and additional info and 18 // has the form: 19 // <header> <decomp_byte>* [<tccc> [<lccc>]] 20 // The header contains the number of bytes in the decomposition (excluding this 21 // length byte). The two most significant bits of this length byte correspond 22 // to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. 23 // The byte sequence is followed by a trailing and leading CCC if the values 24 // for these are not zero. The value of v determines which ccc are appended 25 // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, 26 // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC 27 // there is an additional leading ccc. The value of tccc itself is the 28 // trailing CCC shifted left 2 bits. The two least-significant bits of tccc 29 // are the number of trailing non-starters. 30 31 const ( 32 qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo 33 headerLenMask = 0x3F // extract the length value from the header byte 34 headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte 35 ) 36 37 // Properties provides access to normalization properties of a rune. 38 type Properties struct { 39 pos uint8 // start position in reorderBuffer; used in composition.go 40 size uint8 // length of UTF-8 encoding of this rune 41 ccc uint8 // leading canonical combining class (ccc if not decomposition) 42 tccc uint8 // trailing canonical combining class (ccc if not decomposition) 43 nLead uint8 // number of leading non-starters. 44 flags qcInfo // quick check flags 45 index uint16 46 } 47 48 // functions dispatchable per form 49 type lookupFunc func(b input, i int) Properties 50 51 // formInfo holds Form-specific functions and tables. 52 type formInfo struct { 53 form Form 54 composing, compatibility bool // form type 55 info lookupFunc 56 nextMain iterFunc 57 } 58 59 var formTable []*formInfo 60 61 func init() { 62 formTable = make([]*formInfo, 4) 63 64 for i := range formTable { 65 f := &formInfo{} 66 formTable[i] = f 67 f.form = Form(i) 68 if Form(i) == NFKD || Form(i) == NFKC { 69 f.compatibility = true 70 f.info = lookupInfoNFKC 71 } else { 72 f.info = lookupInfoNFC 73 } 74 f.nextMain = nextDecomposed 75 if Form(i) == NFC || Form(i) == NFKC { 76 f.nextMain = nextComposed 77 f.composing = true 78 } 79 } 80 } 81 82 // We do not distinguish between boundaries for NFC, NFD, etc. to avoid 83 // unexpected behavior for the user. For example, in NFD, there is a boundary 84 // after 'a'. However, 'a' might combine with modifiers, so from the application's 85 // perspective it is not a good boundary. We will therefore always use the 86 // boundaries for the combining variants. 87 88 // BoundaryBefore returns true if this rune starts a new segment and 89 // cannot combine with any rune on the left. 90 func (p Properties) BoundaryBefore() bool { 91 if p.ccc == 0 && !p.combinesBackward() { 92 return true 93 } 94 // We assume that the CCC of the first character in a decomposition 95 // is always non-zero if different from info.ccc and that we can return 96 // false at this point. This is verified by maketables. 97 return false 98 } 99 100 // BoundaryAfter returns true if runes cannot combine with or otherwise 101 // interact with this or previous runes. 102 func (p Properties) BoundaryAfter() bool { 103 // TODO: loosen these conditions. 104 return p.isInert() 105 } 106 107 // We pack quick check data in 4 bits: 108 // 5: Combines forward (0 == false, 1 == true) 109 // 4..3: NFC_QC Yes(00), No (10), or Maybe (11) 110 // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. 111 // 1..0: Number of trailing non-starters. 112 // 113 // When all 4 bits are zero, the character is inert, meaning it is never 114 // influenced by normalization. 115 type qcInfo uint8 116 117 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } 118 func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } 119 120 func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } 121 func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe 122 func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD 123 124 func (p Properties) isInert() bool { 125 return p.flags&qcInfoMask == 0 && p.ccc == 0 126 } 127 128 func (p Properties) multiSegment() bool { 129 return p.index >= firstMulti && p.index < endMulti 130 } 131 132 func (p Properties) nLeadingNonStarters() uint8 { 133 return p.nLead 134 } 135 136 func (p Properties) nTrailingNonStarters() uint8 { 137 return uint8(p.flags & 0x03) 138 } 139 140 // Decomposition returns the decomposition for the underlying rune 141 // or nil if there is none. 142 func (p Properties) Decomposition() []byte { 143 // TODO: create the decomposition for Hangul? 144 if p.index == 0 { 145 return nil 146 } 147 i := p.index 148 n := decomps[i] & headerLenMask 149 i++ 150 return decomps[i : i+uint16(n)] 151 } 152 153 // Size returns the length of UTF-8 encoding of the rune. 154 func (p Properties) Size() int { 155 return int(p.size) 156 } 157 158 // CCC returns the canonical combining class of the underlying rune. 159 func (p Properties) CCC() uint8 { 160 if p.index >= firstCCCZeroExcept { 161 return 0 162 } 163 return ccc[p.ccc] 164 } 165 166 // LeadCCC returns the CCC of the first rune in the decomposition. 167 // If there is no decomposition, LeadCCC equals CCC. 168 func (p Properties) LeadCCC() uint8 { 169 return ccc[p.ccc] 170 } 171 172 // TrailCCC returns the CCC of the last rune in the decomposition. 173 // If there is no decomposition, TrailCCC equals CCC. 174 func (p Properties) TrailCCC() uint8 { 175 return ccc[p.tccc] 176 } 177 178 // Recomposition 179 // We use 32-bit keys instead of 64-bit for the two codepoint keys. 180 // This clips off the bits of three entries, but we know this will not 181 // result in a collision. In the unlikely event that changes to 182 // UnicodeData.txt introduce collisions, the compiler will catch it. 183 // Note that the recomposition map for NFC and NFKC are identical. 184 185 // combine returns the combined rune or 0 if it doesn't exist. 186 func combine(a, b rune) rune { 187 key := uint32(uint16(a))<<16 + uint32(uint16(b)) 188 return recompMap[key] 189 } 190 191 func lookupInfoNFC(b input, i int) Properties { 192 v, sz := b.charinfoNFC(i) 193 return compInfo(v, sz) 194 } 195 196 func lookupInfoNFKC(b input, i int) Properties { 197 v, sz := b.charinfoNFKC(i) 198 return compInfo(v, sz) 199 } 200 201 // Properties returns properties for the first rune in s. 202 func (f Form) Properties(s []byte) Properties { 203 if f == NFC || f == NFD { 204 return compInfo(nfcData.lookup(s)) 205 } 206 return compInfo(nfkcData.lookup(s)) 207 } 208 209 // PropertiesString returns properties for the first rune in s. 210 func (f Form) PropertiesString(s string) Properties { 211 if f == NFC || f == NFD { 212 return compInfo(nfcData.lookupString(s)) 213 } 214 return compInfo(nfkcData.lookupString(s)) 215 } 216 217 // compInfo converts the information contained in v and sz 218 // to a Properties. See the comment at the top of the file 219 // for more information on the format. 220 func compInfo(v uint16, sz int) Properties { 221 if v == 0 { 222 return Properties{size: uint8(sz)} 223 } else if v >= 0x8000 { 224 p := Properties{ 225 size: uint8(sz), 226 ccc: uint8(v), 227 tccc: uint8(v), 228 flags: qcInfo(v >> 8), 229 } 230 if p.ccc > 0 || p.combinesBackward() { 231 p.nLead = uint8(p.flags & 0x3) 232 } 233 return p 234 } 235 // has decomposition 236 h := decomps[v] 237 f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 238 p := Properties{size: uint8(sz), flags: f, index: v} 239 if v >= firstCCC { 240 v += uint16(h&headerLenMask) + 1 241 c := decomps[v] 242 p.tccc = c >> 2 243 p.flags |= qcInfo(c & 0x3) 244 if v >= firstLeadingCCC { 245 p.nLead = c & 0x3 246 if v >= firstStarterWithNLead { 247 // We were tricked. Remove the decomposition. 248 p.flags &= 0x03 249 p.index = 0 250 return p 251 } 252 p.ccc = decomps[v+1] 253 } 254 } 255 return p 256 } 257