Home | History | Annotate | Download | only in unicode
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package unicode provides data and functions to test some properties of
      6 // Unicode code points.
      7 package unicode
      8 
      9 // Tables are regenerated each time we update the Unicode version.
     10 //go:generate go run maketables.go -tables=all -output tables.go
     11 
     12 const (
     13 	MaxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
     14 	ReplacementChar = '\uFFFD'     // Represents invalid code points.
     15 	MaxASCII        = '\u007F'     // maximum ASCII value.
     16 	MaxLatin1       = '\u00FF'     // maximum Latin-1 value.
     17 )
     18 
     19 // RangeTable defines a set of Unicode code points by listing the ranges of
     20 // code points within the set. The ranges are listed in two slices
     21 // to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
     22 // The two slices must be in sorted order and non-overlapping.
     23 // Also, R32 should contain only values >= 0x10000 (1<<16).
     24 type RangeTable struct {
     25 	R16         []Range16
     26 	R32         []Range32
     27 	LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
     28 }
     29 
     30 // Range16 represents of a range of 16-bit Unicode code points.  The range runs from Lo to Hi
     31 // inclusive and has the specified stride.
     32 type Range16 struct {
     33 	Lo     uint16
     34 	Hi     uint16
     35 	Stride uint16
     36 }
     37 
     38 // Range32 represents of a range of Unicode code points and is used when one or
     39 // more of the values will not fit in 16 bits.  The range runs from Lo to Hi
     40 // inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
     41 type Range32 struct {
     42 	Lo     uint32
     43 	Hi     uint32
     44 	Stride uint32
     45 }
     46 
     47 // CaseRange represents a range of Unicode code points for simple (one
     48 // code point to one code point) case conversion.
     49 // The range runs from Lo to Hi inclusive, with a fixed stride of 1.  Deltas
     50 // are the number to add to the code point to reach the code point for a
     51 // different case for that character.  They may be negative.  If zero, it
     52 // means the character is in the corresponding case. There is a special
     53 // case representing sequences of alternating corresponding Upper and Lower
     54 // pairs.  It appears with a fixed Delta of
     55 //	{UpperLower, UpperLower, UpperLower}
     56 // The constant UpperLower has an otherwise impossible delta value.
     57 type CaseRange struct {
     58 	Lo    uint32
     59 	Hi    uint32
     60 	Delta d
     61 }
     62 
     63 // SpecialCase represents language-specific case mappings such as Turkish.
     64 // Methods of SpecialCase customize (by overriding) the standard mappings.
     65 type SpecialCase []CaseRange
     66 
     67 // BUG(r): There is no mechanism for full case folding, that is, for
     68 // characters that involve multiple runes in the input or output.
     69 
     70 // Indices into the Delta arrays inside CaseRanges for case mapping.
     71 const (
     72 	UpperCase = iota
     73 	LowerCase
     74 	TitleCase
     75 	MaxCase
     76 )
     77 
     78 type d [MaxCase]rune // to make the CaseRanges text shorter
     79 
     80 // If the Delta field of a CaseRange is UpperLower, it means
     81 // this CaseRange represents a sequence of the form (say)
     82 // Upper Lower Upper Lower.
     83 const (
     84 	UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
     85 )
     86 
     87 // linearMax is the maximum size table for linear search for non-Latin1 rune.
     88 // Derived by running 'go test -calibrate'.
     89 const linearMax = 18
     90 
     91 // is16 reports whether r is in the sorted slice of 16-bit ranges.
     92 func is16(ranges []Range16, r uint16) bool {
     93 	if len(ranges) <= linearMax || r <= MaxLatin1 {
     94 		for i := range ranges {
     95 			range_ := &ranges[i]
     96 			if r < range_.Lo {
     97 				return false
     98 			}
     99 			if r <= range_.Hi {
    100 				return (r-range_.Lo)%range_.Stride == 0
    101 			}
    102 		}
    103 		return false
    104 	}
    105 
    106 	// binary search over ranges
    107 	lo := 0
    108 	hi := len(ranges)
    109 	for lo < hi {
    110 		m := lo + (hi-lo)/2
    111 		range_ := &ranges[m]
    112 		if range_.Lo <= r && r <= range_.Hi {
    113 			return (r-range_.Lo)%range_.Stride == 0
    114 		}
    115 		if r < range_.Lo {
    116 			hi = m
    117 		} else {
    118 			lo = m + 1
    119 		}
    120 	}
    121 	return false
    122 }
    123 
    124 // is32 reports whether r is in the sorted slice of 32-bit ranges.
    125 func is32(ranges []Range32, r uint32) bool {
    126 	if len(ranges) <= linearMax {
    127 		for i := range ranges {
    128 			range_ := &ranges[i]
    129 			if r < range_.Lo {
    130 				return false
    131 			}
    132 			if r <= range_.Hi {
    133 				return (r-range_.Lo)%range_.Stride == 0
    134 			}
    135 		}
    136 		return false
    137 	}
    138 
    139 	// binary search over ranges
    140 	lo := 0
    141 	hi := len(ranges)
    142 	for lo < hi {
    143 		m := lo + (hi-lo)/2
    144 		range_ := ranges[m]
    145 		if range_.Lo <= r && r <= range_.Hi {
    146 			return (r-range_.Lo)%range_.Stride == 0
    147 		}
    148 		if r < range_.Lo {
    149 			hi = m
    150 		} else {
    151 			lo = m + 1
    152 		}
    153 	}
    154 	return false
    155 }
    156 
    157 // Is reports whether the rune is in the specified table of ranges.
    158 func Is(rangeTab *RangeTable, r rune) bool {
    159 	r16 := rangeTab.R16
    160 	if len(r16) > 0 && r <= rune(r16[len(r16)-1].Hi) {
    161 		return is16(r16, uint16(r))
    162 	}
    163 	r32 := rangeTab.R32
    164 	if len(r32) > 0 && r >= rune(r32[0].Lo) {
    165 		return is32(r32, uint32(r))
    166 	}
    167 	return false
    168 }
    169 
    170 func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
    171 	r16 := rangeTab.R16
    172 	if off := rangeTab.LatinOffset; len(r16) > off && r <= rune(r16[len(r16)-1].Hi) {
    173 		return is16(r16[off:], uint16(r))
    174 	}
    175 	r32 := rangeTab.R32
    176 	if len(r32) > 0 && r >= rune(r32[0].Lo) {
    177 		return is32(r32, uint32(r))
    178 	}
    179 	return false
    180 }
    181 
    182 // IsUpper reports whether the rune is an upper case letter.
    183 func IsUpper(r rune) bool {
    184 	// See comment in IsGraphic.
    185 	if uint32(r) <= MaxLatin1 {
    186 		return properties[uint8(r)]&pLmask == pLu
    187 	}
    188 	return isExcludingLatin(Upper, r)
    189 }
    190 
    191 // IsLower reports whether the rune is a lower case letter.
    192 func IsLower(r rune) bool {
    193 	// See comment in IsGraphic.
    194 	if uint32(r) <= MaxLatin1 {
    195 		return properties[uint8(r)]&pLmask == pLl
    196 	}
    197 	return isExcludingLatin(Lower, r)
    198 }
    199 
    200 // IsTitle reports whether the rune is a title case letter.
    201 func IsTitle(r rune) bool {
    202 	if r <= MaxLatin1 {
    203 		return false
    204 	}
    205 	return isExcludingLatin(Title, r)
    206 }
    207 
    208 // to maps the rune using the specified case mapping.
    209 func to(_case int, r rune, caseRange []CaseRange) rune {
    210 	if _case < 0 || MaxCase <= _case {
    211 		return ReplacementChar // as reasonable an error as any
    212 	}
    213 	// binary search over ranges
    214 	lo := 0
    215 	hi := len(caseRange)
    216 	for lo < hi {
    217 		m := lo + (hi-lo)/2
    218 		cr := caseRange[m]
    219 		if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
    220 			delta := rune(cr.Delta[_case])
    221 			if delta > MaxRune {
    222 				// In an Upper-Lower sequence, which always starts with
    223 				// an UpperCase letter, the real deltas always look like:
    224 				//	{0, 1, 0}    UpperCase (Lower is next)
    225 				//	{-1, 0, -1}  LowerCase (Upper, Title are previous)
    226 				// The characters at even offsets from the beginning of the
    227 				// sequence are upper case; the ones at odd offsets are lower.
    228 				// The correct mapping can be done by clearing or setting the low
    229 				// bit in the sequence offset.
    230 				// The constants UpperCase and TitleCase are even while LowerCase
    231 				// is odd so we take the low bit from _case.
    232 				return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1))
    233 			}
    234 			return r + delta
    235 		}
    236 		if r < rune(cr.Lo) {
    237 			hi = m
    238 		} else {
    239 			lo = m + 1
    240 		}
    241 	}
    242 	return r
    243 }
    244 
    245 // To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
    246 func To(_case int, r rune) rune {
    247 	return to(_case, r, CaseRanges)
    248 }
    249 
    250 // ToUpper maps the rune to upper case.
    251 func ToUpper(r rune) rune {
    252 	if r <= MaxASCII {
    253 		if 'a' <= r && r <= 'z' {
    254 			r -= 'a' - 'A'
    255 		}
    256 		return r
    257 	}
    258 	return To(UpperCase, r)
    259 }
    260 
    261 // ToLower maps the rune to lower case.
    262 func ToLower(r rune) rune {
    263 	if r <= MaxASCII {
    264 		if 'A' <= r && r <= 'Z' {
    265 			r += 'a' - 'A'
    266 		}
    267 		return r
    268 	}
    269 	return To(LowerCase, r)
    270 }
    271 
    272 // ToTitle maps the rune to title case.
    273 func ToTitle(r rune) rune {
    274 	if r <= MaxASCII {
    275 		if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
    276 			r -= 'a' - 'A'
    277 		}
    278 		return r
    279 	}
    280 	return To(TitleCase, r)
    281 }
    282 
    283 // ToUpper maps the rune to upper case giving priority to the special mapping.
    284 func (special SpecialCase) ToUpper(r rune) rune {
    285 	r1 := to(UpperCase, r, []CaseRange(special))
    286 	if r1 == r {
    287 		r1 = ToUpper(r)
    288 	}
    289 	return r1
    290 }
    291 
    292 // ToTitle maps the rune to title case giving priority to the special mapping.
    293 func (special SpecialCase) ToTitle(r rune) rune {
    294 	r1 := to(TitleCase, r, []CaseRange(special))
    295 	if r1 == r {
    296 		r1 = ToTitle(r)
    297 	}
    298 	return r1
    299 }
    300 
    301 // ToLower maps the rune to lower case giving priority to the special mapping.
    302 func (special SpecialCase) ToLower(r rune) rune {
    303 	r1 := to(LowerCase, r, []CaseRange(special))
    304 	if r1 == r {
    305 		r1 = ToLower(r)
    306 	}
    307 	return r1
    308 }
    309 
    310 // caseOrbit is defined in tables.go as []foldPair.  Right now all the
    311 // entries fit in uint16, so use uint16.  If that changes, compilation
    312 // will fail (the constants in the composite literal will not fit in uint16)
    313 // and the types here can change to uint32.
    314 type foldPair struct {
    315 	From uint16
    316 	To   uint16
    317 }
    318 
    319 // SimpleFold iterates over Unicode code points equivalent under
    320 // the Unicode-defined simple case folding.  Among the code points
    321 // equivalent to rune (including rune itself), SimpleFold returns the
    322 // smallest rune > r if one exists, or else the smallest rune >= 0.
    323 //
    324 // For example:
    325 //	SimpleFold('A') = 'a'
    326 //	SimpleFold('a') = 'A'
    327 //
    328 //	SimpleFold('K') = 'k'
    329 //	SimpleFold('k') = '\u212A' (Kelvin symbol, )
    330 //	SimpleFold('\u212A') = 'K'
    331 //
    332 //	SimpleFold('1') = '1'
    333 //
    334 func SimpleFold(r rune) rune {
    335 	// Consult caseOrbit table for special cases.
    336 	lo := 0
    337 	hi := len(caseOrbit)
    338 	for lo < hi {
    339 		m := lo + (hi-lo)/2
    340 		if rune(caseOrbit[m].From) < r {
    341 			lo = m + 1
    342 		} else {
    343 			hi = m
    344 		}
    345 	}
    346 	if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
    347 		return rune(caseOrbit[lo].To)
    348 	}
    349 
    350 	// No folding specified.  This is a one- or two-element
    351 	// equivalence class containing rune and ToLower(rune)
    352 	// and ToUpper(rune) if they are different from rune.
    353 	if l := ToLower(r); l != r {
    354 		return l
    355 	}
    356 	return ToUpper(r)
    357 }
    358