Home | History | Annotate | Download | only in unicode
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package unicode
      6 
      7 // Bit masks for each code point under U+0100, for fast lookup.
      8 const (
      9 	pC     = 1 << iota // a control character.
     10 	pP                 // a punctuation character.
     11 	pN                 // a numeral.
     12 	pS                 // a symbolic character.
     13 	pZ                 // a spacing character.
     14 	pLu                // an upper-case letter.
     15 	pLl                // a lower-case letter.
     16 	pp                 // a printable character according to Go's definition.
     17 	pg     = pp | pZ   // a graphical character according to the Unicode definition.
     18 	pLo    = pLl | pLu // a letter that is neither upper nor lower case.
     19 	pLmask = pLo
     20 )
     21 
     22 // GraphicRanges defines the set of graphic characters according to Unicode.
     23 var GraphicRanges = []*RangeTable{
     24 	L, M, N, P, S, Zs,
     25 }
     26 
     27 // PrintRanges defines the set of printable characters according to Go.
     28 // ASCII space, U+0020, is handled separately.
     29 var PrintRanges = []*RangeTable{
     30 	L, M, N, P, S,
     31 }
     32 
     33 // IsGraphic reports whether the rune is defined as a Graphic by Unicode.
     34 // Such characters include letters, marks, numbers, punctuation, symbols, and
     35 // spaces, from categories L, M, N, P, S, Zs.
     36 func IsGraphic(r rune) bool {
     37 	// We convert to uint32 to avoid the extra test for negative,
     38 	// and in the index we convert to uint8 to avoid the range check.
     39 	if uint32(r) <= MaxLatin1 {
     40 		return properties[uint8(r)]&pg != 0
     41 	}
     42 	return In(r, GraphicRanges...)
     43 }
     44 
     45 // IsPrint reports whether the rune is defined as printable by Go. Such
     46 // characters include letters, marks, numbers, punctuation, symbols, and the
     47 // ASCII space character, from categories L, M, N, P, S and the ASCII space
     48 // character. This categorization is the same as IsGraphic except that the
     49 // only spacing character is ASCII space, U+0020.
     50 func IsPrint(r rune) bool {
     51 	if uint32(r) <= MaxLatin1 {
     52 		return properties[uint8(r)]&pp != 0
     53 	}
     54 	return In(r, PrintRanges...)
     55 }
     56 
     57 // IsOneOf reports whether the rune is a member of one of the ranges.
     58 // The function "In" provides a nicer signature and should be used in preference to IsOneOf.
     59 func IsOneOf(ranges []*RangeTable, r rune) bool {
     60 	for _, inside := range ranges {
     61 		if Is(inside, r) {
     62 			return true
     63 		}
     64 	}
     65 	return false
     66 }
     67 
     68 // In reports whether the rune is a member of one of the ranges.
     69 func In(r rune, ranges ...*RangeTable) bool {
     70 	for _, inside := range ranges {
     71 		if Is(inside, r) {
     72 			return true
     73 		}
     74 	}
     75 	return false
     76 }
     77 
     78 // IsControl reports whether the rune is a control character.
     79 // The C (Other) Unicode category includes more code points
     80 // such as surrogates; use Is(C, r) to test for them.
     81 func IsControl(r rune) bool {
     82 	if uint32(r) <= MaxLatin1 {
     83 		return properties[uint8(r)]&pC != 0
     84 	}
     85 	// All control characters are < MaxLatin1.
     86 	return false
     87 }
     88 
     89 // IsLetter reports whether the rune is a letter (category L).
     90 func IsLetter(r rune) bool {
     91 	if uint32(r) <= MaxLatin1 {
     92 		return properties[uint8(r)]&(pLmask) != 0
     93 	}
     94 	return isExcludingLatin(Letter, r)
     95 }
     96 
     97 // IsMark reports whether the rune is a mark character (category M).
     98 func IsMark(r rune) bool {
     99 	// There are no mark characters in Latin-1.
    100 	return isExcludingLatin(Mark, r)
    101 }
    102 
    103 // IsNumber reports whether the rune is a number (category N).
    104 func IsNumber(r rune) bool {
    105 	if uint32(r) <= MaxLatin1 {
    106 		return properties[uint8(r)]&pN != 0
    107 	}
    108 	return isExcludingLatin(Number, r)
    109 }
    110 
    111 // IsPunct reports whether the rune is a Unicode punctuation character
    112 // (category P).
    113 func IsPunct(r rune) bool {
    114 	if uint32(r) <= MaxLatin1 {
    115 		return properties[uint8(r)]&pP != 0
    116 	}
    117 	return Is(Punct, r)
    118 }
    119 
    120 // IsSpace reports whether the rune is a space character as defined
    121 // by Unicode's White Space property; in the Latin-1 space
    122 // this is
    123 //	'\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP).
    124 // Other definitions of spacing characters are set by category
    125 // Z and property Pattern_White_Space.
    126 func IsSpace(r rune) bool {
    127 	// This property isn't the same as Z; special-case it.
    128 	if uint32(r) <= MaxLatin1 {
    129 		switch r {
    130 		case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
    131 			return true
    132 		}
    133 		return false
    134 	}
    135 	return isExcludingLatin(White_Space, r)
    136 }
    137 
    138 // IsSymbol reports whether the rune is a symbolic character.
    139 func IsSymbol(r rune) bool {
    140 	if uint32(r) <= MaxLatin1 {
    141 		return properties[uint8(r)]&pS != 0
    142 	}
    143 	return isExcludingLatin(Symbol, r)
    144 }
    145