1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Unicode case folding tables. 6 7 // The Unicode case folding tables encode the mapping from one Unicode point 8 // to the next largest Unicode point with equivalent folding. The largest 9 // point wraps back to the first. For example, the tables map: 10 // 11 // 'A' -> 'a' 12 // 'a' -> 'A' 13 // 14 // 'K' -> 'k' 15 // 'k' -> '' (Kelvin symbol) 16 // '' -> 'K' 17 // 18 // Like everything Unicode, these tables are big. If we represent the table 19 // as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB. 20 // Most table entries look like the ones around them: 21 // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. 22 // Instead of listing all the pairs explicitly, we make a list of ranges 23 // and deltas, so that the table entries for 'A' through 'Z' can be represented 24 // as a single entry { 'A', 'Z', +32 }. 25 // 26 // In addition to blocks that map to each other (A-Z mapping to a-z) 27 // there are blocks of pairs that individually map to each other 28 // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). 29 // For those, the special delta value EvenOdd marks even/odd pairs 30 // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. 31 // 32 // In this form, the table has 274 entries, about 3kB. If we were to split 33 // the table into one for 16-bit codes and an overflow table for larger ones, 34 // we could get it down to about 1.5kB, but that's not worth the complexity. 35 // 36 // The grouped form also allows for efficient fold range calculations 37 // rather than looping one character at a time. 38 39 #ifndef RE2_UNICODE_CASEFOLD_H__ 40 #define RE2_UNICODE_CASEFOLD_H__ 41 42 #include "util/util.h" 43 44 namespace re2 { 45 46 enum { 47 EvenOdd = 1, 48 OddEven = -1, 49 EvenOddSkip = 1<<30, 50 OddEvenSkip, 51 }; 52 53 struct CaseFold { 54 uint32 lo; 55 uint32 hi; 56 int32 delta; 57 }; 58 59 extern CaseFold unicode_casefold[]; 60 extern int num_unicode_casefold; 61 62 extern CaseFold unicode_tolower[]; 63 extern int num_unicode_tolower; 64 65 // Returns the CaseFold* in the tables that contains rune. 66 // If rune is not in the tables, returns the first CaseFold* after rune. 67 // If rune is larger than any value in the tables, returns NULL. 68 extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune); 69 70 // Returns the result of applying the fold f to the rune r. 71 extern Rune ApplyFold(CaseFold *f, Rune r); 72 73 } // namespace re2 74 75 #endif // RE2_UNICODE_CASEFOLD_H__ 76