Home | History | Annotate | Download | only in re2
      1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Unicode case folding tables.
      6 
      7 // The Unicode case folding tables encode the mapping from one Unicode point
      8 // to the next largest Unicode point with equivalent folding.  The largest
      9 // point wraps back to the first.  For example, the tables map:
     10 //
     11 //     'A' -> 'a'
     12 //     'a' -> 'A'
     13 //
     14 //     'K' -> 'k'
     15 //     'k' -> ''  (Kelvin symbol)
     16 //     '' -> 'K'
     17 //
     18 // Like everything Unicode, these tables are big.  If we represent the table
     19 // as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
     20 // Most table entries look like the ones around them:
     21 // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
     22 // Instead of listing all the pairs explicitly, we make a list of ranges
     23 // and deltas, so that the table entries for 'A' through 'Z' can be represented
     24 // as a single entry { 'A', 'Z', +32 }.
     25 //
     26 // In addition to blocks that map to each other (A-Z mapping to a-z)
     27 // there are blocks of pairs that individually map to each other
     28 // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
     29 // For those, the special delta value EvenOdd marks even/odd pairs
     30 // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
     31 //
     32 // In this form, the table has 274 entries, about 3kB.  If we were to split
     33 // the table into one for 16-bit codes and an overflow table for larger ones,
     34 // we could get it down to about 1.5kB, but that's not worth the complexity.
     35 //
     36 // The grouped form also allows for efficient fold range calculations
     37 // rather than looping one character at a time.
     38 
     39 #ifndef RE2_UNICODE_CASEFOLD_H__
     40 #define RE2_UNICODE_CASEFOLD_H__
     41 
     42 #include "util/util.h"
     43 
     44 namespace re2 {
     45 
     46 enum {
     47   EvenOdd = 1,
     48   OddEven = -1,
     49   EvenOddSkip = 1<<30,
     50   OddEvenSkip,
     51 };
     52 
     53 struct CaseFold {
     54   uint32 lo;
     55   uint32 hi;
     56   int32 delta;
     57 };
     58 
     59 extern CaseFold unicode_casefold[];
     60 extern int num_unicode_casefold;
     61 
     62 extern CaseFold unicode_tolower[];
     63 extern int num_unicode_tolower;
     64 
     65 // Returns the CaseFold* in the tables that contains rune.
     66 // If rune is not in the tables, returns the first CaseFold* after rune.
     67 // If rune is larger than any value in the tables, returns NULL.
     68 extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
     69 
     70 // Returns the result of applying the fold f to the rune r.
     71 extern Rune ApplyFold(CaseFold *f, Rune r);
     72 
     73 }  // namespace re2
     74 
     75 #endif  // RE2_UNICODE_CASEFOLD_H__
     76