Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 /*
      3 *******************************************************************************
      4 *   Copyright (C) 2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *******************************************************************************
      7 *   created on: 2011feb25
      8 *   created by: Markus W. Scherer
      9 */
     10 
     11 package android.icu.impl;
     12 
     13 /**
     14  * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
     15  * Hardcodes these properties, does not load data, does not depend on other ICU classes.
     16  * <p>
     17  * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
     18  * and both properties only include BMP code points (no supplementary ones).
     19  * Pattern_Syntax includes some unassigned code points.
     20  * <p>
     21  * [:Pattern_White_Space:] =
     22  *   [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
     23  * <p>
     24  * [:Pattern_Syntax:] =
     25  *   [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
     26  *    \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
     27  *    \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
     28  *    \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
     29  *    \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
     30  * @author mscherer
     31  * @hide Only a subset of ICU is exposed in Android
     32  */
     33 public final class PatternProps {
     34     /**
     35      * @return true if c is a Pattern_Syntax code point.
     36      */
     37     public static boolean isSyntax(int c) {
     38         if(c<0) {
     39             return false;
     40         } else if(c<=0xff) {
     41             return latin1[c]==3;
     42         } else if(c<0x2010) {
     43             return false;
     44         } else if(c<=0x3030) {
     45             int bits=syntax2000[index2000[(c-0x2000)>>5]];
     46             return ((bits>>(c&0x1f))&1)!=0;
     47         } else if(0xfd3e<=c && c<=0xfe46) {
     48             return c<=0xfd3f || 0xfe45<=c;
     49         } else {
     50             return false;
     51         }
     52     }
     53 
     54     /**
     55      * @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
     56      */
     57     public static boolean isSyntaxOrWhiteSpace(int c) {
     58         if(c<0) {
     59             return false;
     60         } else if(c<=0xff) {
     61             return latin1[c]!=0;
     62         } else if(c<0x200e) {
     63             return false;
     64         } else if(c<=0x3030) {
     65             int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
     66             return ((bits>>(c&0x1f))&1)!=0;
     67         } else if(0xfd3e<=c && c<=0xfe46) {
     68             return c<=0xfd3f || 0xfe45<=c;
     69         } else {
     70             return false;
     71         }
     72     }
     73 
     74     /**
     75      * @return true if c is a Pattern_White_Space character.
     76      */
     77     public static boolean isWhiteSpace(int c) {
     78         if(c<0) {
     79             return false;
     80         } else if(c<=0xff) {
     81             return latin1[c]==5;
     82         } else if(0x200e<=c && c<=0x2029) {
     83             return c<=0x200f || 0x2028<=c;
     84         } else {
     85             return false;
     86         }
     87     }
     88 
     89     /**
     90      * Skips over Pattern_White_Space starting at index i of the CharSequence.
     91      * @return The smallest index at or after i with a non-white space character.
     92      */
     93     public static int skipWhiteSpace(CharSequence s, int i) {
     94         while(i<s.length() && isWhiteSpace(s.charAt(i))) {
     95             ++i;
     96         }
     97         return i;
     98     }
     99 
    100     /**
    101      * @return s except with leading and trailing Pattern_White_Space removed.
    102      */
    103     public static String trimWhiteSpace(String s) {
    104         if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) {
    105             return s;
    106         }
    107         int start=0;
    108         int limit=s.length();
    109         while(start<limit && isWhiteSpace(s.charAt(start))) {
    110             ++start;
    111         }
    112         if(start<limit) {
    113             // There is non-white space at start; we will not move limit below that,
    114             // so we need not test start<limit in the loop.
    115             while(isWhiteSpace(s.charAt(limit-1))) {
    116                 --limit;
    117             }
    118         }
    119         return s.substring(start, limit);
    120     }
    121 
    122     /**
    123      * Tests whether the CharSequence contains a "pattern identifier", that is,
    124      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
    125      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
    126      */
    127     public static boolean isIdentifier(CharSequence s) {
    128         int limit=s.length();
    129         if(limit==0) {
    130             return false;
    131         }
    132         int start=0;
    133         do {
    134             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
    135                 return false;
    136             }
    137         } while(start<limit);
    138         return true;
    139     }
    140 
    141     /**
    142      * Tests whether the CharSequence contains a "pattern identifier", that is,
    143      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
    144      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters
    145      *         in s between start and (exclusive) limit.
    146      */
    147     public static boolean isIdentifier(CharSequence s, int start, int limit) {
    148         if(start>=limit) {
    149             return false;
    150         }
    151         do {
    152             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
    153                 return false;
    154             }
    155         } while(start<limit);
    156         return true;
    157     }
    158 
    159     /**
    160      * Skips over a "pattern identifier" starting at index i of the CharSequence.
    161      * @return The smallest index at or after i with
    162      *         a Pattern_White_Space or Pattern_Syntax character.
    163      */
    164     public static int skipIdentifier(CharSequence s, int i) {
    165         while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) {
    166             ++i;
    167         }
    168         return i;
    169     }
    170 
    171     /*
    172      * One byte per Latin-1 character.
    173      * Bit 0 is set if either Pattern property is true,
    174      * bit 1 if Pattern_Syntax is true,
    175      * bit 2 if Pattern_White_Space is true.
    176      * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
    177      */
    178     private static final byte latin1[]=new byte[] {  // 256
    179         // WS: 9..D
    180         0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
    181         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    182         // WS: 20  Syntax: 21..2F
    183         5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    184         // Syntax: 3A..40
    185         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
    186         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    187         // Syntax: 5B..5E
    188         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    189         // Syntax: 60
    190         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    191         // Syntax: 7B..7E
    192         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    193         // WS: 85
    194         0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    195         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    196         // Syntax: A1..A7, A9, AB, AC, AE
    197         0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
    198         // Syntax: B0, B1, B6, BB, BF
    199         3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
    200         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    201         // Syntax: D7
    202         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
    203         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    204         // Syntax: F7
    205         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
    206     };
    207 
    208     /*
    209      * One byte per 32 characters from U+2000..U+303F indexing into
    210      * a small table of 32-bit data words.
    211      * The first two data words are all-zeros and all-ones.
    212      */
    213     private static final byte index2000[]=new byte[] {  // 130
    214         2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
    215         0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
    216         1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
    217         1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
    218         1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
    219         1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
    220         1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
    221         1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
    222         1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
    223         1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
    224         1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
    225         1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
    226         0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
    227         0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
    228         1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
    229         0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
    230         8, 9  // 3000..303F
    231     };
    232 
    233     /*
    234      * One 32-bit integer per 32 characters. Ranges of all-false and all-true
    235      * are mapped to the first two values, other ranges map to appropriate bit patterns.
    236      */
    237     private static final int syntax2000[]=new int[] {
    238         0,
    239         -1,
    240         0xffff0000,  // 2: 2010..201F
    241         0x7fff00ff,  // 3: 2020..2027, 2030..203E
    242         0x7feffffe,  // 4: 2041..2053, 2055..205E
    243         0xffff0000,  // 5: 2190..219F
    244         0x003fffff,  // 6: 2760..2775
    245         0xfff00000,  // 7: 2794..279F
    246         0xffffff0e,  // 8: 3001..3003, 3008..301F
    247         0x00010001   // 9: 3020, 3030
    248     };
    249 
    250     /*
    251      * Same as syntax2000, but with additional bits set for the
    252      * Pattern_White_Space characters 200E 200F 2028 2029.
    253      */
    254     private static final int syntaxOrWhiteSpace2000[]=new int[] {
    255         0,
    256         -1,
    257         0xffffc000,  // 2: 200E..201F
    258         0x7fff03ff,  // 3: 2020..2029, 2030..203E
    259         0x7feffffe,  // 4: 2041..2053, 2055..205E
    260         0xffff0000,  // 5: 2190..219F
    261         0x003fffff,  // 6: 2760..2775
    262         0xfff00000,  // 7: 2794..279F
    263         0xffffff0e,  // 8: 3001..3003, 3008..301F
    264         0x00010001   // 9: 3020, 3030
    265     };
    266 }
    267