Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 *   Copyright (C) 2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 *   created on: 2011feb25
     10 *   created by: Markus W. Scherer
     11 */
     12 
     13 package android.icu.impl;
     14 
     15 /**
     16  * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
     17  * Hardcodes these properties, does not load data, does not depend on other ICU classes.
     18  * <p>
     19  * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
     20  * and both properties only include BMP code points (no supplementary ones).
     21  * Pattern_Syntax includes some unassigned code points.
     22  * <p>
     23  * [:Pattern_White_Space:] =
     24  *   [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
     25  * <p>
     26  * [:Pattern_Syntax:] =
     27  *   [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
     28  *    \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
     29  *    \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
     30  *    \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
     31  *    \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
     32  * @author mscherer
     33  * @hide Only a subset of ICU is exposed in Android
     34  */
     35 public final class PatternProps {
     36     /**
     37      * @return true if c is a Pattern_Syntax code point.
     38      */
     39     public static boolean isSyntax(int c) {
     40         if(c<0) {
     41             return false;
     42         } else if(c<=0xff) {
     43             return latin1[c]==3;
     44         } else if(c<0x2010) {
     45             return false;
     46         } else if(c<=0x3030) {
     47             int bits=syntax2000[index2000[(c-0x2000)>>5]];
     48             return ((bits>>(c&0x1f))&1)!=0;
     49         } else if(0xfd3e<=c && c<=0xfe46) {
     50             return c<=0xfd3f || 0xfe45<=c;
     51         } else {
     52             return false;
     53         }
     54     }
     55 
     56     /**
     57      * @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
     58      */
     59     public static boolean isSyntaxOrWhiteSpace(int c) {
     60         if(c<0) {
     61             return false;
     62         } else if(c<=0xff) {
     63             return latin1[c]!=0;
     64         } else if(c<0x200e) {
     65             return false;
     66         } else if(c<=0x3030) {
     67             int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
     68             return ((bits>>(c&0x1f))&1)!=0;
     69         } else if(0xfd3e<=c && c<=0xfe46) {
     70             return c<=0xfd3f || 0xfe45<=c;
     71         } else {
     72             return false;
     73         }
     74     }
     75 
     76     /**
     77      * @return true if c is a Pattern_White_Space character.
     78      */
     79     public static boolean isWhiteSpace(int c) {
     80         if(c<0) {
     81             return false;
     82         } else if(c<=0xff) {
     83             return latin1[c]==5;
     84         } else if(0x200e<=c && c<=0x2029) {
     85             return c<=0x200f || 0x2028<=c;
     86         } else {
     87             return false;
     88         }
     89     }
     90 
     91     /**
     92      * Skips over Pattern_White_Space starting at index i of the CharSequence.
     93      * @return The smallest index at or after i with a non-white space character.
     94      */
     95     public static int skipWhiteSpace(CharSequence s, int i) {
     96         while(i<s.length() && isWhiteSpace(s.charAt(i))) {
     97             ++i;
     98         }
     99         return i;
    100     }
    101 
    102     /**
    103      * @return s except with leading and trailing Pattern_White_Space removed.
    104      */
    105     public static String trimWhiteSpace(String s) {
    106         if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) {
    107             return s;
    108         }
    109         int start=0;
    110         int limit=s.length();
    111         while(start<limit && isWhiteSpace(s.charAt(start))) {
    112             ++start;
    113         }
    114         if(start<limit) {
    115             // There is non-white space at start; we will not move limit below that,
    116             // so we need not test start<limit in the loop.
    117             while(isWhiteSpace(s.charAt(limit-1))) {
    118                 --limit;
    119             }
    120         }
    121         return s.substring(start, limit);
    122     }
    123 
    124     /**
    125      * Tests whether the CharSequence contains a "pattern identifier", that is,
    126      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
    127      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
    128      */
    129     public static boolean isIdentifier(CharSequence s) {
    130         int limit=s.length();
    131         if(limit==0) {
    132             return false;
    133         }
    134         int start=0;
    135         do {
    136             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
    137                 return false;
    138             }
    139         } while(start<limit);
    140         return true;
    141     }
    142 
    143     /**
    144      * Tests whether the CharSequence contains a "pattern identifier", that is,
    145      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
    146      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters
    147      *         in s between start and (exclusive) limit.
    148      */
    149     public static boolean isIdentifier(CharSequence s, int start, int limit) {
    150         if(start>=limit) {
    151             return false;
    152         }
    153         do {
    154             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
    155                 return false;
    156             }
    157         } while(start<limit);
    158         return true;
    159     }
    160 
    161     /**
    162      * Skips over a "pattern identifier" starting at index i of the CharSequence.
    163      * @return The smallest index at or after i with
    164      *         a Pattern_White_Space or Pattern_Syntax character.
    165      */
    166     public static int skipIdentifier(CharSequence s, int i) {
    167         while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) {
    168             ++i;
    169         }
    170         return i;
    171     }
    172 
    173     /*
    174      * One byte per Latin-1 character.
    175      * Bit 0 is set if either Pattern property is true,
    176      * bit 1 if Pattern_Syntax is true,
    177      * bit 2 if Pattern_White_Space is true.
    178      * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
    179      */
    180     private static final byte latin1[]=new byte[] {  // 256
    181         // WS: 9..D
    182         0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
    183         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    184         // WS: 20  Syntax: 21..2F
    185         5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    186         // Syntax: 3A..40
    187         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
    188         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    189         // Syntax: 5B..5E
    190         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    191         // Syntax: 60
    192         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    193         // Syntax: 7B..7E
    194         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    195         // WS: 85
    196         0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    197         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    198         // Syntax: A1..A7, A9, AB, AC, AE
    199         0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
    200         // Syntax: B0, B1, B6, BB, BF
    201         3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
    202         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    203         // Syntax: D7
    204         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
    205         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    206         // Syntax: F7
    207         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
    208     };
    209 
    210     /*
    211      * One byte per 32 characters from U+2000..U+303F indexing into
    212      * a small table of 32-bit data words.
    213      * The first two data words are all-zeros and all-ones.
    214      */
    215     private static final byte index2000[]=new byte[] {  // 130
    216         2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
    217         0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
    218         1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
    219         1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
    220         1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
    221         1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
    222         1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
    223         1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
    224         1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
    225         1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
    226         1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
    227         1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
    228         0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
    229         0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
    230         1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
    231         0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
    232         8, 9  // 3000..303F
    233     };
    234 
    235     /*
    236      * One 32-bit integer per 32 characters. Ranges of all-false and all-true
    237      * are mapped to the first two values, other ranges map to appropriate bit patterns.
    238      */
    239     private static final int syntax2000[]=new int[] {
    240         0,
    241         -1,
    242         0xffff0000,  // 2: 2010..201F
    243         0x7fff00ff,  // 3: 2020..2027, 2030..203E
    244         0x7feffffe,  // 4: 2041..2053, 2055..205E
    245         0xffff0000,  // 5: 2190..219F
    246         0x003fffff,  // 6: 2760..2775
    247         0xfff00000,  // 7: 2794..279F
    248         0xffffff0e,  // 8: 3001..3003, 3008..301F
    249         0x00010001   // 9: 3020, 3030
    250     };
    251 
    252     /*
    253      * Same as syntax2000, but with additional bits set for the
    254      * Pattern_White_Space characters 200E 200F 2028 2029.
    255      */
    256     private static final int syntaxOrWhiteSpace2000[]=new int[] {
    257         0,
    258         -1,
    259         0xffffc000,  // 2: 200E..201F
    260         0x7fff03ff,  // 3: 2020..2029, 2030..203E
    261         0x7feffffe,  // 4: 2041..2053, 2055..205E
    262         0xffff0000,  // 5: 2190..219F
    263         0x003fffff,  // 6: 2760..2775
    264         0xfff00000,  // 7: 2794..279F
    265         0xffffff0e,  // 8: 3001..3003, 3008..301F
    266         0x00010001   // 9: 3020, 3030
    267     };
    268 }
    269