Home | History | Annotate | Download | only in impl
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   created on: 2011feb25
      7 *   created by: Markus W. Scherer
      8 */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 /**
     13  * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
     14  * Hardcodes these properties, does not load data, does not depend on other ICU classes.
     15  * <p>
     16  * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
     17  * and both properties only include BMP code points (no supplementary ones).
     18  * Pattern_Syntax includes some unassigned code points.
     19  * <p>
     20  * [:Pattern_White_Space:] =
     21  *   [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
     22  * <p>
     23  * [:Pattern_Syntax:] =
     24  *   [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
     25  *    \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
     26  *    \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
     27  *    \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
     28  *    \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
     29  * @author mscherer
     30  */
     31 public final class PatternProps {
     32     /**
     33      * @return true if c is a Pattern_Syntax code point.
     34      */
     35     public static boolean isSyntax(int c) {
     36         if(c<0) {
     37             return false;
     38         } else if(c<=0xff) {
     39             return latin1[c]==3;
     40         } else if(c<0x2010) {
     41             return false;
     42         } else if(c<=0x3030) {
     43             int bits=syntax2000[index2000[(c-0x2000)>>5]];
     44             return ((bits>>(c&0x1f))&1)!=0;
     45         } else if(0xfd3e<=c && c<=0xfe46) {
     46             return c<=0xfd3f || 0xfe45<=c;
     47         } else {
     48             return false;
     49         }
     50     }
     51 
     52     /**
     53      * @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
     54      */
     55     public static boolean isSyntaxOrWhiteSpace(int c) {
     56         if(c<0) {
     57             return false;
     58         } else if(c<=0xff) {
     59             return latin1[c]!=0;
     60         } else if(c<0x200e) {
     61             return false;
     62         } else if(c<=0x3030) {
     63             int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
     64             return ((bits>>(c&0x1f))&1)!=0;
     65         } else if(0xfd3e<=c && c<=0xfe46) {
     66             return c<=0xfd3f || 0xfe45<=c;
     67         } else {
     68             return false;
     69         }
     70     }
     71 
     72     /**
     73      * @return true if c is a Pattern_White_Space character.
     74      */
     75     public static boolean isWhiteSpace(int c) {
     76         if(c<0) {
     77             return false;
     78         } else if(c<=0xff) {
     79             return latin1[c]==5;
     80         } else if(0x200e<=c && c<=0x2029) {
     81             return c<=0x200f || 0x2028<=c;
     82         } else {
     83             return false;
     84         }
     85     }
     86 
     87     /**
     88      * Skips over Pattern_White_Space starting at index i of the CharSequence.
     89      * @return The smallest index at or after i with a non-white space character.
     90      */
     91     public static int skipWhiteSpace(CharSequence s, int i) {
     92         while(i<s.length() && isWhiteSpace(s.charAt(i))) {
     93             ++i;
     94         }
     95         return i;
     96     }
     97 
     98     /**
     99      * @return s except with leading and trailing Pattern_White_Space removed.
    100      */
    101     public static String trimWhiteSpace(String s) {
    102         if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) {
    103             return s;
    104         }
    105         int start=0;
    106         int limit=s.length();
    107         while(start<limit && isWhiteSpace(s.charAt(start))) {
    108             ++start;
    109         }
    110         if(start<limit) {
    111             // There is non-white space at start; we will not move limit below that,
    112             // so we need not test start<limit in the loop.
    113             while(isWhiteSpace(s.charAt(limit-1))) {
    114                 --limit;
    115             }
    116         }
    117         return s.substring(start, limit);
    118     }
    119 
    120     /**
    121      * Tests whether the CharSequence contains a "pattern identifier", that is,
    122      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
    123      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
    124      */
    125     public static boolean isIdentifier(CharSequence s) {
    126         int limit=s.length();
    127         if(limit==0) {
    128             return false;
    129         }
    130         int start=0;
    131         do {
    132             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
    133                 return false;
    134             }
    135         } while(start<limit);
    136         return true;
    137     }
    138 
    139     /**
    140      * Tests whether the CharSequence contains a "pattern identifier", that is,
    141      * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
    142      * @return true if there are no Pattern_White_Space or Pattern_Syntax characters
    143      *         in s between start and (exclusive) limit.
    144      */
    145     public static boolean isIdentifier(CharSequence s, int start, int limit) {
    146         if(start>=limit) {
    147             return false;
    148         }
    149         do {
    150             if(isSyntaxOrWhiteSpace(s.charAt(start++))) {
    151                 return false;
    152             }
    153         } while(start<limit);
    154         return true;
    155     }
    156 
    157     /**
    158      * Skips over a "pattern identifier" starting at index i of the CharSequence.
    159      * @return The smallest index at or after i with
    160      *         a Pattern_White_Space or Pattern_Syntax character.
    161      */
    162     public static int skipIdentifier(CharSequence s, int i) {
    163         while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) {
    164             ++i;
    165         }
    166         return i;
    167     }
    168 
    169     /*
    170      * One byte per Latin-1 character.
    171      * Bit 0 is set if either Pattern property is true,
    172      * bit 1 if Pattern_Syntax is true,
    173      * bit 2 if Pattern_White_Space is true.
    174      * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
    175      */
    176     private static final byte latin1[]=new byte[] {  // 256
    177         // WS: 9..D
    178         0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
    179         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    180         // WS: 20  Syntax: 21..2F
    181         5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    182         // Syntax: 3A..40
    183         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
    184         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    185         // Syntax: 5B..5E
    186         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    187         // Syntax: 60
    188         3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    189         // Syntax: 7B..7E
    190         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    191         // WS: 85
    192         0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    193         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    194         // Syntax: A1..A7, A9, AB, AC, AE
    195         0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
    196         // Syntax: B0, B1, B6, BB, BF
    197         3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
    198         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    199         // Syntax: D7
    200         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
    201         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    202         // Syntax: F7
    203         0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
    204     };
    205 
    206     /*
    207      * One byte per 32 characters from U+2000..U+303F indexing into
    208      * a small table of 32-bit data words.
    209      * The first two data words are all-zeros and all-ones.
    210      */
    211     private static final byte index2000[]=new byte[] {  // 130
    212         2, 3, 4, 0, 0, 0, 0, 0,  // 20xx
    213         0, 0, 0, 0, 5, 1, 1, 1,  // 21xx
    214         1, 1, 1, 1, 1, 1, 1, 1,  // 22xx
    215         1, 1, 1, 1, 1, 1, 1, 1,  // 23xx
    216         1, 1, 1, 0, 0, 0, 0, 0,  // 24xx
    217         1, 1, 1, 1, 1, 1, 1, 1,  // 25xx
    218         1, 1, 1, 1, 1, 1, 1, 1,  // 26xx
    219         1, 1, 1, 6, 7, 1, 1, 1,  // 27xx
    220         1, 1, 1, 1, 1, 1, 1, 1,  // 28xx
    221         1, 1, 1, 1, 1, 1, 1, 1,  // 29xx
    222         1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx
    223         1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx
    224         0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx
    225         0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx
    226         1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx
    227         0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx
    228         8, 9  // 3000..303F
    229     };
    230 
    231     /*
    232      * One 32-bit integer per 32 characters. Ranges of all-false and all-true
    233      * are mapped to the first two values, other ranges map to appropriate bit patterns.
    234      */
    235     private static final int syntax2000[]=new int[] {
    236         0,
    237         -1,
    238         0xffff0000,  // 2: 2010..201F
    239         0x7fff00ff,  // 3: 2020..2027, 2030..203E
    240         0x7feffffe,  // 4: 2041..2053, 2055..205E
    241         0xffff0000,  // 5: 2190..219F
    242         0x003fffff,  // 6: 2760..2775
    243         0xfff00000,  // 7: 2794..279F
    244         0xffffff0e,  // 8: 3001..3003, 3008..301F
    245         0x00010001   // 9: 3020, 3030
    246     };
    247 
    248     /*
    249      * Same as syntax2000, but with additional bits set for the
    250      * Pattern_White_Space characters 200E 200F 2028 2029.
    251      */
    252     private static final int syntaxOrWhiteSpace2000[]=new int[] {
    253         0,
    254         -1,
    255         0xffffc000,  // 2: 200E..201F
    256         0x7fff03ff,  // 3: 2020..2029, 2030..203E
    257         0x7feffffe,  // 4: 2041..2053, 2055..205E
    258         0xffff0000,  // 5: 2190..219F
    259         0x003fffff,  // 6: 2760..2775
    260         0xfff00000,  // 7: 2794..279F
    261         0xffffff0e,  // 8: 3001..3003, 3008..301F
    262         0x00010001   // 9: 3020, 3030
    263     };
    264 }
    265