Home | History | Annotate | Download | only in utils
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one
      3  * or more contributor license agreements. See the NOTICE file
      4  * distributed with this work for additional information
      5  * regarding copyright ownership. The ASF licenses this file
      6  * to you under the Apache License, Version 2.0 (the  "License");
      7  * you may not use this file except in compliance with the License.
      8  * You may obtain a copy of the License at
      9  *
     10  *     http://www.apache.org/licenses/LICENSE-2.0
     11  *
     12  * Unless required by applicable law or agreed to in writing, software
     13  * distributed under the License is distributed on an "AS IS" BASIS,
     14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15  * See the License for the specific language governing permissions and
     16  * limitations under the License.
     17  */
     18 
     19 package org.apache.xml.utils;
     20 
     21 import java.util.Arrays;
     22 
     23 
     24 /**
     25  * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
     26  *
     27  * This class defines the basic properties of characters in XML 1.1. The data
     28  * in this class can be used to verify that a character is a valid
     29  * XML 1.1 character or if the character is a space, name start, or name
     30  * character.
     31  * <p>
     32  * A series of convenience methods are supplied to ease the burden
     33  * of the developer.  Using the character as an index into the <code>XML11CHARS</code>
     34  * array and applying the appropriate mask flag (e.g.
     35  * <code>MASK_VALID</code>), yields the same results as calling the
     36  * convenience methods. There is one exception: check the comments
     37  * for the <code>isValid</code> method for details.
     38  *
     39  * @version $Id: XML11Char.java 468655 2006-10-28 07:12:06Z minchau $
     40  */
     41 public class XML11Char {
     42 
     43     //
     44     // Constants
     45     //
     46 
     47     /** Character flags for XML 1.1. */
     48     private static final byte XML11CHARS [] = new byte [1 << 16];
     49 
     50     /** XML 1.1 Valid character mask. */
     51     public static final int MASK_XML11_VALID = 0x01;
     52 
     53     /** XML 1.1 Space character mask. */
     54     public static final int MASK_XML11_SPACE = 0x02;
     55 
     56     /** XML 1.1 Name start character mask. */
     57     public static final int MASK_XML11_NAME_START = 0x04;
     58 
     59     /** XML 1.1 Name character mask. */
     60     public static final int MASK_XML11_NAME = 0x08;
     61 
     62     /** XML 1.1 control character mask */
     63     public static final int MASK_XML11_CONTROL = 0x10;
     64 
     65     /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
     66     public static final int MASK_XML11_CONTENT = 0x20;
     67 
     68     /** XML namespaces 1.1 NCNameStart */
     69     public static final int MASK_XML11_NCNAME_START = 0x40;
     70 
     71     /** XML namespaces 1.1 NCName */
     72     public static final int MASK_XML11_NCNAME = 0x80;
     73 
     74     /** XML 1.1 content for internal entities (valid - "special" chars) */
     75     public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT;
     76 
     77     //
     78     // Static initialization
     79     //
     80 
     81     static {
     82 
     83         // Initializing the Character Flag Array
     84         // Code generated by: XML11CharGenerator.
     85 
     86         Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
     87         XML11CHARS[9] = 35;
     88         XML11CHARS[10] = 3;
     89         Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
     90         XML11CHARS[13] = 3;
     91         Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
     92         XML11CHARS[32] = 35;
     93         Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
     94         XML11CHARS[38] = 1;
     95         Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
     96         Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
     97         XML11CHARS[47] = 33;
     98         Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
     99         XML11CHARS[58] = 45;
    100         XML11CHARS[59] = 33;
    101         XML11CHARS[60] = 1;
    102         Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
    103         Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
    104         Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
    105         XML11CHARS[93] = 1;
    106         XML11CHARS[94] = 33;
    107         XML11CHARS[95] = -19;
    108         XML11CHARS[96] = 33;
    109         Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
    110         Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
    111         Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
    112         XML11CHARS[133] = 35;
    113         Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
    114         Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
    115         XML11CHARS[183] = -87;
    116         Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
    117         Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
    118         XML11CHARS[215] = 33;
    119         Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
    120         XML11CHARS[247] = 33;
    121         Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
    122         Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
    123         Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
    124         XML11CHARS[894] = 33;
    125         Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
    126         Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
    127         Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
    128         Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
    129         XML11CHARS[8232] = 35;
    130         Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
    131         Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
    132         Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
    133         Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
    134         Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
    135         Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
    136         Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
    137         Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
    138         Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
    139         Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
    140         Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
    141         Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
    142 
    143     } // <clinit>()
    144 
    145     //
    146     // Public static methods
    147     //
    148 
    149     /**
    150      * Returns true if the specified character is a space character
    151      * as amdended in the XML 1.1 specification.
    152      *
    153      * @param c The character to check.
    154      */
    155     public static boolean isXML11Space(int c) {
    156         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
    157     } // isXML11Space(int):boolean
    158 
    159     /**
    160      * Returns true if the specified character is valid. This method
    161      * also checks the surrogate character range from 0x10000 to 0x10FFFF.
    162      * <p>
    163      * If the program chooses to apply the mask directly to the
    164      * <code>XML11CHARS</code> array, then they are responsible for checking
    165      * the surrogate character range.
    166      *
    167      * @param c The character to check.
    168      */
    169     public static boolean isXML11Valid(int c) {
    170         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0)
    171                 || (0x10000 <= c && c <= 0x10FFFF);
    172     } // isXML11Valid(int):boolean
    173 
    174     /**
    175      * Returns true if the specified character is invalid.
    176      *
    177      * @param c The character to check.
    178      */
    179     public static boolean isXML11Invalid(int c) {
    180         return !isXML11Valid(c);
    181     } // isXML11Invalid(int):boolean
    182 
    183     /**
    184      * Returns true if the specified character is valid and permitted outside
    185      * of a character reference.
    186      * That is, this method will return false for the same set as
    187      * isXML11Valid, except it also reports false for "control characters".
    188      *
    189      * @param c The character to check.
    190      */
    191     public static boolean isXML11ValidLiteral(int c) {
    192         return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
    193             || (0x10000 <= c && c <= 0x10FFFF));
    194     } // isXML11ValidLiteral(int):boolean
    195 
    196     /**
    197      * Returns true if the specified character can be considered
    198      * content in an external parsed entity.
    199      *
    200      * @param c The character to check.
    201      */
    202     public static boolean isXML11Content(int c) {
    203         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
    204                (0x10000 <= c && c <= 0x10FFFF);
    205     } // isXML11Content(int):boolean
    206 
    207     /**
    208      * Returns true if the specified character can be considered
    209      * content in an internal parsed entity.
    210      *
    211      * @param c The character to check.
    212      */
    213     public static boolean isXML11InternalEntityContent(int c) {
    214         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
    215                (0x10000 <= c && c <= 0x10FFFF);
    216     } // isXML11InternalEntityContent(int):boolean
    217 
    218     /**
    219      * Returns true if the specified character is a valid name start
    220      * character as defined by production [4] in the XML 1.1
    221      * specification.
    222      *
    223      * @param c The character to check.
    224      */
    225     public static boolean isXML11NameStart(int c) {
    226         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
    227             || (0x10000 <= c && c < 0xF0000);
    228     } // isXML11NameStart(int):boolean
    229 
    230     /**
    231      * Returns true if the specified character is a valid name
    232      * character as defined by production [4a] in the XML 1.1
    233      * specification.
    234      *
    235      * @param c The character to check.
    236      */
    237     public static boolean isXML11Name(int c) {
    238         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0)
    239             || (c >= 0x10000 && c < 0xF0000);
    240     } // isXML11Name(int):boolean
    241 
    242     /**
    243      * Returns true if the specified character is a valid NCName start
    244      * character as defined by production [4] in Namespaces in XML
    245      * 1.1 recommendation.
    246      *
    247      * @param c The character to check.
    248      */
    249     public static boolean isXML11NCNameStart(int c) {
    250         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
    251             || (0x10000 <= c && c < 0xF0000);
    252     } // isXML11NCNameStart(int):boolean
    253 
    254     /**
    255      * Returns true if the specified character is a valid NCName
    256      * character as defined by production [5] in Namespaces in XML
    257      * 1.1 recommendation.
    258      *
    259      * @param c The character to check.
    260      */
    261     public static boolean isXML11NCName(int c) {
    262         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
    263             || (0x10000 <= c && c < 0xF0000);
    264     } // isXML11NCName(int):boolean
    265 
    266     /**
    267      * Returns whether the given character is a valid
    268      * high surrogate for a name character. This includes
    269      * all high surrogates for characters [0x10000-0xEFFFF].
    270      * In other words everything excluding planes 15 and 16.
    271      *
    272      * @param c The character to check.
    273      */
    274     public static boolean isXML11NameHighSurrogate(int c) {
    275         return (0xD800 <= c && c <= 0xDB7F);
    276     }
    277 
    278     /*
    279      * [5] Name ::= NameStartChar NameChar*
    280      */
    281     /**
    282      * Check to see if a string is a valid Name according to [5]
    283      * in the XML 1.1 Recommendation
    284      *
    285      * @param name string to check
    286      * @return true if name is a valid Name
    287      */
    288     public static boolean isXML11ValidName(String name) {
    289         int length = name.length();
    290         if (length == 0)
    291             return false;
    292         int i = 1;
    293         char ch = name.charAt(0);
    294         if( !isXML11NameStart(ch) ) {
    295             if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
    296                 char ch2 = name.charAt(1);
    297                 if ( !XMLChar.isLowSurrogate(ch2) ||
    298                      !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
    299                     return false;
    300                 }
    301                 i = 2;
    302             }
    303             else {
    304                 return false;
    305             }
    306         }
    307         while (i < length) {
    308             ch = name.charAt(i);
    309             if ( !isXML11Name(ch) ) {
    310                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
    311                     char ch2 = name.charAt(i);
    312                     if ( !XMLChar.isLowSurrogate(ch2) ||
    313                          !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
    314                         return false;
    315                     }
    316                 }
    317                 else {
    318                     return false;
    319                 }
    320             }
    321             ++i;
    322         }
    323         return true;
    324     } // isXML11ValidName(String):boolean
    325 
    326 
    327     /*
    328      * from the namespace 1.1 rec
    329      * [4] NCName ::= NCNameStartChar NCNameChar*
    330      */
    331     /**
    332      * Check to see if a string is a valid NCName according to [4]
    333      * from the XML Namespaces 1.1 Recommendation
    334      *
    335      * @param ncName string to check
    336      * @return true if name is a valid NCName
    337      */
    338     public static boolean isXML11ValidNCName(String ncName) {
    339         int length = ncName.length();
    340         if (length == 0)
    341             return false;
    342         int i = 1;
    343         char ch = ncName.charAt(0);
    344         if( !isXML11NCNameStart(ch) ) {
    345             if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
    346                 char ch2 = ncName.charAt(1);
    347                 if ( !XMLChar.isLowSurrogate(ch2) ||
    348                      !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
    349                     return false;
    350                 }
    351                 i = 2;
    352             }
    353             else {
    354                 return false;
    355             }
    356         }
    357         while (i < length) {
    358             ch = ncName.charAt(i);
    359             if ( !isXML11NCName(ch) ) {
    360                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
    361                     char ch2 = ncName.charAt(i);
    362                     if ( !XMLChar.isLowSurrogate(ch2) ||
    363                          !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
    364                         return false;
    365                     }
    366                 }
    367                 else {
    368                     return false;
    369                 }
    370             }
    371             ++i;
    372         }
    373         return true;
    374     } // isXML11ValidNCName(String):boolean
    375 
    376     /*
    377      * [7] Nmtoken ::= (NameChar)+
    378      */
    379     /**
    380      * Check to see if a string is a valid Nmtoken according to [7]
    381      * in the XML 1.1 Recommendation
    382      *
    383      * @param nmtoken string to check
    384      * @return true if nmtoken is a valid Nmtoken
    385      */
    386     public static boolean isXML11ValidNmtoken(String nmtoken) {
    387         int length = nmtoken.length();
    388         if (length == 0)
    389             return false;
    390         for (int i = 0; i < length; ++i ) {
    391             char ch = nmtoken.charAt(i);
    392             if( !isXML11Name(ch) ) {
    393                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
    394                     char ch2 = nmtoken.charAt(i);
    395                     if ( !XMLChar.isLowSurrogate(ch2) ||
    396                          !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
    397                         return false;
    398                     }
    399                 }
    400                 else {
    401                     return false;
    402                 }
    403             }
    404         }
    405         return true;
    406     } // isXML11ValidName(String):boolean
    407 
    408     /**
    409       * Simple check to determine if qname is legal. If it returns false
    410       * then <param>str</param> is illegal; if it returns true then
    411       * <param>str</param> is legal.
    412       */
    413      public static boolean isXML11ValidQName(String str) {
    414 
    415         final int colon = str.indexOf(':');
    416 
    417         if (colon == 0 || colon == str.length() - 1) {
    418             return false;
    419         }
    420 
    421         if (colon > 0) {
    422             final String prefix = str.substring(0,colon);
    423             final String localPart = str.substring(colon+1);
    424             return isXML11ValidNCName(prefix) && isXML11ValidNCName(localPart);
    425         }
    426         else {
    427             return isXML11ValidNCName(str);
    428         }
    429      }
    430 
    431 } // class XML11Char
    432 
    433