Home | History | Annotate | Download | only in utils
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one
      3  * or more contributor license agreements. See the NOTICE file
      4  * distributed with this work for additional information
      5  * regarding copyright ownership. The ASF licenses this file
      6  * to you under the Apache License, Version 2.0 (the  "License");
      7  * you may not use this file except in compliance with the License.
      8  * You may obtain a copy of the License at
      9  *
     10  *     http://www.apache.org/licenses/LICENSE-2.0
     11  *
     12  * Unless required by applicable law or agreed to in writing, software
     13  * distributed under the License is distributed on an "AS IS" BASIS,
     14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15  * See the License for the specific language governing permissions and
     16  * limitations under the License.
     17  */
     18 
     19 package org.apache.xml.serializer.utils;
     20 
     21 import java.util.Arrays;
     22 
     23 /**
     24  * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
     25  *
     26  * This class defines the basic properties of characters in XML 1.1. The data
     27  * in this class can be used to verify that a character is a valid
     28  * XML 1.1 character or if the character is a space, name start, or name
     29  * character.
     30  * <p>
     31  * A series of convenience methods are supplied to ease the burden
     32  * of the developer.  Using the character as an index into the <code>XML11CHARS</code>
     33  * array and applying the appropriate mask flag (e.g.
     34  * <code>MASK_VALID</code>), yields the same results as calling the
     35  * convenience methods. There is one exception: check the comments
     36  * for the <code>isValid</code> method for details.
     37  *
     38  * @author Glenn Marcy, IBM
     39  * @author Andy Clark, IBM
     40  * @author Arnaud  Le Hors, IBM
     41  * @author Neil Graham, IBM
     42  * @author Michael Glavassevich, IBM
     43  *
     44  * @version $Id: $
     45  */
     46 public class XML11Char {
     47 
     48     //
     49     // Constants
     50     //
     51 
     52     /** Character flags for XML 1.1. */
     53     private static final byte XML11CHARS [] = new byte [1 << 16];
     54 
     55     /** XML 1.1 Valid character mask. */
     56     public static final int MASK_XML11_VALID = 0x01;
     57 
     58     /** XML 1.1 Space character mask. */
     59     public static final int MASK_XML11_SPACE = 0x02;
     60 
     61     /** XML 1.1 Name start character mask. */
     62     public static final int MASK_XML11_NAME_START = 0x04;
     63 
     64     /** XML 1.1 Name character mask. */
     65     public static final int MASK_XML11_NAME = 0x08;
     66 
     67     /** XML 1.1 control character mask */
     68     public static final int MASK_XML11_CONTROL = 0x10;
     69 
     70     /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
     71     public static final int MASK_XML11_CONTENT = 0x20;
     72 
     73     /** XML namespaces 1.1 NCNameStart */
     74     public static final int MASK_XML11_NCNAME_START = 0x40;
     75 
     76     /** XML namespaces 1.1 NCName */
     77     public static final int MASK_XML11_NCNAME = 0x80;
     78 
     79     /** XML 1.1 content for internal entities (valid - "special" chars) */
     80     public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT;
     81 
     82     //
     83     // Static initialization
     84     //
     85 
     86     static {
     87 
     88         // Initializing the Character Flag Array
     89         // Code generated by: XML11CharGenerator.
     90 
     91         Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
     92         XML11CHARS[9] = 35;
     93         XML11CHARS[10] = 3;
     94         Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
     95         XML11CHARS[13] = 3;
     96         Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
     97         XML11CHARS[32] = 35;
     98         Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
     99         XML11CHARS[38] = 1;
    100         Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
    101         Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
    102         XML11CHARS[47] = 33;
    103         Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
    104         XML11CHARS[58] = 45;
    105         XML11CHARS[59] = 33;
    106         XML11CHARS[60] = 1;
    107         Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
    108         Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
    109         Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
    110         XML11CHARS[93] = 1;
    111         XML11CHARS[94] = 33;
    112         XML11CHARS[95] = -19;
    113         XML11CHARS[96] = 33;
    114         Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
    115         Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
    116         Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
    117         XML11CHARS[133] = 35;
    118         Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
    119         Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
    120         XML11CHARS[183] = -87;
    121         Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
    122         Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
    123         XML11CHARS[215] = 33;
    124         Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
    125         XML11CHARS[247] = 33;
    126         Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
    127         Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
    128         Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
    129         XML11CHARS[894] = 33;
    130         Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
    131         Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
    132         Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
    133         Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
    134         XML11CHARS[8232] = 35;
    135         Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
    136         Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
    137         Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
    138         Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
    139         Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
    140         Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
    141         Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
    142         Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
    143         Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
    144         Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
    145         Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
    146         Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
    147 
    148     } // <clinit>()
    149 
    150     //
    151     // Public static methods
    152     //
    153 
    154     /**
    155      * Returns true if the specified character is a space character
    156      * as amdended in the XML 1.1 specification.
    157      *
    158      * @param c The character to check.
    159      */
    160     public static boolean isXML11Space(int c) {
    161         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
    162     } // isXML11Space(int):boolean
    163 
    164     /**
    165      * Returns true if the specified character is valid. This method
    166      * also checks the surrogate character range from 0x10000 to 0x10FFFF.
    167      * <p>
    168      * If the program chooses to apply the mask directly to the
    169      * <code>XML11CHARS</code> array, then they are responsible for checking
    170      * the surrogate character range.
    171      *
    172      * @param c The character to check.
    173      */
    174     public static boolean isXML11Valid(int c) {
    175         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0)
    176                 || (0x10000 <= c && c <= 0x10FFFF);
    177     } // isXML11Valid(int):boolean
    178 
    179     /**
    180      * Returns true if the specified character is invalid.
    181      *
    182      * @param c The character to check.
    183      */
    184     public static boolean isXML11Invalid(int c) {
    185         return !isXML11Valid(c);
    186     } // isXML11Invalid(int):boolean
    187 
    188     /**
    189      * Returns true if the specified character is valid and permitted outside
    190      * of a character reference.
    191      * That is, this method will return false for the same set as
    192      * isXML11Valid, except it also reports false for "control characters".
    193      *
    194      * @param c The character to check.
    195      */
    196     public static boolean isXML11ValidLiteral(int c) {
    197         return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
    198             || (0x10000 <= c && c <= 0x10FFFF));
    199     } // isXML11ValidLiteral(int):boolean
    200 
    201     /**
    202      * Returns true if the specified character can be considered
    203      * content in an external parsed entity.
    204      *
    205      * @param c The character to check.
    206      */
    207     public static boolean isXML11Content(int c) {
    208         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
    209                (0x10000 <= c && c <= 0x10FFFF);
    210     } // isXML11Content(int):boolean
    211 
    212     /**
    213      * Returns true if the specified character can be considered
    214      * content in an internal parsed entity.
    215      *
    216      * @param c The character to check.
    217      */
    218     public static boolean isXML11InternalEntityContent(int c) {
    219         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
    220                (0x10000 <= c && c <= 0x10FFFF);
    221     } // isXML11InternalEntityContent(int):boolean
    222 
    223     /**
    224      * Returns true if the specified character is a valid name start
    225      * character as defined by production [4] in the XML 1.1
    226      * specification.
    227      *
    228      * @param c The character to check.
    229      */
    230     public static boolean isXML11NameStart(int c) {
    231         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
    232             || (0x10000 <= c && c < 0xF0000);
    233     } // isXML11NameStart(int):boolean
    234 
    235     /**
    236      * Returns true if the specified character is a valid name
    237      * character as defined by production [4a] in the XML 1.1
    238      * specification.
    239      *
    240      * @param c The character to check.
    241      */
    242     public static boolean isXML11Name(int c) {
    243         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0)
    244             || (c >= 0x10000 && c < 0xF0000);
    245     } // isXML11Name(int):boolean
    246 
    247     /**
    248      * Returns true if the specified character is a valid NCName start
    249      * character as defined by production [4] in Namespaces in XML
    250      * 1.1 recommendation.
    251      *
    252      * @param c The character to check.
    253      */
    254     public static boolean isXML11NCNameStart(int c) {
    255         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
    256             || (0x10000 <= c && c < 0xF0000);
    257     } // isXML11NCNameStart(int):boolean
    258 
    259     /**
    260      * Returns true if the specified character is a valid NCName
    261      * character as defined by production [5] in Namespaces in XML
    262      * 1.1 recommendation.
    263      *
    264      * @param c The character to check.
    265      */
    266     public static boolean isXML11NCName(int c) {
    267         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
    268             || (0x10000 <= c && c < 0xF0000);
    269     } // isXML11NCName(int):boolean
    270 
    271     /**
    272      * Returns whether the given character is a valid
    273      * high surrogate for a name character. This includes
    274      * all high surrogates for characters [0x10000-0xEFFFF].
    275      * In other words everything excluding planes 15 and 16.
    276      *
    277      * @param c The character to check.
    278      */
    279     public static boolean isXML11NameHighSurrogate(int c) {
    280         return (0xD800 <= c && c <= 0xDB7F);
    281     }
    282 
    283     /*
    284      * [5] Name ::= NameStartChar NameChar*
    285      */
    286     /**
    287      * Check to see if a string is a valid Name according to [5]
    288      * in the XML 1.1 Recommendation
    289      *
    290      * @param name string to check
    291      * @return true if name is a valid Name
    292      */
    293     public static boolean isXML11ValidName(String name) {
    294         int length = name.length();
    295         if (length == 0)
    296             return false;
    297         int i = 1;
    298         char ch = name.charAt(0);
    299         if( !isXML11NameStart(ch) ) {
    300             if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
    301                 char ch2 = name.charAt(1);
    302                 if ( !XMLChar.isLowSurrogate(ch2) ||
    303                      !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
    304                     return false;
    305                 }
    306                 i = 2;
    307             }
    308             else {
    309                 return false;
    310             }
    311         }
    312         while (i < length) {
    313             ch = name.charAt(i);
    314             if ( !isXML11Name(ch) ) {
    315                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
    316                     char ch2 = name.charAt(i);
    317                     if ( !XMLChar.isLowSurrogate(ch2) ||
    318                          !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
    319                         return false;
    320                     }
    321                 }
    322                 else {
    323                     return false;
    324                 }
    325             }
    326             ++i;
    327         }
    328         return true;
    329     } // isXML11ValidName(String):boolean
    330 
    331 
    332     /*
    333      * from the namespace 1.1 rec
    334      * [4] NCName ::= NCNameStartChar NCNameChar*
    335      */
    336     /**
    337      * Check to see if a string is a valid NCName according to [4]
    338      * from the XML Namespaces 1.1 Recommendation
    339      *
    340      * @param ncName string to check
    341      * @return true if name is a valid NCName
    342      */
    343     public static boolean isXML11ValidNCName(String ncName) {
    344         int length = ncName.length();
    345         if (length == 0)
    346             return false;
    347         int i = 1;
    348         char ch = ncName.charAt(0);
    349         if( !isXML11NCNameStart(ch) ) {
    350             if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
    351                 char ch2 = ncName.charAt(1);
    352                 if ( !XMLChar.isLowSurrogate(ch2) ||
    353                      !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
    354                     return false;
    355                 }
    356                 i = 2;
    357             }
    358             else {
    359                 return false;
    360             }
    361         }
    362         while (i < length) {
    363             ch = ncName.charAt(i);
    364             if ( !isXML11NCName(ch) ) {
    365                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
    366                     char ch2 = ncName.charAt(i);
    367                     if ( !XMLChar.isLowSurrogate(ch2) ||
    368                          !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
    369                         return false;
    370                     }
    371                 }
    372                 else {
    373                     return false;
    374                 }
    375             }
    376             ++i;
    377         }
    378         return true;
    379     } // isXML11ValidNCName(String):boolean
    380 
    381     /*
    382      * [7] Nmtoken ::= (NameChar)+
    383      */
    384     /**
    385      * Check to see if a string is a valid Nmtoken according to [7]
    386      * in the XML 1.1 Recommendation
    387      *
    388      * @param nmtoken string to check
    389      * @return true if nmtoken is a valid Nmtoken
    390      */
    391     public static boolean isXML11ValidNmtoken(String nmtoken) {
    392         int length = nmtoken.length();
    393         if (length == 0)
    394             return false;
    395         for (int i = 0; i < length; ++i ) {
    396             char ch = nmtoken.charAt(i);
    397             if( !isXML11Name(ch) ) {
    398                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
    399                     char ch2 = nmtoken.charAt(i);
    400                     if ( !XMLChar.isLowSurrogate(ch2) ||
    401                          !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
    402                         return false;
    403                     }
    404                 }
    405                 else {
    406                     return false;
    407                 }
    408             }
    409         }
    410         return true;
    411     } // isXML11ValidName(String):boolean
    412 
    413 } // class XML11Char
    414 
    415