Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (c) 1996, 2012, Oracle and/or its affiliates. All rights reserved.
      3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
      4  *
      5  * This code is free software; you can redistribute it and/or modify it
      6  * under the terms of the GNU General Public License version 2 only, as
      7  * published by the Free Software Foundation.  Oracle designates this
      8  * particular file as subject to the "Classpath" exception as provided
      9  * by Oracle in the LICENSE file that accompanied this code.
     10  *
     11  * This code is distributed in the hope that it will be useful, but WITHOUT
     12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     14  * version 2 for more details (a copy is included in the LICENSE file that
     15  * accompanied this code).
     16  *
     17  * You should have received a copy of the GNU General Public License version
     18  * 2 along with this work; if not, write to the Free Software Foundation,
     19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
     20  *
     21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
     22  * or visit www.oracle.com if you need additional information or have any
     23  * questions.
     24  */
     25 
     26 /*
     27  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
     28  * (C) Copyright IBM Corp. 1996, 1997 - All Rights Reserved
     29  *
     30  *   The original version of this source code and documentation is copyrighted
     31  * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
     32  * materials are provided under terms of a License Agreement between Taligent
     33  * and Sun. This technology is protected by multiple US and International
     34  * patents. This notice and attribution to Taligent may not be removed.
     35  *   Taligent is a registered trademark of Taligent, Inc.
     36  *
     37  */
     38 
     39 package java.text;
     40 
     41 import java.util.ArrayList;
     42 
     43 /**
     44  * Utility class for normalizing and merging patterns for collation.
     45  * Patterns are strings of the form <entry>*, where <entry> has the
     46  * form:
     47  * <pattern> := <entry>*
     48  * <entry> := <separator><chars>{"/"<extension>}
     49  * <separator> := "=", ",", ";", "<", "&"
     50  * <chars>, and <extension> are both arbitrary strings.
     51  * unquoted whitespaces are ignored.
     52  * 'xxx' can be used to quote characters
     53  * One difference from Collator is that & is used to reset to a current
     54  * point. Or, in other words, it introduces a new sequence which is to
     55  * be added to the old.
     56  * That is: "a < b < c < d" is the same as "a < b & b < c & c < d" OR
     57  * "a < b < d & b < c"
     58  * XXX: make '' be a single quote.
     59  * @see PatternEntry
     60  * @author             Mark Davis, Helena Shih
     61  */
     62 
     63 final class MergeCollation {
     64 
     65     /**
     66      * Creates from a pattern
     67      * @exception ParseException If the input pattern is incorrect.
     68      */
     69     public MergeCollation(String pattern) throws ParseException
     70     {
     71         for (int i = 0; i < statusArray.length; i++)
     72             statusArray[i] = 0;
     73         setPattern(pattern);
     74     }
     75 
     76     /**
     77      * recovers current pattern
     78      */
     79     public String getPattern() {
     80         return getPattern(true);
     81     }
     82 
     83     /**
     84      * recovers current pattern.
     85      * @param withWhiteSpace puts spacing around the entries, and \n
     86      * before & and <
     87      */
     88     public String getPattern(boolean withWhiteSpace) {
     89         StringBuffer result = new StringBuffer();
     90         PatternEntry tmp = null;
     91         ArrayList<PatternEntry> extList = null;
     92         int i;
     93         for (i = 0; i < patterns.size(); ++i) {
     94             PatternEntry entry = patterns.get(i);
     95             if (entry.extension.length() != 0) {
     96                 if (extList == null)
     97                     extList = new ArrayList<>();
     98                 extList.add(entry);
     99             } else {
    100                 if (extList != null) {
    101                     PatternEntry last = findLastWithNoExtension(i-1);
    102                     for (int j = extList.size() - 1; j >= 0 ; j--) {
    103                         tmp = extList.get(j);
    104                         tmp.addToBuffer(result, false, withWhiteSpace, last);
    105                     }
    106                     extList = null;
    107                 }
    108                 entry.addToBuffer(result, false, withWhiteSpace, null);
    109             }
    110         }
    111         if (extList != null) {
    112             PatternEntry last = findLastWithNoExtension(i-1);
    113             for (int j = extList.size() - 1; j >= 0 ; j--) {
    114                 tmp = extList.get(j);
    115                 tmp.addToBuffer(result, false, withWhiteSpace, last);
    116             }
    117             extList = null;
    118         }
    119         return result.toString();
    120     }
    121 
    122     private final PatternEntry findLastWithNoExtension(int i) {
    123         for (--i;i >= 0; --i) {
    124             PatternEntry entry = patterns.get(i);
    125             if (entry.extension.length() == 0) {
    126                 return entry;
    127             }
    128         }
    129         return null;
    130     }
    131 
    132     /**
    133      * emits the pattern for collation builder.
    134      * @return emits the string in the format understable to the collation
    135      * builder.
    136      */
    137     public String emitPattern() {
    138         return emitPattern(true);
    139     }
    140 
    141     /**
    142      * emits the pattern for collation builder.
    143      * @param withWhiteSpace puts spacing around the entries, and \n
    144      * before & and <
    145      * @return emits the string in the format understable to the collation
    146      * builder.
    147      */
    148     public String emitPattern(boolean withWhiteSpace) {
    149         StringBuffer result = new StringBuffer();
    150         for (int i = 0; i < patterns.size(); ++i)
    151         {
    152             PatternEntry entry = patterns.get(i);
    153             if (entry != null) {
    154                 entry.addToBuffer(result, true, withWhiteSpace, null);
    155             }
    156         }
    157         return result.toString();
    158     }
    159 
    160     /**
    161      * sets the pattern.
    162      */
    163     public void setPattern(String pattern) throws ParseException
    164     {
    165         patterns.clear();
    166         addPattern(pattern);
    167     }
    168 
    169     /**
    170      * adds a pattern to the current one.
    171      * @param pattern the new pattern to be added
    172      */
    173     public void addPattern(String pattern) throws ParseException
    174     {
    175         if (pattern == null)
    176             return;
    177 
    178         PatternEntry.Parser parser = new PatternEntry.Parser(pattern);
    179 
    180         PatternEntry entry = parser.next();
    181         while (entry != null) {
    182             fixEntry(entry);
    183             entry = parser.next();
    184         }
    185     }
    186 
    187     /**
    188      * gets count of separate entries
    189      * @return the size of pattern entries
    190      */
    191     public int getCount() {
    192         return patterns.size();
    193     }
    194 
    195     /**
    196      * gets count of separate entries
    197      * @param index the offset of the desired pattern entry
    198      * @return the requested pattern entry
    199      */
    200     public PatternEntry getItemAt(int index) {
    201         return patterns.get(index);
    202     }
    203 
    204     //============================================================
    205     // privates
    206     //============================================================
    207     ArrayList<PatternEntry> patterns = new ArrayList<>(); // a list of PatternEntries
    208 
    209     private transient PatternEntry saveEntry = null;
    210     private transient PatternEntry lastEntry = null;
    211 
    212     // This is really used as a local variable inside fixEntry, but we cache
    213     // it here to avoid newing it up every time the method is called.
    214     private transient StringBuffer excess = new StringBuffer();
    215 
    216     //
    217     // When building a MergeCollation, we need to do lots of searches to see
    218     // whether a given entry is already in the table.  Since we're using an
    219     // array, this would make the algorithm O(N*N).  To speed things up, we
    220     // use this bit array to remember whether the array contains any entries
    221     // starting with each Unicode character.  If not, we can avoid the search.
    222     // Using BitSet would make this easier, but it's significantly slower.
    223     //
    224     private transient byte[] statusArray = new byte[8192];
    225     private final byte BITARRAYMASK = (byte)0x1;
    226     private final int  BYTEPOWER = 3;
    227     private final int  BYTEMASK = (1 << BYTEPOWER) - 1;
    228 
    229     /*
    230       If the strength is RESET, then just change the lastEntry to
    231       be the current. (If the current is not in patterns, signal an error).
    232       If not, then remove the current entry, and add it after lastEntry
    233       (which is usually at the end).
    234       */
    235     private final void fixEntry(PatternEntry newEntry) throws ParseException
    236     {
    237         // check to see whether the new entry has the same characters as the previous
    238         // entry did (this can happen when a pattern declaring a difference between two
    239         // strings that are canonically equivalent is normalized).  If so, and the strength
    240         // is anything other than IDENTICAL or RESET, throw an exception (you can't
    241         // declare a string to be unequal to itself).       --rtg 5/24/99
    242         if (lastEntry != null && newEntry.chars.equals(lastEntry.chars)
    243                 && newEntry.extension.equals(lastEntry.extension)) {
    244             if (newEntry.strength != Collator.IDENTICAL
    245                 && newEntry.strength != PatternEntry.RESET) {
    246                     throw new ParseException("The entries " + lastEntry + " and "
    247                             + newEntry + " are adjacent in the rules, but have conflicting "
    248                             + "strengths: A character can't be unequal to itself.", -1);
    249             } else {
    250                 // otherwise, just skip this entry and behave as though you never saw it
    251                 return;
    252             }
    253         }
    254 
    255         boolean changeLastEntry = true;
    256         if (newEntry.strength != PatternEntry.RESET) {
    257             int oldIndex = -1;
    258 
    259             if ((newEntry.chars.length() == 1)) {
    260 
    261                 char c = newEntry.chars.charAt(0);
    262                 int statusIndex = c >> BYTEPOWER;
    263                 byte bitClump = statusArray[statusIndex];
    264                 byte setBit = (byte)(BITARRAYMASK << (c & BYTEMASK));
    265 
    266                 if (bitClump != 0 && (bitClump & setBit) != 0) {
    267                     oldIndex = patterns.lastIndexOf(newEntry);
    268                 } else {
    269                     // We're going to add an element that starts with this
    270                     // character, so go ahead and set its bit.
    271                     statusArray[statusIndex] = (byte)(bitClump | setBit);
    272                 }
    273             } else {
    274                 oldIndex = patterns.lastIndexOf(newEntry);
    275             }
    276             if (oldIndex != -1) {
    277                 patterns.remove(oldIndex);
    278             }
    279 
    280             excess.setLength(0);
    281             int lastIndex = findLastEntry(lastEntry, excess);
    282 
    283             if (excess.length() != 0) {
    284                 newEntry.extension = excess + newEntry.extension;
    285                 if (lastIndex != patterns.size()) {
    286                     lastEntry = saveEntry;
    287                     changeLastEntry = false;
    288                 }
    289             }
    290             if (lastIndex == patterns.size()) {
    291                 patterns.add(newEntry);
    292                 saveEntry = newEntry;
    293             } else {
    294                 patterns.add(lastIndex, newEntry);
    295             }
    296         }
    297         if (changeLastEntry) {
    298             lastEntry = newEntry;
    299         }
    300     }
    301 
    302     private final int findLastEntry(PatternEntry entry,
    303                               StringBuffer excessChars) throws ParseException
    304     {
    305         if (entry == null)
    306             return 0;
    307 
    308         if (entry.strength != PatternEntry.RESET) {
    309             // Search backwards for string that contains this one;
    310             // most likely entry is last one
    311 
    312             int oldIndex = -1;
    313             if ((entry.chars.length() == 1)) {
    314                 int index = entry.chars.charAt(0) >> BYTEPOWER;
    315                 if ((statusArray[index] &
    316                     (BITARRAYMASK << (entry.chars.charAt(0) & BYTEMASK))) != 0) {
    317                     oldIndex = patterns.lastIndexOf(entry);
    318                 }
    319             } else {
    320                 oldIndex = patterns.lastIndexOf(entry);
    321             }
    322             if ((oldIndex == -1))
    323                 throw new ParseException("couldn't find last entry: "
    324                                           + entry, oldIndex);
    325             return oldIndex + 1;
    326         } else {
    327             int i;
    328             for (i = patterns.size() - 1; i >= 0; --i) {
    329                 PatternEntry e = patterns.get(i);
    330                 if (e.chars.regionMatches(0,entry.chars,0,
    331                                               e.chars.length())) {
    332                     excessChars.append(entry.chars.substring(e.chars.length(),
    333                                                             entry.chars.length()));
    334                     break;
    335                 }
    336             }
    337             if (i == -1)
    338                 throw new ParseException("couldn't find: " + entry, i);
    339             return i + 1;
    340         }
    341     }
    342 }
    343