Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 1996-2010, International Business Machines Corporation and    *
      7  * others. All Rights Reserved.                                                *
      8  *******************************************************************************
      9  */
     10 package android.icu.text;
     11 
     12 import java.util.ArrayList;
     13 import java.util.List;
     14 
     15 import android.icu.impl.UtilityExtensions;
     16 
     17 /**
     18  * A set of rules for a <code>RuleBasedTransliterator</code>.  This set encodes
     19  * the transliteration in one direction from one set of characters or short
     20  * strings to another.  A <code>RuleBasedTransliterator</code> consists of up to
     21  * two such sets, one for the forward direction, and one for the reverse.
     22  *
     23  * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
     24  * finding a matching rule at a given point in the text.  This is accomplished
     25  * by the <code>findMatch()</code> method.
     26  *
     27  * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
     28  *
     29  * @author Alan Liu
     30  */
     31 class TransliterationRuleSet {
     32     /**
     33      * Vector of rules, in the order added.
     34      */
     35     private List<TransliterationRule> ruleVector;
     36 
     37     /**
     38      * Length of the longest preceding context
     39      */
     40     private int maxContextLength;
     41 
     42     /**
     43      * Sorted and indexed table of rules.  This is created by freeze() from
     44      * the rules in ruleVector.  rules.length >= ruleVector.size(), and the
     45      * references in rules[] are aliases of the references in ruleVector.
     46      * A single rule in ruleVector is listed one or more times in rules[].
     47      */
     48     private TransliterationRule[] rules;
     49 
     50     /**
     51      * Index table.  For text having a first character c, compute x = c&0xFF.
     52      * Now use rules[index[x]..index[x+1]-1].  This index table is created by
     53      * freeze().
     54      */
     55     private int[] index;
     56 
     57     /**
     58      * Construct a new empty rule set.
     59      */
     60     public TransliterationRuleSet() {
     61         ruleVector = new ArrayList<TransliterationRule>();
     62         maxContextLength = 0;
     63     }
     64 
     65     /**
     66      * Return the maximum context length.
     67      * @return the length of the longest preceding context.
     68      */
     69     public int getMaximumContextLength() {
     70         return maxContextLength;
     71     }
     72 
     73     /**
     74      * Add a rule to this set.  Rules are added in order, and order is
     75      * significant.
     76      * @param rule the rule to add
     77      */
     78     public void addRule(TransliterationRule rule) {
     79         ruleVector.add(rule);
     80         int len;
     81         if ((len = rule.getAnteContextLength()) > maxContextLength) {
     82             maxContextLength = len;
     83         }
     84 
     85         rules = null;
     86     }
     87 
     88     /**
     89      * Close this rule set to further additions, check it for masked rules,
     90      * and index it to optimize performance.
     91      * @exception IllegalArgumentException if some rules are masked
     92      */
     93     public void freeze() {
     94         /* Construct the rule array and index table.  We reorder the
     95          * rules by sorting them into 256 bins.  Each bin contains all
     96          * rules matching the index value for that bin.  A rule
     97          * matches an index value if string whose first key character
     98          * has a low byte equal to the index value can match the rule.
     99          *
    100          * Each bin contains zero or more rules, in the same order
    101          * they were found originally.  However, the total rules in
    102          * the bins may exceed the number in the original vector,
    103          * since rules that have a variable as their first key
    104          * character will generally fall into more than one bin.
    105          *
    106          * That is, each bin contains all rules that either have that
    107          * first index value as their first key character, or have
    108          * a set containing the index value as their first character.
    109          */
    110         int n = ruleVector.size();
    111         index = new int[257]; // [sic]
    112         List<TransliterationRule> v = new ArrayList<TransliterationRule>(2*n); // heuristic; adjust as needed
    113 
    114         /* Precompute the index values.  This saves a LOT of time.
    115          */
    116         int[] indexValue = new int[n];
    117         for (int j=0; j<n; ++j) {
    118             TransliterationRule r = ruleVector.get(j);
    119             indexValue[j] = r.getIndexValue();
    120         }
    121         for (int x=0; x<256; ++x) {
    122             index[x] = v.size();
    123             for (int j=0; j<n; ++j) {
    124                 if (indexValue[j] >= 0) {
    125                     if (indexValue[j] == x) {
    126                         v.add(ruleVector.get(j));
    127                     }
    128                 } else {
    129                     // If the indexValue is < 0, then the first key character is
    130                     // a set, and we must use the more time-consuming
    131                     // matchesIndexValue check.  In practice this happens
    132                     // rarely, so we seldom tread this code path.
    133                     TransliterationRule r = ruleVector.get(j);
    134                     if (r.matchesIndexValue(x)) {
    135                         v.add(r);
    136                     }
    137                 }
    138             }
    139         }
    140         index[256] = v.size();
    141 
    142         /* Freeze things into an array.
    143          */
    144         rules = new TransliterationRule[v.size()];
    145         v.toArray(rules);
    146 
    147         StringBuilder errors = null;
    148 
    149         /* Check for masking.  This is MUCH faster than our old check,
    150          * which was each rule against each following rule, since we
    151          * only have to check for masking within each bin now.  It's
    152          * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
    153          * count, and n2 is the per-bin rule count.  But n2<<n1, so
    154          * it's a big win.
    155          */
    156         for (int x=0; x<256; ++x) {
    157             for (int j=index[x]; j<index[x+1]-1; ++j) {
    158                 TransliterationRule r1 = rules[j];
    159                 for (int k=j+1; k<index[x+1]; ++k) {
    160                     TransliterationRule r2 = rules[k];
    161                     if (r1.masks(r2)) {
    162                         if (errors == null) {
    163                             errors = new StringBuilder();
    164                         } else {
    165                             errors.append("\n");
    166                         }
    167                         errors.append("Rule " + r1 + " masks " + r2);
    168                     }
    169                 }
    170             }
    171         }
    172 
    173         if (errors != null) {
    174             throw new IllegalArgumentException(errors.toString());
    175         }
    176     }
    177 
    178     /**
    179      * Transliterate the given text with the given UTransPosition
    180      * indices.  Return TRUE if the transliteration should continue
    181      * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
    182      * Note that FALSE is only ever returned if isIncremental is TRUE.
    183      * @param text the text to be transliterated
    184      * @param pos the position indices, which will be updated
    185      * @param incremental if TRUE, assume new text may be inserted
    186      * at index.limit, and return FALSE if thre is a partial match.
    187      * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
    188      * indicating that transliteration should stop until more text
    189      * arrives.
    190      */
    191     public boolean transliterate(Replaceable text,
    192                                  Transliterator.Position pos,
    193                                  boolean incremental) {
    194         int indexByte = text.char32At(pos.start) & 0xFF;
    195         for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
    196             int m = rules[i].matchAndReplace(text, pos, incremental);
    197             switch (m) {
    198             case UnicodeMatcher.U_MATCH:
    199                 if (Transliterator.DEBUG) {
    200                     System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
    201                                        rules[i].toRule(true) + " => " +
    202                                        UtilityExtensions.formatInput(text, pos));
    203                 }
    204                 return true;
    205             case UnicodeMatcher.U_PARTIAL_MATCH:
    206                 if (Transliterator.DEBUG) {
    207                     System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
    208                                        rules[i].toRule(true) + " => " +
    209                                        UtilityExtensions.formatInput(text, pos));
    210                 }
    211                 return false;
    212                 default:
    213                     if (Transliterator.DEBUG) {
    214                         System.out.println("Rule: no match " + rules[i]);
    215                     }
    216             }
    217         }
    218         // No match or partial match from any rule
    219         pos.start += UTF16.getCharCount(text.char32At(pos.start));
    220         if (Transliterator.DEBUG) {
    221             System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
    222                                UtilityExtensions.formatInput(text, pos));
    223         }
    224         return true;
    225     }
    226 
    227     /**
    228      * Create rule strings that represents this rule set.
    229      */
    230     String toRules(boolean escapeUnprintable) {
    231         int i;
    232         int count = ruleVector.size();
    233         StringBuilder ruleSource = new StringBuilder();
    234         for (i=0; i<count; ++i) {
    235             if (i != 0) {
    236                 ruleSource.append('\n');
    237             }
    238             TransliterationRule r = ruleVector.get(i);
    239             ruleSource.append(r.toRule(escapeUnprintable));
    240         }
    241         return ruleSource.toString();
    242     }
    243 
    244     // TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
    245     // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
    246     void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
    247         UnicodeSet currentFilter = new UnicodeSet(filter);
    248         UnicodeSet revisiting = new UnicodeSet();
    249         int count = ruleVector.size();
    250         for (int i=0; i<count; ++i) {
    251             TransliterationRule r = ruleVector.get(i);
    252             r.addSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.clear());
    253             currentFilter.addAll(revisiting);
    254         }
    255     }
    256 
    257 }
    258