Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2010, International Business Machines Corporation and    *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 
     11 import java.util.ArrayList;
     12 import java.util.List;
     13 
     14 import com.ibm.icu.impl.UtilityExtensions;
     15 
     16 /**
     17  * A set of rules for a <code>RuleBasedTransliterator</code>.  This set encodes
     18  * the transliteration in one direction from one set of characters or short
     19  * strings to another.  A <code>RuleBasedTransliterator</code> consists of up to
     20  * two such sets, one for the forward direction, and one for the reverse.
     21  *
     22  * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
     23  * finding a matching rule at a given point in the text.  This is accomplished
     24  * by the <code>findMatch()</code> method.
     25  *
     26  * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
     27  *
     28  * @author Alan Liu
     29  */
     30 class TransliterationRuleSet {
     31     /**
     32      * Vector of rules, in the order added.
     33      */
     34     private List<TransliterationRule> ruleVector;
     35 
     36     /**
     37      * Length of the longest preceding context
     38      */
     39     private int maxContextLength;
     40 
     41     /**
     42      * Sorted and indexed table of rules.  This is created by freeze() from
     43      * the rules in ruleVector.  rules.length >= ruleVector.size(), and the
     44      * references in rules[] are aliases of the references in ruleVector.
     45      * A single rule in ruleVector is listed one or more times in rules[].
     46      */
     47     private TransliterationRule[] rules;
     48 
     49     /**
     50      * Index table.  For text having a first character c, compute x = c&0xFF.
     51      * Now use rules[index[x]..index[x+1]-1].  This index table is created by
     52      * freeze().
     53      */
     54     private int[] index;
     55 
     56     /**
     57      * Construct a new empty rule set.
     58      */
     59     public TransliterationRuleSet() {
     60         ruleVector = new ArrayList<TransliterationRule>();
     61         maxContextLength = 0;
     62     }
     63 
     64     /**
     65      * Return the maximum context length.
     66      * @return the length of the longest preceding context.
     67      */
     68     public int getMaximumContextLength() {
     69         return maxContextLength;
     70     }
     71 
     72     /**
     73      * Add a rule to this set.  Rules are added in order, and order is
     74      * significant.
     75      * @param rule the rule to add
     76      */
     77     public void addRule(TransliterationRule rule) {
     78         ruleVector.add(rule);
     79         int len;
     80         if ((len = rule.getAnteContextLength()) > maxContextLength) {
     81             maxContextLength = len;
     82         }
     83 
     84         rules = null;
     85     }
     86 
     87     /**
     88      * Close this rule set to further additions, check it for masked rules,
     89      * and index it to optimize performance.
     90      * @exception IllegalArgumentException if some rules are masked
     91      */
     92     public void freeze() {
     93         /* Construct the rule array and index table.  We reorder the
     94          * rules by sorting them into 256 bins.  Each bin contains all
     95          * rules matching the index value for that bin.  A rule
     96          * matches an index value if string whose first key character
     97          * has a low byte equal to the index value can match the rule.
     98          *
     99          * Each bin contains zero or more rules, in the same order
    100          * they were found originally.  However, the total rules in
    101          * the bins may exceed the number in the original vector,
    102          * since rules that have a variable as their first key
    103          * character will generally fall into more than one bin.
    104          *
    105          * That is, each bin contains all rules that either have that
    106          * first index value as their first key character, or have
    107          * a set containing the index value as their first character.
    108          */
    109         int n = ruleVector.size();
    110         index = new int[257]; // [sic]
    111         List<TransliterationRule> v = new ArrayList<TransliterationRule>(2*n); // heuristic; adjust as needed
    112 
    113         /* Precompute the index values.  This saves a LOT of time.
    114          */
    115         int[] indexValue = new int[n];
    116         for (int j=0; j<n; ++j) {
    117             TransliterationRule r = ruleVector.get(j);
    118             indexValue[j] = r.getIndexValue();
    119         }
    120         for (int x=0; x<256; ++x) {
    121             index[x] = v.size();
    122             for (int j=0; j<n; ++j) {
    123                 if (indexValue[j] >= 0) {
    124                     if (indexValue[j] == x) {
    125                         v.add(ruleVector.get(j));
    126                     }
    127                 } else {
    128                     // If the indexValue is < 0, then the first key character is
    129                     // a set, and we must use the more time-consuming
    130                     // matchesIndexValue check.  In practice this happens
    131                     // rarely, so we seldom tread this code path.
    132                     TransliterationRule r = ruleVector.get(j);
    133                     if (r.matchesIndexValue(x)) {
    134                         v.add(r);
    135                     }
    136                 }
    137             }
    138         }
    139         index[256] = v.size();
    140 
    141         /* Freeze things into an array.
    142          */
    143         rules = new TransliterationRule[v.size()];
    144         v.toArray(rules);
    145 
    146         StringBuilder errors = null;
    147 
    148         /* Check for masking.  This is MUCH faster than our old check,
    149          * which was each rule against each following rule, since we
    150          * only have to check for masking within each bin now.  It's
    151          * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
    152          * count, and n2 is the per-bin rule count.  But n2<<n1, so
    153          * it's a big win.
    154          */
    155         for (int x=0; x<256; ++x) {
    156             for (int j=index[x]; j<index[x+1]-1; ++j) {
    157                 TransliterationRule r1 = rules[j];
    158                 for (int k=j+1; k<index[x+1]; ++k) {
    159                     TransliterationRule r2 = rules[k];
    160                     if (r1.masks(r2)) {
    161                         if (errors == null) {
    162                             errors = new StringBuilder();
    163                         } else {
    164                             errors.append("\n");
    165                         }
    166                         errors.append("Rule " + r1 + " masks " + r2);
    167                     }
    168                 }
    169             }
    170         }
    171 
    172         if (errors != null) {
    173             throw new IllegalArgumentException(errors.toString());
    174         }
    175     }
    176 
    177     /**
    178      * Transliterate the given text with the given UTransPosition
    179      * indices.  Return TRUE if the transliteration should continue
    180      * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
    181      * Note that FALSE is only ever returned if isIncremental is TRUE.
    182      * @param text the text to be transliterated
    183      * @param pos the position indices, which will be updated
    184      * @param incremental if TRUE, assume new text may be inserted
    185      * at index.limit, and return FALSE if thre is a partial match.
    186      * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
    187      * indicating that transliteration should stop until more text
    188      * arrives.
    189      */
    190     public boolean transliterate(Replaceable text,
    191                                  Transliterator.Position pos,
    192                                  boolean incremental) {
    193         int indexByte = text.char32At(pos.start) & 0xFF;
    194         for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
    195             int m = rules[i].matchAndReplace(text, pos, incremental);
    196             switch (m) {
    197             case UnicodeMatcher.U_MATCH:
    198                 if (Transliterator.DEBUG) {
    199                     System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
    200                                        rules[i].toRule(true) + " => " +
    201                                        UtilityExtensions.formatInput(text, pos));
    202                 }
    203                 return true;
    204             case UnicodeMatcher.U_PARTIAL_MATCH:
    205                 if (Transliterator.DEBUG) {
    206                     System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
    207                                        rules[i].toRule(true) + " => " +
    208                                        UtilityExtensions.formatInput(text, pos));
    209                 }
    210                 return false;
    211                 default:
    212                     if (Transliterator.DEBUG) {
    213                         System.out.println("Rule: no match " + rules[i]);
    214                     }
    215             }
    216         }
    217         // No match or partial match from any rule
    218         pos.start += UTF16.getCharCount(text.char32At(pos.start));
    219         if (Transliterator.DEBUG) {
    220             System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
    221                                UtilityExtensions.formatInput(text, pos));
    222         }
    223         return true;
    224     }
    225 
    226     /**
    227      * Create rule strings that represents this rule set.
    228      */
    229     String toRules(boolean escapeUnprintable) {
    230         int i;
    231         int count = ruleVector.size();
    232         StringBuilder ruleSource = new StringBuilder();
    233         for (i=0; i<count; ++i) {
    234             if (i != 0) {
    235                 ruleSource.append('\n');
    236             }
    237             TransliterationRule r = ruleVector.get(i);
    238             ruleSource.append(r.toRule(escapeUnprintable));
    239         }
    240         return ruleSource.toString();
    241     }
    242 
    243     // TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
    244     // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
    245     void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
    246         UnicodeSet currentFilter = new UnicodeSet(filter);
    247         UnicodeSet revisiting = new UnicodeSet();
    248         int count = ruleVector.size();
    249         for (int i=0; i<count; ++i) {
    250             TransliterationRule r = ruleVector.get(i);
    251             r.addSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.clear());
    252             currentFilter.addAll(revisiting);
    253         }
    254     }
    255 
    256 }
    257