1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.util.ArrayList; 12 import java.util.List; 13 14 import com.ibm.icu.impl.UtilityExtensions; 15 16 /** 17 * A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes 18 * the transliteration in one direction from one set of characters or short 19 * strings to another. A <code>RuleBasedTransliterator</code> consists of up to 20 * two such sets, one for the forward direction, and one for the reverse. 21 * 22 * <p>A <code>TransliterationRuleSet</code> has one important operation, that of 23 * finding a matching rule at a given point in the text. This is accomplished 24 * by the <code>findMatch()</code> method. 25 * 26 * <p>Copyright © IBM Corporation 1999. All rights reserved. 27 * 28 * @author Alan Liu 29 */ 30 class TransliterationRuleSet { 31 /** 32 * Vector of rules, in the order added. 33 */ 34 private List<TransliterationRule> ruleVector; 35 36 /** 37 * Length of the longest preceding context 38 */ 39 private int maxContextLength; 40 41 /** 42 * Sorted and indexed table of rules. This is created by freeze() from 43 * the rules in ruleVector. rules.length >= ruleVector.size(), and the 44 * references in rules[] are aliases of the references in ruleVector. 45 * A single rule in ruleVector is listed one or more times in rules[]. 46 */ 47 private TransliterationRule[] rules; 48 49 /** 50 * Index table. For text having a first character c, compute x = c&0xFF. 51 * Now use rules[index[x]..index[x+1]-1]. This index table is created by 52 * freeze(). 53 */ 54 private int[] index; 55 56 /** 57 * Construct a new empty rule set. 58 */ 59 public TransliterationRuleSet() { 60 ruleVector = new ArrayList<TransliterationRule>(); 61 maxContextLength = 0; 62 } 63 64 /** 65 * Return the maximum context length. 66 * @return the length of the longest preceding context. 67 */ 68 public int getMaximumContextLength() { 69 return maxContextLength; 70 } 71 72 /** 73 * Add a rule to this set. Rules are added in order, and order is 74 * significant. 75 * @param rule the rule to add 76 */ 77 public void addRule(TransliterationRule rule) { 78 ruleVector.add(rule); 79 int len; 80 if ((len = rule.getAnteContextLength()) > maxContextLength) { 81 maxContextLength = len; 82 } 83 84 rules = null; 85 } 86 87 /** 88 * Close this rule set to further additions, check it for masked rules, 89 * and index it to optimize performance. 90 * @exception IllegalArgumentException if some rules are masked 91 */ 92 public void freeze() { 93 /* Construct the rule array and index table. We reorder the 94 * rules by sorting them into 256 bins. Each bin contains all 95 * rules matching the index value for that bin. A rule 96 * matches an index value if string whose first key character 97 * has a low byte equal to the index value can match the rule. 98 * 99 * Each bin contains zero or more rules, in the same order 100 * they were found originally. However, the total rules in 101 * the bins may exceed the number in the original vector, 102 * since rules that have a variable as their first key 103 * character will generally fall into more than one bin. 104 * 105 * That is, each bin contains all rules that either have that 106 * first index value as their first key character, or have 107 * a set containing the index value as their first character. 108 */ 109 int n = ruleVector.size(); 110 index = new int[257]; // [sic] 111 List<TransliterationRule> v = new ArrayList<TransliterationRule>(2*n); // heuristic; adjust as needed 112 113 /* Precompute the index values. This saves a LOT of time. 114 */ 115 int[] indexValue = new int[n]; 116 for (int j=0; j<n; ++j) { 117 TransliterationRule r = ruleVector.get(j); 118 indexValue[j] = r.getIndexValue(); 119 } 120 for (int x=0; x<256; ++x) { 121 index[x] = v.size(); 122 for (int j=0; j<n; ++j) { 123 if (indexValue[j] >= 0) { 124 if (indexValue[j] == x) { 125 v.add(ruleVector.get(j)); 126 } 127 } else { 128 // If the indexValue is < 0, then the first key character is 129 // a set, and we must use the more time-consuming 130 // matchesIndexValue check. In practice this happens 131 // rarely, so we seldom tread this code path. 132 TransliterationRule r = ruleVector.get(j); 133 if (r.matchesIndexValue(x)) { 134 v.add(r); 135 } 136 } 137 } 138 } 139 index[256] = v.size(); 140 141 /* Freeze things into an array. 142 */ 143 rules = new TransliterationRule[v.size()]; 144 v.toArray(rules); 145 146 StringBuilder errors = null; 147 148 /* Check for masking. This is MUCH faster than our old check, 149 * which was each rule against each following rule, since we 150 * only have to check for masking within each bin now. It's 151 * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule 152 * count, and n2 is the per-bin rule count. But n2<<n1, so 153 * it's a big win. 154 */ 155 for (int x=0; x<256; ++x) { 156 for (int j=index[x]; j<index[x+1]-1; ++j) { 157 TransliterationRule r1 = rules[j]; 158 for (int k=j+1; k<index[x+1]; ++k) { 159 TransliterationRule r2 = rules[k]; 160 if (r1.masks(r2)) { 161 if (errors == null) { 162 errors = new StringBuilder(); 163 } else { 164 errors.append("\n"); 165 } 166 errors.append("Rule " + r1 + " masks " + r2); 167 } 168 } 169 } 170 } 171 172 if (errors != null) { 173 throw new IllegalArgumentException(errors.toString()); 174 } 175 } 176 177 /** 178 * Transliterate the given text with the given UTransPosition 179 * indices. Return TRUE if the transliteration should continue 180 * or FALSE if it should halt (because of a U_PARTIAL_MATCH match). 181 * Note that FALSE is only ever returned if isIncremental is TRUE. 182 * @param text the text to be transliterated 183 * @param pos the position indices, which will be updated 184 * @param incremental if TRUE, assume new text may be inserted 185 * at index.limit, and return FALSE if thre is a partial match. 186 * @return TRUE unless a U_PARTIAL_MATCH has been obtained, 187 * indicating that transliteration should stop until more text 188 * arrives. 189 */ 190 public boolean transliterate(Replaceable text, 191 Transliterator.Position pos, 192 boolean incremental) { 193 int indexByte = text.char32At(pos.start) & 0xFF; 194 for (int i=index[indexByte]; i<index[indexByte+1]; ++i) { 195 int m = rules[i].matchAndReplace(text, pos, incremental); 196 switch (m) { 197 case UnicodeMatcher.U_MATCH: 198 if (Transliterator.DEBUG) { 199 System.out.println((incremental ? "Rule.i: match ":"Rule: match ") + 200 rules[i].toRule(true) + " => " + 201 UtilityExtensions.formatInput(text, pos)); 202 } 203 return true; 204 case UnicodeMatcher.U_PARTIAL_MATCH: 205 if (Transliterator.DEBUG) { 206 System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") + 207 rules[i].toRule(true) + " => " + 208 UtilityExtensions.formatInput(text, pos)); 209 } 210 return false; 211 default: 212 if (Transliterator.DEBUG) { 213 System.out.println("Rule: no match " + rules[i]); 214 } 215 } 216 } 217 // No match or partial match from any rule 218 pos.start += UTF16.getCharCount(text.char32At(pos.start)); 219 if (Transliterator.DEBUG) { 220 System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") + 221 UtilityExtensions.formatInput(text, pos)); 222 } 223 return true; 224 } 225 226 /** 227 * Create rule strings that represents this rule set. 228 */ 229 String toRules(boolean escapeUnprintable) { 230 int i; 231 int count = ruleVector.size(); 232 StringBuilder ruleSource = new StringBuilder(); 233 for (i=0; i<count; ++i) { 234 if (i != 0) { 235 ruleSource.append('\n'); 236 } 237 TransliterationRule r = ruleVector.get(i); 238 ruleSource.append(r.toRule(escapeUnprintable)); 239 } 240 return ruleSource.toString(); 241 } 242 243 // TODO Handle the case where we have :: [a] ; a > |b ; b > c ; 244 // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing 245 void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) { 246 UnicodeSet currentFilter = new UnicodeSet(filter); 247 UnicodeSet revisiting = new UnicodeSet(); 248 int count = ruleVector.size(); 249 for (int i=0; i<count; ++i) { 250 TransliterationRule r = ruleVector.get(i); 251 r.addSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.clear()); 252 currentFilter.addAll(revisiting); 253 } 254 } 255 256 } 257