1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 // 4 // Copyright (C) 2002-2014, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // 8 9 package com.ibm.icu.text; 10 11 import java.io.DataOutputStream; 12 import java.io.IOException; 13 import java.io.OutputStream; 14 import java.util.ArrayList; 15 import java.util.HashMap; 16 import java.util.List; 17 import java.util.Map; 18 import java.util.Set; 19 20 import com.ibm.icu.impl.Assert; 21 import com.ibm.icu.impl.ICUBinary; 22 import com.ibm.icu.impl.ICUDebug; 23 24 class RBBIRuleBuilder { 25 // This is the main class for building (compiling) break rules into the tables 26 // required by the runtime RBBI engine. 27 // 28 29 String fDebugEnv; // controls debug trace output 30 String fRules; // The rule string that we are compiling 31 RBBIRuleScanner fScanner; // The scanner. 32 33 34 // 35 // There are four separate parse trees generated, one for each of the 36 // forward rules, reverse rules, safe forward rules and safe reverse rules. 37 // This array references the root of each of the trees. 38 // 39 RBBINode[] fTreeRoots = new RBBINode[4]; 40 static final int fForwardTree = 0; // Indexes into the above fTreeRoots array 41 static final int fReverseTree = 1; // for each of the trees. 42 static final int fSafeFwdTree = 2; // (in C, these are pointer variables and 43 static final int fSafeRevTree = 3; // there is no array.) 44 int fDefaultTree = fForwardTree; // For rules not qualified with a ! 45 // the tree to which they belong to. 46 47 boolean fChainRules; // True for chained Unicode TR style rules. 48 // False for traditional regexp rules. 49 50 boolean fLBCMNoChain; // True: suppress chaining of rules on 51 // chars with LineBreak property == CM. 52 53 boolean fLookAheadHardBreak; // True: Look ahead matches cause an 54 // immediate break, no continuing for the 55 // longest match. 56 57 RBBISetBuilder fSetBuilder; // Set and Character Category builder. 58 List<RBBINode> fUSetNodes; // Vector of all uset nodes. 59 RBBITableBuilder fForwardTables; // State transition tables 60 RBBITableBuilder fReverseTables; 61 RBBITableBuilder fSafeFwdTables; 62 RBBITableBuilder fSafeRevTables; 63 64 // 65 // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.). 66 // 67 Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far. 68 // Map Key is the set of values. 69 // Map Value is the runtime array index. 70 71 List<Integer> fRuleStatusVals; // List of Integer objects. Has same layout as the 72 // runtime array of status (tag) values - 73 // number of values in group 1 74 // first status value in group 1 75 // 2nd status value in group 1 76 // ... 77 // number of values in group 2 78 // first status value in group 2 79 // etc. 80 // 81 // Error codes from ICU4C. 82 // using these simplified the porting, and consolidated the 83 // creation of Java exceptions 84 // 85 static final int U_BRK_ERROR_START = 0x10200; 86 /**< Start of codes indicating Break Iterator failures */ 87 88 static final int U_BRK_INTERNAL_ERROR = 0x10201; 89 /**< An internal error (bug) was detected. */ 90 91 static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202; 92 /**< Hex digits expected as part of a escaped char in a rule. */ 93 94 static final int U_BRK_SEMICOLON_EXPECTED = 0x10203; 95 /**< Missing ';' at the end of a RBBI rule. */ 96 97 static final int U_BRK_RULE_SYNTAX = 0x10204; 98 /**< Syntax error in RBBI rule. */ 99 100 static final int U_BRK_UNCLOSED_SET = 0x10205; 101 /**< UnicodeSet witing an RBBI rule missing a closing ']'. */ 102 103 static final int U_BRK_ASSIGN_ERROR = 0x10206; 104 /**< Syntax error in RBBI rule assignment statement. */ 105 106 static final int U_BRK_VARIABLE_REDFINITION = 0x10207; 107 /**< RBBI rule $Variable redefined. */ 108 109 static final int U_BRK_MISMATCHED_PAREN = 0x10208; 110 /**< Mis-matched parentheses in an RBBI rule. */ 111 112 static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209; 113 /**< Missing closing quote in an RBBI rule. */ 114 115 static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a; 116 /**< Use of an undefined $Variable in an RBBI rule. */ 117 118 static final int U_BRK_INIT_ERROR = 0x1020b; 119 /**< Initialization failure. Probable missing ICU Data. */ 120 121 static final int U_BRK_RULE_EMPTY_SET = 0x1020c; 122 /**< Rule contains an empty Unicode Set. */ 123 124 static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d; 125 /**< !!option in RBBI rules not recognized. */ 126 127 static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e; 128 /**< The {nnn} tag on a rule is mal formed */ 129 static final int U_BRK_MALFORMED_SET = 0x1020f; 130 131 static final int U_BRK_ERROR_LIMIT = 0x10210; 132 /**< This must always be the last value to indicate the limit for Break Iterator failures */ 133 134 135 //---------------------------------------------------------------------------------------- 136 // 137 // Constructor. 138 // 139 //---------------------------------------------------------------------------------------- 140 RBBIRuleBuilder(String rules) 141 { 142 fDebugEnv = ICUDebug.enabled("rbbi") ? 143 ICUDebug.value("rbbi") : null; 144 fRules = rules; 145 fUSetNodes = new ArrayList<RBBINode>(); 146 fRuleStatusVals = new ArrayList<Integer>(); 147 fScanner = new RBBIRuleScanner(this); 148 fSetBuilder = new RBBISetBuilder(this); 149 } 150 151 //---------------------------------------------------------------------------------------- 152 // 153 // flattenData() - Collect up the compiled RBBI rule data and put it into 154 // the format for saving in ICU data files, 155 // 156 // See the ICU4C file common/rbidata.h for a detailed description. 157 // 158 //---------------------------------------------------------------------------------------- 159 static final int align8(int i) 160 { 161 return (i + 7) & 0xfffffff8; 162 } 163 164 void flattenData(OutputStream os) throws IOException { 165 DataOutputStream dos = new DataOutputStream(os); 166 int i; 167 168 // Remove comments and whitespace from the rules to make it smaller. 169 String strippedRules = RBBIRuleScanner.stripRules(fRules); 170 171 // Calculate the size of each section in the data in bytes. 172 // Sizes here are padded up to a multiple of 8 for better memory alignment. 173 // Sections sizes actually stored in the header are for the actual data 174 // without the padding. 175 // 176 int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader)); 177 int forwardTableSize = align8(fForwardTables.getTableSize()); 178 int reverseTableSize = align8(fReverseTables.getTableSize()); 179 // int safeFwdTableSize = align8(fSafeFwdTables.getTableSize()); 180 int safeRevTableSize = align8(fSafeRevTables.getTableSize()); 181 int trieSize = align8(fSetBuilder.getTrieSize()); 182 int statusTableSize = align8(fRuleStatusVals.size() * 4); 183 int rulesSize = align8((strippedRules.length()) * 2); 184 185 int totalSize = headerSize 186 + forwardTableSize 187 + /* reverseTableSize */ 0 188 + /* safeFwdTableSize */ 0 189 + (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize) 190 + statusTableSize + trieSize + rulesSize; 191 int outputPos = 0; // Track stream position, starting from RBBIDataHeader. 192 193 // 194 // Write out an ICU Data Header 195 // 196 ICUBinary.writeHeader(RBBIDataWrapper.DATA_FORMAT, RBBIDataWrapper.FORMAT_VERSION, 0, dos); 197 198 // 199 // Write out the RBBIDataHeader 200 // 201 int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader 202 header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0; 203 header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION; 204 header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections. 205 header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount. 206 207 // Only save the forward table and the safe reverse table, 208 // because these are the only ones used at run-time. 209 // 210 // For the moment, we still build the other tables if they are present in the rule source files, 211 // for backwards compatibility. Old rule files need to work, and this is the simplest approach. 212 // 213 // Additional backwards compatibility consideration: if no safe rules are provided, consider the 214 // reverse rules to actually be the safe reverse rules. 215 216 header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable 217 header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen 218 219 // Do not save Reverse Table. 220 header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable 221 header[RBBIDataWrapper.DH_RTABLELEN] = 0; // fRTableLen 222 223 // Do not save the Safe Forward table. 224 header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE] 225 + 0; // fSTable 226 header[RBBIDataWrapper.DH_SFTABLELEN] = 0; // fSTableLen 227 228 // Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse. 229 header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE] 230 + 0; // fSRTable 231 if (safeRevTableSize > 0) { 232 header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize; 233 } else { 234 assert reverseTableSize > 0; 235 header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize; 236 } 237 238 header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE] 239 + header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie 240 header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen 241 header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE] 242 + header[RBBIDataWrapper.DH_TRIELEN]; 243 header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen 244 header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE] 245 + statusTableSize; 246 header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2; 247 for (i = 0; i < header.length; i++) { 248 dos.writeInt(header[i]); 249 outputPos += 4; 250 } 251 252 // Write out the actual state tables. 253 short[] tableData; 254 tableData = fForwardTables.exportTable(); 255 Assert.assrt(outputPos == header[4]); 256 for (i = 0; i < tableData.length; i++) { 257 dos.writeShort(tableData[i]); 258 outputPos += 2; 259 } 260 261 /* do not write the reverse table 262 tableData = fReverseTables.exportTable(); 263 Assert.assrt(outputPos == header[6]); 264 for (i = 0; i < tableData.length; i++) { 265 dos.writeShort(tableData[i]); 266 outputPos += 2; 267 } 268 */ 269 270 /* do not write safe forwards table 271 Assert.assrt(outputPos == header[8]); 272 tableData = fSafeFwdTables.exportTable(); 273 for (i = 0; i < tableData.length; i++) { 274 dos.writeShort(tableData[i]); 275 outputPos += 2; 276 } 277 */ 278 279 // Write the safe reverse table. 280 // If not present, write the plain reverse table (old style rule compatibility) 281 Assert.assrt(outputPos == header[10]); 282 if (safeRevTableSize > 0) { 283 tableData = fSafeRevTables.exportTable(); 284 } else { 285 tableData = fReverseTables.exportTable(); 286 } 287 for (i = 0; i < tableData.length; i++) { 288 dos.writeShort(tableData[i]); 289 outputPos += 2; 290 } 291 292 // write out the Trie table 293 Assert.assrt(outputPos == header[12]); 294 fSetBuilder.serializeTrie(os); 295 outputPos += header[13]; 296 while (outputPos % 8 != 0) { // pad to an 8 byte boundary 297 dos.write(0); 298 outputPos += 1; 299 } 300 301 // Write out the status {tag} table. 302 Assert.assrt(outputPos == header[16]); 303 for (Integer val : fRuleStatusVals) { 304 dos.writeInt(val.intValue()); 305 outputPos += 4; 306 } 307 308 while (outputPos % 8 != 0) { // pad to an 8 byte boundary 309 dos.write(0); 310 outputPos += 1; 311 } 312 313 // Write out the stripped rules (rules with extra spaces removed 314 // These go last in the data area, even though they are not last in the header. 315 Assert.assrt(outputPos == header[14]); 316 dos.writeChars(strippedRules); 317 outputPos += strippedRules.length() * 2; 318 while (outputPos % 8 != 0) { // pad to an 8 byte boundary 319 dos.write(0); 320 outputPos += 1; 321 } 322 } 323 324 //---------------------------------------------------------------------------------------- 325 // 326 // compileRules compile source rules, placing the compiled form into a output stream 327 // The compiled form is identical to that from ICU4C (Big Endian). 328 // 329 //---------------------------------------------------------------------------------------- 330 static void compileRules(String rules, OutputStream os) throws IOException 331 { 332 // 333 // Read the input rules, generate a parse tree, symbol table, 334 // and list of all Unicode Sets referenced by the rules. 335 // 336 RBBIRuleBuilder builder = new RBBIRuleBuilder(rules); 337 builder.fScanner.parse(); 338 339 // 340 // UnicodeSet processing. 341 // Munge the Unicode Sets to create a set of character categories. 342 // Generate the mapping tables (TRIE) from input 32-bit characters to 343 // the character categories. 344 // 345 builder.fSetBuilder.build(); 346 347 // 348 // Generate the DFA state transition table. 349 // 350 builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree); 351 builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree); 352 builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree); 353 builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree); 354 builder.fForwardTables.build(); 355 builder.fReverseTables.build(); 356 builder.fSafeFwdTables.build(); 357 builder.fSafeRevTables.build(); 358 if (builder.fDebugEnv != null 359 && builder.fDebugEnv.indexOf("states") >= 0) { 360 builder.fForwardTables.printRuleStatusTable(); 361 } 362 363 // 364 // Package up the compiled data, writing it to an output stream 365 // in the serialization format. This is the same as the ICU4C runtime format. 366 // 367 builder.flattenData(os); 368 } 369 } 370