1 // 2 // file: rbbirb.cpp 3 // 4 // Copyright (C) 2002-2011, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains the RBBIRuleBuilder class implementation. This is the main class for 8 // building (compiling) break rules into the tables required by the runtime 9 // RBBI engine. 10 // 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_BREAK_ITERATION 15 16 #include "unicode/brkiter.h" 17 #include "unicode/rbbi.h" 18 #include "unicode/ubrk.h" 19 #include "unicode/unistr.h" 20 #include "unicode/uniset.h" 21 #include "unicode/uchar.h" 22 #include "unicode/uchriter.h" 23 #include "unicode/parsepos.h" 24 #include "unicode/parseerr.h" 25 #include "cmemory.h" 26 #include "cstring.h" 27 28 #include "rbbirb.h" 29 #include "rbbinode.h" 30 31 #include "rbbiscan.h" 32 #include "rbbisetb.h" 33 #include "rbbitblb.h" 34 #include "rbbidata.h" 35 36 37 U_NAMESPACE_BEGIN 38 39 40 //---------------------------------------------------------------------------------------- 41 // 42 // Constructor. 43 // 44 //---------------------------------------------------------------------------------------- 45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, 46 UParseError *parseErr, 47 UErrorCode &status) 48 : fRules(rules) 49 { 50 fStatus = &status; // status is checked below 51 fParseError = parseErr; 52 fDebugEnv = NULL; 53 #ifdef RBBI_DEBUG 54 fDebugEnv = getenv("U_RBBIDEBUG"); 55 #endif 56 57 58 fForwardTree = NULL; 59 fReverseTree = NULL; 60 fSafeFwdTree = NULL; 61 fSafeRevTree = NULL; 62 fDefaultTree = &fForwardTree; 63 fForwardTables = NULL; 64 fReverseTables = NULL; 65 fSafeFwdTables = NULL; 66 fSafeRevTables = NULL; 67 fRuleStatusVals = NULL; 68 fChainRules = FALSE; 69 fLBCMNoChain = FALSE; 70 fLookAheadHardBreak = FALSE; 71 fUSetNodes = NULL; 72 fRuleStatusVals = NULL; 73 fScanner = NULL; 74 fSetBuilder = NULL; 75 if (parseErr) { 76 uprv_memset(parseErr, 0, sizeof(UParseError)); 77 } 78 79 if (U_FAILURE(status)) { 80 return; 81 } 82 83 fUSetNodes = new UVector(status); // bcos status gets overwritten here 84 fRuleStatusVals = new UVector(status); 85 fScanner = new RBBIRuleScanner(this); 86 fSetBuilder = new RBBISetBuilder(this); 87 if (U_FAILURE(status)) { 88 return; 89 } 90 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { 91 status = U_MEMORY_ALLOCATION_ERROR; 92 } 93 } 94 95 96 97 //---------------------------------------------------------------------------------------- 98 // 99 // Destructor 100 // 101 //---------------------------------------------------------------------------------------- 102 RBBIRuleBuilder::~RBBIRuleBuilder() { 103 104 int i; 105 for (i=0; ; i++) { 106 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); 107 if (n==NULL) { 108 break; 109 } 110 delete n; 111 } 112 113 delete fUSetNodes; 114 delete fSetBuilder; 115 delete fForwardTables; 116 delete fReverseTables; 117 delete fSafeFwdTables; 118 delete fSafeRevTables; 119 120 delete fForwardTree; 121 delete fReverseTree; 122 delete fSafeFwdTree; 123 delete fSafeRevTree; 124 delete fScanner; 125 delete fRuleStatusVals; 126 } 127 128 129 130 131 132 //---------------------------------------------------------------------------------------- 133 // 134 // flattenData() - Collect up the compiled RBBI rule data and put it into 135 // the format for saving in ICU data files, 136 // which is also the format needed by the RBBI runtime engine. 137 // 138 //---------------------------------------------------------------------------------------- 139 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} 140 141 RBBIDataHeader *RBBIRuleBuilder::flattenData() { 142 int32_t i; 143 144 if (U_FAILURE(*fStatus)) { 145 return NULL; 146 } 147 148 // Remove comments and whitespace from the rules to make it smaller. 149 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules)); 150 151 // Calculate the size of each section in the data. 152 // Sizes here are padded up to a multiple of 8 for better memory alignment. 153 // Sections sizes actually stored in the header are for the actual data 154 // without the padding. 155 // 156 int32_t headerSize = align8(sizeof(RBBIDataHeader)); 157 int32_t forwardTableSize = align8(fForwardTables->getTableSize()); 158 int32_t reverseTableSize = align8(fReverseTables->getTableSize()); 159 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); 160 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); 161 int32_t trieSize = align8(fSetBuilder->getTrieSize()); 162 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); 163 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); 164 165 int32_t totalSize = headerSize + forwardTableSize + reverseTableSize 166 + safeFwdTableSize + safeRevTableSize 167 + statusTableSize + trieSize + rulesSize; 168 169 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); 170 if (data == NULL) { 171 *fStatus = U_MEMORY_ALLOCATION_ERROR; 172 return NULL; 173 } 174 uprv_memset(data, 0, totalSize); 175 176 177 data->fMagic = 0xb1a0; 178 data->fFormatVersion[0] = 3; 179 data->fFormatVersion[1] = 1; 180 data->fFormatVersion[2] = 0; 181 data->fFormatVersion[3] = 0; 182 data->fLength = totalSize; 183 data->fCatCount = fSetBuilder->getNumCharCategories(); 184 185 data->fFTable = headerSize; 186 data->fFTableLen = forwardTableSize; 187 data->fRTable = data->fFTable + forwardTableSize; 188 data->fRTableLen = reverseTableSize; 189 data->fSFTable = data->fRTable + reverseTableSize; 190 data->fSFTableLen = safeFwdTableSize; 191 data->fSRTable = data->fSFTable + safeFwdTableSize; 192 data->fSRTableLen = safeRevTableSize; 193 194 data->fTrie = data->fSRTable + safeRevTableSize; 195 data->fTrieLen = fSetBuilder->getTrieSize(); 196 data->fStatusTable = data->fTrie + trieSize; 197 data->fStatusTableLen= statusTableSize; 198 data->fRuleSource = data->fStatusTable + statusTableSize; 199 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); 200 201 uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); 202 203 fForwardTables->exportTable((uint8_t *)data + data->fFTable); 204 fReverseTables->exportTable((uint8_t *)data + data->fRTable); 205 fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); 206 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); 207 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); 208 209 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); 210 for (i=0; i<fRuleStatusVals->size(); i++) { 211 ruleStatusTable[i] = fRuleStatusVals->elementAti(i); 212 } 213 214 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); 215 216 return data; 217 } 218 219 220 221 222 223 224 //---------------------------------------------------------------------------------------- 225 // 226 // createRuleBasedBreakIterator construct from source rules that are passed in 227 // in a UnicodeString 228 // 229 //---------------------------------------------------------------------------------------- 230 BreakIterator * 231 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, 232 UParseError *parseError, 233 UErrorCode &status) 234 { 235 // status checked below 236 237 // 238 // Read the input rules, generate a parse tree, symbol table, 239 // and list of all Unicode Sets referenced by the rules. 240 // 241 RBBIRuleBuilder builder(rules, parseError, status); 242 if (U_FAILURE(status)) { // status checked here bcos build below doesn't 243 return NULL; 244 } 245 builder.fScanner->parse(); 246 247 // 248 // UnicodeSet processing. 249 // Munge the Unicode Sets to create a set of character categories. 250 // Generate the mapping tables (TRIE) from input 32-bit characters to 251 // the character categories. 252 // 253 builder.fSetBuilder->build(); 254 255 256 // 257 // Generate the DFA state transition table. 258 // 259 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); 260 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); 261 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); 262 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); 263 if (builder.fForwardTables == NULL || builder.fReverseTables == NULL || 264 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL) 265 { 266 status = U_MEMORY_ALLOCATION_ERROR; 267 delete builder.fForwardTables; builder.fForwardTables = NULL; 268 delete builder.fReverseTables; builder.fReverseTables = NULL; 269 delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL; 270 delete builder.fSafeRevTables; builder.fSafeRevTables = NULL; 271 return NULL; 272 } 273 274 builder.fForwardTables->build(); 275 builder.fReverseTables->build(); 276 builder.fSafeFwdTables->build(); 277 builder.fSafeRevTables->build(); 278 279 #ifdef RBBI_DEBUG 280 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { 281 builder.fForwardTables->printRuleStatusTable(); 282 } 283 #endif 284 285 // 286 // Package up the compiled data into a memory image 287 // in the run-time format. 288 // 289 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error 290 if (U_FAILURE(*builder.fStatus)) { 291 return NULL; 292 } 293 294 295 // 296 // Clean up the compiler related stuff 297 // 298 299 300 // 301 // Create a break iterator from the compiled rules. 302 // (Identical to creation from stored pre-compiled rules) 303 // 304 // status is checked after init in construction. 305 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); 306 if (U_FAILURE(status)) { 307 delete This; 308 This = NULL; 309 } 310 else if(This == NULL) { // test for NULL 311 status = U_MEMORY_ALLOCATION_ERROR; 312 } 313 return This; 314 } 315 316 U_NAMESPACE_END 317 318 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 319