1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/17/99 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/uobject.h" 16 #include "unicode/parseerr.h" 17 #include "unicode/parsepos.h" 18 #include "unicode/putil.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ustring.h" 21 #include "unicode/uniset.h" 22 #include "cstring.h" 23 #include "funcrepl.h" 24 #include "hash.h" 25 #include "quant.h" 26 #include "rbt.h" 27 #include "rbt_data.h" 28 #include "rbt_pars.h" 29 #include "rbt_rule.h" 30 #include "strmatch.h" 31 #include "strrepl.h" 32 #include "unicode/symtable.h" 33 #include "tridpars.h" 34 #include "uvector.h" 35 #include "hash.h" 36 #include "patternprops.h" 37 #include "util.h" 38 #include "cmemory.h" 39 #include "uprops.h" 40 #include "putilimp.h" 41 42 // Operators 43 #define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/ 44 #define FORWARD_RULE_OP ((UChar)0x003E) /*>*/ 45 #define REVERSE_RULE_OP ((UChar)0x003C) /*<*/ 46 #define FWDREV_RULE_OP ((UChar)0x007E) /*~*/ // internal rep of <> op 47 48 // Other special characters 49 #define QUOTE ((UChar)0x0027) /*'*/ 50 #define ESCAPE ((UChar)0x005C) /*\*/ 51 #define END_OF_RULE ((UChar)0x003B) /*;*/ 52 #define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/ 53 54 #define SEGMENT_OPEN ((UChar)0x0028) /*(*/ 55 #define SEGMENT_CLOSE ((UChar)0x0029) /*)*/ 56 #define CONTEXT_ANTE ((UChar)0x007B) /*{*/ 57 #define CONTEXT_POST ((UChar)0x007D) /*}*/ 58 #define CURSOR_POS ((UChar)0x007C) /*|*/ 59 #define CURSOR_OFFSET ((UChar)0x0040) /*@*/ 60 #define ANCHOR_START ((UChar)0x005E) /*^*/ 61 #define KLEENE_STAR ((UChar)0x002A) /***/ 62 #define ONE_OR_MORE ((UChar)0x002B) /*+*/ 63 #define ZERO_OR_ONE ((UChar)0x003F) /*?*/ 64 65 #define DOT ((UChar)46) /*.*/ 66 67 static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]"; 68 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90, 69 108, 58, 93, 92, 114, 92, 110, 36, 93, 0 70 }; 71 72 // A function is denoted &Source-Target/Variant(text) 73 #define FUNCTION ((UChar)38) /*&*/ 74 75 // Aliases for some of the syntax characters. These are provided so 76 // transliteration rules can be expressed in XML without clashing with 77 // XML syntax characters '<', '>', and '&'. 78 #define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow 79 #define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow 80 #define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow 81 #define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta) 82 83 // Special characters disallowed at the top level 84 static const UChar ILLEGAL_TOP[] = {41,0}; // ")" 85 86 // Special characters disallowed within a segment 87 static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@" 88 89 // Special characters disallowed within a function argument 90 static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@" 91 92 // By definition, the ANCHOR_END special character is a 93 // trailing SymbolTable.SYMBOL_REF character. 94 // private static final char ANCHOR_END = '$'; 95 96 static const UChar gOPERATORS[] = { // "=><" 97 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 98 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 99 0 100 }; 101 102 static const UChar HALF_ENDERS[] = { // "=><;" 103 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 104 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 105 END_OF_RULE, 106 0 107 }; 108 109 // These are also used in Transliterator::toRules() 110 static const int32_t ID_TOKEN_LEN = 2; 111 static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' 112 113 /* 114 commented out until we do real ::BEGIN/::END functionality 115 static const int32_t BEGIN_TOKEN_LEN = 5; 116 static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN' 117 118 static const int32_t END_TOKEN_LEN = 3; 119 static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END' 120 */ 121 122 U_NAMESPACE_BEGIN 123 124 //---------------------------------------------------------------------- 125 // BEGIN ParseData 126 //---------------------------------------------------------------------- 127 128 /** 129 * This class implements the SymbolTable interface. It is used 130 * during parsing to give UnicodeSet access to variables that 131 * have been defined so far. Note that it uses variablesVector, 132 * _not_ data.setVariables. 133 */ 134 class ParseData : public UMemory, public SymbolTable { 135 public: 136 const TransliterationRuleData* data; // alias 137 138 const UVector* variablesVector; // alias 139 140 const Hashtable* variableNames; // alias 141 142 ParseData(const TransliterationRuleData* data = 0, 143 const UVector* variablesVector = 0, 144 const Hashtable* variableNames = 0); 145 146 virtual const UnicodeString* lookup(const UnicodeString& s) const; 147 148 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 149 150 virtual UnicodeString parseReference(const UnicodeString& text, 151 ParsePosition& pos, int32_t limit) const; 152 /** 153 * Return true if the given character is a matcher standin or a plain 154 * character (non standin). 155 */ 156 UBool isMatcher(UChar32 ch); 157 158 /** 159 * Return true if the given character is a replacer standin or a plain 160 * character (non standin). 161 */ 162 UBool isReplacer(UChar32 ch); 163 164 private: 165 ParseData(const ParseData &other); // forbid copying of this class 166 ParseData &operator=(const ParseData &other); // forbid copying of this class 167 }; 168 169 ParseData::ParseData(const TransliterationRuleData* d, 170 const UVector* sets, 171 const Hashtable* vNames) : 172 data(d), variablesVector(sets), variableNames(vNames) {} 173 174 /** 175 * Implement SymbolTable API. 176 */ 177 const UnicodeString* ParseData::lookup(const UnicodeString& name) const { 178 return (const UnicodeString*) variableNames->get(name); 179 } 180 181 /** 182 * Implement SymbolTable API. 183 */ 184 const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { 185 // Note that we cannot use data.lookupSet() because the 186 // set array has not been constructed yet. 187 const UnicodeFunctor* set = NULL; 188 int32_t i = ch - data->variablesBase; 189 if (i >= 0 && i < variablesVector->size()) { 190 int32_t i = ch - data->variablesBase; 191 set = (i < variablesVector->size()) ? 192 (UnicodeFunctor*) variablesVector->elementAt(i) : 0; 193 } 194 return set; 195 } 196 197 /** 198 * Implement SymbolTable API. Parse out a symbol reference 199 * name. 200 */ 201 UnicodeString ParseData::parseReference(const UnicodeString& text, 202 ParsePosition& pos, int32_t limit) const { 203 int32_t start = pos.getIndex(); 204 int32_t i = start; 205 UnicodeString result; 206 while (i < limit) { 207 UChar c = text.charAt(i); 208 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 209 break; 210 } 211 ++i; 212 } 213 if (i == start) { // No valid name chars 214 return result; // Indicate failure with empty string 215 } 216 pos.setIndex(i); 217 text.extractBetween(start, i, result); 218 return result; 219 } 220 221 UBool ParseData::isMatcher(UChar32 ch) { 222 // Note that we cannot use data.lookup() because the 223 // set array has not been constructed yet. 224 int32_t i = ch - data->variablesBase; 225 if (i >= 0 && i < variablesVector->size()) { 226 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 227 return f != NULL && f->toMatcher() != NULL; 228 } 229 return TRUE; 230 } 231 232 /** 233 * Return true if the given character is a replacer standin or a plain 234 * character (non standin). 235 */ 236 UBool ParseData::isReplacer(UChar32 ch) { 237 // Note that we cannot use data.lookup() because the 238 // set array has not been constructed yet. 239 int i = ch - data->variablesBase; 240 if (i >= 0 && i < variablesVector->size()) { 241 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 242 return f != NULL && f->toReplacer() != NULL; 243 } 244 return TRUE; 245 } 246 247 //---------------------------------------------------------------------- 248 // BEGIN RuleHalf 249 //---------------------------------------------------------------------- 250 251 /** 252 * A class representing one side of a rule. This class knows how to 253 * parse half of a rule. It is tightly coupled to the method 254 * RuleBasedTransliterator.Parser.parseRule(). 255 */ 256 class RuleHalf : public UMemory { 257 258 public: 259 260 UnicodeString text; 261 262 int32_t cursor; // position of cursor in text 263 int32_t ante; // position of ante context marker '{' in text 264 int32_t post; // position of post context marker '}' in text 265 266 // Record the offset to the cursor either to the left or to the 267 // right of the key. This is indicated by characters on the output 268 // side that allow the cursor to be positioned arbitrarily within 269 // the matching text. For example, abc{def} > | @@@ xyz; changes 270 // def to xyz and moves the cursor to before abc. Offset characters 271 // must be at the start or end, and they cannot move the cursor past 272 // the ante- or postcontext text. Placeholders are only valid in 273 // output text. The length of the ante and post context is 274 // determined at runtime, because of supplementals and quantifiers. 275 int32_t cursorOffset; // only nonzero on output side 276 277 // Position of first CURSOR_OFFSET on _right_. This will be -1 278 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 279 int32_t cursorOffsetPos; 280 281 UBool anchorStart; 282 UBool anchorEnd; 283 284 /** 285 * The segment number from 1..n of the next '(' we see 286 * during parsing; 1-based. 287 */ 288 int32_t nextSegmentNumber; 289 290 TransliteratorParser& parser; 291 292 //-------------------------------------------------- 293 // Methods 294 295 RuleHalf(TransliteratorParser& parser); 296 ~RuleHalf(); 297 298 int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 299 300 int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 301 UnicodeString& buf, 302 const UnicodeString& illegal, 303 UBool isSegment, 304 UErrorCode& status); 305 306 /** 307 * Remove context. 308 */ 309 void removeContext(); 310 311 /** 312 * Return true if this half looks like valid output, that is, does not 313 * contain quantifiers or other special input-only elements. 314 */ 315 UBool isValidOutput(TransliteratorParser& parser); 316 317 /** 318 * Return true if this half looks like valid input, that is, does not 319 * contain functions or other special output-only elements. 320 */ 321 UBool isValidInput(TransliteratorParser& parser); 322 323 int syntaxError(UErrorCode code, 324 const UnicodeString& rule, 325 int32_t start, 326 UErrorCode& status) { 327 return parser.syntaxError(code, rule, start, status); 328 } 329 330 private: 331 // Disallowed methods; no impl. 332 RuleHalf(const RuleHalf&); 333 RuleHalf& operator=(const RuleHalf&); 334 }; 335 336 RuleHalf::RuleHalf(TransliteratorParser& p) : 337 parser(p) 338 { 339 cursor = -1; 340 ante = -1; 341 post = -1; 342 cursorOffset = 0; 343 cursorOffsetPos = 0; 344 anchorStart = anchorEnd = FALSE; 345 nextSegmentNumber = 1; 346 } 347 348 RuleHalf::~RuleHalf() { 349 } 350 351 /** 352 * Parse one side of a rule, stopping at either the limit, 353 * the END_OF_RULE character, or an operator. 354 * @return the index after the terminating character, or 355 * if limit was reached, limit 356 */ 357 int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 358 int32_t start = pos; 359 text.truncate(0); 360 pos = parseSection(rule, pos, limit, text, ILLEGAL_TOP, FALSE, status); 361 362 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 363 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 364 } 365 366 return pos; 367 } 368 369 /** 370 * Parse a section of one side of a rule, stopping at either 371 * the limit, the END_OF_RULE character, an operator, or a 372 * segment close character. This method parses both a 373 * top-level rule half and a segment within such a rule half. 374 * It calls itself recursively to parse segments and nested 375 * segments. 376 * @param buf buffer into which to accumulate the rule pattern 377 * characters, either literal characters from the rule or 378 * standins for UnicodeMatcher objects including segments. 379 * @param illegal the set of special characters that is illegal during 380 * this parse. 381 * @param isSegment if true, then we've already seen a '(' and 382 * pos on entry points right after it. Accumulate everything 383 * up to the closing ')', put it in a segment matcher object, 384 * generate a standin for it, and add the standin to buf. As 385 * a side effect, update the segments vector with a reference 386 * to the segment matcher. This works recursively for nested 387 * segments. If isSegment is false, just accumulate 388 * characters into buf. 389 * @return the index after the terminating character, or 390 * if limit was reached, limit 391 */ 392 int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 393 UnicodeString& buf, 394 const UnicodeString& illegal, 395 UBool isSegment, UErrorCode& status) { 396 int32_t start = pos; 397 ParsePosition pp; 398 UnicodeString scratch; 399 UBool done = FALSE; 400 int32_t quoteStart = -1; // Most recent 'single quoted string' 401 int32_t quoteLimit = -1; 402 int32_t varStart = -1; // Most recent $variableReference 403 int32_t varLimit = -1; 404 int32_t bufStart = buf.length(); 405 406 while (pos < limit && !done) { 407 // Since all syntax characters are in the BMP, fetching 408 // 16-bit code units suffices here. 409 UChar c = rule.charAt(pos++); 410 if (PatternProps::isWhiteSpace(c)) { 411 // Ignore whitespace. Note that this is not Unicode 412 // spaces, but Java spaces -- a subset, representing 413 // whitespace likely to be seen in code. 414 continue; 415 } 416 if (u_strchr(HALF_ENDERS, c) != NULL) { 417 if (isSegment) { 418 // Unclosed segment 419 return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status); 420 } 421 break; 422 } 423 if (anchorEnd) { 424 // Text after a presumed end anchor is a syntax err 425 return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status); 426 } 427 if (UnicodeSet::resemblesPattern(rule, pos-1)) { 428 pp.setIndex(pos-1); // Backup to opening '[' 429 buf.append(parser.parseSet(rule, pp, status)); 430 if (U_FAILURE(status)) { 431 return syntaxError(U_MALFORMED_SET, rule, start, status); 432 } 433 pos = pp.getIndex(); 434 continue; 435 } 436 // Handle escapes 437 if (c == ESCAPE) { 438 if (pos == limit) { 439 return syntaxError(U_TRAILING_BACKSLASH, rule, start, status); 440 } 441 UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\' 442 if (escaped == (UChar32) -1) { 443 return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status); 444 } 445 if (!parser.checkVariableRange(escaped)) { 446 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 447 } 448 buf.append(escaped); 449 continue; 450 } 451 // Handle quoted matter 452 if (c == QUOTE) { 453 int32_t iq = rule.indexOf(QUOTE, pos); 454 if (iq == pos) { 455 buf.append(c); // Parse [''] outside quotes as ['] 456 ++pos; 457 } else { 458 /* This loop picks up a run of quoted text of the 459 * form 'aaaa' each time through. If this run 460 * hasn't really ended ('aaaa''bbbb') then it keeps 461 * looping, each time adding on a new run. When it 462 * reaches the final quote it breaks. 463 */ 464 quoteStart = buf.length(); 465 for (;;) { 466 if (iq < 0) { 467 return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status); 468 } 469 scratch.truncate(0); 470 rule.extractBetween(pos, iq, scratch); 471 buf.append(scratch); 472 pos = iq+1; 473 if (pos < limit && rule.charAt(pos) == QUOTE) { 474 // Parse [''] inside quotes as ['] 475 iq = rule.indexOf(QUOTE, pos+1); 476 // Continue looping 477 } else { 478 break; 479 } 480 } 481 quoteLimit = buf.length(); 482 483 for (iq=quoteStart; iq<quoteLimit; ++iq) { 484 if (!parser.checkVariableRange(buf.charAt(iq))) { 485 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 486 } 487 } 488 } 489 continue; 490 } 491 492 if (!parser.checkVariableRange(c)) { 493 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 494 } 495 496 if (illegal.indexOf(c) >= 0) { 497 syntaxError(U_ILLEGAL_CHARACTER, rule, start, status); 498 } 499 500 switch (c) { 501 502 //------------------------------------------------------ 503 // Elements allowed within and out of segments 504 //------------------------------------------------------ 505 case ANCHOR_START: 506 if (buf.length() == 0 && !anchorStart) { 507 anchorStart = TRUE; 508 } else { 509 return syntaxError(U_MISPLACED_ANCHOR_START, 510 rule, start, status); 511 } 512 break; 513 case SEGMENT_OPEN: 514 { 515 // bufSegStart is the offset in buf to the first 516 // character of the segment we are parsing. 517 int32_t bufSegStart = buf.length(); 518 519 // Record segment number now, since nextSegmentNumber 520 // will be incremented during the call to parseSection 521 // if there are nested segments. 522 int32_t segmentNumber = nextSegmentNumber++; // 1-based 523 524 // Parse the segment 525 pos = parseSection(rule, pos, limit, buf, ILLEGAL_SEG, TRUE, status); 526 527 // After parsing a segment, the relevant characters are 528 // in buf, starting at offset bufSegStart. Extract them 529 // into a string matcher, and replace them with a 530 // standin for that matcher. 531 StringMatcher* m = 532 new StringMatcher(buf, bufSegStart, buf.length(), 533 segmentNumber, *parser.curData); 534 if (m == NULL) { 535 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 536 } 537 538 // Record and associate object and segment number 539 parser.setSegmentObject(segmentNumber, m, status); 540 buf.truncate(bufSegStart); 541 buf.append(parser.getSegmentStandin(segmentNumber, status)); 542 } 543 break; 544 case FUNCTION: 545 case ALT_FUNCTION: 546 { 547 int32_t iref = pos; 548 TransliteratorIDParser::SingleID* single = 549 TransliteratorIDParser::parseFilterID(rule, iref); 550 // The next character MUST be a segment open 551 if (single == NULL || 552 !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) { 553 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 554 } 555 556 Transliterator *t = single->createInstance(); 557 delete single; 558 if (t == NULL) { 559 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 560 } 561 562 // bufSegStart is the offset in buf to the first 563 // character of the segment we are parsing. 564 int32_t bufSegStart = buf.length(); 565 566 // Parse the segment 567 pos = parseSection(rule, iref, limit, buf, ILLEGAL_FUNC, TRUE, status); 568 569 // After parsing a segment, the relevant characters are 570 // in buf, starting at offset bufSegStart. 571 UnicodeString output; 572 buf.extractBetween(bufSegStart, buf.length(), output); 573 FunctionReplacer *r = 574 new FunctionReplacer(t, new StringReplacer(output, parser.curData)); 575 if (r == NULL) { 576 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 577 } 578 579 // Replace the buffer contents with a stand-in 580 buf.truncate(bufSegStart); 581 buf.append(parser.generateStandInFor(r, status)); 582 } 583 break; 584 case SymbolTable::SYMBOL_REF: 585 // Handle variable references and segment references "$1" .. "$9" 586 { 587 // A variable reference must be followed immediately 588 // by a Unicode identifier start and zero or more 589 // Unicode identifier part characters, or by a digit 590 // 1..9 if it is a segment reference. 591 if (pos == limit) { 592 // A variable ref character at the end acts as 593 // an anchor to the context limit, as in perl. 594 anchorEnd = TRUE; 595 break; 596 } 597 // Parse "$1" "$2" .. "$9" .. (no upper limit) 598 c = rule.charAt(pos); 599 int32_t r = u_digit(c, 10); 600 if (r >= 1 && r <= 9) { 601 r = ICU_Utility::parseNumber(rule, pos, 10); 602 if (r < 0) { 603 return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, 604 rule, start, status); 605 } 606 buf.append(parser.getSegmentStandin(r, status)); 607 } else { 608 pp.setIndex(pos); 609 UnicodeString name = parser.parseData-> 610 parseReference(rule, pp, limit); 611 if (name.length() == 0) { 612 // This means the '$' was not followed by a 613 // valid name. Try to interpret it as an 614 // end anchor then. If this also doesn't work 615 // (if we see a following character) then signal 616 // an error. 617 anchorEnd = TRUE; 618 break; 619 } 620 pos = pp.getIndex(); 621 // If this is a variable definition statement, 622 // then the LHS variable will be undefined. In 623 // that case appendVariableDef() will append the 624 // special placeholder char variableLimit-1. 625 varStart = buf.length(); 626 parser.appendVariableDef(name, buf, status); 627 varLimit = buf.length(); 628 } 629 } 630 break; 631 case DOT: 632 buf.append(parser.getDotStandIn(status)); 633 break; 634 case KLEENE_STAR: 635 case ONE_OR_MORE: 636 case ZERO_OR_ONE: 637 // Quantifiers. We handle single characters, quoted strings, 638 // variable references, and segments. 639 // a+ matches aaa 640 // 'foo'+ matches foofoofoo 641 // $v+ matches xyxyxy if $v == xy 642 // (seg)+ matches segsegseg 643 { 644 if (isSegment && buf.length() == bufStart) { 645 // The */+ immediately follows '(' 646 return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status); 647 } 648 649 int32_t qstart, qlimit; 650 // The */+ follows an isolated character or quote 651 // or variable reference 652 if (buf.length() == quoteLimit) { 653 // The */+ follows a 'quoted string' 654 qstart = quoteStart; 655 qlimit = quoteLimit; 656 } else if (buf.length() == varLimit) { 657 // The */+ follows a $variableReference 658 qstart = varStart; 659 qlimit = varLimit; 660 } else { 661 // The */+ follows a single character, possibly 662 // a segment standin 663 qstart = buf.length() - 1; 664 qlimit = qstart + 1; 665 } 666 667 UnicodeFunctor *m = 668 new StringMatcher(buf, qstart, qlimit, 0, *parser.curData); 669 if (m == NULL) { 670 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 671 } 672 int32_t min = 0; 673 int32_t max = Quantifier::MAX; 674 switch (c) { 675 case ONE_OR_MORE: 676 min = 1; 677 break; 678 case ZERO_OR_ONE: 679 min = 0; 680 max = 1; 681 break; 682 // case KLEENE_STAR: 683 // do nothing -- min, max already set 684 } 685 m = new Quantifier(m, min, max); 686 if (m == NULL) { 687 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 688 } 689 buf.truncate(qstart); 690 buf.append(parser.generateStandInFor(m, status)); 691 } 692 break; 693 694 //------------------------------------------------------ 695 // Elements allowed ONLY WITHIN segments 696 //------------------------------------------------------ 697 case SEGMENT_CLOSE: 698 // assert(isSegment); 699 // We're done parsing a segment. 700 done = TRUE; 701 break; 702 703 //------------------------------------------------------ 704 // Elements allowed ONLY OUTSIDE segments 705 //------------------------------------------------------ 706 case CONTEXT_ANTE: 707 if (ante >= 0) { 708 return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status); 709 } 710 ante = buf.length(); 711 break; 712 case CONTEXT_POST: 713 if (post >= 0) { 714 return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status); 715 } 716 post = buf.length(); 717 break; 718 case CURSOR_POS: 719 if (cursor >= 0) { 720 return syntaxError(U_MULTIPLE_CURSORS, rule, start, status); 721 } 722 cursor = buf.length(); 723 break; 724 case CURSOR_OFFSET: 725 if (cursorOffset < 0) { 726 if (buf.length() > 0) { 727 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 728 } 729 --cursorOffset; 730 } else if (cursorOffset > 0) { 731 if (buf.length() != cursorOffsetPos || cursor >= 0) { 732 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 733 } 734 ++cursorOffset; 735 } else { 736 if (cursor == 0 && buf.length() == 0) { 737 cursorOffset = -1; 738 } else if (cursor < 0) { 739 cursorOffsetPos = buf.length(); 740 cursorOffset = 1; 741 } else { 742 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 743 } 744 } 745 break; 746 747 748 //------------------------------------------------------ 749 // Non-special characters 750 //------------------------------------------------------ 751 default: 752 // Disallow unquoted characters other than [0-9A-Za-z] 753 // in the printable ASCII range. These characters are 754 // reserved for possible future use. 755 if (c >= 0x0021 && c <= 0x007E && 756 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 757 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 758 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) { 759 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 760 } 761 buf.append(c); 762 break; 763 } 764 } 765 766 return pos; 767 } 768 769 /** 770 * Remove context. 771 */ 772 void RuleHalf::removeContext() { 773 //text = text.substring(ante < 0 ? 0 : ante, 774 // post < 0 ? text.length() : post); 775 if (post >= 0) { 776 text.remove(post); 777 } 778 if (ante >= 0) { 779 text.removeBetween(0, ante); 780 } 781 ante = post = -1; 782 anchorStart = anchorEnd = FALSE; 783 } 784 785 /** 786 * Return true if this half looks like valid output, that is, does not 787 * contain quantifiers or other special input-only elements. 788 */ 789 UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { 790 for (int32_t i=0; i<text.length(); ) { 791 UChar32 c = text.char32At(i); 792 i += UTF_CHAR_LENGTH(c); 793 if (!transParser.parseData->isReplacer(c)) { 794 return FALSE; 795 } 796 } 797 return TRUE; 798 } 799 800 /** 801 * Return true if this half looks like valid input, that is, does not 802 * contain functions or other special output-only elements. 803 */ 804 UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { 805 for (int32_t i=0; i<text.length(); ) { 806 UChar32 c = text.char32At(i); 807 i += UTF_CHAR_LENGTH(c); 808 if (!transParser.parseData->isMatcher(c)) { 809 return FALSE; 810 } 811 } 812 return TRUE; 813 } 814 815 //---------------------------------------------------------------------- 816 // PUBLIC API 817 //---------------------------------------------------------------------- 818 819 /** 820 * Constructor. 821 */ 822 TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : 823 dataVector(statusReturn), 824 idBlockVector(statusReturn), 825 variablesVector(statusReturn), 826 segmentObjects(statusReturn) 827 { 828 idBlockVector.setDeleter(uhash_deleteUnicodeString); 829 curData = NULL; 830 compoundFilter = NULL; 831 parseData = NULL; 832 variableNames.setValueDeleter(uhash_deleteUnicodeString); 833 } 834 835 /** 836 * Destructor. 837 */ 838 TransliteratorParser::~TransliteratorParser() { 839 while (!dataVector.isEmpty()) 840 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 841 delete compoundFilter; 842 delete parseData; 843 while (!variablesVector.isEmpty()) 844 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 845 } 846 847 void 848 TransliteratorParser::parse(const UnicodeString& rules, 849 UTransDirection transDirection, 850 UParseError& pe, 851 UErrorCode& ec) { 852 if (U_SUCCESS(ec)) { 853 parseRules(rules, transDirection, ec); 854 pe = parseError; 855 } 856 } 857 858 /** 859 * Return the compound filter parsed by parse(). Caller owns result. 860 */ 861 UnicodeSet* TransliteratorParser::orphanCompoundFilter() { 862 UnicodeSet* f = compoundFilter; 863 compoundFilter = NULL; 864 return f; 865 } 866 867 //---------------------------------------------------------------------- 868 // Private implementation 869 //---------------------------------------------------------------------- 870 871 /** 872 * Parse the given string as a sequence of rules, separated by newline 873 * characters ('\n'), and cause this object to implement those rules. Any 874 * previous rules are discarded. Typically this method is called exactly 875 * once, during construction. 876 * @exception IllegalArgumentException if there is a syntax error in the 877 * rules 878 */ 879 void TransliteratorParser::parseRules(const UnicodeString& rule, 880 UTransDirection theDirection, 881 UErrorCode& status) 882 { 883 // Clear error struct 884 uprv_memset(&parseError, 0, sizeof(parseError)); 885 parseError.line = parseError.offset = -1; 886 887 UBool parsingIDs = TRUE; 888 int32_t ruleCount = 0; 889 890 while (!dataVector.isEmpty()) { 891 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 892 } 893 if (U_FAILURE(status)) { 894 return; 895 } 896 897 idBlockVector.removeAllElements(); 898 curData = NULL; 899 direction = theDirection; 900 ruleCount = 0; 901 902 delete compoundFilter; 903 compoundFilter = NULL; 904 905 while (!variablesVector.isEmpty()) { 906 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 907 } 908 variableNames.removeAll(); 909 parseData = new ParseData(0, &variablesVector, &variableNames); 910 if (parseData == NULL) { 911 status = U_MEMORY_ALLOCATION_ERROR; 912 return; 913 } 914 915 dotStandIn = (UChar) -1; 916 917 UnicodeString *tempstr = NULL; // used for memory allocation error checking 918 UnicodeString str; // scratch 919 UnicodeString idBlockResult; 920 int32_t pos = 0; 921 int32_t limit = rule.length(); 922 923 // The compound filter offset is an index into idBlockResult. 924 // If it is 0, then the compound filter occurred at the start, 925 // and it is the offset to the _start_ of the compound filter 926 // pattern. Otherwise it is the offset to the _limit_ of the 927 // compound filter pattern within idBlockResult. 928 compoundFilter = NULL; 929 int32_t compoundFilterOffset = -1; 930 931 while (pos < limit && U_SUCCESS(status)) { 932 UChar c = rule.charAt(pos++); 933 if (PatternProps::isWhiteSpace(c)) { 934 // Ignore leading whitespace. 935 continue; 936 } 937 // Skip lines starting with the comment character 938 if (c == RULE_COMMENT_CHAR) { 939 pos = rule.indexOf((UChar)0x000A /*\n*/, pos) + 1; 940 if (pos == 0) { 941 break; // No "\n" found; rest of rule is a commnet 942 } 943 continue; // Either fall out or restart with next line 944 } 945 946 // skip empty rules 947 if (c == END_OF_RULE) 948 continue; 949 950 // keep track of how many rules we've seen 951 ++ruleCount; 952 953 // We've found the start of a rule or ID. c is its first 954 // character, and pos points past c. 955 --pos; 956 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 957 // chars left. 958 if ((pos + ID_TOKEN_LEN + 1) <= limit && 959 rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) { 960 pos += ID_TOKEN_LEN; 961 c = rule.charAt(pos); 962 while (PatternProps::isWhiteSpace(c) && pos < limit) { 963 ++pos; 964 c = rule.charAt(pos); 965 } 966 967 int32_t p = pos; 968 969 if (!parsingIDs) { 970 if (curData != NULL) { 971 if (direction == UTRANS_FORWARD) 972 dataVector.addElement(curData, status); 973 else 974 dataVector.insertElementAt(curData, 0, status); 975 curData = NULL; 976 } 977 parsingIDs = TRUE; 978 } 979 980 TransliteratorIDParser::SingleID* id = 981 TransliteratorIDParser::parseSingleID(rule, p, direction, status); 982 if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) { 983 // Successful ::ID parse. 984 985 if (direction == UTRANS_FORWARD) { 986 idBlockResult.append(id->canonID).append(END_OF_RULE); 987 } else { 988 idBlockResult.insert(0, END_OF_RULE); 989 idBlockResult.insert(0, id->canonID); 990 } 991 992 } else { 993 // Couldn't parse an ID. Try to parse a global filter 994 int32_t withParens = -1; 995 UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL); 996 if (f != NULL) { 997 if (ICU_Utility::parseChar(rule, p, END_OF_RULE) 998 && (direction == UTRANS_FORWARD) == (withParens == 0)) 999 { 1000 if (compoundFilter != NULL) { 1001 // Multiple compound filters 1002 syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status); 1003 delete f; 1004 } else { 1005 compoundFilter = f; 1006 compoundFilterOffset = ruleCount; 1007 } 1008 } else { 1009 delete f; 1010 } 1011 } else { 1012 // Invalid ::id 1013 // Can be parsed as neither an ID nor a global filter 1014 syntaxError(U_INVALID_ID, rule, pos, status); 1015 } 1016 } 1017 delete id; 1018 pos = p; 1019 } else { 1020 if (parsingIDs) { 1021 tempstr = new UnicodeString(idBlockResult); 1022 // NULL pointer check 1023 if (tempstr == NULL) { 1024 status = U_MEMORY_ALLOCATION_ERROR; 1025 return; 1026 } 1027 if (direction == UTRANS_FORWARD) 1028 idBlockVector.addElement(tempstr, status); 1029 else 1030 idBlockVector.insertElementAt(tempstr, 0, status); 1031 idBlockResult.remove(); 1032 parsingIDs = FALSE; 1033 curData = new TransliterationRuleData(status); 1034 // NULL pointer check 1035 if (curData == NULL) { 1036 status = U_MEMORY_ALLOCATION_ERROR; 1037 return; 1038 } 1039 parseData->data = curData; 1040 1041 // By default, rules use part of the private use area 1042 // E000..F8FF for variables and other stand-ins. Currently 1043 // the range F000..F8FF is typically sufficient. The 'use 1044 // variable range' pragma allows rule sets to modify this. 1045 setVariableRange(0xF000, 0xF8FF, status); 1046 } 1047 1048 if (resemblesPragma(rule, pos, limit)) { 1049 int32_t ppp = parsePragma(rule, pos, limit, status); 1050 if (ppp < 0) { 1051 syntaxError(U_MALFORMED_PRAGMA, rule, pos, status); 1052 } 1053 pos = ppp; 1054 // Parse a rule 1055 } else { 1056 pos = parseRule(rule, pos, limit, status); 1057 } 1058 } 1059 } 1060 1061 if (parsingIDs && idBlockResult.length() > 0) { 1062 tempstr = new UnicodeString(idBlockResult); 1063 // NULL pointer check 1064 if (tempstr == NULL) { 1065 status = U_MEMORY_ALLOCATION_ERROR; 1066 return; 1067 } 1068 if (direction == UTRANS_FORWARD) 1069 idBlockVector.addElement(tempstr, status); 1070 else 1071 idBlockVector.insertElementAt(tempstr, 0, status); 1072 } 1073 else if (!parsingIDs && curData != NULL) { 1074 if (direction == UTRANS_FORWARD) 1075 dataVector.addElement(curData, status); 1076 else 1077 dataVector.insertElementAt(curData, 0, status); 1078 } 1079 1080 if (U_SUCCESS(status)) { 1081 // Convert the set vector to an array 1082 int32_t i, dataVectorSize = dataVector.size(); 1083 for (i = 0; i < dataVectorSize; i++) { 1084 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1085 data->variablesLength = variablesVector.size(); 1086 if (data->variablesLength == 0) { 1087 data->variables = 0; 1088 } else { 1089 data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*)); 1090 // NULL pointer check 1091 if (data->variables == NULL) { 1092 status = U_MEMORY_ALLOCATION_ERROR; 1093 return; 1094 } 1095 data->variablesAreOwned = (i == 0); 1096 } 1097 1098 for (int32_t j = 0; j < data->variablesLength; j++) { 1099 data->variables[j] = 1100 ((UnicodeSet*)variablesVector.elementAt(j)); 1101 } 1102 1103 data->variableNames.removeAll(); 1104 int32_t pos = -1; 1105 const UHashElement* he = variableNames.nextElement(pos); 1106 while (he != NULL) { 1107 UnicodeString* tempus = (UnicodeString*)(((UnicodeString*)(he->value.pointer))->clone()); 1108 if (tempus == NULL) { 1109 status = U_MEMORY_ALLOCATION_ERROR; 1110 return; 1111 } 1112 data->variableNames.put(*((UnicodeString*)(he->key.pointer)), 1113 tempus, status); 1114 he = variableNames.nextElement(pos); 1115 } 1116 } 1117 variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed 1118 1119 // Index the rules 1120 if (compoundFilter != NULL) { 1121 if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) || 1122 (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) { 1123 status = U_MISPLACED_COMPOUND_FILTER; 1124 } 1125 } 1126 1127 for (i = 0; i < dataVectorSize; i++) { 1128 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1129 data->ruleSet.freeze(parseError, status); 1130 } 1131 if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) { 1132 idBlockVector.removeElementAt(0); 1133 } 1134 } 1135 } 1136 1137 /** 1138 * Set the variable range to [start, end] (inclusive). 1139 */ 1140 void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { 1141 if (start > end || start < 0 || end > 0xFFFF) { 1142 status = U_MALFORMED_PRAGMA; 1143 return; 1144 } 1145 1146 curData->variablesBase = (UChar) start; 1147 if (dataVector.size() == 0) { 1148 variableNext = (UChar) start; 1149 variableLimit = (UChar) (end + 1); 1150 } 1151 } 1152 1153 /** 1154 * Assert that the given character is NOT within the variable range. 1155 * If it is, return FALSE. This is neccesary to ensure that the 1156 * variable range does not overlap characters used in a rule. 1157 */ 1158 UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { 1159 return !(ch >= curData->variablesBase && ch < variableLimit); 1160 } 1161 1162 /** 1163 * Set the maximum backup to 'backup', in response to a pragma 1164 * statement. 1165 */ 1166 void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { 1167 //TODO Finish 1168 } 1169 1170 /** 1171 * Begin normalizing all rules using the given mode, in response 1172 * to a pragma statement. 1173 */ 1174 void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { 1175 //TODO Finish 1176 } 1177 1178 static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use " 1179 1180 static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;" 1181 1182 static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;" 1183 1184 static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;" 1185 1186 static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;" 1187 1188 /** 1189 * Return true if the given rule looks like a pragma. 1190 * @param pos offset to the first non-whitespace character 1191 * of the rule. 1192 * @param limit pointer past the last character of the rule. 1193 */ 1194 UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { 1195 // Must start with /use\s/i 1196 return ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_USE, NULL) >= 0; 1197 } 1198 1199 /** 1200 * Parse a pragma. This method assumes resemblesPragma() has 1201 * already returned true. 1202 * @param pos offset to the first non-whitespace character 1203 * of the rule. 1204 * @param limit pointer past the last character of the rule. 1205 * @return the position index after the final ';' of the pragma, 1206 * or -1 on failure. 1207 */ 1208 int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1209 int32_t array[2]; 1210 1211 // resemblesPragma() has already returned true, so we 1212 // know that pos points to /use\s/i; we can skip 4 characters 1213 // immediately 1214 pos += 4; 1215 1216 // Here are the pragmas we recognize: 1217 // use variable range 0xE000 0xEFFF; 1218 // use maximum backup 16; 1219 // use nfd rules; 1220 // use nfc rules; 1221 int p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_VARIABLE_RANGE, array); 1222 if (p >= 0) { 1223 setVariableRange(array[0], array[1], status); 1224 return p; 1225 } 1226 1227 p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_MAXIMUM_BACKUP, array); 1228 if (p >= 0) { 1229 pragmaMaximumBackup(array[0]); 1230 return p; 1231 } 1232 1233 p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFD_RULES, NULL); 1234 if (p >= 0) { 1235 pragmaNormalizeRules(UNORM_NFD); 1236 return p; 1237 } 1238 1239 p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFC_RULES, NULL); 1240 if (p >= 0) { 1241 pragmaNormalizeRules(UNORM_NFC); 1242 return p; 1243 } 1244 1245 // Syntax error: unable to parse pragma 1246 return -1; 1247 } 1248 1249 /** 1250 * MAIN PARSER. Parse the next rule in the given rule string, starting 1251 * at pos. Return the index after the last character parsed. Do not 1252 * parse characters at or after limit. 1253 * 1254 * Important: The character at pos must be a non-whitespace character 1255 * that is not the comment character. 1256 * 1257 * This method handles quoting, escaping, and whitespace removal. It 1258 * parses the end-of-rule character. It recognizes context and cursor 1259 * indicators. Once it does a lexical breakdown of the rule at pos, it 1260 * creates a rule object and adds it to our rule list. 1261 */ 1262 int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1263 // Locate the left side, operator, and right side 1264 int32_t start = pos; 1265 UChar op = 0; 1266 int32_t i; 1267 1268 // Set up segments data 1269 segmentStandins.truncate(0); 1270 segmentObjects.removeAllElements(); 1271 1272 // Use pointers to automatics to make swapping possible. 1273 RuleHalf _left(*this), _right(*this); 1274 RuleHalf* left = &_left; 1275 RuleHalf* right = &_right; 1276 1277 undefinedVariableName.remove(); 1278 pos = left->parse(rule, pos, limit, status); 1279 if (U_FAILURE(status)) { 1280 return start; 1281 } 1282 1283 if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) { 1284 return syntaxError(U_MISSING_OPERATOR, rule, start, status); 1285 } 1286 ++pos; 1287 1288 // Found an operator char. Check for forward-reverse operator. 1289 if (op == REVERSE_RULE_OP && 1290 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1291 ++pos; 1292 op = FWDREV_RULE_OP; 1293 } 1294 1295 // Translate alternate op characters. 1296 switch (op) { 1297 case ALT_FORWARD_RULE_OP: 1298 op = FORWARD_RULE_OP; 1299 break; 1300 case ALT_REVERSE_RULE_OP: 1301 op = REVERSE_RULE_OP; 1302 break; 1303 case ALT_FWDREV_RULE_OP: 1304 op = FWDREV_RULE_OP; 1305 break; 1306 } 1307 1308 pos = right->parse(rule, pos, limit, status); 1309 if (U_FAILURE(status)) { 1310 return start; 1311 } 1312 1313 if (pos < limit) { 1314 if (rule.charAt(--pos) == END_OF_RULE) { 1315 ++pos; 1316 } else { 1317 // RuleHalf parser must have terminated at an operator 1318 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 1319 } 1320 } 1321 1322 if (op == VARIABLE_DEF_OP) { 1323 // LHS is the name. RHS is a single character, either a literal 1324 // or a set (already parsed). If RHS is longer than one 1325 // character, it is either a multi-character string, or multiple 1326 // sets, or a mixture of chars and sets -- syntax error. 1327 1328 // We expect to see a single undefined variable (the one being 1329 // defined). 1330 if (undefinedVariableName.length() == 0) { 1331 // "Missing '$' or duplicate definition" 1332 return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status); 1333 } 1334 if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) { 1335 // "Malformed LHS" 1336 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1337 } 1338 if (left->anchorStart || left->anchorEnd || 1339 right->anchorStart || right->anchorEnd) { 1340 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1341 } 1342 // We allow anything on the right, including an empty string. 1343 UnicodeString* value = new UnicodeString(right->text); 1344 // NULL pointer check 1345 if (value == NULL) { 1346 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1347 } 1348 variableNames.put(undefinedVariableName, value, status); 1349 ++variableLimit; 1350 return pos; 1351 } 1352 1353 // If this is not a variable definition rule, we shouldn't have 1354 // any undefined variable names. 1355 if (undefinedVariableName.length() != 0) { 1356 return syntaxError(// "Undefined variable $" + undefinedVariableName, 1357 U_UNDEFINED_VARIABLE, 1358 rule, start, status); 1359 } 1360 1361 // Verify segments 1362 if (segmentStandins.length() > segmentObjects.size()) { 1363 syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status); 1364 } 1365 for (i=0; i<segmentStandins.length(); ++i) { 1366 if (segmentStandins.charAt(i) == 0) { 1367 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1368 } 1369 } 1370 for (i=0; i<segmentObjects.size(); ++i) { 1371 if (segmentObjects.elementAt(i) == NULL) { 1372 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1373 } 1374 } 1375 1376 // If the direction we want doesn't match the rule 1377 // direction, do nothing. 1378 if (op != FWDREV_RULE_OP && 1379 ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) { 1380 return pos; 1381 } 1382 1383 // Transform the rule into a forward rule by swapping the 1384 // sides if necessary. 1385 if (direction == UTRANS_REVERSE) { 1386 left = &_right; 1387 right = &_left; 1388 } 1389 1390 // Remove non-applicable elements in forward-reverse 1391 // rules. Bidirectional rules ignore elements that do not 1392 // apply. 1393 if (op == FWDREV_RULE_OP) { 1394 right->removeContext(); 1395 left->cursor = -1; 1396 left->cursorOffset = 0; 1397 } 1398 1399 // Normalize context 1400 if (left->ante < 0) { 1401 left->ante = 0; 1402 } 1403 if (left->post < 0) { 1404 left->post = left->text.length(); 1405 } 1406 1407 // Context is only allowed on the input side. Cursors are only 1408 // allowed on the output side. Segment delimiters can only appear 1409 // on the left, and references on the right. Cursor offset 1410 // cannot appear without an explicit cursor. Cursor offset 1411 // cannot place the cursor outside the limits of the context. 1412 // Anchors are only allowed on the input side. 1413 if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || 1414 (right->cursorOffset != 0 && right->cursor < 0) || 1415 // - The following two checks were used to ensure that the 1416 // - the cursor offset stayed within the ante- or postcontext. 1417 // - However, with the addition of quantifiers, we have to 1418 // - allow arbitrary cursor offsets and do runtime checking. 1419 //(right->cursorOffset > (left->text.length() - left->post)) || 1420 //(-right->cursorOffset > left->ante) || 1421 right->anchorStart || right->anchorEnd || 1422 !left->isValidInput(*this) || !right->isValidOutput(*this) || 1423 left->ante > left->post) { 1424 1425 return syntaxError(U_MALFORMED_RULE, rule, start, status); 1426 } 1427 1428 // Flatten segment objects vector to an array 1429 UnicodeFunctor** segmentsArray = NULL; 1430 if (segmentObjects.size() > 0) { 1431 segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *)); 1432 // Null pointer check 1433 if (segmentsArray == NULL) { 1434 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1435 } 1436 segmentObjects.toArray((void**) segmentsArray); 1437 } 1438 TransliterationRule* temptr = new TransliterationRule( 1439 left->text, left->ante, left->post, 1440 right->text, right->cursor, right->cursorOffset, 1441 segmentsArray, 1442 segmentObjects.size(), 1443 left->anchorStart, left->anchorEnd, 1444 curData, 1445 status); 1446 //Null pointer check 1447 if (temptr == NULL) { 1448 uprv_free(segmentsArray); 1449 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1450 } 1451 1452 curData->ruleSet.addRule(temptr, status); 1453 1454 return pos; 1455 } 1456 1457 /** 1458 * Called by main parser upon syntax error. Search the rule string 1459 * for the probable end of the rule. Of course, if the error is that 1460 * the end of rule marker is missing, then the rule end will not be found. 1461 * In any case the rule start will be correctly reported. 1462 * @param msg error description 1463 * @param rule pattern string 1464 * @param start position of first character of current rule 1465 */ 1466 int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode, 1467 const UnicodeString& rule, 1468 int32_t pos, 1469 UErrorCode& status) 1470 { 1471 parseError.offset = pos; 1472 parseError.line = 0 ; /* we are not using line numbers */ 1473 1474 // for pre-context 1475 const int32_t LEN = U_PARSE_CONTEXT_LEN - 1; 1476 int32_t start = uprv_max(pos - LEN, 0); 1477 int32_t stop = pos; 1478 1479 rule.extract(start,stop-start,parseError.preContext); 1480 //null terminate the buffer 1481 parseError.preContext[stop-start] = 0; 1482 1483 //for post-context 1484 start = pos; 1485 stop = uprv_min(pos + LEN, rule.length()); 1486 1487 rule.extract(start,stop-start,parseError.postContext); 1488 //null terminate the buffer 1489 parseError.postContext[stop-start]= 0; 1490 1491 status = (UErrorCode)parseErrorCode; 1492 return pos; 1493 1494 } 1495 1496 /** 1497 * Parse a UnicodeSet out, store it, and return the stand-in character 1498 * used to represent it. 1499 */ 1500 UChar TransliteratorParser::parseSet(const UnicodeString& rule, 1501 ParsePosition& pos, 1502 UErrorCode& status) { 1503 UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status); 1504 // Null pointer check 1505 if (set == NULL) { 1506 status = U_MEMORY_ALLOCATION_ERROR; 1507 return (UChar)0x0000; // Return empty character with error. 1508 } 1509 set->compact(); 1510 return generateStandInFor(set, status); 1511 } 1512 1513 /** 1514 * Generate and return a stand-in for a new UnicodeFunctor. Store 1515 * the matcher (adopt it). 1516 */ 1517 UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { 1518 // assert(obj != null); 1519 1520 // Look up previous stand-in, if any. This is a short list 1521 // (typical n is 0, 1, or 2); linear search is optimal. 1522 for (int32_t i=0; i<variablesVector.size(); ++i) { 1523 if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison 1524 return (UChar) (curData->variablesBase + i); 1525 } 1526 } 1527 1528 if (variableNext >= variableLimit) { 1529 delete adopted; 1530 status = U_VARIABLE_RANGE_EXHAUSTED; 1531 return 0; 1532 } 1533 variablesVector.addElement(adopted, status); 1534 return variableNext++; 1535 } 1536 1537 /** 1538 * Return the standin for segment seg (1-based). 1539 */ 1540 UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { 1541 // Special character used to indicate an empty spot 1542 UChar empty = curData->variablesBase - 1; 1543 while (segmentStandins.length() < seg) { 1544 segmentStandins.append(empty); 1545 } 1546 UChar c = segmentStandins.charAt(seg-1); 1547 if (c == empty) { 1548 if (variableNext >= variableLimit) { 1549 status = U_VARIABLE_RANGE_EXHAUSTED; 1550 return 0; 1551 } 1552 c = variableNext++; 1553 // Set a placeholder in the master variables vector that will be 1554 // filled in later by setSegmentObject(). We know that we will get 1555 // called first because setSegmentObject() will call us. 1556 variablesVector.addElement((void*) NULL, status); 1557 segmentStandins.setCharAt(seg-1, c); 1558 } 1559 return c; 1560 } 1561 1562 /** 1563 * Set the object for segment seg (1-based). 1564 */ 1565 void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { 1566 // Since we call parseSection() recursively, nested 1567 // segments will result in segment i+1 getting parsed 1568 // and stored before segment i; be careful with the 1569 // vector handling here. 1570 if (segmentObjects.size() < seg) { 1571 segmentObjects.setSize(seg, status); 1572 } 1573 int32_t index = getSegmentStandin(seg, status) - curData->variablesBase; 1574 if (segmentObjects.elementAt(seg-1) != NULL || 1575 variablesVector.elementAt(index) != NULL) { 1576 // should never happen 1577 status = U_INTERNAL_TRANSLITERATOR_ERROR; 1578 return; 1579 } 1580 segmentObjects.setElementAt(adopted, seg-1); 1581 variablesVector.setElementAt(adopted, index); 1582 } 1583 1584 /** 1585 * Return the stand-in for the dot set. It is allocated the first 1586 * time and reused thereafter. 1587 */ 1588 UChar TransliteratorParser::getDotStandIn(UErrorCode& status) { 1589 if (dotStandIn == (UChar) -1) { 1590 UnicodeSet* tempus = new UnicodeSet(DOT_SET, status); 1591 // Null pointer check. 1592 if (tempus == NULL) { 1593 status = U_MEMORY_ALLOCATION_ERROR; 1594 return (UChar)0x0000; 1595 } 1596 dotStandIn = generateStandInFor(tempus, status); 1597 } 1598 return dotStandIn; 1599 } 1600 1601 /** 1602 * Append the value of the given variable name to the given 1603 * UnicodeString. 1604 */ 1605 void TransliteratorParser::appendVariableDef(const UnicodeString& name, 1606 UnicodeString& buf, 1607 UErrorCode& status) { 1608 const UnicodeString* s = (const UnicodeString*) variableNames.get(name); 1609 if (s == NULL) { 1610 // We allow one undefined variable so that variable definition 1611 // statements work. For the first undefined variable we return 1612 // the special placeholder variableLimit-1, and save the variable 1613 // name. 1614 if (undefinedVariableName.length() == 0) { 1615 undefinedVariableName = name; 1616 if (variableNext >= variableLimit) { 1617 // throw new RuntimeException("Private use variables exhausted"); 1618 status = U_ILLEGAL_ARGUMENT_ERROR; 1619 return; 1620 } 1621 buf.append((UChar) --variableLimit); 1622 } else { 1623 //throw new IllegalArgumentException("Undefined variable $" 1624 // + name); 1625 status = U_ILLEGAL_ARGUMENT_ERROR; 1626 return; 1627 } 1628 } else { 1629 buf.append(*s); 1630 } 1631 } 1632 1633 /** 1634 * Glue method to get around access restrictions in C++. 1635 */ 1636 /*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 1637 return Transliterator::createBasicInstance(id, canonID); 1638 }*/ 1639 1640 U_NAMESPACE_END 1641 1642 U_CAPI int32_t 1643 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) { 1644 U_NAMESPACE_USE 1645 1646 //const UChar *sourceStart = source; 1647 const UChar *targetStart = target; 1648 const UChar *sourceLimit = source+sourceLen; 1649 UChar *targetLimit = target+sourceLen; 1650 UChar32 c = 0; 1651 UBool quoted = FALSE; 1652 int32_t index; 1653 1654 uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR); 1655 1656 /* read the rules into the buffer */ 1657 while (source < sourceLimit) 1658 { 1659 index=0; 1660 U16_NEXT_UNSAFE(source, index, c); 1661 source+=index; 1662 if(c == QUOTE) { 1663 quoted = (UBool)!quoted; 1664 } 1665 else if (!quoted) { 1666 if (c == RULE_COMMENT_CHAR) { 1667 /* skip comments and all preceding spaces */ 1668 while (targetStart < target && *(target - 1) == 0x0020) { 1669 target--; 1670 } 1671 do { 1672 c = *(source++); 1673 } 1674 while (c != CR && c != LF); 1675 } 1676 else if (c == ESCAPE) { 1677 UChar32 c2 = *source; 1678 if (c2 == CR || c2 == LF) { 1679 /* A backslash at the end of a line. */ 1680 /* Since we're stripping lines, ignore the backslash. */ 1681 source++; 1682 continue; 1683 } 1684 if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */ 1685 int32_t escapeOffset = 0; 1686 UnicodeString escapedStr(source, 5); 1687 c2 = escapedStr.unescapeAt(escapeOffset); 1688 1689 if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0) 1690 { 1691 *status = U_PARSE_ERROR; 1692 return 0; 1693 } 1694 if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) { 1695 /* It was escaped for a reason. Write what it was suppose to be. */ 1696 source+=5; 1697 c = c2; 1698 } 1699 } 1700 else if (c2 == QUOTE) { 1701 /* \' seen. Make sure we don't do anything when we see it again. */ 1702 quoted = (UBool)!quoted; 1703 } 1704 } 1705 } 1706 if (c == CR || c == LF) 1707 { 1708 /* ignore spaces carriage returns, and all leading spaces on the next line. 1709 * and line feed unless in the form \uXXXX 1710 */ 1711 quoted = FALSE; 1712 while (source < sourceLimit) { 1713 c = *(source); 1714 if (c != CR && c != LF && c != 0x0020) { 1715 break; 1716 } 1717 source++; 1718 } 1719 continue; 1720 } 1721 1722 /* Append UChar * after dissembling if c > 0xffff*/ 1723 index=0; 1724 U16_APPEND_UNSAFE(target, index, c); 1725 target+=index; 1726 } 1727 if (target < targetLimit) { 1728 *target = 0; 1729 } 1730 return (int32_t)(target-targetStart); 1731 } 1732 1733 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1734