1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/uobject.h" 18 #include "unicode/parseerr.h" 19 #include "unicode/parsepos.h" 20 #include "unicode/putil.h" 21 #include "unicode/uchar.h" 22 #include "unicode/ustring.h" 23 #include "unicode/uniset.h" 24 #include "unicode/utf16.h" 25 #include "cstring.h" 26 #include "funcrepl.h" 27 #include "hash.h" 28 #include "quant.h" 29 #include "rbt.h" 30 #include "rbt_data.h" 31 #include "rbt_pars.h" 32 #include "rbt_rule.h" 33 #include "strmatch.h" 34 #include "strrepl.h" 35 #include "unicode/symtable.h" 36 #include "tridpars.h" 37 #include "uvector.h" 38 #include "hash.h" 39 #include "patternprops.h" 40 #include "util.h" 41 #include "cmemory.h" 42 #include "uprops.h" 43 #include "putilimp.h" 44 45 // Operators 46 #define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/ 47 #define FORWARD_RULE_OP ((UChar)0x003E) /*>*/ 48 #define REVERSE_RULE_OP ((UChar)0x003C) /*<*/ 49 #define FWDREV_RULE_OP ((UChar)0x007E) /*~*/ // internal rep of <> op 50 51 // Other special characters 52 #define QUOTE ((UChar)0x0027) /*'*/ 53 #define ESCAPE ((UChar)0x005C) /*\*/ 54 #define END_OF_RULE ((UChar)0x003B) /*;*/ 55 #define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/ 56 57 #define SEGMENT_OPEN ((UChar)0x0028) /*(*/ 58 #define SEGMENT_CLOSE ((UChar)0x0029) /*)*/ 59 #define CONTEXT_ANTE ((UChar)0x007B) /*{*/ 60 #define CONTEXT_POST ((UChar)0x007D) /*}*/ 61 #define CURSOR_POS ((UChar)0x007C) /*|*/ 62 #define CURSOR_OFFSET ((UChar)0x0040) /*@*/ 63 #define ANCHOR_START ((UChar)0x005E) /*^*/ 64 #define KLEENE_STAR ((UChar)0x002A) /***/ 65 #define ONE_OR_MORE ((UChar)0x002B) /*+*/ 66 #define ZERO_OR_ONE ((UChar)0x003F) /*?*/ 67 68 #define DOT ((UChar)46) /*.*/ 69 70 static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]"; 71 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90, 72 108, 58, 93, 92, 114, 92, 110, 36, 93, 0 73 }; 74 75 // A function is denoted &Source-Target/Variant(text) 76 #define FUNCTION ((UChar)38) /*&*/ 77 78 // Aliases for some of the syntax characters. These are provided so 79 // transliteration rules can be expressed in XML without clashing with 80 // XML syntax characters '<', '>', and '&'. 81 #define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow 82 #define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow 83 #define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow 84 #define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta) 85 86 // Special characters disallowed at the top level 87 static const UChar ILLEGAL_TOP[] = {41,0}; // ")" 88 89 // Special characters disallowed within a segment 90 static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@" 91 92 // Special characters disallowed within a function argument 93 static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@" 94 95 // By definition, the ANCHOR_END special character is a 96 // trailing SymbolTable.SYMBOL_REF character. 97 // private static final char ANCHOR_END = '$'; 98 99 static const UChar gOPERATORS[] = { // "=><" 100 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 101 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 102 0 103 }; 104 105 static const UChar HALF_ENDERS[] = { // "=><;" 106 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 107 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 108 END_OF_RULE, 109 0 110 }; 111 112 // These are also used in Transliterator::toRules() 113 static const int32_t ID_TOKEN_LEN = 2; 114 static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' 115 116 /* 117 commented out until we do real ::BEGIN/::END functionality 118 static const int32_t BEGIN_TOKEN_LEN = 5; 119 static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN' 120 121 static const int32_t END_TOKEN_LEN = 3; 122 static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END' 123 */ 124 125 U_NAMESPACE_BEGIN 126 127 //---------------------------------------------------------------------- 128 // BEGIN ParseData 129 //---------------------------------------------------------------------- 130 131 /** 132 * This class implements the SymbolTable interface. It is used 133 * during parsing to give UnicodeSet access to variables that 134 * have been defined so far. Note that it uses variablesVector, 135 * _not_ data.setVariables. 136 */ 137 class ParseData : public UMemory, public SymbolTable { 138 public: 139 const TransliterationRuleData* data; // alias 140 141 const UVector* variablesVector; // alias 142 143 const Hashtable* variableNames; // alias 144 145 ParseData(const TransliterationRuleData* data = 0, 146 const UVector* variablesVector = 0, 147 const Hashtable* variableNames = 0); 148 149 virtual ~ParseData(); 150 151 virtual const UnicodeString* lookup(const UnicodeString& s) const; 152 153 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 154 155 virtual UnicodeString parseReference(const UnicodeString& text, 156 ParsePosition& pos, int32_t limit) const; 157 /** 158 * Return true if the given character is a matcher standin or a plain 159 * character (non standin). 160 */ 161 UBool isMatcher(UChar32 ch); 162 163 /** 164 * Return true if the given character is a replacer standin or a plain 165 * character (non standin). 166 */ 167 UBool isReplacer(UChar32 ch); 168 169 private: 170 ParseData(const ParseData &other); // forbid copying of this class 171 ParseData &operator=(const ParseData &other); // forbid copying of this class 172 }; 173 174 ParseData::ParseData(const TransliterationRuleData* d, 175 const UVector* sets, 176 const Hashtable* vNames) : 177 data(d), variablesVector(sets), variableNames(vNames) {} 178 179 ParseData::~ParseData() {} 180 181 /** 182 * Implement SymbolTable API. 183 */ 184 const UnicodeString* ParseData::lookup(const UnicodeString& name) const { 185 return (const UnicodeString*) variableNames->get(name); 186 } 187 188 /** 189 * Implement SymbolTable API. 190 */ 191 const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { 192 // Note that we cannot use data.lookupSet() because the 193 // set array has not been constructed yet. 194 const UnicodeFunctor* set = NULL; 195 int32_t i = ch - data->variablesBase; 196 if (i >= 0 && i < variablesVector->size()) { 197 int32_t j = ch - data->variablesBase; 198 set = (j < variablesVector->size()) ? 199 (UnicodeFunctor*) variablesVector->elementAt(j) : 0; 200 } 201 return set; 202 } 203 204 /** 205 * Implement SymbolTable API. Parse out a symbol reference 206 * name. 207 */ 208 UnicodeString ParseData::parseReference(const UnicodeString& text, 209 ParsePosition& pos, int32_t limit) const { 210 int32_t start = pos.getIndex(); 211 int32_t i = start; 212 UnicodeString result; 213 while (i < limit) { 214 UChar c = text.charAt(i); 215 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 216 break; 217 } 218 ++i; 219 } 220 if (i == start) { // No valid name chars 221 return result; // Indicate failure with empty string 222 } 223 pos.setIndex(i); 224 text.extractBetween(start, i, result); 225 return result; 226 } 227 228 UBool ParseData::isMatcher(UChar32 ch) { 229 // Note that we cannot use data.lookup() because the 230 // set array has not been constructed yet. 231 int32_t i = ch - data->variablesBase; 232 if (i >= 0 && i < variablesVector->size()) { 233 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 234 return f != NULL && f->toMatcher() != NULL; 235 } 236 return TRUE; 237 } 238 239 /** 240 * Return true if the given character is a replacer standin or a plain 241 * character (non standin). 242 */ 243 UBool ParseData::isReplacer(UChar32 ch) { 244 // Note that we cannot use data.lookup() because the 245 // set array has not been constructed yet. 246 int i = ch - data->variablesBase; 247 if (i >= 0 && i < variablesVector->size()) { 248 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 249 return f != NULL && f->toReplacer() != NULL; 250 } 251 return TRUE; 252 } 253 254 //---------------------------------------------------------------------- 255 // BEGIN RuleHalf 256 //---------------------------------------------------------------------- 257 258 /** 259 * A class representing one side of a rule. This class knows how to 260 * parse half of a rule. It is tightly coupled to the method 261 * RuleBasedTransliterator.Parser.parseRule(). 262 */ 263 class RuleHalf : public UMemory { 264 265 public: 266 267 UnicodeString text; 268 269 int32_t cursor; // position of cursor in text 270 int32_t ante; // position of ante context marker '{' in text 271 int32_t post; // position of post context marker '}' in text 272 273 // Record the offset to the cursor either to the left or to the 274 // right of the key. This is indicated by characters on the output 275 // side that allow the cursor to be positioned arbitrarily within 276 // the matching text. For example, abc{def} > | @@@ xyz; changes 277 // def to xyz and moves the cursor to before abc. Offset characters 278 // must be at the start or end, and they cannot move the cursor past 279 // the ante- or postcontext text. Placeholders are only valid in 280 // output text. The length of the ante and post context is 281 // determined at runtime, because of supplementals and quantifiers. 282 int32_t cursorOffset; // only nonzero on output side 283 284 // Position of first CURSOR_OFFSET on _right_. This will be -1 285 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 286 int32_t cursorOffsetPos; 287 288 UBool anchorStart; 289 UBool anchorEnd; 290 291 /** 292 * The segment number from 1..n of the next '(' we see 293 * during parsing; 1-based. 294 */ 295 int32_t nextSegmentNumber; 296 297 TransliteratorParser& parser; 298 299 //-------------------------------------------------- 300 // Methods 301 302 RuleHalf(TransliteratorParser& parser); 303 ~RuleHalf(); 304 305 int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 306 307 int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 308 UnicodeString& buf, 309 const UnicodeString& illegal, 310 UBool isSegment, 311 UErrorCode& status); 312 313 /** 314 * Remove context. 315 */ 316 void removeContext(); 317 318 /** 319 * Return true if this half looks like valid output, that is, does not 320 * contain quantifiers or other special input-only elements. 321 */ 322 UBool isValidOutput(TransliteratorParser& parser); 323 324 /** 325 * Return true if this half looks like valid input, that is, does not 326 * contain functions or other special output-only elements. 327 */ 328 UBool isValidInput(TransliteratorParser& parser); 329 330 int syntaxError(UErrorCode code, 331 const UnicodeString& rule, 332 int32_t start, 333 UErrorCode& status) { 334 return parser.syntaxError(code, rule, start, status); 335 } 336 337 private: 338 // Disallowed methods; no impl. 339 RuleHalf(const RuleHalf&); 340 RuleHalf& operator=(const RuleHalf&); 341 }; 342 343 RuleHalf::RuleHalf(TransliteratorParser& p) : 344 parser(p) 345 { 346 cursor = -1; 347 ante = -1; 348 post = -1; 349 cursorOffset = 0; 350 cursorOffsetPos = 0; 351 anchorStart = anchorEnd = FALSE; 352 nextSegmentNumber = 1; 353 } 354 355 RuleHalf::~RuleHalf() { 356 } 357 358 /** 359 * Parse one side of a rule, stopping at either the limit, 360 * the END_OF_RULE character, or an operator. 361 * @return the index after the terminating character, or 362 * if limit was reached, limit 363 */ 364 int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 365 int32_t start = pos; 366 text.truncate(0); 367 pos = parseSection(rule, pos, limit, text, UnicodeString(TRUE, ILLEGAL_TOP, -1), FALSE, status); 368 369 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 370 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 371 } 372 373 return pos; 374 } 375 376 /** 377 * Parse a section of one side of a rule, stopping at either 378 * the limit, the END_OF_RULE character, an operator, or a 379 * segment close character. This method parses both a 380 * top-level rule half and a segment within such a rule half. 381 * It calls itself recursively to parse segments and nested 382 * segments. 383 * @param buf buffer into which to accumulate the rule pattern 384 * characters, either literal characters from the rule or 385 * standins for UnicodeMatcher objects including segments. 386 * @param illegal the set of special characters that is illegal during 387 * this parse. 388 * @param isSegment if true, then we've already seen a '(' and 389 * pos on entry points right after it. Accumulate everything 390 * up to the closing ')', put it in a segment matcher object, 391 * generate a standin for it, and add the standin to buf. As 392 * a side effect, update the segments vector with a reference 393 * to the segment matcher. This works recursively for nested 394 * segments. If isSegment is false, just accumulate 395 * characters into buf. 396 * @return the index after the terminating character, or 397 * if limit was reached, limit 398 */ 399 int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 400 UnicodeString& buf, 401 const UnicodeString& illegal, 402 UBool isSegment, UErrorCode& status) { 403 int32_t start = pos; 404 ParsePosition pp; 405 UnicodeString scratch; 406 UBool done = FALSE; 407 int32_t quoteStart = -1; // Most recent 'single quoted string' 408 int32_t quoteLimit = -1; 409 int32_t varStart = -1; // Most recent $variableReference 410 int32_t varLimit = -1; 411 int32_t bufStart = buf.length(); 412 413 while (pos < limit && !done) { 414 // Since all syntax characters are in the BMP, fetching 415 // 16-bit code units suffices here. 416 UChar c = rule.charAt(pos++); 417 if (PatternProps::isWhiteSpace(c)) { 418 // Ignore whitespace. Note that this is not Unicode 419 // spaces, but Java spaces -- a subset, representing 420 // whitespace likely to be seen in code. 421 continue; 422 } 423 if (u_strchr(HALF_ENDERS, c) != NULL) { 424 if (isSegment) { 425 // Unclosed segment 426 return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status); 427 } 428 break; 429 } 430 if (anchorEnd) { 431 // Text after a presumed end anchor is a syntax err 432 return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status); 433 } 434 if (UnicodeSet::resemblesPattern(rule, pos-1)) { 435 pp.setIndex(pos-1); // Backup to opening '[' 436 buf.append(parser.parseSet(rule, pp, status)); 437 if (U_FAILURE(status)) { 438 return syntaxError(U_MALFORMED_SET, rule, start, status); 439 } 440 pos = pp.getIndex(); 441 continue; 442 } 443 // Handle escapes 444 if (c == ESCAPE) { 445 if (pos == limit) { 446 return syntaxError(U_TRAILING_BACKSLASH, rule, start, status); 447 } 448 UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\' 449 if (escaped == (UChar32) -1) { 450 return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status); 451 } 452 if (!parser.checkVariableRange(escaped)) { 453 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 454 } 455 buf.append(escaped); 456 continue; 457 } 458 // Handle quoted matter 459 if (c == QUOTE) { 460 int32_t iq = rule.indexOf(QUOTE, pos); 461 if (iq == pos) { 462 buf.append(c); // Parse [''] outside quotes as ['] 463 ++pos; 464 } else { 465 /* This loop picks up a run of quoted text of the 466 * form 'aaaa' each time through. If this run 467 * hasn't really ended ('aaaa''bbbb') then it keeps 468 * looping, each time adding on a new run. When it 469 * reaches the final quote it breaks. 470 */ 471 quoteStart = buf.length(); 472 for (;;) { 473 if (iq < 0) { 474 return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status); 475 } 476 scratch.truncate(0); 477 rule.extractBetween(pos, iq, scratch); 478 buf.append(scratch); 479 pos = iq+1; 480 if (pos < limit && rule.charAt(pos) == QUOTE) { 481 // Parse [''] inside quotes as ['] 482 iq = rule.indexOf(QUOTE, pos+1); 483 // Continue looping 484 } else { 485 break; 486 } 487 } 488 quoteLimit = buf.length(); 489 490 for (iq=quoteStart; iq<quoteLimit; ++iq) { 491 if (!parser.checkVariableRange(buf.charAt(iq))) { 492 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 493 } 494 } 495 } 496 continue; 497 } 498 499 if (!parser.checkVariableRange(c)) { 500 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 501 } 502 503 if (illegal.indexOf(c) >= 0) { 504 syntaxError(U_ILLEGAL_CHARACTER, rule, start, status); 505 } 506 507 switch (c) { 508 509 //------------------------------------------------------ 510 // Elements allowed within and out of segments 511 //------------------------------------------------------ 512 case ANCHOR_START: 513 if (buf.length() == 0 && !anchorStart) { 514 anchorStart = TRUE; 515 } else { 516 return syntaxError(U_MISPLACED_ANCHOR_START, 517 rule, start, status); 518 } 519 break; 520 case SEGMENT_OPEN: 521 { 522 // bufSegStart is the offset in buf to the first 523 // character of the segment we are parsing. 524 int32_t bufSegStart = buf.length(); 525 526 // Record segment number now, since nextSegmentNumber 527 // will be incremented during the call to parseSection 528 // if there are nested segments. 529 int32_t segmentNumber = nextSegmentNumber++; // 1-based 530 531 // Parse the segment 532 pos = parseSection(rule, pos, limit, buf, UnicodeString(TRUE, ILLEGAL_SEG, -1), TRUE, status); 533 534 // After parsing a segment, the relevant characters are 535 // in buf, starting at offset bufSegStart. Extract them 536 // into a string matcher, and replace them with a 537 // standin for that matcher. 538 StringMatcher* m = 539 new StringMatcher(buf, bufSegStart, buf.length(), 540 segmentNumber, *parser.curData); 541 if (m == NULL) { 542 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 543 } 544 545 // Record and associate object and segment number 546 parser.setSegmentObject(segmentNumber, m, status); 547 buf.truncate(bufSegStart); 548 buf.append(parser.getSegmentStandin(segmentNumber, status)); 549 } 550 break; 551 case FUNCTION: 552 case ALT_FUNCTION: 553 { 554 int32_t iref = pos; 555 TransliteratorIDParser::SingleID* single = 556 TransliteratorIDParser::parseFilterID(rule, iref); 557 // The next character MUST be a segment open 558 if (single == NULL || 559 !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) { 560 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 561 } 562 563 Transliterator *t = single->createInstance(); 564 delete single; 565 if (t == NULL) { 566 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 567 } 568 569 // bufSegStart is the offset in buf to the first 570 // character of the segment we are parsing. 571 int32_t bufSegStart = buf.length(); 572 573 // Parse the segment 574 pos = parseSection(rule, iref, limit, buf, UnicodeString(TRUE, ILLEGAL_FUNC, -1), TRUE, status); 575 576 // After parsing a segment, the relevant characters are 577 // in buf, starting at offset bufSegStart. 578 UnicodeString output; 579 buf.extractBetween(bufSegStart, buf.length(), output); 580 FunctionReplacer *r = 581 new FunctionReplacer(t, new StringReplacer(output, parser.curData)); 582 if (r == NULL) { 583 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 584 } 585 586 // Replace the buffer contents with a stand-in 587 buf.truncate(bufSegStart); 588 buf.append(parser.generateStandInFor(r, status)); 589 } 590 break; 591 case SymbolTable::SYMBOL_REF: 592 // Handle variable references and segment references "$1" .. "$9" 593 { 594 // A variable reference must be followed immediately 595 // by a Unicode identifier start and zero or more 596 // Unicode identifier part characters, or by a digit 597 // 1..9 if it is a segment reference. 598 if (pos == limit) { 599 // A variable ref character at the end acts as 600 // an anchor to the context limit, as in perl. 601 anchorEnd = TRUE; 602 break; 603 } 604 // Parse "$1" "$2" .. "$9" .. (no upper limit) 605 c = rule.charAt(pos); 606 int32_t r = u_digit(c, 10); 607 if (r >= 1 && r <= 9) { 608 r = ICU_Utility::parseNumber(rule, pos, 10); 609 if (r < 0) { 610 return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, 611 rule, start, status); 612 } 613 buf.append(parser.getSegmentStandin(r, status)); 614 } else { 615 pp.setIndex(pos); 616 UnicodeString name = parser.parseData-> 617 parseReference(rule, pp, limit); 618 if (name.length() == 0) { 619 // This means the '$' was not followed by a 620 // valid name. Try to interpret it as an 621 // end anchor then. If this also doesn't work 622 // (if we see a following character) then signal 623 // an error. 624 anchorEnd = TRUE; 625 break; 626 } 627 pos = pp.getIndex(); 628 // If this is a variable definition statement, 629 // then the LHS variable will be undefined. In 630 // that case appendVariableDef() will append the 631 // special placeholder char variableLimit-1. 632 varStart = buf.length(); 633 parser.appendVariableDef(name, buf, status); 634 varLimit = buf.length(); 635 } 636 } 637 break; 638 case DOT: 639 buf.append(parser.getDotStandIn(status)); 640 break; 641 case KLEENE_STAR: 642 case ONE_OR_MORE: 643 case ZERO_OR_ONE: 644 // Quantifiers. We handle single characters, quoted strings, 645 // variable references, and segments. 646 // a+ matches aaa 647 // 'foo'+ matches foofoofoo 648 // $v+ matches xyxyxy if $v == xy 649 // (seg)+ matches segsegseg 650 { 651 if (isSegment && buf.length() == bufStart) { 652 // The */+ immediately follows '(' 653 return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status); 654 } 655 656 int32_t qstart, qlimit; 657 // The */+ follows an isolated character or quote 658 // or variable reference 659 if (buf.length() == quoteLimit) { 660 // The */+ follows a 'quoted string' 661 qstart = quoteStart; 662 qlimit = quoteLimit; 663 } else if (buf.length() == varLimit) { 664 // The */+ follows a $variableReference 665 qstart = varStart; 666 qlimit = varLimit; 667 } else { 668 // The */+ follows a single character, possibly 669 // a segment standin 670 qstart = buf.length() - 1; 671 qlimit = qstart + 1; 672 } 673 674 UnicodeFunctor *m = 675 new StringMatcher(buf, qstart, qlimit, 0, *parser.curData); 676 if (m == NULL) { 677 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 678 } 679 int32_t min = 0; 680 int32_t max = Quantifier::MAX; 681 switch (c) { 682 case ONE_OR_MORE: 683 min = 1; 684 break; 685 case ZERO_OR_ONE: 686 min = 0; 687 max = 1; 688 break; 689 // case KLEENE_STAR: 690 // do nothing -- min, max already set 691 } 692 m = new Quantifier(m, min, max); 693 if (m == NULL) { 694 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 695 } 696 buf.truncate(qstart); 697 buf.append(parser.generateStandInFor(m, status)); 698 } 699 break; 700 701 //------------------------------------------------------ 702 // Elements allowed ONLY WITHIN segments 703 //------------------------------------------------------ 704 case SEGMENT_CLOSE: 705 // assert(isSegment); 706 // We're done parsing a segment. 707 done = TRUE; 708 break; 709 710 //------------------------------------------------------ 711 // Elements allowed ONLY OUTSIDE segments 712 //------------------------------------------------------ 713 case CONTEXT_ANTE: 714 if (ante >= 0) { 715 return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status); 716 } 717 ante = buf.length(); 718 break; 719 case CONTEXT_POST: 720 if (post >= 0) { 721 return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status); 722 } 723 post = buf.length(); 724 break; 725 case CURSOR_POS: 726 if (cursor >= 0) { 727 return syntaxError(U_MULTIPLE_CURSORS, rule, start, status); 728 } 729 cursor = buf.length(); 730 break; 731 case CURSOR_OFFSET: 732 if (cursorOffset < 0) { 733 if (buf.length() > 0) { 734 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 735 } 736 --cursorOffset; 737 } else if (cursorOffset > 0) { 738 if (buf.length() != cursorOffsetPos || cursor >= 0) { 739 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 740 } 741 ++cursorOffset; 742 } else { 743 if (cursor == 0 && buf.length() == 0) { 744 cursorOffset = -1; 745 } else if (cursor < 0) { 746 cursorOffsetPos = buf.length(); 747 cursorOffset = 1; 748 } else { 749 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 750 } 751 } 752 break; 753 754 755 //------------------------------------------------------ 756 // Non-special characters 757 //------------------------------------------------------ 758 default: 759 // Disallow unquoted characters other than [0-9A-Za-z] 760 // in the printable ASCII range. These characters are 761 // reserved for possible future use. 762 if (c >= 0x0021 && c <= 0x007E && 763 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 764 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 765 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) { 766 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 767 } 768 buf.append(c); 769 break; 770 } 771 } 772 773 return pos; 774 } 775 776 /** 777 * Remove context. 778 */ 779 void RuleHalf::removeContext() { 780 //text = text.substring(ante < 0 ? 0 : ante, 781 // post < 0 ? text.length() : post); 782 if (post >= 0) { 783 text.remove(post); 784 } 785 if (ante >= 0) { 786 text.removeBetween(0, ante); 787 } 788 ante = post = -1; 789 anchorStart = anchorEnd = FALSE; 790 } 791 792 /** 793 * Return true if this half looks like valid output, that is, does not 794 * contain quantifiers or other special input-only elements. 795 */ 796 UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { 797 for (int32_t i=0; i<text.length(); ) { 798 UChar32 c = text.char32At(i); 799 i += U16_LENGTH(c); 800 if (!transParser.parseData->isReplacer(c)) { 801 return FALSE; 802 } 803 } 804 return TRUE; 805 } 806 807 /** 808 * Return true if this half looks like valid input, that is, does not 809 * contain functions or other special output-only elements. 810 */ 811 UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { 812 for (int32_t i=0; i<text.length(); ) { 813 UChar32 c = text.char32At(i); 814 i += U16_LENGTH(c); 815 if (!transParser.parseData->isMatcher(c)) { 816 return FALSE; 817 } 818 } 819 return TRUE; 820 } 821 822 //---------------------------------------------------------------------- 823 // PUBLIC API 824 //---------------------------------------------------------------------- 825 826 /** 827 * Constructor. 828 */ 829 TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : 830 dataVector(statusReturn), 831 idBlockVector(statusReturn), 832 variablesVector(statusReturn), 833 segmentObjects(statusReturn) 834 { 835 idBlockVector.setDeleter(uprv_deleteUObject); 836 curData = NULL; 837 compoundFilter = NULL; 838 parseData = NULL; 839 variableNames.setValueDeleter(uprv_deleteUObject); 840 } 841 842 /** 843 * Destructor. 844 */ 845 TransliteratorParser::~TransliteratorParser() { 846 while (!dataVector.isEmpty()) 847 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 848 delete compoundFilter; 849 delete parseData; 850 while (!variablesVector.isEmpty()) 851 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 852 } 853 854 void 855 TransliteratorParser::parse(const UnicodeString& rules, 856 UTransDirection transDirection, 857 UParseError& pe, 858 UErrorCode& ec) { 859 if (U_SUCCESS(ec)) { 860 parseRules(rules, transDirection, ec); 861 pe = parseError; 862 } 863 } 864 865 /** 866 * Return the compound filter parsed by parse(). Caller owns result. 867 */ 868 UnicodeSet* TransliteratorParser::orphanCompoundFilter() { 869 UnicodeSet* f = compoundFilter; 870 compoundFilter = NULL; 871 return f; 872 } 873 874 //---------------------------------------------------------------------- 875 // Private implementation 876 //---------------------------------------------------------------------- 877 878 /** 879 * Parse the given string as a sequence of rules, separated by newline 880 * characters ('\n'), and cause this object to implement those rules. Any 881 * previous rules are discarded. Typically this method is called exactly 882 * once, during construction. 883 * @exception IllegalArgumentException if there is a syntax error in the 884 * rules 885 */ 886 void TransliteratorParser::parseRules(const UnicodeString& rule, 887 UTransDirection theDirection, 888 UErrorCode& status) 889 { 890 // Clear error struct 891 uprv_memset(&parseError, 0, sizeof(parseError)); 892 parseError.line = parseError.offset = -1; 893 894 UBool parsingIDs = TRUE; 895 int32_t ruleCount = 0; 896 897 while (!dataVector.isEmpty()) { 898 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 899 } 900 if (U_FAILURE(status)) { 901 return; 902 } 903 904 idBlockVector.removeAllElements(); 905 curData = NULL; 906 direction = theDirection; 907 ruleCount = 0; 908 909 delete compoundFilter; 910 compoundFilter = NULL; 911 912 while (!variablesVector.isEmpty()) { 913 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 914 } 915 variableNames.removeAll(); 916 parseData = new ParseData(0, &variablesVector, &variableNames); 917 if (parseData == NULL) { 918 status = U_MEMORY_ALLOCATION_ERROR; 919 return; 920 } 921 922 dotStandIn = (UChar) -1; 923 924 UnicodeString *tempstr = NULL; // used for memory allocation error checking 925 UnicodeString str; // scratch 926 UnicodeString idBlockResult; 927 int32_t pos = 0; 928 int32_t limit = rule.length(); 929 930 // The compound filter offset is an index into idBlockResult. 931 // If it is 0, then the compound filter occurred at the start, 932 // and it is the offset to the _start_ of the compound filter 933 // pattern. Otherwise it is the offset to the _limit_ of the 934 // compound filter pattern within idBlockResult. 935 compoundFilter = NULL; 936 int32_t compoundFilterOffset = -1; 937 938 while (pos < limit && U_SUCCESS(status)) { 939 UChar c = rule.charAt(pos++); 940 if (PatternProps::isWhiteSpace(c)) { 941 // Ignore leading whitespace. 942 continue; 943 } 944 // Skip lines starting with the comment character 945 if (c == RULE_COMMENT_CHAR) { 946 pos = rule.indexOf((UChar)0x000A /*\n*/, pos) + 1; 947 if (pos == 0) { 948 break; // No "\n" found; rest of rule is a commnet 949 } 950 continue; // Either fall out or restart with next line 951 } 952 953 // skip empty rules 954 if (c == END_OF_RULE) 955 continue; 956 957 // keep track of how many rules we've seen 958 ++ruleCount; 959 960 // We've found the start of a rule or ID. c is its first 961 // character, and pos points past c. 962 --pos; 963 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 964 // chars left. 965 if ((pos + ID_TOKEN_LEN + 1) <= limit && 966 rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) { 967 pos += ID_TOKEN_LEN; 968 c = rule.charAt(pos); 969 while (PatternProps::isWhiteSpace(c) && pos < limit) { 970 ++pos; 971 c = rule.charAt(pos); 972 } 973 974 int32_t p = pos; 975 976 if (!parsingIDs) { 977 if (curData != NULL) { 978 if (direction == UTRANS_FORWARD) 979 dataVector.addElement(curData, status); 980 else 981 dataVector.insertElementAt(curData, 0, status); 982 curData = NULL; 983 } 984 parsingIDs = TRUE; 985 } 986 987 TransliteratorIDParser::SingleID* id = 988 TransliteratorIDParser::parseSingleID(rule, p, direction, status); 989 if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) { 990 // Successful ::ID parse. 991 992 if (direction == UTRANS_FORWARD) { 993 idBlockResult.append(id->canonID).append(END_OF_RULE); 994 } else { 995 idBlockResult.insert(0, END_OF_RULE); 996 idBlockResult.insert(0, id->canonID); 997 } 998 999 } else { 1000 // Couldn't parse an ID. Try to parse a global filter 1001 int32_t withParens = -1; 1002 UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL); 1003 if (f != NULL) { 1004 if (ICU_Utility::parseChar(rule, p, END_OF_RULE) 1005 && (direction == UTRANS_FORWARD) == (withParens == 0)) 1006 { 1007 if (compoundFilter != NULL) { 1008 // Multiple compound filters 1009 syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status); 1010 delete f; 1011 } else { 1012 compoundFilter = f; 1013 compoundFilterOffset = ruleCount; 1014 } 1015 } else { 1016 delete f; 1017 } 1018 } else { 1019 // Invalid ::id 1020 // Can be parsed as neither an ID nor a global filter 1021 syntaxError(U_INVALID_ID, rule, pos, status); 1022 } 1023 } 1024 delete id; 1025 pos = p; 1026 } else { 1027 if (parsingIDs) { 1028 tempstr = new UnicodeString(idBlockResult); 1029 // NULL pointer check 1030 if (tempstr == NULL) { 1031 status = U_MEMORY_ALLOCATION_ERROR; 1032 return; 1033 } 1034 if (direction == UTRANS_FORWARD) 1035 idBlockVector.addElement(tempstr, status); 1036 else 1037 idBlockVector.insertElementAt(tempstr, 0, status); 1038 idBlockResult.remove(); 1039 parsingIDs = FALSE; 1040 curData = new TransliterationRuleData(status); 1041 // NULL pointer check 1042 if (curData == NULL) { 1043 status = U_MEMORY_ALLOCATION_ERROR; 1044 return; 1045 } 1046 parseData->data = curData; 1047 1048 // By default, rules use part of the private use area 1049 // E000..F8FF for variables and other stand-ins. Currently 1050 // the range F000..F8FF is typically sufficient. The 'use 1051 // variable range' pragma allows rule sets to modify this. 1052 setVariableRange(0xF000, 0xF8FF, status); 1053 } 1054 1055 if (resemblesPragma(rule, pos, limit)) { 1056 int32_t ppp = parsePragma(rule, pos, limit, status); 1057 if (ppp < 0) { 1058 syntaxError(U_MALFORMED_PRAGMA, rule, pos, status); 1059 } 1060 pos = ppp; 1061 // Parse a rule 1062 } else { 1063 pos = parseRule(rule, pos, limit, status); 1064 } 1065 } 1066 } 1067 1068 if (parsingIDs && idBlockResult.length() > 0) { 1069 tempstr = new UnicodeString(idBlockResult); 1070 // NULL pointer check 1071 if (tempstr == NULL) { 1072 status = U_MEMORY_ALLOCATION_ERROR; 1073 return; 1074 } 1075 if (direction == UTRANS_FORWARD) 1076 idBlockVector.addElement(tempstr, status); 1077 else 1078 idBlockVector.insertElementAt(tempstr, 0, status); 1079 } 1080 else if (!parsingIDs && curData != NULL) { 1081 if (direction == UTRANS_FORWARD) 1082 dataVector.addElement(curData, status); 1083 else 1084 dataVector.insertElementAt(curData, 0, status); 1085 } 1086 1087 if (U_SUCCESS(status)) { 1088 // Convert the set vector to an array 1089 int32_t i, dataVectorSize = dataVector.size(); 1090 for (i = 0; i < dataVectorSize; i++) { 1091 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1092 data->variablesLength = variablesVector.size(); 1093 if (data->variablesLength == 0) { 1094 data->variables = 0; 1095 } else { 1096 data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*)); 1097 // NULL pointer check 1098 if (data->variables == NULL) { 1099 status = U_MEMORY_ALLOCATION_ERROR; 1100 return; 1101 } 1102 data->variablesAreOwned = (i == 0); 1103 } 1104 1105 for (int32_t j = 0; j < data->variablesLength; j++) { 1106 data->variables[j] = 1107 static_cast<UnicodeFunctor *>(variablesVector.elementAt(j)); 1108 } 1109 1110 data->variableNames.removeAll(); 1111 int32_t p = UHASH_FIRST; 1112 const UHashElement* he = variableNames.nextElement(p); 1113 while (he != NULL) { 1114 UnicodeString* tempus = (UnicodeString*)(((UnicodeString*)(he->value.pointer))->clone()); 1115 if (tempus == NULL) { 1116 status = U_MEMORY_ALLOCATION_ERROR; 1117 return; 1118 } 1119 data->variableNames.put(*((UnicodeString*)(he->key.pointer)), 1120 tempus, status); 1121 he = variableNames.nextElement(p); 1122 } 1123 } 1124 variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed 1125 1126 // Index the rules 1127 if (compoundFilter != NULL) { 1128 if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) || 1129 (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) { 1130 status = U_MISPLACED_COMPOUND_FILTER; 1131 } 1132 } 1133 1134 for (i = 0; i < dataVectorSize; i++) { 1135 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1136 data->ruleSet.freeze(parseError, status); 1137 } 1138 if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) { 1139 idBlockVector.removeElementAt(0); 1140 } 1141 } 1142 } 1143 1144 /** 1145 * Set the variable range to [start, end] (inclusive). 1146 */ 1147 void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { 1148 if (start > end || start < 0 || end > 0xFFFF) { 1149 status = U_MALFORMED_PRAGMA; 1150 return; 1151 } 1152 1153 curData->variablesBase = (UChar) start; 1154 if (dataVector.size() == 0) { 1155 variableNext = (UChar) start; 1156 variableLimit = (UChar) (end + 1); 1157 } 1158 } 1159 1160 /** 1161 * Assert that the given character is NOT within the variable range. 1162 * If it is, return FALSE. This is neccesary to ensure that the 1163 * variable range does not overlap characters used in a rule. 1164 */ 1165 UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { 1166 return !(ch >= curData->variablesBase && ch < variableLimit); 1167 } 1168 1169 /** 1170 * Set the maximum backup to 'backup', in response to a pragma 1171 * statement. 1172 */ 1173 void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { 1174 //TODO Finish 1175 } 1176 1177 /** 1178 * Begin normalizing all rules using the given mode, in response 1179 * to a pragma statement. 1180 */ 1181 void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { 1182 //TODO Finish 1183 } 1184 1185 static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use " 1186 1187 static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;" 1188 1189 static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;" 1190 1191 static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;" 1192 1193 static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;" 1194 1195 /** 1196 * Return true if the given rule looks like a pragma. 1197 * @param pos offset to the first non-whitespace character 1198 * of the rule. 1199 * @param limit pointer past the last character of the rule. 1200 */ 1201 UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { 1202 // Must start with /use\s/i 1203 return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_USE, 4), NULL) >= 0; 1204 } 1205 1206 /** 1207 * Parse a pragma. This method assumes resemblesPragma() has 1208 * already returned true. 1209 * @param pos offset to the first non-whitespace character 1210 * of the rule. 1211 * @param limit pointer past the last character of the rule. 1212 * @return the position index after the final ';' of the pragma, 1213 * or -1 on failure. 1214 */ 1215 int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1216 int32_t array[2]; 1217 1218 // resemblesPragma() has already returned true, so we 1219 // know that pos points to /use\s/i; we can skip 4 characters 1220 // immediately 1221 pos += 4; 1222 1223 // Here are the pragmas we recognize: 1224 // use variable range 0xE000 0xEFFF; 1225 // use maximum backup 16; 1226 // use nfd rules; 1227 // use nfc rules; 1228 int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_VARIABLE_RANGE, -1), array); 1229 if (p >= 0) { 1230 setVariableRange(array[0], array[1], status); 1231 return p; 1232 } 1233 1234 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_MAXIMUM_BACKUP, -1), array); 1235 if (p >= 0) { 1236 pragmaMaximumBackup(array[0]); 1237 return p; 1238 } 1239 1240 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFD_RULES, -1), NULL); 1241 if (p >= 0) { 1242 pragmaNormalizeRules(UNORM_NFD); 1243 return p; 1244 } 1245 1246 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFC_RULES, -1), NULL); 1247 if (p >= 0) { 1248 pragmaNormalizeRules(UNORM_NFC); 1249 return p; 1250 } 1251 1252 // Syntax error: unable to parse pragma 1253 return -1; 1254 } 1255 1256 /** 1257 * MAIN PARSER. Parse the next rule in the given rule string, starting 1258 * at pos. Return the index after the last character parsed. Do not 1259 * parse characters at or after limit. 1260 * 1261 * Important: The character at pos must be a non-whitespace character 1262 * that is not the comment character. 1263 * 1264 * This method handles quoting, escaping, and whitespace removal. It 1265 * parses the end-of-rule character. It recognizes context and cursor 1266 * indicators. Once it does a lexical breakdown of the rule at pos, it 1267 * creates a rule object and adds it to our rule list. 1268 */ 1269 int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1270 // Locate the left side, operator, and right side 1271 int32_t start = pos; 1272 UChar op = 0; 1273 int32_t i; 1274 1275 // Set up segments data 1276 segmentStandins.truncate(0); 1277 segmentObjects.removeAllElements(); 1278 1279 // Use pointers to automatics to make swapping possible. 1280 RuleHalf _left(*this), _right(*this); 1281 RuleHalf* left = &_left; 1282 RuleHalf* right = &_right; 1283 1284 undefinedVariableName.remove(); 1285 pos = left->parse(rule, pos, limit, status); 1286 if (U_FAILURE(status)) { 1287 return start; 1288 } 1289 1290 if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) { 1291 return syntaxError(U_MISSING_OPERATOR, rule, start, status); 1292 } 1293 ++pos; 1294 1295 // Found an operator char. Check for forward-reverse operator. 1296 if (op == REVERSE_RULE_OP && 1297 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1298 ++pos; 1299 op = FWDREV_RULE_OP; 1300 } 1301 1302 // Translate alternate op characters. 1303 switch (op) { 1304 case ALT_FORWARD_RULE_OP: 1305 op = FORWARD_RULE_OP; 1306 break; 1307 case ALT_REVERSE_RULE_OP: 1308 op = REVERSE_RULE_OP; 1309 break; 1310 case ALT_FWDREV_RULE_OP: 1311 op = FWDREV_RULE_OP; 1312 break; 1313 } 1314 1315 pos = right->parse(rule, pos, limit, status); 1316 if (U_FAILURE(status)) { 1317 return start; 1318 } 1319 1320 if (pos < limit) { 1321 if (rule.charAt(--pos) == END_OF_RULE) { 1322 ++pos; 1323 } else { 1324 // RuleHalf parser must have terminated at an operator 1325 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 1326 } 1327 } 1328 1329 if (op == VARIABLE_DEF_OP) { 1330 // LHS is the name. RHS is a single character, either a literal 1331 // or a set (already parsed). If RHS is longer than one 1332 // character, it is either a multi-character string, or multiple 1333 // sets, or a mixture of chars and sets -- syntax error. 1334 1335 // We expect to see a single undefined variable (the one being 1336 // defined). 1337 if (undefinedVariableName.length() == 0) { 1338 // "Missing '$' or duplicate definition" 1339 return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status); 1340 } 1341 if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) { 1342 // "Malformed LHS" 1343 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1344 } 1345 if (left->anchorStart || left->anchorEnd || 1346 right->anchorStart || right->anchorEnd) { 1347 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1348 } 1349 // We allow anything on the right, including an empty string. 1350 UnicodeString* value = new UnicodeString(right->text); 1351 // NULL pointer check 1352 if (value == NULL) { 1353 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1354 } 1355 variableNames.put(undefinedVariableName, value, status); 1356 ++variableLimit; 1357 return pos; 1358 } 1359 1360 // If this is not a variable definition rule, we shouldn't have 1361 // any undefined variable names. 1362 if (undefinedVariableName.length() != 0) { 1363 return syntaxError(// "Undefined variable $" + undefinedVariableName, 1364 U_UNDEFINED_VARIABLE, 1365 rule, start, status); 1366 } 1367 1368 // Verify segments 1369 if (segmentStandins.length() > segmentObjects.size()) { 1370 syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status); 1371 } 1372 for (i=0; i<segmentStandins.length(); ++i) { 1373 if (segmentStandins.charAt(i) == 0) { 1374 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1375 } 1376 } 1377 for (i=0; i<segmentObjects.size(); ++i) { 1378 if (segmentObjects.elementAt(i) == NULL) { 1379 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1380 } 1381 } 1382 1383 // If the direction we want doesn't match the rule 1384 // direction, do nothing. 1385 if (op != FWDREV_RULE_OP && 1386 ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) { 1387 return pos; 1388 } 1389 1390 // Transform the rule into a forward rule by swapping the 1391 // sides if necessary. 1392 if (direction == UTRANS_REVERSE) { 1393 left = &_right; 1394 right = &_left; 1395 } 1396 1397 // Remove non-applicable elements in forward-reverse 1398 // rules. Bidirectional rules ignore elements that do not 1399 // apply. 1400 if (op == FWDREV_RULE_OP) { 1401 right->removeContext(); 1402 left->cursor = -1; 1403 left->cursorOffset = 0; 1404 } 1405 1406 // Normalize context 1407 if (left->ante < 0) { 1408 left->ante = 0; 1409 } 1410 if (left->post < 0) { 1411 left->post = left->text.length(); 1412 } 1413 1414 // Context is only allowed on the input side. Cursors are only 1415 // allowed on the output side. Segment delimiters can only appear 1416 // on the left, and references on the right. Cursor offset 1417 // cannot appear without an explicit cursor. Cursor offset 1418 // cannot place the cursor outside the limits of the context. 1419 // Anchors are only allowed on the input side. 1420 if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || 1421 (right->cursorOffset != 0 && right->cursor < 0) || 1422 // - The following two checks were used to ensure that the 1423 // - the cursor offset stayed within the ante- or postcontext. 1424 // - However, with the addition of quantifiers, we have to 1425 // - allow arbitrary cursor offsets and do runtime checking. 1426 //(right->cursorOffset > (left->text.length() - left->post)) || 1427 //(-right->cursorOffset > left->ante) || 1428 right->anchorStart || right->anchorEnd || 1429 !left->isValidInput(*this) || !right->isValidOutput(*this) || 1430 left->ante > left->post) { 1431 1432 return syntaxError(U_MALFORMED_RULE, rule, start, status); 1433 } 1434 1435 // Flatten segment objects vector to an array 1436 UnicodeFunctor** segmentsArray = NULL; 1437 if (segmentObjects.size() > 0) { 1438 segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *)); 1439 // Null pointer check 1440 if (segmentsArray == NULL) { 1441 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1442 } 1443 segmentObjects.toArray((void**) segmentsArray); 1444 } 1445 TransliterationRule* temptr = new TransliterationRule( 1446 left->text, left->ante, left->post, 1447 right->text, right->cursor, right->cursorOffset, 1448 segmentsArray, 1449 segmentObjects.size(), 1450 left->anchorStart, left->anchorEnd, 1451 curData, 1452 status); 1453 //Null pointer check 1454 if (temptr == NULL) { 1455 uprv_free(segmentsArray); 1456 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1457 } 1458 1459 curData->ruleSet.addRule(temptr, status); 1460 1461 return pos; 1462 } 1463 1464 /** 1465 * Called by main parser upon syntax error. Search the rule string 1466 * for the probable end of the rule. Of course, if the error is that 1467 * the end of rule marker is missing, then the rule end will not be found. 1468 * In any case the rule start will be correctly reported. 1469 * @param msg error description 1470 * @param rule pattern string 1471 * @param start position of first character of current rule 1472 */ 1473 int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode, 1474 const UnicodeString& rule, 1475 int32_t pos, 1476 UErrorCode& status) 1477 { 1478 parseError.offset = pos; 1479 parseError.line = 0 ; /* we are not using line numbers */ 1480 1481 // for pre-context 1482 const int32_t LEN = U_PARSE_CONTEXT_LEN - 1; 1483 int32_t start = uprv_max(pos - LEN, 0); 1484 int32_t stop = pos; 1485 1486 rule.extract(start,stop-start,parseError.preContext); 1487 //null terminate the buffer 1488 parseError.preContext[stop-start] = 0; 1489 1490 //for post-context 1491 start = pos; 1492 stop = uprv_min(pos + LEN, rule.length()); 1493 1494 rule.extract(start,stop-start,parseError.postContext); 1495 //null terminate the buffer 1496 parseError.postContext[stop-start]= 0; 1497 1498 status = (UErrorCode)parseErrorCode; 1499 return pos; 1500 1501 } 1502 1503 /** 1504 * Parse a UnicodeSet out, store it, and return the stand-in character 1505 * used to represent it. 1506 */ 1507 UChar TransliteratorParser::parseSet(const UnicodeString& rule, 1508 ParsePosition& pos, 1509 UErrorCode& status) { 1510 UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status); 1511 // Null pointer check 1512 if (set == NULL) { 1513 status = U_MEMORY_ALLOCATION_ERROR; 1514 return (UChar)0x0000; // Return empty character with error. 1515 } 1516 set->compact(); 1517 return generateStandInFor(set, status); 1518 } 1519 1520 /** 1521 * Generate and return a stand-in for a new UnicodeFunctor. Store 1522 * the matcher (adopt it). 1523 */ 1524 UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { 1525 // assert(obj != null); 1526 1527 // Look up previous stand-in, if any. This is a short list 1528 // (typical n is 0, 1, or 2); linear search is optimal. 1529 for (int32_t i=0; i<variablesVector.size(); ++i) { 1530 if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison 1531 return (UChar) (curData->variablesBase + i); 1532 } 1533 } 1534 1535 if (variableNext >= variableLimit) { 1536 delete adopted; 1537 status = U_VARIABLE_RANGE_EXHAUSTED; 1538 return 0; 1539 } 1540 variablesVector.addElement(adopted, status); 1541 return variableNext++; 1542 } 1543 1544 /** 1545 * Return the standin for segment seg (1-based). 1546 */ 1547 UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { 1548 // Special character used to indicate an empty spot 1549 UChar empty = curData->variablesBase - 1; 1550 while (segmentStandins.length() < seg) { 1551 segmentStandins.append(empty); 1552 } 1553 UChar c = segmentStandins.charAt(seg-1); 1554 if (c == empty) { 1555 if (variableNext >= variableLimit) { 1556 status = U_VARIABLE_RANGE_EXHAUSTED; 1557 return 0; 1558 } 1559 c = variableNext++; 1560 // Set a placeholder in the master variables vector that will be 1561 // filled in later by setSegmentObject(). We know that we will get 1562 // called first because setSegmentObject() will call us. 1563 variablesVector.addElement((void*) NULL, status); 1564 segmentStandins.setCharAt(seg-1, c); 1565 } 1566 return c; 1567 } 1568 1569 /** 1570 * Set the object for segment seg (1-based). 1571 */ 1572 void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { 1573 // Since we call parseSection() recursively, nested 1574 // segments will result in segment i+1 getting parsed 1575 // and stored before segment i; be careful with the 1576 // vector handling here. 1577 if (segmentObjects.size() < seg) { 1578 segmentObjects.setSize(seg, status); 1579 } 1580 int32_t index = getSegmentStandin(seg, status) - curData->variablesBase; 1581 if (segmentObjects.elementAt(seg-1) != NULL || 1582 variablesVector.elementAt(index) != NULL) { 1583 // should never happen 1584 status = U_INTERNAL_TRANSLITERATOR_ERROR; 1585 return; 1586 } 1587 segmentObjects.setElementAt(adopted, seg-1); 1588 variablesVector.setElementAt(adopted, index); 1589 } 1590 1591 /** 1592 * Return the stand-in for the dot set. It is allocated the first 1593 * time and reused thereafter. 1594 */ 1595 UChar TransliteratorParser::getDotStandIn(UErrorCode& status) { 1596 if (dotStandIn == (UChar) -1) { 1597 UnicodeSet* tempus = new UnicodeSet(UnicodeString(TRUE, DOT_SET, -1), status); 1598 // Null pointer check. 1599 if (tempus == NULL) { 1600 status = U_MEMORY_ALLOCATION_ERROR; 1601 return (UChar)0x0000; 1602 } 1603 dotStandIn = generateStandInFor(tempus, status); 1604 } 1605 return dotStandIn; 1606 } 1607 1608 /** 1609 * Append the value of the given variable name to the given 1610 * UnicodeString. 1611 */ 1612 void TransliteratorParser::appendVariableDef(const UnicodeString& name, 1613 UnicodeString& buf, 1614 UErrorCode& status) { 1615 const UnicodeString* s = (const UnicodeString*) variableNames.get(name); 1616 if (s == NULL) { 1617 // We allow one undefined variable so that variable definition 1618 // statements work. For the first undefined variable we return 1619 // the special placeholder variableLimit-1, and save the variable 1620 // name. 1621 if (undefinedVariableName.length() == 0) { 1622 undefinedVariableName = name; 1623 if (variableNext >= variableLimit) { 1624 // throw new RuntimeException("Private use variables exhausted"); 1625 status = U_ILLEGAL_ARGUMENT_ERROR; 1626 return; 1627 } 1628 buf.append((UChar) --variableLimit); 1629 } else { 1630 //throw new IllegalArgumentException("Undefined variable $" 1631 // + name); 1632 status = U_ILLEGAL_ARGUMENT_ERROR; 1633 return; 1634 } 1635 } else { 1636 buf.append(*s); 1637 } 1638 } 1639 1640 /** 1641 * Glue method to get around access restrictions in C++. 1642 */ 1643 /*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 1644 return Transliterator::createBasicInstance(id, canonID); 1645 }*/ 1646 1647 U_NAMESPACE_END 1648 1649 U_CAPI int32_t 1650 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) { 1651 U_NAMESPACE_USE 1652 1653 //const UChar *sourceStart = source; 1654 const UChar *targetStart = target; 1655 const UChar *sourceLimit = source+sourceLen; 1656 UChar *targetLimit = target+sourceLen; 1657 UChar32 c = 0; 1658 UBool quoted = FALSE; 1659 int32_t index; 1660 1661 uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR); 1662 1663 /* read the rules into the buffer */ 1664 while (source < sourceLimit) 1665 { 1666 index=0; 1667 U16_NEXT_UNSAFE(source, index, c); 1668 source+=index; 1669 if(c == QUOTE) { 1670 quoted = (UBool)!quoted; 1671 } 1672 else if (!quoted) { 1673 if (c == RULE_COMMENT_CHAR) { 1674 /* skip comments and all preceding spaces */ 1675 while (targetStart < target && *(target - 1) == 0x0020) { 1676 target--; 1677 } 1678 do { 1679 if (source == sourceLimit) { 1680 c = U_SENTINEL; 1681 break; 1682 } 1683 c = *(source++); 1684 } 1685 while (c != CR && c != LF); 1686 if (c < 0) { 1687 break; 1688 } 1689 } 1690 else if (c == ESCAPE && source < sourceLimit) { 1691 UChar32 c2 = *source; 1692 if (c2 == CR || c2 == LF) { 1693 /* A backslash at the end of a line. */ 1694 /* Since we're stripping lines, ignore the backslash. */ 1695 source++; 1696 continue; 1697 } 1698 if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */ 1699 int32_t escapeOffset = 0; 1700 UnicodeString escapedStr(source, 5); 1701 c2 = escapedStr.unescapeAt(escapeOffset); 1702 1703 if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0) 1704 { 1705 *status = U_PARSE_ERROR; 1706 return 0; 1707 } 1708 if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) { 1709 /* It was escaped for a reason. Write what it was suppose to be. */ 1710 source+=5; 1711 c = c2; 1712 } 1713 } 1714 else if (c2 == QUOTE) { 1715 /* \' seen. Make sure we don't do anything when we see it again. */ 1716 quoted = (UBool)!quoted; 1717 } 1718 } 1719 } 1720 if (c == CR || c == LF) 1721 { 1722 /* ignore spaces carriage returns, and all leading spaces on the next line. 1723 * and line feed unless in the form \uXXXX 1724 */ 1725 quoted = FALSE; 1726 while (source < sourceLimit) { 1727 c = *(source); 1728 if (c != CR && c != LF && c != 0x0020) { 1729 break; 1730 } 1731 source++; 1732 } 1733 continue; 1734 } 1735 1736 /* Append UChar * after dissembling if c > 0xffff*/ 1737 index=0; 1738 U16_APPEND_UNSAFE(target, index, c); 1739 target+=index; 1740 } 1741 if (target < targetLimit) { 1742 *target = 0; 1743 } 1744 return (int32_t)(target-targetStart); 1745 } 1746 1747 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1748