1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ********************************************************************** 5 * Copyright (c) 2001-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 package com.ibm.icu.text; 10 11 import java.text.ParsePosition; 12 import java.util.ArrayList; 13 import java.util.HashMap; 14 import java.util.List; 15 import java.util.Map; 16 17 import com.ibm.icu.impl.IllegalIcuArgumentException; 18 import com.ibm.icu.impl.PatternProps; 19 import com.ibm.icu.impl.Utility; 20 import com.ibm.icu.lang.UCharacter; 21 import com.ibm.icu.text.RuleBasedTransliterator.Data; 22 23 class TransliteratorParser { 24 25 //---------------------------------------------------------------------- 26 // Data members 27 //---------------------------------------------------------------------- 28 29 /** 30 * PUBLIC data member. 31 * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group 32 * of rules in the rule set 33 */ 34 public List<Data> dataVector; 35 36 /** 37 * PUBLIC data member. 38 * A Vector of Strings containing all of the ID blocks in the rule set 39 */ 40 public List<String> idBlockVector; 41 42 /** 43 * The current data object for which we are parsing rules 44 */ 45 private Data curData; 46 47 /** 48 * PUBLIC data member containing the parsed compound filter, if any. 49 */ 50 public UnicodeSet compoundFilter; 51 52 53 private int direction; 54 55 /** 56 * Temporary symbol table used during parsing. 57 */ 58 private ParseData parseData; 59 60 /** 61 * Temporary vector of set variables. When parsing is complete, this 62 * is copied into the array data.variables. As with data.variables, 63 * element 0 corresponds to character data.variablesBase. 64 */ 65 private List<Object> variablesVector; 66 67 /** 68 * Temporary table of variable names. When parsing is complete, this is 69 * copied into data.variableNames. 70 */ 71 private Map<String, char[]> variableNames; 72 73 /** 74 * String of standins for segments. Used during the parsing of a single 75 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds 76 * to StringMatcher object segmentObjects.elementAt(0), etc. 77 */ 78 private StringBuffer segmentStandins; 79 80 /** 81 * Vector of StringMatcher objects for segments. Used during the 82 * parsing of a single rule. 83 * segmentStandins.charAt(0) is the standin for "$1" and corresponds 84 * to StringMatcher object segmentObjects.elementAt(0), etc. 85 */ 86 private List<StringMatcher> segmentObjects; 87 88 /** 89 * The next available stand-in for variables. This starts at some point in 90 * the private use area (discovered dynamically) and increments up toward 91 * <code>variableLimit</code>. At any point during parsing, available 92 * variables are <code>variableNext..variableLimit-1</code>. 93 */ 94 private char variableNext; 95 96 /** 97 * The last available stand-in for variables. This is discovered 98 * dynamically. At any point during parsing, available variables are 99 * <code>variableNext..variableLimit-1</code>. During variable definition 100 * we use the special value variableLimit-1 as a placeholder. 101 */ 102 private char variableLimit; 103 104 /** 105 * When we encounter an undefined variable, we do not immediately signal 106 * an error, in case we are defining this variable, e.g., "$a = [a-z];". 107 * Instead, we save the name of the undefined variable, and substitute 108 * in the placeholder char variableLimit - 1, and decrement 109 * variableLimit. 110 */ 111 private String undefinedVariableName; 112 113 /** 114 * The stand-in character for the 'dot' set, represented by '.' in 115 * patterns. This is allocated the first time it is needed, and 116 * reused thereafter. 117 */ 118 private int dotStandIn = -1; 119 120 //---------------------------------------------------------------------- 121 // Constants 122 //---------------------------------------------------------------------- 123 124 // Indicator for ID blocks 125 private static final String ID_TOKEN = "::"; 126 private static final int ID_TOKEN_LEN = 2; 127 128 /* 129 (reserved for future expansion) 130 // markers for beginning and end of rule groups 131 private static final String BEGIN_TOKEN = "BEGIN"; 132 private static final String END_TOKEN = "END"; 133 */ 134 135 // Operators 136 private static final char VARIABLE_DEF_OP = '='; 137 private static final char FORWARD_RULE_OP = '>'; 138 private static final char REVERSE_RULE_OP = '<'; 139 private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op 140 141 private static final String OPERATORS = "=><\u2190\u2192\u2194"; 142 private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;"; 143 144 // Other special characters 145 private static final char QUOTE = '\''; 146 private static final char ESCAPE = '\\'; 147 private static final char END_OF_RULE = ';'; 148 private static final char RULE_COMMENT_CHAR = '#'; 149 150 private static final char CONTEXT_ANTE = '{'; // ante{key 151 private static final char CONTEXT_POST = '}'; // key}post 152 private static final char CURSOR_POS = '|'; 153 private static final char CURSOR_OFFSET = '@'; 154 private static final char ANCHOR_START = '^'; 155 156 private static final char KLEENE_STAR = '*'; 157 private static final char ONE_OR_MORE = '+'; 158 private static final char ZERO_OR_ONE = '?'; 159 160 private static final char DOT = '.'; 161 private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]"; 162 163 // By definition, the ANCHOR_END special character is a 164 // trailing SymbolTable.SYMBOL_REF character. 165 // private static final char ANCHOR_END = '$'; 166 167 // Segments of the input string are delimited by "(" and ")". In the 168 // output string these segments are referenced as "$1", "$2", etc. 169 private static final char SEGMENT_OPEN = '('; 170 private static final char SEGMENT_CLOSE = ')'; 171 172 // A function is denoted &Source-Target/Variant(text) 173 private static final char FUNCTION = '&'; 174 175 // Aliases for some of the syntax characters. These are provided so 176 // transliteration rules can be expressed in XML without clashing with 177 // XML syntax characters '<', '>', and '&'. 178 private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow 179 private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow 180 private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow 181 private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta) 182 183 // Special characters disallowed at the top level 184 private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]"); 185 186 // Special characters disallowed within a segment 187 private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]"); 188 189 // Special characters disallowed within a function argument 190 private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]"); 191 192 //---------------------------------------------------------------------- 193 // class ParseData 194 //---------------------------------------------------------------------- 195 196 /** 197 * This class implements the SymbolTable interface. It is used 198 * during parsing to give UnicodeSet access to variables that 199 * have been defined so far. Note that it uses variablesVector, 200 * _not_ data.variables. 201 */ 202 private class ParseData implements SymbolTable { 203 204 /** 205 * Implement SymbolTable API. 206 */ 207 @Override 208 public char[] lookup(String name) { 209 return variableNames.get(name); 210 } 211 212 /** 213 * Implement SymbolTable API. 214 */ 215 @Override 216 public UnicodeMatcher lookupMatcher(int ch) { 217 // Note that we cannot use data.lookup() because the 218 // set array has not been constructed yet. 219 int i = ch - curData.variablesBase; 220 if (i >= 0 && i < variablesVector.size()) { 221 return (UnicodeMatcher) variablesVector.get(i); 222 } 223 return null; 224 } 225 226 /** 227 * Implement SymbolTable API. Parse out a symbol reference 228 * name. 229 */ 230 @Override 231 public String parseReference(String text, ParsePosition pos, int limit) { 232 int start = pos.getIndex(); 233 int i = start; 234 while (i < limit) { 235 char c = text.charAt(i); 236 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) || 237 !UCharacter.isUnicodeIdentifierPart(c)) { 238 break; 239 } 240 ++i; 241 } 242 if (i == start) { // No valid name chars 243 return null; 244 } 245 pos.setIndex(i); 246 return text.substring(start, i); 247 } 248 249 /** 250 * Return true if the given character is a matcher standin or a plain 251 * character (non standin). 252 */ 253 public boolean isMatcher(int ch) { 254 // Note that we cannot use data.lookup() because the 255 // set array has not been constructed yet. 256 int i = ch - curData.variablesBase; 257 if (i >= 0 && i < variablesVector.size()) { 258 return variablesVector.get(i) instanceof UnicodeMatcher; 259 } 260 return true; 261 } 262 263 /** 264 * Return true if the given character is a replacer standin or a plain 265 * character (non standin). 266 */ 267 public boolean isReplacer(int ch) { 268 // Note that we cannot use data.lookup() because the 269 // set array has not been constructed yet. 270 int i = ch - curData.variablesBase; 271 if (i >= 0 && i < variablesVector.size()) { 272 return variablesVector.get(i) instanceof UnicodeReplacer; 273 } 274 return true; 275 } 276 } 277 278 //---------------------------------------------------------------------- 279 // classes RuleBody, RuleArray, and RuleReader 280 //---------------------------------------------------------------------- 281 282 /** 283 * A private abstract class representing the interface to rule 284 * source code that is broken up into lines. Handles the 285 * folding of lines terminated by a backslash. This folding 286 * is limited; it does not account for comments, quotes, or 287 * escapes, so its use to be limited. 288 */ 289 private static abstract class RuleBody { 290 291 /** 292 * Retrieve the next line of the source, or return null if 293 * none. Folds lines terminated by a backslash into the 294 * next line, without regard for comments, quotes, or 295 * escapes. 296 */ 297 String nextLine() { 298 String s = handleNextLine(); 299 if (s != null && 300 s.length() > 0 && 301 s.charAt(s.length() - 1) == '\\') { 302 StringBuilder b = new StringBuilder(s); 303 do { 304 b.deleteCharAt(b.length()-1); 305 s = handleNextLine(); 306 if (s == null) { 307 break; 308 } 309 b.append(s); 310 } while (s.length() > 0 && 311 s.charAt(s.length() - 1) == '\\'); 312 s = b.toString(); 313 } 314 return s; 315 } 316 317 /** 318 * Reset to the first line of the source. 319 */ 320 abstract void reset(); 321 322 /** 323 * Subclass method to return the next line of the source. 324 */ 325 abstract String handleNextLine(); 326 } 327 328 /** 329 * RuleBody subclass for a String[] array. 330 */ 331 private static class RuleArray extends RuleBody { 332 String[] array; 333 int i; 334 public RuleArray(String[] array) { this.array = array; i = 0; } 335 @Override 336 public String handleNextLine() { 337 return (i < array.length) ? array[i++] : null; 338 } 339 @Override 340 public void reset() { 341 i = 0; 342 } 343 } 344 345 /* 346 * RuleBody subclass for a ResourceReader. 347 */ 348 /* private static class RuleReader extends RuleBody { 349 ResourceReader reader; 350 public RuleReader(ResourceReader reader) { this.reader = reader; } 351 public String handleNextLine() { 352 try { 353 return reader.readLine(); 354 } catch (java.io.IOException e) {} 355 return null; 356 } 357 public void reset() { 358 reader.reset(); 359 } 360 }*/ 361 362 //---------------------------------------------------------------------- 363 // class RuleHalf 364 //---------------------------------------------------------------------- 365 366 /** 367 * A class representing one side of a rule. This class knows how to 368 * parse half of a rule. It is tightly coupled to the method 369 * TransliteratorParser.parseRule(). 370 */ 371 private static class RuleHalf { 372 373 public String text; 374 375 public int cursor = -1; // position of cursor in text 376 public int ante = -1; // position of ante context marker '{' in text 377 public int post = -1; // position of post context marker '}' in text 378 379 // Record the offset to the cursor either to the left or to the 380 // right of the key. This is indicated by characters on the output 381 // side that allow the cursor to be positioned arbitrarily within 382 // the matching text. For example, abc{def} > | @@@ xyz; changes 383 // def to xyz and moves the cursor to before abc. Offset characters 384 // must be at the start or end, and they cannot move the cursor past 385 // the ante- or postcontext text. Placeholders are only valid in 386 // output text. The length of the ante and post context is 387 // determined at runtime, because of supplementals and quantifiers. 388 public int cursorOffset = 0; // only nonzero on output side 389 390 // Position of first CURSOR_OFFSET on _right_. This will be -1 391 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 392 private int cursorOffsetPos = 0; 393 394 public boolean anchorStart = false; 395 public boolean anchorEnd = false; 396 397 /** 398 * The segment number from 1..n of the next '(' we see 399 * during parsing; 1-based. 400 */ 401 private int nextSegmentNumber = 1; 402 403 /** 404 * Parse one side of a rule, stopping at either the limit, 405 * the END_OF_RULE character, or an operator. 406 * @return the index after the terminating character, or 407 * if limit was reached, limit 408 */ 409 public int parse(String rule, int pos, int limit, 410 TransliteratorParser parser) { 411 int start = pos; 412 StringBuffer buf = new StringBuffer(); 413 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false); 414 text = buf.toString(); 415 416 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 417 syntaxError("Misplaced " + CURSOR_POS, rule, start); 418 } 419 420 return pos; 421 } 422 423 /** 424 * Parse a section of one side of a rule, stopping at either 425 * the limit, the END_OF_RULE character, an operator, or a 426 * segment close character. This method parses both a 427 * top-level rule half and a segment within such a rule half. 428 * It calls itself recursively to parse segments and nested 429 * segments. 430 * @param buf buffer into which to accumulate the rule pattern 431 * characters, either literal characters from the rule or 432 * standins for UnicodeMatcher objects including segments. 433 * @param illegal the set of special characters that is illegal during 434 * this parse. 435 * @param isSegment if true, then we've already seen a '(' and 436 * pos on entry points right after it. Accumulate everything 437 * up to the closing ')', put it in a segment matcher object, 438 * generate a standin for it, and add the standin to buf. As 439 * a side effect, update the segments vector with a reference 440 * to the segment matcher. This works recursively for nested 441 * segments. If isSegment is false, just accumulate 442 * characters into buf. 443 * @return the index after the terminating character, or 444 * if limit was reached, limit 445 */ 446 private int parseSection(String rule, int pos, int limit, 447 TransliteratorParser parser, 448 StringBuffer buf, 449 UnicodeSet illegal, 450 boolean isSegment) { 451 int start = pos; 452 ParsePosition pp = null; 453 int quoteStart = -1; // Most recent 'single quoted string' 454 int quoteLimit = -1; 455 int varStart = -1; // Most recent $variableReference 456 int varLimit = -1; 457 int[] iref = new int[1]; 458 int bufStart = buf.length(); 459 460 main: 461 while (pos < limit) { 462 // Since all syntax characters are in the BMP, fetching 463 // 16-bit code units suffices here. 464 char c = rule.charAt(pos++); 465 if (PatternProps.isWhiteSpace(c)) { 466 continue; 467 } 468 // HALF_ENDERS is all chars that end a rule half: "<>=;" 469 if (HALF_ENDERS.indexOf(c) >= 0) { 470 ///CLOVER:OFF 471 // isSegment is always false 472 if (isSegment) { 473 syntaxError("Unclosed segment", rule, start); 474 } 475 ///CLOVER:ON 476 break main; 477 } 478 if (anchorEnd) { 479 // Text after a presumed end anchor is a syntax err 480 syntaxError("Malformed variable reference", rule, start); 481 } 482 if (UnicodeSet.resemblesPattern(rule, pos-1)) { 483 if (pp == null) { 484 pp = new ParsePosition(0); 485 } 486 pp.setIndex(pos-1); // Backup to opening '[' 487 buf.append(parser.parseSet(rule, pp)); 488 pos = pp.getIndex(); 489 continue; 490 } 491 // Handle escapes 492 if (c == ESCAPE) { 493 if (pos == limit) { 494 syntaxError("Trailing backslash", rule, start); 495 } 496 iref[0] = pos; 497 int escaped = Utility.unescapeAt(rule, iref); 498 pos = iref[0]; 499 if (escaped == -1) { 500 syntaxError("Malformed escape", rule, start); 501 } 502 parser.checkVariableRange(escaped, rule, start); 503 UTF16.append(buf, escaped); 504 continue; 505 } 506 // Handle quoted matter 507 if (c == QUOTE) { 508 int iq = rule.indexOf(QUOTE, pos); 509 if (iq == pos) { 510 buf.append(c); // Parse [''] outside quotes as ['] 511 ++pos; 512 } else { 513 /* This loop picks up a run of quoted text of the 514 * form 'aaaa' each time through. If this run 515 * hasn't really ended ('aaaa''bbbb') then it keeps 516 * looping, each time adding on a new run. When it 517 * reaches the final quote it breaks. 518 */ 519 quoteStart = buf.length(); 520 for (;;) { 521 if (iq < 0) { 522 syntaxError("Unterminated quote", rule, start); 523 } 524 buf.append(rule.substring(pos, iq)); 525 pos = iq+1; 526 if (pos < limit && rule.charAt(pos) == QUOTE) { 527 // Parse [''] inside quotes as ['] 528 iq = rule.indexOf(QUOTE, pos+1); 529 // Continue looping 530 } else { 531 break; 532 } 533 } 534 quoteLimit = buf.length(); 535 536 for (iq=quoteStart; iq<quoteLimit; ++iq) { 537 parser.checkVariableRange(buf.charAt(iq), rule, start); 538 } 539 } 540 continue; 541 } 542 543 parser.checkVariableRange(c, rule, start); 544 545 if (illegal.contains(c)) { 546 syntaxError("Illegal character '" + c + '\'', rule, start); 547 } 548 549 switch (c) { 550 551 //------------------------------------------------------ 552 // Elements allowed within and out of segments 553 //------------------------------------------------------ 554 case ANCHOR_START: 555 if (buf.length() == 0 && !anchorStart) { 556 anchorStart = true; 557 } else { 558 syntaxError("Misplaced anchor start", 559 rule, start); 560 } 561 break; 562 case SEGMENT_OPEN: 563 { 564 // bufSegStart is the offset in buf to the first 565 // character of the segment we are parsing. 566 int bufSegStart = buf.length(); 567 568 // Record segment number now, since nextSegmentNumber 569 // will be incremented during the call to parseSection 570 // if there are nested segments. 571 int segmentNumber = nextSegmentNumber++; // 1-based 572 573 // Parse the segment 574 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true); 575 576 // After parsing a segment, the relevant characters are 577 // in buf, starting at offset bufSegStart. Extract them 578 // into a string matcher, and replace them with a 579 // standin for that matcher. 580 StringMatcher m = 581 new StringMatcher(buf.substring(bufSegStart), 582 segmentNumber, parser.curData); 583 584 // Record and associate object and segment number 585 parser.setSegmentObject(segmentNumber, m); 586 buf.setLength(bufSegStart); 587 buf.append(parser.getSegmentStandin(segmentNumber)); 588 } 589 break; 590 case FUNCTION: 591 case ALT_FUNCTION: 592 { 593 iref[0] = pos; 594 TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref); 595 // The next character MUST be a segment open 596 if (single == null || 597 !Utility.parseChar(rule, iref, SEGMENT_OPEN)) { 598 syntaxError("Invalid function", rule, start); 599 } 600 601 Transliterator t = single.getInstance(); 602 if (t == null) { 603 syntaxError("Invalid function ID", rule, start); 604 } 605 606 // bufSegStart is the offset in buf to the first 607 // character of the segment we are parsing. 608 int bufSegStart = buf.length(); 609 610 // Parse the segment 611 pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true); 612 613 // After parsing a segment, the relevant characters are 614 // in buf, starting at offset bufSegStart. 615 FunctionReplacer r = 616 new FunctionReplacer(t, 617 new StringReplacer(buf.substring(bufSegStart), parser.curData)); 618 619 // Replace the buffer contents with a stand-in 620 buf.setLength(bufSegStart); 621 buf.append(parser.generateStandInFor(r)); 622 } 623 break; 624 case SymbolTable.SYMBOL_REF: 625 // Handle variable references and segment references "$1" .. "$9" 626 { 627 // A variable reference must be followed immediately 628 // by a Unicode identifier start and zero or more 629 // Unicode identifier part characters, or by a digit 630 // 1..9 if it is a segment reference. 631 if (pos == limit) { 632 // A variable ref character at the end acts as 633 // an anchor to the context limit, as in perl. 634 anchorEnd = true; 635 break; 636 } 637 // Parse "$1" "$2" .. "$9" .. (no upper limit) 638 c = rule.charAt(pos); 639 int r = UCharacter.digit(c, 10); 640 if (r >= 1 && r <= 9) { 641 iref[0] = pos; 642 r = Utility.parseNumber(rule, iref, 10); 643 if (r < 0) { 644 syntaxError("Undefined segment reference", 645 rule, start); 646 } 647 pos = iref[0]; 648 buf.append(parser.getSegmentStandin(r)); 649 } else { 650 if (pp == null) { // Lazy create 651 pp = new ParsePosition(0); 652 } 653 pp.setIndex(pos); 654 String name = parser.parseData. 655 parseReference(rule, pp, limit); 656 if (name == null) { 657 // This means the '$' was not followed by a 658 // valid name. Try to interpret it as an 659 // end anchor then. If this also doesn't work 660 // (if we see a following character) then signal 661 // an error. 662 anchorEnd = true; 663 break; 664 } 665 pos = pp.getIndex(); 666 // If this is a variable definition statement, 667 // then the LHS variable will be undefined. In 668 // that case appendVariableDef() will append the 669 // special placeholder char variableLimit-1. 670 varStart = buf.length(); 671 parser.appendVariableDef(name, buf); 672 varLimit = buf.length(); 673 } 674 } 675 break; 676 case DOT: 677 buf.append(parser.getDotStandIn()); 678 break; 679 case KLEENE_STAR: 680 case ONE_OR_MORE: 681 case ZERO_OR_ONE: 682 // Quantifiers. We handle single characters, quoted strings, 683 // variable references, and segments. 684 // a+ matches aaa 685 // 'foo'+ matches foofoofoo 686 // $v+ matches xyxyxy if $v == xy 687 // (seg)+ matches segsegseg 688 { 689 ///CLOVER:OFF 690 // isSegment is always false 691 if (isSegment && buf.length() == bufStart) { 692 // The */+ immediately follows '(' 693 syntaxError("Misplaced quantifier", rule, start); 694 break; 695 } 696 ///CLOVER:ON 697 698 int qstart, qlimit; 699 // The */+ follows an isolated character or quote 700 // or variable reference 701 if (buf.length() == quoteLimit) { 702 // The */+ follows a 'quoted string' 703 qstart = quoteStart; 704 qlimit = quoteLimit; 705 } else if (buf.length() == varLimit) { 706 // The */+ follows a $variableReference 707 qstart = varStart; 708 qlimit = varLimit; 709 } else { 710 // The */+ follows a single character, possibly 711 // a segment standin 712 qstart = buf.length() - 1; 713 qlimit = qstart + 1; 714 } 715 716 UnicodeMatcher m; 717 try { 718 m = new StringMatcher(buf.toString(), qstart, qlimit, 719 0, parser.curData); 720 } catch (RuntimeException e) { 721 final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos); 722 final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "..."; 723 throw new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$" 724 + postContext).initCause(e); 725 } 726 int min = 0; 727 int max = Quantifier.MAX; 728 switch (c) { 729 case ONE_OR_MORE: 730 min = 1; 731 break; 732 case ZERO_OR_ONE: 733 min = 0; 734 max = 1; 735 break; 736 // case KLEENE_STAR: 737 // do nothing -- min, max already set 738 } 739 m = new Quantifier(m, min, max); 740 buf.setLength(qstart); 741 buf.append(parser.generateStandInFor(m)); 742 } 743 break; 744 745 //------------------------------------------------------ 746 // Elements allowed ONLY WITHIN segments 747 //------------------------------------------------------ 748 case SEGMENT_CLOSE: 749 // assert(isSegment); 750 // We're done parsing a segment. 751 break main; 752 753 //------------------------------------------------------ 754 // Elements allowed ONLY OUTSIDE segments 755 //------------------------------------------------------ 756 case CONTEXT_ANTE: 757 if (ante >= 0) { 758 syntaxError("Multiple ante contexts", rule, start); 759 } 760 ante = buf.length(); 761 break; 762 case CONTEXT_POST: 763 if (post >= 0) { 764 syntaxError("Multiple post contexts", rule, start); 765 } 766 post = buf.length(); 767 break; 768 case CURSOR_POS: 769 if (cursor >= 0) { 770 syntaxError("Multiple cursors", rule, start); 771 } 772 cursor = buf.length(); 773 break; 774 case CURSOR_OFFSET: 775 if (cursorOffset < 0) { 776 if (buf.length() > 0) { 777 syntaxError("Misplaced " + c, rule, start); 778 } 779 --cursorOffset; 780 } else if (cursorOffset > 0) { 781 if (buf.length() != cursorOffsetPos || cursor >= 0) { 782 syntaxError("Misplaced " + c, rule, start); 783 } 784 ++cursorOffset; 785 } else { 786 if (cursor == 0 && buf.length() == 0) { 787 cursorOffset = -1; 788 } else if (cursor < 0) { 789 cursorOffsetPos = buf.length(); 790 cursorOffset = 1; 791 } else { 792 syntaxError("Misplaced " + c, rule, start); 793 } 794 } 795 break; 796 797 //------------------------------------------------------ 798 // Non-special characters 799 //------------------------------------------------------ 800 default: 801 // Disallow unquoted characters other than [0-9A-Za-z] 802 // in the printable ASCII range. These characters are 803 // reserved for possible future use. 804 if (c >= 0x0021 && c <= 0x007E && 805 !((c >= '0' && c <= '9') || 806 (c >= 'A' && c <= 'Z') || 807 (c >= 'a' && c <= 'z'))) { 808 syntaxError("Unquoted " + c, rule, start); 809 } 810 buf.append(c); 811 break; 812 } 813 } 814 return pos; 815 } 816 817 /** 818 * Remove context. 819 */ 820 void removeContext() { 821 text = text.substring(ante < 0 ? 0 : ante, 822 post < 0 ? text.length() : post); 823 ante = post = -1; 824 anchorStart = anchorEnd = false; 825 } 826 827 /** 828 * Return true if this half looks like valid output, that is, does not 829 * contain quantifiers or other special input-only elements. 830 */ 831 public boolean isValidOutput(TransliteratorParser parser) { 832 for (int i=0; i<text.length(); ) { 833 int c = UTF16.charAt(text, i); 834 i += UTF16.getCharCount(c); 835 if (!parser.parseData.isReplacer(c)) { 836 return false; 837 } 838 } 839 return true; 840 } 841 842 /** 843 * Return true if this half looks like valid input, that is, does not 844 * contain functions or other special output-only elements. 845 */ 846 public boolean isValidInput(TransliteratorParser parser) { 847 for (int i=0; i<text.length(); ) { 848 int c = UTF16.charAt(text, i); 849 i += UTF16.getCharCount(c); 850 if (!parser.parseData.isMatcher(c)) { 851 return false; 852 } 853 } 854 return true; 855 } 856 } 857 858 //---------------------------------------------------------------------- 859 // PUBLIC methods 860 //---------------------------------------------------------------------- 861 862 /** 863 * Constructor. 864 */ 865 public TransliteratorParser() { 866 } 867 868 /** 869 * Parse a set of rules. After the parse completes, examine the public 870 * data members for results. 871 */ 872 public void parse(String rules, int dir) { 873 parseRules(new RuleArray(new String[] { rules }), dir); 874 } 875 876 /* 877 * Parse a set of rules. After the parse completes, examine the public 878 * data members for results. 879 */ 880 /* public void parse(ResourceReader rules, int direction) { 881 parseRules(new RuleReader(rules), direction); 882 }*/ 883 884 //---------------------------------------------------------------------- 885 // PRIVATE methods 886 //---------------------------------------------------------------------- 887 888 /** 889 * Parse an array of zero or more rules. The strings in the array are 890 * treated as if they were concatenated together, with rule terminators 891 * inserted between array elements if not present already. 892 * 893 * Any previous rules are discarded. Typically this method is called exactly 894 * once, during construction. 895 * 896 * The member this.data will be set to null if there are no rules. 897 * 898 * @exception IllegalIcuArgumentException if there is a syntax error in the 899 * rules 900 */ 901 void parseRules(RuleBody ruleArray, int dir) { 902 boolean parsingIDs = true; 903 int ruleCount = 0; 904 905 dataVector = new ArrayList<Data>(); 906 idBlockVector = new ArrayList<String>(); 907 curData = null; 908 direction = dir; 909 compoundFilter = null; 910 variablesVector = new ArrayList<Object>(); 911 variableNames = new HashMap<String, char[]>(); 912 parseData = new ParseData(); 913 914 List<RuntimeException> errors = new ArrayList<RuntimeException>(); 915 int errorCount = 0; 916 917 ruleArray.reset(); 918 919 StringBuilder idBlockResult = new StringBuilder(); 920 921 // The compound filter offset is an index into idBlockResult. 922 // If it is 0, then the compound filter occurred at the start, 923 // and it is the offset to the _start_ of the compound filter 924 // pattern. Otherwise it is the offset to the _limit_ of the 925 // compound filter pattern within idBlockResult. 926 this.compoundFilter = null; 927 int compoundFilterOffset = -1; 928 929 main: 930 for (;;) { 931 String rule = ruleArray.nextLine(); 932 if (rule == null) { 933 break; 934 } 935 int pos = 0; 936 int limit = rule.length(); 937 while (pos < limit) { 938 char c = rule.charAt(pos++); 939 if (PatternProps.isWhiteSpace(c)) { 940 continue; 941 } 942 // Skip lines starting with the comment character 943 if (c == RULE_COMMENT_CHAR) { 944 pos = rule.indexOf("\n", pos) + 1; 945 if (pos == 0) { 946 break; // No "\n" found; rest of rule is a commnet 947 } 948 continue; // Either fall out or restart with next line 949 } 950 951 // skip empty rules 952 if (c == END_OF_RULE) 953 continue; 954 955 // Often a rule file contains multiple errors. It's 956 // convenient to the rule author if these are all reported 957 // at once. We keep parsing rules even after a failure, up 958 // to a specified limit, and report all errors at once. 959 try { 960 ++ruleCount; 961 962 // We've found the start of a rule or ID. c is its first 963 // character, and pos points past c. 964 --pos; 965 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 966 // chars left. 967 if ((pos + ID_TOKEN_LEN + 1) <= limit && 968 rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) { 969 pos += ID_TOKEN_LEN; 970 c = rule.charAt(pos); 971 while (PatternProps.isWhiteSpace(c) && pos < limit) { 972 ++pos; 973 c = rule.charAt(pos); 974 } 975 int[] p = new int[] { pos }; 976 977 if (!parsingIDs) { 978 if (curData != null) { 979 if (direction == Transliterator.FORWARD) 980 dataVector.add(curData); 981 else 982 dataVector.add(0, curData); 983 curData = null; 984 } 985 parsingIDs = true; 986 } 987 988 TransliteratorIDParser.SingleID id = 989 TransliteratorIDParser.parseSingleID( 990 rule, p, direction); 991 if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) { 992 // Successful ::ID parse. 993 994 if (direction == Transliterator.FORWARD) { 995 idBlockResult.append(id.canonID).append(END_OF_RULE); 996 } else { 997 idBlockResult.insert(0, id.canonID + END_OF_RULE); 998 } 999 1000 } else { 1001 // Couldn't parse an ID. Try to parse a global filter 1002 int[] withParens = new int[] { -1 }; 1003 UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null); 1004 if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) { 1005 if ((direction == Transliterator.FORWARD) == 1006 (withParens[0] == 0)) { 1007 if (compoundFilter != null) { 1008 // Multiple compound filters 1009 syntaxError("Multiple global filters", rule, pos); 1010 } 1011 compoundFilter = f; 1012 compoundFilterOffset = ruleCount; 1013 } 1014 } else { 1015 // Invalid ::id 1016 // Can be parsed as neither an ID nor a global filter 1017 syntaxError("Invalid ::ID", rule, pos); 1018 } 1019 } 1020 1021 pos = p[0]; 1022 } else { 1023 if (parsingIDs) { 1024 if (direction == Transliterator.FORWARD) 1025 idBlockVector.add(idBlockResult.toString()); 1026 else 1027 idBlockVector.add(0, idBlockResult.toString()); 1028 idBlockResult.delete(0, idBlockResult.length()); 1029 parsingIDs = false; 1030 curData = new RuleBasedTransliterator.Data(); 1031 1032 // By default, rules use part of the private use area 1033 // E000..F8FF for variables and other stand-ins. Currently 1034 // the range F000..F8FF is typically sufficient. The 'use 1035 // variable range' pragma allows rule sets to modify this. 1036 setVariableRange(0xF000, 0xF8FF); 1037 } 1038 1039 if (resemblesPragma(rule, pos, limit)) { 1040 int ppp = parsePragma(rule, pos, limit); 1041 if (ppp < 0) { 1042 syntaxError("Unrecognized pragma", rule, pos); 1043 } 1044 pos = ppp; 1045 // Parse a rule 1046 } else { 1047 pos = parseRule(rule, pos, limit); 1048 } 1049 } 1050 } catch (IllegalArgumentException e) { 1051 if (errorCount == 30) { 1052 IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched"); 1053 icuEx.initCause(e); 1054 errors.add(icuEx); 1055 break main; 1056 } 1057 e.fillInStackTrace(); 1058 errors.add(e); 1059 ++errorCount; 1060 pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';' 1061 } 1062 } 1063 } 1064 if (parsingIDs && idBlockResult.length() > 0) { 1065 if (direction == Transliterator.FORWARD) 1066 idBlockVector.add(idBlockResult.toString()); 1067 else 1068 idBlockVector.add(0, idBlockResult.toString()); 1069 } 1070 else if (!parsingIDs && curData != null) { 1071 if (direction == Transliterator.FORWARD) 1072 dataVector.add(curData); 1073 else 1074 dataVector.add(0, curData); 1075 } 1076 1077 // Convert the set vector to an array 1078 for (int i = 0; i < dataVector.size(); i++) { 1079 Data data = dataVector.get(i); 1080 data.variables = new Object[variablesVector.size()]; 1081 variablesVector.toArray(data.variables); 1082 data.variableNames = new HashMap<String, char[]>(); 1083 data.variableNames.putAll(variableNames); 1084 } 1085 variablesVector = null; 1086 1087 // Do more syntax checking and index the rules 1088 try { 1089 if (compoundFilter != null) { 1090 if ((direction == Transliterator.FORWARD && 1091 compoundFilterOffset != 1) || 1092 (direction == Transliterator.REVERSE && 1093 compoundFilterOffset != ruleCount)) { 1094 throw new IllegalIcuArgumentException("Compound filters misplaced"); 1095 } 1096 } 1097 1098 for (int i = 0; i < dataVector.size(); i++) { 1099 Data data = dataVector.get(i); 1100 data.ruleSet.freeze(); 1101 } 1102 1103 if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0) 1104 idBlockVector.remove(0); 1105 1106 } catch (IllegalArgumentException e) { 1107 e.fillInStackTrace(); 1108 errors.add(e); 1109 } 1110 1111 if (errors.size() != 0) { 1112 for (int i = errors.size()-1; i > 0; --i) { 1113 RuntimeException previous = errors.get(i-1); 1114 while (previous.getCause() != null) { 1115 previous = (RuntimeException) previous.getCause(); // chain specially 1116 } 1117 previous.initCause(errors.get(i)); 1118 } 1119 throw errors.get(0); 1120 // if initCause not supported: throw new IllegalArgumentException(errors.toString()); 1121 } 1122 } 1123 1124 /** 1125 * MAIN PARSER. Parse the next rule in the given rule string, starting 1126 * at pos. Return the index after the last character parsed. Do not 1127 * parse characters at or after limit. 1128 * 1129 * Important: The character at pos must be a non-whitespace character 1130 * that is not the comment character. 1131 * 1132 * This method handles quoting, escaping, and whitespace removal. It 1133 * parses the end-of-rule character. It recognizes context and cursor 1134 * indicators. Once it does a lexical breakdown of the rule at pos, it 1135 * creates a rule object and adds it to our rule list. 1136 * 1137 * This method is tightly coupled to the inner class RuleHalf. 1138 */ 1139 private int parseRule(String rule, int pos, int limit) { 1140 // Locate the left side, operator, and right side 1141 int start = pos; 1142 char operator = 0; 1143 1144 // Set up segments data 1145 segmentStandins = new StringBuffer(); 1146 segmentObjects = new ArrayList<StringMatcher>(); 1147 1148 RuleHalf left = new RuleHalf(); 1149 RuleHalf right = new RuleHalf(); 1150 1151 undefinedVariableName = null; 1152 pos = left.parse(rule, pos, limit, this); 1153 1154 if (pos == limit || 1155 OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) { 1156 syntaxError("No operator pos=" + pos, rule, start); 1157 } 1158 ++pos; 1159 1160 // Found an operator char. Check for forward-reverse operator. 1161 if (operator == REVERSE_RULE_OP && 1162 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1163 ++pos; 1164 operator = FWDREV_RULE_OP; 1165 } 1166 1167 // Translate alternate op characters. 1168 switch (operator) { 1169 case ALT_FORWARD_RULE_OP: 1170 operator = FORWARD_RULE_OP; 1171 break; 1172 case ALT_REVERSE_RULE_OP: 1173 operator = REVERSE_RULE_OP; 1174 break; 1175 case ALT_FWDREV_RULE_OP: 1176 operator = FWDREV_RULE_OP; 1177 break; 1178 } 1179 1180 pos = right.parse(rule, pos, limit, this); 1181 1182 if (pos < limit) { 1183 if (rule.charAt(--pos) == END_OF_RULE) { 1184 ++pos; 1185 } else { 1186 // RuleHalf parser must have terminated at an operator 1187 syntaxError("Unquoted operator", rule, start); 1188 } 1189 } 1190 1191 if (operator == VARIABLE_DEF_OP) { 1192 // LHS is the name. RHS is a single character, either a literal 1193 // or a set (already parsed). If RHS is longer than one 1194 // character, it is either a multi-character string, or multiple 1195 // sets, or a mixture of chars and sets -- syntax error. 1196 1197 // We expect to see a single undefined variable (the one being 1198 // defined). 1199 if (undefinedVariableName == null) { 1200 syntaxError("Missing '$' or duplicate definition", rule, start); 1201 } 1202 if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) { 1203 syntaxError("Malformed LHS", rule, start); 1204 } 1205 if (left.anchorStart || left.anchorEnd || 1206 right.anchorStart || right.anchorEnd) { 1207 syntaxError("Malformed variable def", rule, start); 1208 } 1209 // We allow anything on the right, including an empty string. 1210 int n = right.text.length(); 1211 char[] value = new char[n]; 1212 right.text.getChars(0, n, value, 0); 1213 variableNames.put(undefinedVariableName, value); 1214 1215 ++variableLimit; 1216 return pos; 1217 } 1218 1219 // If this is not a variable definition rule, we shouldn't have 1220 // any undefined variable names. 1221 if (undefinedVariableName != null) { 1222 syntaxError("Undefined variable $" + undefinedVariableName, 1223 rule, start); 1224 } 1225 1226 // Verify segments 1227 if (segmentStandins.length() > segmentObjects.size()) { 1228 syntaxError("Undefined segment reference", rule, start); 1229 } 1230 for (int i=0; i<segmentStandins.length(); ++i) { 1231 if (segmentStandins.charAt(i) == 0) { 1232 syntaxError("Internal error", rule, start); // will never happen 1233 } 1234 } 1235 for (int i=0; i<segmentObjects.size(); ++i) { 1236 if (segmentObjects.get(i) == null) { 1237 syntaxError("Internal error", rule, start); // will never happen 1238 } 1239 } 1240 1241 // If the direction we want doesn't match the rule 1242 // direction, do nothing. 1243 if (operator != FWDREV_RULE_OP && 1244 ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) { 1245 return pos; 1246 } 1247 1248 // Transform the rule into a forward rule by swapping the 1249 // sides if necessary. 1250 if (direction == Transliterator.REVERSE) { 1251 RuleHalf temp = left; 1252 left = right; 1253 right = temp; 1254 } 1255 1256 // Remove non-applicable elements in forward-reverse 1257 // rules. Bidirectional rules ignore elements that do not 1258 // apply. 1259 if (operator == FWDREV_RULE_OP) { 1260 right.removeContext(); 1261 left.cursor = -1; 1262 left.cursorOffset = 0; 1263 } 1264 1265 // Normalize context 1266 if (left.ante < 0) { 1267 left.ante = 0; 1268 } 1269 if (left.post < 0) { 1270 left.post = left.text.length(); 1271 } 1272 1273 // Context is only allowed on the input side. Cursors are only 1274 // allowed on the output side. Segment delimiters can only appear 1275 // on the left, and references on the right. Cursor offset 1276 // cannot appear without an explicit cursor. Cursor offset 1277 // cannot place the cursor outside the limits of the context. 1278 // Anchors are only allowed on the input side. 1279 if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || 1280 (right.cursorOffset != 0 && right.cursor < 0) || 1281 // - The following two checks were used to ensure that the 1282 // - the cursor offset stayed within the ante- or postcontext. 1283 // - However, with the addition of quantifiers, we have to 1284 // - allow arbitrary cursor offsets and do runtime checking. 1285 //(right.cursorOffset > (left.text.length() - left.post)) || 1286 //(-right.cursorOffset > left.ante) || 1287 right.anchorStart || right.anchorEnd || 1288 !left.isValidInput(this) || !right.isValidOutput(this) || 1289 left.ante > left.post) { 1290 syntaxError("Malformed rule", rule, start); 1291 } 1292 1293 // Flatten segment objects vector to an array 1294 UnicodeMatcher[] segmentsArray = null; 1295 if (segmentObjects.size() > 0) { 1296 segmentsArray = new UnicodeMatcher[segmentObjects.size()]; 1297 segmentObjects.toArray(segmentsArray); 1298 } 1299 1300 curData.ruleSet.addRule(new TransliterationRule( 1301 left.text, left.ante, left.post, 1302 right.text, right.cursor, right.cursorOffset, 1303 segmentsArray, 1304 left.anchorStart, left.anchorEnd, 1305 curData)); 1306 1307 return pos; 1308 } 1309 1310 /** 1311 * Set the variable range to [start, end] (inclusive). 1312 */ 1313 private void setVariableRange(int start, int end) { 1314 if (start > end || start < 0 || end > 0xFFFF) { 1315 throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end); 1316 } 1317 1318 curData.variablesBase = (char) start; // first private use 1319 1320 if (dataVector.size() == 0) { 1321 variableNext = (char) start; 1322 variableLimit = (char) (end + 1); 1323 } 1324 } 1325 1326 /** 1327 * Assert that the given character is NOT within the variable range. 1328 * If it is, signal an error. This is neccesary to ensure that the 1329 * variable range does not overlap characters used in a rule. 1330 */ 1331 private void checkVariableRange(int ch, String rule, int start) { 1332 if (ch >= curData.variablesBase && ch < variableLimit) { 1333 syntaxError("Variable range character in rule", rule, start); 1334 } 1335 } 1336 1337 // (The following method is part of an unimplemented feature. 1338 // Remove this clover pragma after the feature is implemented. 1339 // 2003-06-11 ICU 2.6 Alan) 1340 ///CLOVER:OFF 1341 /** 1342 * Set the maximum backup to 'backup', in response to a pragma 1343 * statement. 1344 */ 1345 private void pragmaMaximumBackup(int backup) { 1346 //TODO Finish 1347 throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet"); 1348 } 1349 ///CLOVER:ON 1350 1351 // (The following method is part of an unimplemented feature. 1352 // Remove this clover pragma after the feature is implemented. 1353 // 2003-06-11 ICU 2.6 Alan) 1354 ///CLOVER:OFF 1355 /** 1356 * Begin normalizing all rules using the given mode, in response 1357 * to a pragma statement. 1358 */ 1359 private void pragmaNormalizeRules(Normalizer.Mode mode) { 1360 //TODO Finish 1361 throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet"); 1362 } 1363 ///CLOVER:ON 1364 1365 /** 1366 * Return true if the given rule looks like a pragma. 1367 * @param pos offset to the first non-whitespace character 1368 * of the rule. 1369 * @param limit pointer past the last character of the rule. 1370 */ 1371 static boolean resemblesPragma(String rule, int pos, int limit) { 1372 // Must start with /use\s/i 1373 return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0; 1374 } 1375 1376 /** 1377 * Parse a pragma. This method assumes resemblesPragma() has 1378 * already returned true. 1379 * @param pos offset to the first non-whitespace character 1380 * of the rule. 1381 * @param limit pointer past the last character of the rule. 1382 * @return the position index after the final ';' of the pragma, 1383 * or -1 on failure. 1384 */ 1385 private int parsePragma(String rule, int pos, int limit) { 1386 int[] array = new int[2]; 1387 1388 // resemblesPragma() has already returned true, so we 1389 // know that pos points to /use\s/i; we can skip 4 characters 1390 // immediately 1391 pos += 4; 1392 1393 // Here are the pragmas we recognize: 1394 // use variable range 0xE000 0xEFFF; 1395 // use maximum backup 16; 1396 // use nfd rules; 1397 int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array); 1398 if (p >= 0) { 1399 setVariableRange(array[0], array[1]); 1400 return p; 1401 } 1402 1403 p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array); 1404 if (p >= 0) { 1405 pragmaMaximumBackup(array[0]); 1406 return p; 1407 } 1408 1409 p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null); 1410 if (p >= 0) { 1411 pragmaNormalizeRules(Normalizer.NFD); 1412 return p; 1413 } 1414 1415 p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null); 1416 if (p >= 0) { 1417 pragmaNormalizeRules(Normalizer.NFC); 1418 return p; 1419 } 1420 1421 // Syntax error: unable to parse pragma 1422 return -1; 1423 } 1424 1425 /** 1426 * Throw an exception indicating a syntax error. Search the rule string 1427 * for the probable end of the rule. Of course, if the error is that 1428 * the end of rule marker is missing, then the rule end will not be found. 1429 * In any case the rule start will be correctly reported. 1430 * @param msg error description 1431 * @param rule pattern string 1432 * @param start position of first character of current rule 1433 */ 1434 static final void syntaxError(String msg, String rule, int start) { 1435 int end = ruleEnd(rule, start, rule.length()); 1436 throw new IllegalIcuArgumentException(msg + " in \"" + 1437 Utility.escape(rule.substring(start, end)) + '"'); 1438 } 1439 1440 static final int ruleEnd(String rule, int start, int limit) { 1441 int end = Utility.quotedIndexOf(rule, start, limit, ";"); 1442 if (end < 0) { 1443 end = limit; 1444 } 1445 return end; 1446 } 1447 1448 /** 1449 * Parse a UnicodeSet out, store it, and return the stand-in character 1450 * used to represent it. 1451 */ 1452 private final char parseSet(String rule, ParsePosition pos) { 1453 UnicodeSet set = new UnicodeSet(rule, pos, parseData); 1454 if (variableNext >= variableLimit) { 1455 throw new RuntimeException("Private use variables exhausted"); 1456 } 1457 set.compact(); 1458 return generateStandInFor(set); 1459 } 1460 1461 /** 1462 * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer. 1463 * Store the object. 1464 */ 1465 char generateStandInFor(Object obj) { 1466 // assert(obj != null); 1467 1468 // Look up previous stand-in, if any. This is a short list 1469 // (typical n is 0, 1, or 2); linear search is optimal. 1470 for (int i=0; i<variablesVector.size(); ++i) { 1471 if (variablesVector.get(i) == obj) { // [sic] pointer comparison 1472 return (char) (curData.variablesBase + i); 1473 } 1474 } 1475 1476 if (variableNext >= variableLimit) { 1477 throw new RuntimeException("Variable range exhausted"); 1478 } 1479 variablesVector.add(obj); 1480 return variableNext++; 1481 } 1482 1483 /** 1484 * Return the standin for segment seg (1-based). 1485 */ 1486 public char getSegmentStandin(int seg) { 1487 if (segmentStandins.length() < seg) { 1488 segmentStandins.setLength(seg); 1489 } 1490 char c = segmentStandins.charAt(seg-1); 1491 if (c == 0) { 1492 if (variableNext >= variableLimit) { 1493 throw new RuntimeException("Variable range exhausted"); 1494 } 1495 c = variableNext++; 1496 // Set a placeholder in the master variables vector that will be 1497 // filled in later by setSegmentObject(). We know that we will get 1498 // called first because setSegmentObject() will call us. 1499 variablesVector.add(null); 1500 segmentStandins.setCharAt(seg-1, c); 1501 } 1502 return c; 1503 } 1504 1505 /** 1506 * Set the object for segment seg (1-based). 1507 */ 1508 public void setSegmentObject(int seg, StringMatcher obj) { 1509 // Since we call parseSection() recursively, nested 1510 // segments will result in segment i+1 getting parsed 1511 // and stored before segment i; be careful with the 1512 // vector handling here. 1513 while (segmentObjects.size() < seg) { 1514 segmentObjects.add(null); 1515 } 1516 int index = getSegmentStandin(seg) - curData.variablesBase; 1517 if (segmentObjects.get(seg-1) != null || 1518 variablesVector.get(index) != null) { 1519 throw new RuntimeException(); // should never happen 1520 } 1521 segmentObjects.set(seg-1, obj); 1522 variablesVector.set(index, obj); 1523 } 1524 1525 /** 1526 * Return the stand-in for the dot set. It is allocated the first 1527 * time and reused thereafter. 1528 */ 1529 char getDotStandIn() { 1530 if (dotStandIn == -1) { 1531 dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET)); 1532 } 1533 return (char) dotStandIn; 1534 } 1535 1536 /** 1537 * Append the value of the given variable name to the given 1538 * StringBuffer. 1539 * @exception IllegalIcuArgumentException if the name is unknown. 1540 */ 1541 private void appendVariableDef(String name, StringBuffer buf) { 1542 char[] ch = variableNames.get(name); 1543 if (ch == null) { 1544 // We allow one undefined variable so that variable definition 1545 // statements work. For the first undefined variable we return 1546 // the special placeholder variableLimit-1, and save the variable 1547 // name. 1548 if (undefinedVariableName == null) { 1549 undefinedVariableName = name; 1550 if (variableNext >= variableLimit) { 1551 throw new RuntimeException("Private use variables exhausted"); 1552 } 1553 buf.append(--variableLimit); 1554 } else { 1555 throw new IllegalIcuArgumentException("Undefined variable $" 1556 + name); 1557 } 1558 } else { 1559 buf.append(ch); 1560 } 1561 } 1562 } 1563 1564 //eof 1565