1 /* 2 ********************************************************************** 3 * Copyright (c) 2003-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Author: Alan Liu 7 * Created: September 24 2003 8 * Since: ICU 2.8 9 ********************************************************************** 10 */ 11 #include "ruleiter.h" 12 #include "unicode/parsepos.h" 13 #include "unicode/symtable.h" 14 #include "unicode/unistr.h" 15 #include "unicode/utf16.h" 16 #include "patternprops.h" 17 18 /* \U87654321 or \ud800\udc00 */ 19 #define MAX_U_NOTATION_LEN 12 20 21 U_NAMESPACE_BEGIN 22 23 RuleCharacterIterator::RuleCharacterIterator(const UnicodeString& theText, const SymbolTable* theSym, 24 ParsePosition& thePos) : 25 text(theText), 26 pos(thePos), 27 sym(theSym), 28 buf(0), 29 bufPos(0) 30 {} 31 32 UBool RuleCharacterIterator::atEnd() const { 33 return buf == 0 && pos.getIndex() == text.length(); 34 } 35 36 UChar32 RuleCharacterIterator::next(int32_t options, UBool& isEscaped, UErrorCode& ec) { 37 if (U_FAILURE(ec)) return DONE; 38 39 UChar32 c = DONE; 40 isEscaped = FALSE; 41 42 for (;;) { 43 c = _current(); 44 _advance(U16_LENGTH(c)); 45 46 if (c == SymbolTable::SYMBOL_REF && buf == 0 && 47 (options & PARSE_VARIABLES) != 0 && sym != 0) { 48 UnicodeString name = sym->parseReference(text, pos, text.length()); 49 // If name is empty there was an isolated SYMBOL_REF; 50 // return it. Caller must be prepared for this. 51 if (name.length() == 0) { 52 break; 53 } 54 bufPos = 0; 55 buf = sym->lookup(name); 56 if (buf == 0) { 57 ec = U_UNDEFINED_VARIABLE; 58 return DONE; 59 } 60 // Handle empty variable value 61 if (buf->length() == 0) { 62 buf = 0; 63 } 64 continue; 65 } 66 67 if ((options & SKIP_WHITESPACE) != 0 && PatternProps::isWhiteSpace(c)) { 68 continue; 69 } 70 71 if (c == 0x5C /*'\\'*/ && (options & PARSE_ESCAPES) != 0) { 72 UnicodeString tempEscape; 73 int32_t offset = 0; 74 c = lookahead(tempEscape, MAX_U_NOTATION_LEN).unescapeAt(offset); 75 jumpahead(offset); 76 isEscaped = TRUE; 77 if (c < 0) { 78 ec = U_MALFORMED_UNICODE_ESCAPE; 79 return DONE; 80 } 81 } 82 83 break; 84 } 85 86 return c; 87 } 88 89 void RuleCharacterIterator::getPos(RuleCharacterIterator::Pos& p) const { 90 p.buf = buf; 91 p.pos = pos.getIndex(); 92 p.bufPos = bufPos; 93 } 94 95 void RuleCharacterIterator::setPos(const RuleCharacterIterator::Pos& p) { 96 buf = p.buf; 97 pos.setIndex(p.pos); 98 bufPos = p.bufPos; 99 } 100 101 void RuleCharacterIterator::skipIgnored(int32_t options) { 102 if ((options & SKIP_WHITESPACE) != 0) { 103 for (;;) { 104 UChar32 a = _current(); 105 if (!PatternProps::isWhiteSpace(a)) break; 106 _advance(U16_LENGTH(a)); 107 } 108 } 109 } 110 111 UnicodeString& RuleCharacterIterator::lookahead(UnicodeString& result, int32_t maxLookAhead) const { 112 if (maxLookAhead < 0) { 113 maxLookAhead = 0x7FFFFFFF; 114 } 115 if (buf != 0) { 116 buf->extract(bufPos, maxLookAhead, result); 117 } else { 118 text.extract(pos.getIndex(), maxLookAhead, result); 119 } 120 return result; 121 } 122 123 void RuleCharacterIterator::jumpahead(int32_t count) { 124 _advance(count); 125 } 126 127 /* 128 UnicodeString& RuleCharacterIterator::toString(UnicodeString& result) const { 129 int32_t b = pos.getIndex(); 130 text.extract(0, b, result); 131 return result.append((UChar) 0x7C).append(text, b, 0x7FFFFFFF); // Insert '|' at index 132 } 133 */ 134 135 UChar32 RuleCharacterIterator::_current() const { 136 if (buf != 0) { 137 return buf->char32At(bufPos); 138 } else { 139 int i = pos.getIndex(); 140 return (i < text.length()) ? text.char32At(i) : (UChar32)DONE; 141 } 142 } 143 144 void RuleCharacterIterator::_advance(int32_t count) { 145 if (buf != 0) { 146 bufPos += count; 147 if (bufPos == buf->length()) { 148 buf = 0; 149 } 150 } else { 151 pos.setIndex(pos.getIndex() + count); 152 if (pos.getIndex() > text.length()) { 153 pos.setIndex(text.length()); 154 } 155 } 156 } 157 158 U_NAMESPACE_END 159 160 //eof 161