1 /* 2 ********************************************************************** 3 * Copyright (C) 2001-2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 06/07/01 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/unifilt.h" 16 #include "unicode/uchar.h" 17 #include "unicode/uniset.h" 18 #include "name2uni.h" 19 #include "cmemory.h" 20 #include "uprops.h" 21 #include "uinvchar.h" 22 #include "util.h" 23 24 U_NAMESPACE_BEGIN 25 26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NameUnicodeTransliterator) 27 28 static const UChar OPEN[] = {92,78,126,123,126,0}; // "\N~{~" 29 static const UChar OPEN_DELIM = 92; // '\\' first char of OPEN 30 static const UChar CLOSE_DELIM = 125; // '}' 31 static const UChar SPACE = 32; // ' ' 32 33 U_CDECL_BEGIN 34 35 // USetAdder implementation 36 // Does not use uset.h to reduce code dependencies 37 static void U_CALLCONV 38 _set_add(USet *set, UChar32 c) { 39 uset_add(set, c); 40 } 41 42 // These functions aren't used. 43 /*static void U_CALLCONV 44 _set_addRange(USet *set, UChar32 start, UChar32 end) { 45 ((UnicodeSet *)set)->add(start, end); 46 } 47 48 static void U_CALLCONV 49 _set_addString(USet *set, const UChar *str, int32_t length) { 50 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 51 }*/ 52 53 U_CDECL_END 54 55 /** 56 * Constructs a transliterator with the default delimiters '{' and 57 * '}'. 58 */ 59 NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) : 60 Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) { 61 62 UnicodeSet *legalPtr = &legal; 63 // Get the legal character set 64 USetAdder sa = { 65 (USet *)legalPtr, // USet* == UnicodeSet* 66 _set_add, 67 NULL, // Don't need _set_addRange 68 NULL, // Don't need _set_addString 69 NULL, // Don't need remove() 70 NULL 71 }; 72 uprv_getCharNameCharacters(&sa); 73 } 74 75 /** 76 * Destructor. 77 */ 78 NameUnicodeTransliterator::~NameUnicodeTransliterator() {} 79 80 /** 81 * Copy constructor. 82 */ 83 NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) : 84 Transliterator(o), legal(o.legal) {} 85 86 /** 87 * Assignment operator. 88 */ 89 /*NameUnicodeTransliterator& NameUnicodeTransliterator::operator=( 90 const NameUnicodeTransliterator& o) { 91 Transliterator::operator=(o); 92 // not necessary: the legal sets should all be the same -- legal=o.legal; 93 return *this; 94 }*/ 95 96 /** 97 * Transliterator API. 98 */ 99 Transliterator* NameUnicodeTransliterator::clone(void) const { 100 return new NameUnicodeTransliterator(*this); 101 } 102 103 /** 104 * Implements {@link Transliterator#handleTransliterate}. 105 */ 106 void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 107 UBool isIncremental) const { 108 // The failure mode, here and below, is to behave like Any-Null, 109 // if either there is no name data (max len == 0) or there is no 110 // memory (malloc() => NULL). 111 112 int32_t maxLen = uprv_getMaxCharNameLength(); 113 if (maxLen == 0) { 114 offsets.start = offsets.limit; 115 return; 116 } 117 118 // Accomodate the longest possible name 119 ++maxLen; // allow for temporary trailing space 120 char* cbuf = (char*) uprv_malloc(maxLen); 121 if (cbuf == NULL) { 122 offsets.start = offsets.limit; 123 return; 124 } 125 126 UnicodeString openPat(TRUE, OPEN, -1); 127 UnicodeString str, name; 128 129 int32_t cursor = offsets.start; 130 int32_t limit = offsets.limit; 131 132 // Modes: 133 // 0 - looking for open delimiter 134 // 1 - after open delimiter 135 int32_t mode = 0; 136 int32_t openPos = -1; // open delim candidate pos 137 138 UChar32 c; 139 while (cursor < limit) { 140 c = text.char32At(cursor); 141 142 switch (mode) { 143 case 0: // looking for open delimiter 144 if (c == OPEN_DELIM) { // quick check first 145 openPos = cursor; 146 int32_t i = 147 ICU_Utility::parsePattern(openPat, text, cursor, limit); 148 if (i >= 0 && i < limit) { 149 mode = 1; 150 name.truncate(0); 151 cursor = i; 152 continue; // *** reprocess char32At(cursor) 153 } 154 } 155 break; 156 157 case 1: // after open delimiter 158 // Look for legal chars. If \s+ is found, convert it 159 // to a single space. If closeDelimiter is found, exit 160 // the loop. If any other character is found, exit the 161 // loop. If the limit is reached, exit the loop. 162 163 // Convert \s+ => SPACE. This assumes there are no 164 // runs of >1 space characters in names. 165 if (uprv_isRuleWhiteSpace(c)) { 166 // Ignore leading whitespace 167 if (name.length() > 0 && 168 name.charAt(name.length()-1) != SPACE) { 169 name.append(SPACE); 170 // If we are too long then abort. maxLen includes 171 // temporary trailing space, so use '>'. 172 if (name.length() > maxLen) { 173 mode = 0; 174 } 175 } 176 break; 177 } 178 179 if (c == CLOSE_DELIM) { 180 int32_t len = name.length(); 181 182 // Delete trailing space, if any 183 if (len > 0 && 184 name.charAt(len-1) == SPACE) { 185 --len; 186 } 187 188 if (uprv_isInvariantUString(name.getBuffer(), len)) { 189 name.extract(0, len, cbuf, maxLen, US_INV); 190 191 UErrorCode status = U_ZERO_ERROR; 192 c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status); 193 if (U_SUCCESS(status)) { 194 // Lookup succeeded 195 196 // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1); 197 cursor++; // advance over CLOSE_DELIM 198 199 str.truncate(0); 200 str.append(c); 201 text.handleReplaceBetween(openPos, cursor, str); 202 203 // Adjust indices for the change in the length of 204 // the string. Do not assume that str.length() == 205 // 1, in case of surrogates. 206 int32_t delta = cursor - openPos - str.length(); 207 cursor -= delta; 208 limit -= delta; 209 // assert(cursor == openPos + str.length()); 210 } 211 } 212 // If the lookup failed, we leave things as-is and 213 // still switch to mode 0 and continue. 214 mode = 0; 215 openPos = -1; // close off candidate 216 continue; // *** reprocess char32At(cursor) 217 } 218 219 // Check if c is a legal char. We assume here that 220 // legal.contains(OPEN_DELIM) is FALSE, so when we abort a 221 // name, we don't have to go back to openPos+1. 222 if (legal.contains(c)) { 223 name.append(c); 224 // If we go past the longest possible name then abort. 225 // maxLen includes temporary trailing space, so use '>='. 226 if (name.length() >= maxLen) { 227 mode = 0; 228 } 229 } 230 231 // Invalid character 232 else { 233 --cursor; // Backup and reprocess this character 234 mode = 0; 235 } 236 237 break; 238 } 239 240 cursor += UTF_CHAR_LENGTH(c); 241 } 242 243 offsets.contextLimit += limit - offsets.limit; 244 offsets.limit = limit; 245 // In incremental mode, only advance the cursor up to the last 246 // open delimiter candidate. 247 offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; 248 249 uprv_free(cbuf); 250 } 251 252 U_NAMESPACE_END 253 254 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 255