1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/17/99 aliu Creation. 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/rep.h" 16 #include "unicode/uniset.h" 17 #include "rbt_pars.h" 18 #include "rbt_data.h" 19 #include "rbt_rule.h" 20 #include "rbt.h" 21 #include "umutex.h" 22 23 U_NAMESPACE_BEGIN 24 25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) 26 27 static UMTX transliteratorDataMutex = NULL; 28 static Replaceable *gLockedText = NULL; 29 30 void RuleBasedTransliterator::_construct(const UnicodeString& rules, 31 UTransDirection direction, 32 UParseError& parseError, 33 UErrorCode& status) { 34 fData = 0; 35 isDataOwned = TRUE; 36 if (U_FAILURE(status)) { 37 return; 38 } 39 40 TransliteratorParser parser(status); 41 parser.parse(rules, direction, parseError, status); 42 if (U_FAILURE(status)) { 43 return; 44 } 45 46 if (parser.idBlockVector.size() != 0 || 47 parser.compoundFilter != NULL || 48 parser.dataVector.size() == 0) { 49 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT 50 return; 51 } 52 53 fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); 54 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 55 } 56 57 /** 58 * Constructs a new transliterator from the given rules. 59 * @param id the id for the transliterator. 60 * @param rules rules, separated by ';' 61 * @param direction either FORWARD or REVERSE. 62 * @param adoptedFilter the filter for this transliterator. 63 * @param parseError Struct to recieve information on position 64 * of error if an error is encountered 65 * @param status Output param set to success/failure code. 66 * @exception IllegalArgumentException if rules are malformed 67 * or direction is invalid. 68 */ 69 RuleBasedTransliterator::RuleBasedTransliterator( 70 const UnicodeString& id, 71 const UnicodeString& rules, 72 UTransDirection direction, 73 UnicodeFilter* adoptedFilter, 74 UParseError& parseError, 75 UErrorCode& status) : 76 Transliterator(id, adoptedFilter) { 77 _construct(rules, direction,parseError,status); 78 } 79 80 /** 81 * Constructs a new transliterator from the given rules. 82 * @param id the id for the transliterator. 83 * @param rules rules, separated by ';' 84 * @param direction either FORWARD or REVERSE. 85 * @param adoptedFilter the filter for this transliterator. 86 * @param status Output param set to success/failure code. 87 * @exception IllegalArgumentException if rules are malformed 88 * or direction is invalid. 89 */ 90 /*RuleBasedTransliterator::RuleBasedTransliterator( 91 const UnicodeString& id, 92 const UnicodeString& rules, 93 UTransDirection direction, 94 UnicodeFilter* adoptedFilter, 95 UErrorCode& status) : 96 Transliterator(id, adoptedFilter) { 97 UParseError parseError; 98 _construct(rules, direction,parseError, status); 99 }*/ 100 101 /** 102 * Covenience constructor with no filter. 103 */ 104 /*RuleBasedTransliterator::RuleBasedTransliterator( 105 const UnicodeString& id, 106 const UnicodeString& rules, 107 UTransDirection direction, 108 UErrorCode& status) : 109 Transliterator(id, 0) { 110 UParseError parseError; 111 _construct(rules, direction,parseError, status); 112 }*/ 113 114 /** 115 * Covenience constructor with no filter and FORWARD direction. 116 */ 117 /*RuleBasedTransliterator::RuleBasedTransliterator( 118 const UnicodeString& id, 119 const UnicodeString& rules, 120 UErrorCode& status) : 121 Transliterator(id, 0) { 122 UParseError parseError; 123 _construct(rules, UTRANS_FORWARD, parseError, status); 124 }*/ 125 126 /** 127 * Covenience constructor with FORWARD direction. 128 */ 129 /*RuleBasedTransliterator::RuleBasedTransliterator( 130 const UnicodeString& id, 131 const UnicodeString& rules, 132 UnicodeFilter* adoptedFilter, 133 UErrorCode& status) : 134 Transliterator(id, adoptedFilter) { 135 UParseError parseError; 136 _construct(rules, UTRANS_FORWARD,parseError, status); 137 }*/ 138 139 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 140 const TransliterationRuleData* theData, 141 UnicodeFilter* adoptedFilter) : 142 Transliterator(id, adoptedFilter), 143 fData((TransliterationRuleData*)theData), // cast away const 144 isDataOwned(FALSE) { 145 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 146 } 147 148 /** 149 * Internal constructor. 150 */ 151 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 152 TransliterationRuleData* theData, 153 UBool isDataAdopted) : 154 Transliterator(id, 0), 155 fData(theData), 156 isDataOwned(isDataAdopted) { 157 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 158 } 159 160 /** 161 * Copy constructor. 162 */ 163 RuleBasedTransliterator::RuleBasedTransliterator( 164 const RuleBasedTransliterator& other) : 165 Transliterator(other), fData(other.fData), 166 isDataOwned(other.isDataOwned) { 167 168 // The data object may or may not be owned. If it is not owned we 169 // share it; it is invariant. If it is owned, it's still 170 // invariant, but we need to copy it to prevent double-deletion. 171 // If this becomes a performance issue (if people do a lot of RBT 172 // copying -- unlikely) we can reference count the data object. 173 174 // Only do a deep copy if this is owned data, that is, data that 175 // will be later deleted. System transliterators contain 176 // non-owned data. 177 if (isDataOwned) { 178 fData = new TransliterationRuleData(*other.fData); 179 } 180 } 181 182 /** 183 * Destructor. 184 */ 185 RuleBasedTransliterator::~RuleBasedTransliterator() { 186 // Delete the data object only if we own it. 187 if (isDataOwned) { 188 delete fData; 189 } 190 } 191 192 Transliterator* // Covariant return NOT ALLOWED (for portability) 193 RuleBasedTransliterator::clone(void) const { 194 return new RuleBasedTransliterator(*this); 195 } 196 197 /** 198 * Implements {@link Transliterator#handleTransliterate}. 199 */ 200 void 201 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, 202 UBool isIncremental) const { 203 /* We keep contextStart and contextLimit fixed the entire time, 204 * relative to the text -- contextLimit may move numerically if 205 * text is inserted or removed. The start offset moves toward 206 * limit, with replacements happening under it. 207 * 208 * Example: rules 1. ab>x|y 209 * 2. yc>z 210 * 211 * |eabcd begin - no match, advance start 212 * e|abcd match rule 1 - change text & adjust start 213 * ex|ycd match rule 2 - change text & adjust start 214 * exz|d no match, advance start 215 * exzd| done 216 */ 217 218 /* A rule like 219 * a>b|a 220 * creates an infinite loop. To prevent that, we put an arbitrary 221 * limit on the number of iterations that we take, one that is 222 * high enough that any reasonable rules are ok, but low enough to 223 * prevent a server from hanging. The limit is 16 times the 224 * number of characters n, unless n is so large that 16n exceeds a 225 * uint32_t. 226 */ 227 uint32_t loopCount = 0; 228 uint32_t loopLimit = index.limit - index.start; 229 if (loopLimit >= 0x10000000) { 230 loopLimit = 0xFFFFFFFF; 231 } else { 232 loopLimit <<= 4; 233 } 234 235 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent 236 // operations must be prevented. 237 // A Complication: compound transliterators can result in recursive entries to this 238 // function, sometimes with different "This" objects, always with the same text. 239 // Double-locking must be prevented in these cases. 240 // 241 242 // If the transliteration data is exclusively owned by this transliterator object, 243 // we don't need to do any locking. No sharing between transliterators is possible, 244 // so no concurrent access from multiple threads is possible. 245 UBool lockedMutexAtThisLevel = FALSE; 246 if (isDataOwned == FALSE) { 247 // Test whether this request is operating on the same text string as some 248 // some other transliteration that is still in progress and holding the 249 // transliteration mutex. If so, do not lock the transliteration 250 // mutex again. 251 UBool needToLock; 252 UMTX_CHECK(NULL, (&text != gLockedText), needToLock); 253 if (needToLock) { 254 umtx_lock(&transliteratorDataMutex); 255 gLockedText = &text; 256 lockedMutexAtThisLevel = TRUE; 257 } 258 } 259 260 // Check to make sure we don't dereference a null pointer. 261 if (fData != NULL) { 262 while (index.start < index.limit && 263 loopCount <= loopLimit && 264 fData->ruleSet.transliterate(text, index, isIncremental)) { 265 ++loopCount; 266 } 267 } 268 if (lockedMutexAtThisLevel) { 269 gLockedText = NULL; 270 umtx_unlock(&transliteratorDataMutex); 271 } 272 } 273 274 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, 275 UBool escapeUnprintable) const { 276 return fData->ruleSet.toRules(rulesSource, escapeUnprintable); 277 } 278 279 /** 280 * Implement Transliterator framework 281 */ 282 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { 283 fData->ruleSet.getSourceTargetSet(result, FALSE); 284 } 285 286 /** 287 * Override Transliterator framework 288 */ 289 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { 290 return fData->ruleSet.getSourceTargetSet(result, TRUE); 291 } 292 293 U_NAMESPACE_END 294 295 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 296