Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 1999-2008, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   11/17/99    aliu        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/rep.h"
     16 #include "unicode/uniset.h"
     17 #include "rbt_pars.h"
     18 #include "rbt_data.h"
     19 #include "rbt_rule.h"
     20 #include "rbt.h"
     21 #include "umutex.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
     26 
     27 static UMTX  transliteratorDataMutex = NULL;
     28 static Replaceable *gLockedText = NULL;
     29 
     30 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
     31                                          UTransDirection direction,
     32                                          UParseError& parseError,
     33                                          UErrorCode& status) {
     34     fData = 0;
     35     isDataOwned = TRUE;
     36     if (U_FAILURE(status)) {
     37         return;
     38     }
     39 
     40     TransliteratorParser parser(status);
     41     parser.parse(rules, direction, parseError, status);
     42     if (U_FAILURE(status)) {
     43         return;
     44     }
     45 
     46     if (parser.idBlockVector.size() != 0 ||
     47         parser.compoundFilter != NULL ||
     48         parser.dataVector.size() == 0) {
     49         status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
     50         return;
     51     }
     52 
     53     fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
     54     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
     55 }
     56 
     57 /**
     58  * Constructs a new transliterator from the given rules.
     59  * @param id            the id for the transliterator.
     60  * @param rules         rules, separated by ';'
     61  * @param direction     either FORWARD or REVERSE.
     62  * @param adoptedFilter the filter for this transliterator.
     63  * @param parseError    Struct to recieve information on position
     64  *                      of error if an error is encountered
     65  * @param status        Output param set to success/failure code.
     66  * @exception IllegalArgumentException if rules are malformed
     67  * or direction is invalid.
     68  */
     69 RuleBasedTransliterator::RuleBasedTransliterator(
     70                             const UnicodeString& id,
     71                             const UnicodeString& rules,
     72                             UTransDirection direction,
     73                             UnicodeFilter* adoptedFilter,
     74                             UParseError& parseError,
     75                             UErrorCode& status) :
     76     Transliterator(id, adoptedFilter) {
     77     _construct(rules, direction,parseError,status);
     78 }
     79 
     80 /**
     81  * Constructs a new transliterator from the given rules.
     82  * @param id            the id for the transliterator.
     83  * @param rules         rules, separated by ';'
     84  * @param direction     either FORWARD or REVERSE.
     85  * @param adoptedFilter the filter for this transliterator.
     86  * @param status        Output param set to success/failure code.
     87  * @exception IllegalArgumentException if rules are malformed
     88  * or direction is invalid.
     89  */
     90 /*RuleBasedTransliterator::RuleBasedTransliterator(
     91                             const UnicodeString& id,
     92                             const UnicodeString& rules,
     93                             UTransDirection direction,
     94                             UnicodeFilter* adoptedFilter,
     95                             UErrorCode& status) :
     96     Transliterator(id, adoptedFilter) {
     97     UParseError parseError;
     98     _construct(rules, direction,parseError, status);
     99 }*/
    100 
    101 /**
    102  * Covenience constructor with no filter.
    103  */
    104 /*RuleBasedTransliterator::RuleBasedTransliterator(
    105                             const UnicodeString& id,
    106                             const UnicodeString& rules,
    107                             UTransDirection direction,
    108                             UErrorCode& status) :
    109     Transliterator(id, 0) {
    110     UParseError parseError;
    111     _construct(rules, direction,parseError, status);
    112 }*/
    113 
    114 /**
    115  * Covenience constructor with no filter and FORWARD direction.
    116  */
    117 /*RuleBasedTransliterator::RuleBasedTransliterator(
    118                             const UnicodeString& id,
    119                             const UnicodeString& rules,
    120                             UErrorCode& status) :
    121     Transliterator(id, 0) {
    122     UParseError parseError;
    123     _construct(rules, UTRANS_FORWARD, parseError, status);
    124 }*/
    125 
    126 /**
    127  * Covenience constructor with FORWARD direction.
    128  */
    129 /*RuleBasedTransliterator::RuleBasedTransliterator(
    130                             const UnicodeString& id,
    131                             const UnicodeString& rules,
    132                             UnicodeFilter* adoptedFilter,
    133                             UErrorCode& status) :
    134     Transliterator(id, adoptedFilter) {
    135     UParseError parseError;
    136     _construct(rules, UTRANS_FORWARD,parseError, status);
    137 }*/
    138 
    139 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
    140                                  const TransliterationRuleData* theData,
    141                                  UnicodeFilter* adoptedFilter) :
    142     Transliterator(id, adoptedFilter),
    143     fData((TransliterationRuleData*)theData), // cast away const
    144     isDataOwned(FALSE) {
    145     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    146 }
    147 
    148 /**
    149  * Internal constructor.
    150  */
    151 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
    152                                                  TransliterationRuleData* theData,
    153                                                  UBool isDataAdopted) :
    154     Transliterator(id, 0),
    155     fData(theData),
    156     isDataOwned(isDataAdopted) {
    157     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    158 }
    159 
    160 /**
    161  * Copy constructor.
    162  */
    163 RuleBasedTransliterator::RuleBasedTransliterator(
    164         const RuleBasedTransliterator& other) :
    165     Transliterator(other), fData(other.fData),
    166     isDataOwned(other.isDataOwned) {
    167 
    168     // The data object may or may not be owned.  If it is not owned we
    169     // share it; it is invariant.  If it is owned, it's still
    170     // invariant, but we need to copy it to prevent double-deletion.
    171     // If this becomes a performance issue (if people do a lot of RBT
    172     // copying -- unlikely) we can reference count the data object.
    173 
    174     // Only do a deep copy if this is owned data, that is, data that
    175     // will be later deleted.  System transliterators contain
    176     // non-owned data.
    177     if (isDataOwned) {
    178         fData = new TransliterationRuleData(*other.fData);
    179     }
    180 }
    181 
    182 /**
    183  * Destructor.
    184  */
    185 RuleBasedTransliterator::~RuleBasedTransliterator() {
    186     // Delete the data object only if we own it.
    187     if (isDataOwned) {
    188         delete fData;
    189     }
    190 }
    191 
    192 Transliterator* // Covariant return NOT ALLOWED (for portability)
    193 RuleBasedTransliterator::clone(void) const {
    194     return new RuleBasedTransliterator(*this);
    195 }
    196 
    197 /**
    198  * Implements {@link Transliterator#handleTransliterate}.
    199  */
    200 void
    201 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
    202                                              UBool isIncremental) const {
    203     /* We keep contextStart and contextLimit fixed the entire time,
    204      * relative to the text -- contextLimit may move numerically if
    205      * text is inserted or removed.  The start offset moves toward
    206      * limit, with replacements happening under it.
    207      *
    208      * Example: rules 1. ab>x|y
    209      *                2. yc>z
    210      *
    211      * |eabcd   begin - no match, advance start
    212      * e|abcd   match rule 1 - change text & adjust start
    213      * ex|ycd   match rule 2 - change text & adjust start
    214      * exz|d    no match, advance start
    215      * exzd|    done
    216      */
    217 
    218     /* A rule like
    219      *   a>b|a
    220      * creates an infinite loop. To prevent that, we put an arbitrary
    221      * limit on the number of iterations that we take, one that is
    222      * high enough that any reasonable rules are ok, but low enough to
    223      * prevent a server from hanging.  The limit is 16 times the
    224      * number of characters n, unless n is so large that 16n exceeds a
    225      * uint32_t.
    226      */
    227     uint32_t loopCount = 0;
    228     uint32_t loopLimit = index.limit - index.start;
    229     if (loopLimit >= 0x10000000) {
    230         loopLimit = 0xFFFFFFFF;
    231     } else {
    232         loopLimit <<= 4;
    233     }
    234 
    235     // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
    236     //   operations must be prevented.
    237     // A Complication: compound transliterators can result in recursive entries to this
    238     //   function, sometimes with different "This" objects, always with the same text.
    239     //   Double-locking must be prevented in these cases.
    240     //
    241 
    242     // If the transliteration data is exclusively owned by this transliterator object,
    243     //   we don't need to do any locking.  No sharing between transliterators is possible,
    244     //   so no concurrent access from multiple threads is possible.
    245     UBool    lockedMutexAtThisLevel = FALSE;
    246     if (isDataOwned == FALSE) {
    247         // Test whether this request is operating on the same text string as some
    248         //   some other transliteration that is still in progress and holding the
    249         //   transliteration mutex.  If so, do not lock the transliteration
    250         //    mutex again.
    251         UBool needToLock;
    252         UMTX_CHECK(NULL, (&text != gLockedText), needToLock);
    253         if (needToLock) {
    254             umtx_lock(&transliteratorDataMutex);
    255             gLockedText = &text;
    256             lockedMutexAtThisLevel = TRUE;
    257         }
    258     }
    259 
    260     // Check to make sure we don't dereference a null pointer.
    261     if (fData != NULL) {
    262 	    while (index.start < index.limit &&
    263 	           loopCount <= loopLimit &&
    264 	           fData->ruleSet.transliterate(text, index, isIncremental)) {
    265 	        ++loopCount;
    266 	    }
    267     }
    268     if (lockedMutexAtThisLevel) {
    269         gLockedText = NULL;
    270         umtx_unlock(&transliteratorDataMutex);
    271     }
    272 }
    273 
    274 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
    275                                                 UBool escapeUnprintable) const {
    276     return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
    277 }
    278 
    279 /**
    280  * Implement Transliterator framework
    281  */
    282 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
    283     fData->ruleSet.getSourceTargetSet(result, FALSE);
    284 }
    285 
    286 /**
    287  * Override Transliterator framework
    288  */
    289 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
    290     return fData->ruleSet.getSourceTargetSet(result, TRUE);
    291 }
    292 
    293 U_NAMESPACE_END
    294 
    295 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    296