Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/17/99    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/rep.h"
     18 #include "unicode/uniset.h"
     19 #include "rbt_pars.h"
     20 #include "rbt_data.h"
     21 #include "rbt_rule.h"
     22 #include "rbt.h"
     23 #include "mutex.h"
     24 #include "umutex.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
     29 
     30 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
     31 static Replaceable *gLockedText = NULL;
     32 
     33 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
     34                                          UTransDirection direction,
     35                                          UParseError& parseError,
     36                                          UErrorCode& status) {
     37     fData = 0;
     38     isDataOwned = TRUE;
     39     if (U_FAILURE(status)) {
     40         return;
     41     }
     42 
     43     TransliteratorParser parser(status);
     44     parser.parse(rules, direction, parseError, status);
     45     if (U_FAILURE(status)) {
     46         return;
     47     }
     48 
     49     if (parser.idBlockVector.size() != 0 ||
     50         parser.compoundFilter != NULL ||
     51         parser.dataVector.size() == 0) {
     52         status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
     53         return;
     54     }
     55 
     56     fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
     57     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
     58 }
     59 
     60 /**
     61  * Constructs a new transliterator from the given rules.
     62  * @param id            the id for the transliterator.
     63  * @param rules         rules, separated by ';'
     64  * @param direction     either FORWARD or REVERSE.
     65  * @param adoptedFilter the filter for this transliterator.
     66  * @param parseError    Struct to recieve information on position
     67  *                      of error if an error is encountered
     68  * @param status        Output param set to success/failure code.
     69  * @exception IllegalArgumentException if rules are malformed
     70  * or direction is invalid.
     71  */
     72 RuleBasedTransliterator::RuleBasedTransliterator(
     73                             const UnicodeString& id,
     74                             const UnicodeString& rules,
     75                             UTransDirection direction,
     76                             UnicodeFilter* adoptedFilter,
     77                             UParseError& parseError,
     78                             UErrorCode& status) :
     79     Transliterator(id, adoptedFilter) {
     80     _construct(rules, direction,parseError,status);
     81 }
     82 
     83 /**
     84  * Constructs a new transliterator from the given rules.
     85  * @param id            the id for the transliterator.
     86  * @param rules         rules, separated by ';'
     87  * @param direction     either FORWARD or REVERSE.
     88  * @param adoptedFilter the filter for this transliterator.
     89  * @param status        Output param set to success/failure code.
     90  * @exception IllegalArgumentException if rules are malformed
     91  * or direction is invalid.
     92  */
     93 /*RuleBasedTransliterator::RuleBasedTransliterator(
     94                             const UnicodeString& id,
     95                             const UnicodeString& rules,
     96                             UTransDirection direction,
     97                             UnicodeFilter* adoptedFilter,
     98                             UErrorCode& status) :
     99     Transliterator(id, adoptedFilter) {
    100     UParseError parseError;
    101     _construct(rules, direction,parseError, status);
    102 }*/
    103 
    104 /**
    105  * Covenience constructor with no filter.
    106  */
    107 /*RuleBasedTransliterator::RuleBasedTransliterator(
    108                             const UnicodeString& id,
    109                             const UnicodeString& rules,
    110                             UTransDirection direction,
    111                             UErrorCode& status) :
    112     Transliterator(id, 0) {
    113     UParseError parseError;
    114     _construct(rules, direction,parseError, status);
    115 }*/
    116 
    117 /**
    118  * Covenience constructor with no filter and FORWARD direction.
    119  */
    120 /*RuleBasedTransliterator::RuleBasedTransliterator(
    121                             const UnicodeString& id,
    122                             const UnicodeString& rules,
    123                             UErrorCode& status) :
    124     Transliterator(id, 0) {
    125     UParseError parseError;
    126     _construct(rules, UTRANS_FORWARD, parseError, status);
    127 }*/
    128 
    129 /**
    130  * Covenience constructor with FORWARD direction.
    131  */
    132 /*RuleBasedTransliterator::RuleBasedTransliterator(
    133                             const UnicodeString& id,
    134                             const UnicodeString& rules,
    135                             UnicodeFilter* adoptedFilter,
    136                             UErrorCode& status) :
    137     Transliterator(id, adoptedFilter) {
    138     UParseError parseError;
    139     _construct(rules, UTRANS_FORWARD,parseError, status);
    140 }*/
    141 
    142 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
    143                                  const TransliterationRuleData* theData,
    144                                  UnicodeFilter* adoptedFilter) :
    145     Transliterator(id, adoptedFilter),
    146     fData((TransliterationRuleData*)theData), // cast away const
    147     isDataOwned(FALSE) {
    148     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    149 }
    150 
    151 /**
    152  * Internal constructor.
    153  */
    154 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
    155                                                  TransliterationRuleData* theData,
    156                                                  UBool isDataAdopted) :
    157     Transliterator(id, 0),
    158     fData(theData),
    159     isDataOwned(isDataAdopted) {
    160     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    161 }
    162 
    163 /**
    164  * Copy constructor.
    165  */
    166 RuleBasedTransliterator::RuleBasedTransliterator(
    167         const RuleBasedTransliterator& other) :
    168     Transliterator(other), fData(other.fData),
    169     isDataOwned(other.isDataOwned) {
    170 
    171     // The data object may or may not be owned.  If it is not owned we
    172     // share it; it is invariant.  If it is owned, it's still
    173     // invariant, but we need to copy it to prevent double-deletion.
    174     // If this becomes a performance issue (if people do a lot of RBT
    175     // copying -- unlikely) we can reference count the data object.
    176 
    177     // Only do a deep copy if this is owned data, that is, data that
    178     // will be later deleted.  System transliterators contain
    179     // non-owned data.
    180     if (isDataOwned) {
    181         fData = new TransliterationRuleData(*other.fData);
    182     }
    183 }
    184 
    185 /**
    186  * Destructor.
    187  */
    188 RuleBasedTransliterator::~RuleBasedTransliterator() {
    189     // Delete the data object only if we own it.
    190     if (isDataOwned) {
    191         delete fData;
    192     }
    193 }
    194 
    195 Transliterator* // Covariant return NOT ALLOWED (for portability)
    196 RuleBasedTransliterator::clone(void) const {
    197     return new RuleBasedTransliterator(*this);
    198 }
    199 
    200 /**
    201  * Implements {@link Transliterator#handleTransliterate}.
    202  */
    203 void
    204 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
    205                                              UBool isIncremental) const {
    206     /* We keep contextStart and contextLimit fixed the entire time,
    207      * relative to the text -- contextLimit may move numerically if
    208      * text is inserted or removed.  The start offset moves toward
    209      * limit, with replacements happening under it.
    210      *
    211      * Example: rules 1. ab>x|y
    212      *                2. yc>z
    213      *
    214      * |eabcd   begin - no match, advance start
    215      * e|abcd   match rule 1 - change text & adjust start
    216      * ex|ycd   match rule 2 - change text & adjust start
    217      * exz|d    no match, advance start
    218      * exzd|    done
    219      */
    220 
    221     /* A rule like
    222      *   a>b|a
    223      * creates an infinite loop. To prevent that, we put an arbitrary
    224      * limit on the number of iterations that we take, one that is
    225      * high enough that any reasonable rules are ok, but low enough to
    226      * prevent a server from hanging.  The limit is 16 times the
    227      * number of characters n, unless n is so large that 16n exceeds a
    228      * uint32_t.
    229      */
    230     uint32_t loopCount = 0;
    231     uint32_t loopLimit = index.limit - index.start;
    232     if (loopLimit >= 0x10000000) {
    233         loopLimit = 0xFFFFFFFF;
    234     } else {
    235         loopLimit <<= 4;
    236     }
    237 
    238     // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
    239     //   operations must be prevented.
    240     // A Complication: compound transliterators can result in recursive entries to this
    241     //   function, sometimes with different "This" objects, always with the same text.
    242     //   Double-locking must be prevented in these cases.
    243     //
    244 
    245     UBool    lockedMutexAtThisLevel = FALSE;
    246 
    247     // Test whether this request is operating on the same text string as
    248     //   some other transliteration that is still in progress and holding the
    249     //   transliteration mutex.  If so, do not lock the transliteration
    250     //    mutex again.
    251     //
    252     //  gLockedText variable is protected by the global ICU mutex.
    253     //  Shared RBT data protected by transliteratorDataMutex.
    254     //
    255     // TODO(andy): Need a better scheme for handling this.
    256     UBool needToLock;
    257     {
    258         Mutex m;
    259         needToLock = (&text != gLockedText);
    260     }
    261     if (needToLock) {
    262         umtx_lock(&transliteratorDataMutex);  // Contention, longish waits possible here.
    263         Mutex m;
    264         gLockedText = &text;
    265         lockedMutexAtThisLevel = TRUE;
    266     }
    267 
    268     // Check to make sure we don't dereference a null pointer.
    269     if (fData != NULL) {
    270 	    while (index.start < index.limit &&
    271 	           loopCount <= loopLimit &&
    272 	           fData->ruleSet.transliterate(text, index, isIncremental)) {
    273 	        ++loopCount;
    274 	    }
    275     }
    276     if (lockedMutexAtThisLevel) {
    277         {
    278             Mutex m;
    279             gLockedText = NULL;
    280         }
    281         umtx_unlock(&transliteratorDataMutex);
    282     }
    283 }
    284 
    285 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
    286                                                 UBool escapeUnprintable) const {
    287     return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
    288 }
    289 
    290 /**
    291  * Implement Transliterator framework
    292  */
    293 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
    294     fData->ruleSet.getSourceTargetSet(result, FALSE);
    295 }
    296 
    297 /**
    298  * Override Transliterator framework
    299  */
    300 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
    301     return fData->ruleSet.getSourceTargetSet(result, TRUE);
    302 }
    303 
    304 U_NAMESPACE_END
    305 
    306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    307