Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 1996-2014, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 
     10 /*
     11 * File coleitr.cpp
     12 *
     13 * Created by: Helena Shih
     14 *
     15 * Modification History:
     16 *
     17 *  Date      Name        Description
     18 *
     19 *  6/23/97   helena      Adding comments to make code more readable.
     20 * 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
     21 * 12/10/99   aliu        Ported Thai collation support from Java.
     22 * 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
     23 * 02/19/01   swquek      Removed CollationElementIterator() since it is
     24 *                        private constructor and no calls are made to it
     25 * 2012-2014  markus      Rewritten in C++ again.
     26 */
     27 
     28 #include "unicode/utypes.h"
     29 
     30 #if !UCONFIG_NO_COLLATION
     31 
     32 #include "unicode/coleitr.h"
     33 #include "unicode/tblcoll.h"
     34 #include "unicode/ustring.h"
     35 #include "cmemory.h"
     36 #include "collation.h"
     37 #include "collationdata.h"
     38 #include "collationiterator.h"
     39 #include "collationsets.h"
     40 #include "collationtailoring.h"
     41 #include "uassert.h"
     42 #include "uhash.h"
     43 #include "utf16collationiterator.h"
     44 #include "uvectr32.h"
     45 
     46 /* Constants --------------------------------------------------------------- */
     47 
     48 U_NAMESPACE_BEGIN
     49 
     50 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
     51 
     52 /* CollationElementIterator public constructor/destructor ------------------ */
     53 
     54 CollationElementIterator::CollationElementIterator(
     55                                          const CollationElementIterator& other)
     56         : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
     57     *this = other;
     58 }
     59 
     60 CollationElementIterator::~CollationElementIterator()
     61 {
     62     delete iter_;
     63     delete offsets_;
     64 }
     65 
     66 /* CollationElementIterator public methods --------------------------------- */
     67 
     68 namespace {
     69 
     70 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
     71     return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
     72 }
     73 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
     74     return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
     75 }
     76 UBool ceNeedsTwoParts(int64_t ce) {
     77     return (ce & INT64_C(0xffff00ff003f)) != 0;
     78 }
     79 
     80 }  // namespace
     81 
     82 int32_t CollationElementIterator::getOffset() const
     83 {
     84     if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
     85         // CollationIterator::previousCE() decrements the CEs length
     86         // while it pops CEs from its internal buffer.
     87         int32_t i = iter_->getCEsLength();
     88         if (otherHalf_ != 0) {
     89             // Return the trailing CE offset while we are in the middle of a 64-bit CE.
     90             ++i;
     91         }
     92         U_ASSERT(i < offsets_->size());
     93         return offsets_->elementAti(i);
     94     }
     95     return iter_->getOffset();
     96 }
     97 
     98 /**
     99 * Get the ordering priority of the next character in the string.
    100 * @return the next character's ordering. Returns NULLORDER if an error has
    101 *         occured or if the end of string has been reached
    102 */
    103 int32_t CollationElementIterator::next(UErrorCode& status)
    104 {
    105     if (U_FAILURE(status)) { return NULLORDER; }
    106     if (dir_ > 1) {
    107         // Continue forward iteration. Test this first.
    108         if (otherHalf_ != 0) {
    109             uint32_t oh = otherHalf_;
    110             otherHalf_ = 0;
    111             return oh;
    112         }
    113     } else if (dir_ == 1) {
    114         // next() after setOffset()
    115         dir_ = 2;
    116     } else if (dir_ == 0) {
    117         // The iter_ is already reset to the start of the text.
    118         dir_ = 2;
    119     } else /* dir_ < 0 */ {
    120         // illegal change of direction
    121         status = U_INVALID_STATE_ERROR;
    122         return NULLORDER;
    123     }
    124     // No need to keep all CEs in the buffer when we iterate.
    125     iter_->clearCEsIfNoneRemaining();
    126     int64_t ce = iter_->nextCE(status);
    127     if (ce == Collation::NO_CE) { return NULLORDER; }
    128     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    129     uint32_t p = (uint32_t)(ce >> 32);
    130     uint32_t lower32 = (uint32_t)ce;
    131     uint32_t firstHalf = getFirstHalf(p, lower32);
    132     uint32_t secondHalf = getSecondHalf(p, lower32);
    133     if (secondHalf != 0) {
    134         otherHalf_ = secondHalf | 0xc0;  // continuation CE
    135     }
    136     return firstHalf;
    137 }
    138 
    139 UBool CollationElementIterator::operator!=(
    140                                   const CollationElementIterator& other) const
    141 {
    142     return !(*this == other);
    143 }
    144 
    145 UBool CollationElementIterator::operator==(
    146                                     const CollationElementIterator& that) const
    147 {
    148     if (this == &that) {
    149         return TRUE;
    150     }
    151 
    152     return
    153         (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
    154         otherHalf_ == that.otherHalf_ &&
    155         normalizeDir() == that.normalizeDir() &&
    156         string_ == that.string_ &&
    157         *iter_ == *that.iter_;
    158 }
    159 
    160 /**
    161 * Get the ordering priority of the previous collation element in the string.
    162 * @param status the error code status.
    163 * @return the previous element's ordering. Returns NULLORDER if an error has
    164 *         occured or if the start of string has been reached.
    165 */
    166 int32_t CollationElementIterator::previous(UErrorCode& status)
    167 {
    168     if (U_FAILURE(status)) { return NULLORDER; }
    169     if (dir_ < 0) {
    170         // Continue backwards iteration. Test this first.
    171         if (otherHalf_ != 0) {
    172             uint32_t oh = otherHalf_;
    173             otherHalf_ = 0;
    174             return oh;
    175         }
    176     } else if (dir_ == 0) {
    177         iter_->resetToOffset(string_.length());
    178         dir_ = -1;
    179     } else if (dir_ == 1) {
    180         // previous() after setOffset()
    181         dir_ = -1;
    182     } else /* dir_ > 1 */ {
    183         // illegal change of direction
    184         status = U_INVALID_STATE_ERROR;
    185         return NULLORDER;
    186     }
    187     if (offsets_ == NULL) {
    188         offsets_ = new UVector32(status);
    189         if (offsets_ == NULL) {
    190             status = U_MEMORY_ALLOCATION_ERROR;
    191             return NULLORDER;
    192         }
    193     }
    194     // If we already have expansion CEs, then we also have offsets.
    195     // Otherwise remember the trailing offset in case we need to
    196     // write offsets for an artificial expansion.
    197     int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
    198     int64_t ce = iter_->previousCE(*offsets_, status);
    199     if (ce == Collation::NO_CE) { return NULLORDER; }
    200     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    201     uint32_t p = (uint32_t)(ce >> 32);
    202     uint32_t lower32 = (uint32_t)ce;
    203     uint32_t firstHalf = getFirstHalf(p, lower32);
    204     uint32_t secondHalf = getSecondHalf(p, lower32);
    205     if (secondHalf != 0) {
    206         if (offsets_->isEmpty()) {
    207             // When we convert a single 64-bit CE into two 32-bit CEs,
    208             // we need to make this artificial expansion behave like a normal expansion.
    209             // See CollationIterator::previousCE().
    210             offsets_->addElement(iter_->getOffset(), status);
    211             offsets_->addElement(limitOffset, status);
    212         }
    213         otherHalf_ = firstHalf;
    214         return secondHalf | 0xc0;  // continuation CE
    215     }
    216     return firstHalf;
    217 }
    218 
    219 /**
    220 * Resets the cursor to the beginning of the string.
    221 */
    222 void CollationElementIterator::reset()
    223 {
    224     iter_ ->resetToOffset(0);
    225     otherHalf_ = 0;
    226     dir_ = 0;
    227 }
    228 
    229 void CollationElementIterator::setOffset(int32_t newOffset,
    230                                          UErrorCode& status)
    231 {
    232     if (U_FAILURE(status)) { return; }
    233     if (0 < newOffset && newOffset < string_.length()) {
    234         int32_t offset = newOffset;
    235         do {
    236             UChar c = string_.charAt(offset);
    237             if (!rbc_->isUnsafe(c) ||
    238                     (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
    239                 break;
    240             }
    241             // Back up to before this unsafe character.
    242             --offset;
    243         } while (offset > 0);
    244         if (offset < newOffset) {
    245             // We might have backed up more than necessary.
    246             // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
    247             // but for text "chu" setOffset(2) should remain at 2
    248             // although we initially back up to offset 0.
    249             // Find the last safe offset no greater than newOffset by iterating forward.
    250             int32_t lastSafeOffset = offset;
    251             do {
    252                 iter_->resetToOffset(lastSafeOffset);
    253                 do {
    254                     iter_->nextCE(status);
    255                     if (U_FAILURE(status)) { return; }
    256                 } while ((offset = iter_->getOffset()) == lastSafeOffset);
    257                 if (offset <= newOffset) {
    258                     lastSafeOffset = offset;
    259                 }
    260             } while (offset < newOffset);
    261             newOffset = lastSafeOffset;
    262         }
    263     }
    264     iter_->resetToOffset(newOffset);
    265     otherHalf_ = 0;
    266     dir_ = 1;
    267 }
    268 
    269 /**
    270 * Sets the source to the new source string.
    271 */
    272 void CollationElementIterator::setText(const UnicodeString& source,
    273                                        UErrorCode& status)
    274 {
    275     if (U_FAILURE(status)) {
    276         return;
    277     }
    278 
    279     string_ = source;
    280     const UChar *s = string_.getBuffer();
    281     CollationIterator *newIter;
    282     UBool numeric = rbc_->settings->isNumeric();
    283     if (rbc_->settings->dontCheckFCD()) {
    284         newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    285     } else {
    286         newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    287     }
    288     if (newIter == NULL) {
    289         status = U_MEMORY_ALLOCATION_ERROR;
    290         return;
    291     }
    292     delete iter_;
    293     iter_ = newIter;
    294     otherHalf_ = 0;
    295     dir_ = 0;
    296 }
    297 
    298 // Sets the source to the new character iterator.
    299 void CollationElementIterator::setText(CharacterIterator& source,
    300                                        UErrorCode& status)
    301 {
    302     if (U_FAILURE(status))
    303         return;
    304 
    305     source.getText(string_);
    306     setText(string_, status);
    307 }
    308 
    309 int32_t CollationElementIterator::strengthOrder(int32_t order) const
    310 {
    311     UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
    312     // Mask off the unwanted differences.
    313     if (s == UCOL_PRIMARY) {
    314         order &= 0xffff0000;
    315     }
    316     else if (s == UCOL_SECONDARY) {
    317         order &= 0xffffff00;
    318     }
    319 
    320     return order;
    321 }
    322 
    323 /* CollationElementIterator private constructors/destructors --------------- */
    324 
    325 /**
    326 * This is the "real" constructor for this class; it constructs an iterator
    327 * over the source text using the specified collator
    328 */
    329 CollationElementIterator::CollationElementIterator(
    330                                                const UnicodeString &source,
    331                                                const RuleBasedCollator *coll,
    332                                                UErrorCode &status)
    333         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
    334     setText(source, status);
    335 }
    336 
    337 /**
    338 * This is the "real" constructor for this class; it constructs an iterator over
    339 * the source text using the specified collator
    340 */
    341 CollationElementIterator::CollationElementIterator(
    342                                            const CharacterIterator &source,
    343                                            const RuleBasedCollator *coll,
    344                                            UErrorCode &status)
    345         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
    346     // We only call source.getText() which should be const anyway.
    347     setText(const_cast<CharacterIterator &>(source), status);
    348 }
    349 
    350 /* CollationElementIterator private methods -------------------------------- */
    351 
    352 const CollationElementIterator& CollationElementIterator::operator=(
    353                                          const CollationElementIterator& other)
    354 {
    355     if (this == &other) {
    356         return *this;
    357     }
    358 
    359     CollationIterator *newIter;
    360     const FCDUTF16CollationIterator *otherFCDIter =
    361             dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
    362     if(otherFCDIter != NULL) {
    363         newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
    364     } else {
    365         const UTF16CollationIterator *otherIter =
    366                 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
    367         if(otherIter != NULL) {
    368             newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
    369         } else {
    370             newIter = NULL;
    371         }
    372     }
    373     if(newIter != NULL) {
    374         delete iter_;
    375         iter_ = newIter;
    376         rbc_ = other.rbc_;
    377         otherHalf_ = other.otherHalf_;
    378         dir_ = other.dir_;
    379 
    380         string_ = other.string_;
    381     }
    382     if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
    383         UErrorCode errorCode = U_ZERO_ERROR;
    384         if(offsets_ == NULL) {
    385             offsets_ = new UVector32(other.offsets_->size(), errorCode);
    386         }
    387         if(offsets_ != NULL) {
    388             offsets_->assign(*other.offsets_, errorCode);
    389         }
    390     }
    391     return *this;
    392 }
    393 
    394 namespace {
    395 
    396 class MaxExpSink : public ContractionsAndExpansions::CESink {
    397 public:
    398     MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
    399     virtual ~MaxExpSink();
    400     virtual void handleCE(int64_t /*ce*/) {}
    401     virtual void handleExpansion(const int64_t ces[], int32_t length) {
    402         if (length <= 1) {
    403             // We do not need to add single CEs into the map.
    404             return;
    405         }
    406         int32_t count = 0;  // number of CE "halves"
    407         for (int32_t i = 0; i < length; ++i) {
    408             count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
    409         }
    410         // last "half" of the last CE
    411         int64_t ce = ces[length - 1];
    412         uint32_t p = (uint32_t)(ce >> 32);
    413         uint32_t lower32 = (uint32_t)ce;
    414         uint32_t lastHalf = getSecondHalf(p, lower32);
    415         if (lastHalf == 0) {
    416             lastHalf = getFirstHalf(p, lower32);
    417             U_ASSERT(lastHalf != 0);
    418         } else {
    419             lastHalf |= 0xc0;  // old-style continuation CE
    420         }
    421         if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
    422             uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
    423         }
    424     }
    425 
    426 private:
    427     UHashtable *maxExpansions;
    428     UErrorCode &errorCode;
    429 };
    430 
    431 MaxExpSink::~MaxExpSink() {}
    432 
    433 }  // namespace
    434 
    435 UHashtable *
    436 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
    437     if (U_FAILURE(errorCode)) { return NULL; }
    438     UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
    439                                            uhash_compareLong, &errorCode);
    440     if (U_FAILURE(errorCode)) { return NULL; }
    441     MaxExpSink sink(maxExpansions, errorCode);
    442     ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
    443     if (U_FAILURE(errorCode)) {
    444         uhash_close(maxExpansions);
    445         return NULL;
    446     }
    447     return maxExpansions;
    448 }
    449 
    450 int32_t
    451 CollationElementIterator::getMaxExpansion(int32_t order) const {
    452     return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
    453 }
    454 
    455 int32_t
    456 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
    457     if (order == 0) { return 1; }
    458     int32_t max;
    459     if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
    460         return max;
    461     }
    462     if ((order & 0xc0) == 0xc0) {
    463         // old-style continuation CE
    464         return 2;
    465     } else {
    466         return 1;
    467     }
    468 }
    469 
    470 U_NAMESPACE_END
    471 
    472 #endif /* #if !UCONFIG_NO_COLLATION */
    473