Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 1996-2014, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 *******************************************************************************
      6 */
      7 
      8 /*
      9 * File coleitr.cpp
     10 *
     11 * Created by: Helena Shih
     12 *
     13 * Modification History:
     14 *
     15 *  Date      Name        Description
     16 *
     17 *  6/23/97   helena      Adding comments to make code more readable.
     18 * 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
     19 * 12/10/99   aliu        Ported Thai collation support from Java.
     20 * 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
     21 * 02/19/01   swquek      Removed CollationElementIterator() since it is
     22 *                        private constructor and no calls are made to it
     23 * 2012-2014  markus      Rewritten in C++ again.
     24 */
     25 
     26 #include "unicode/utypes.h"
     27 
     28 #if !UCONFIG_NO_COLLATION
     29 
     30 #include "unicode/coleitr.h"
     31 #include "unicode/tblcoll.h"
     32 #include "unicode/ustring.h"
     33 #include "cmemory.h"
     34 #include "collation.h"
     35 #include "collationdata.h"
     36 #include "collationiterator.h"
     37 #include "collationsets.h"
     38 #include "collationtailoring.h"
     39 #include "uassert.h"
     40 #include "uhash.h"
     41 #include "utf16collationiterator.h"
     42 #include "uvectr32.h"
     43 
     44 /* Constants --------------------------------------------------------------- */
     45 
     46 U_NAMESPACE_BEGIN
     47 
     48 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
     49 
     50 /* CollationElementIterator public constructor/destructor ------------------ */
     51 
     52 CollationElementIterator::CollationElementIterator(
     53                                          const CollationElementIterator& other)
     54         : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
     55     *this = other;
     56 }
     57 
     58 CollationElementIterator::~CollationElementIterator()
     59 {
     60     delete iter_;
     61     delete offsets_;
     62 }
     63 
     64 /* CollationElementIterator public methods --------------------------------- */
     65 
     66 namespace {
     67 
     68 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
     69     return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
     70 }
     71 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
     72     return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
     73 }
     74 UBool ceNeedsTwoParts(int64_t ce) {
     75     return (ce & INT64_C(0xffff00ff003f)) != 0;
     76 }
     77 
     78 }  // namespace
     79 
     80 int32_t CollationElementIterator::getOffset() const
     81 {
     82     if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
     83         // CollationIterator::previousCE() decrements the CEs length
     84         // while it pops CEs from its internal buffer.
     85         int32_t i = iter_->getCEsLength();
     86         if (otherHalf_ != 0) {
     87             // Return the trailing CE offset while we are in the middle of a 64-bit CE.
     88             ++i;
     89         }
     90         U_ASSERT(i < offsets_->size());
     91         return offsets_->elementAti(i);
     92     }
     93     return iter_->getOffset();
     94 }
     95 
     96 /**
     97 * Get the ordering priority of the next character in the string.
     98 * @return the next character's ordering. Returns NULLORDER if an error has
     99 *         occured or if the end of string has been reached
    100 */
    101 int32_t CollationElementIterator::next(UErrorCode& status)
    102 {
    103     if (U_FAILURE(status)) { return NULLORDER; }
    104     if (dir_ > 1) {
    105         // Continue forward iteration. Test this first.
    106         if (otherHalf_ != 0) {
    107             uint32_t oh = otherHalf_;
    108             otherHalf_ = 0;
    109             return oh;
    110         }
    111     } else if (dir_ == 1) {
    112         // next() after setOffset()
    113         dir_ = 2;
    114     } else if (dir_ == 0) {
    115         // The iter_ is already reset to the start of the text.
    116         dir_ = 2;
    117     } else /* dir_ < 0 */ {
    118         // illegal change of direction
    119         status = U_INVALID_STATE_ERROR;
    120         return NULLORDER;
    121     }
    122     // No need to keep all CEs in the buffer when we iterate.
    123     iter_->clearCEsIfNoneRemaining();
    124     int64_t ce = iter_->nextCE(status);
    125     if (ce == Collation::NO_CE) { return NULLORDER; }
    126     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    127     uint32_t p = (uint32_t)(ce >> 32);
    128     uint32_t lower32 = (uint32_t)ce;
    129     uint32_t firstHalf = getFirstHalf(p, lower32);
    130     uint32_t secondHalf = getSecondHalf(p, lower32);
    131     if (secondHalf != 0) {
    132         otherHalf_ = secondHalf | 0xc0;  // continuation CE
    133     }
    134     return firstHalf;
    135 }
    136 
    137 UBool CollationElementIterator::operator!=(
    138                                   const CollationElementIterator& other) const
    139 {
    140     return !(*this == other);
    141 }
    142 
    143 UBool CollationElementIterator::operator==(
    144                                     const CollationElementIterator& that) const
    145 {
    146     if (this == &that) {
    147         return TRUE;
    148     }
    149 
    150     return
    151         (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
    152         otherHalf_ == that.otherHalf_ &&
    153         normalizeDir() == that.normalizeDir() &&
    154         string_ == that.string_ &&
    155         *iter_ == *that.iter_;
    156 }
    157 
    158 /**
    159 * Get the ordering priority of the previous collation element in the string.
    160 * @param status the error code status.
    161 * @return the previous element's ordering. Returns NULLORDER if an error has
    162 *         occured or if the start of string has been reached.
    163 */
    164 int32_t CollationElementIterator::previous(UErrorCode& status)
    165 {
    166     if (U_FAILURE(status)) { return NULLORDER; }
    167     if (dir_ < 0) {
    168         // Continue backwards iteration. Test this first.
    169         if (otherHalf_ != 0) {
    170             uint32_t oh = otherHalf_;
    171             otherHalf_ = 0;
    172             return oh;
    173         }
    174     } else if (dir_ == 0) {
    175         iter_->resetToOffset(string_.length());
    176         dir_ = -1;
    177     } else if (dir_ == 1) {
    178         // previous() after setOffset()
    179         dir_ = -1;
    180     } else /* dir_ > 1 */ {
    181         // illegal change of direction
    182         status = U_INVALID_STATE_ERROR;
    183         return NULLORDER;
    184     }
    185     if (offsets_ == NULL) {
    186         offsets_ = new UVector32(status);
    187         if (offsets_ == NULL) {
    188             status = U_MEMORY_ALLOCATION_ERROR;
    189             return NULLORDER;
    190         }
    191     }
    192     // If we already have expansion CEs, then we also have offsets.
    193     // Otherwise remember the trailing offset in case we need to
    194     // write offsets for an artificial expansion.
    195     int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
    196     int64_t ce = iter_->previousCE(*offsets_, status);
    197     if (ce == Collation::NO_CE) { return NULLORDER; }
    198     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    199     uint32_t p = (uint32_t)(ce >> 32);
    200     uint32_t lower32 = (uint32_t)ce;
    201     uint32_t firstHalf = getFirstHalf(p, lower32);
    202     uint32_t secondHalf = getSecondHalf(p, lower32);
    203     if (secondHalf != 0) {
    204         if (offsets_->isEmpty()) {
    205             // When we convert a single 64-bit CE into two 32-bit CEs,
    206             // we need to make this artificial expansion behave like a normal expansion.
    207             // See CollationIterator::previousCE().
    208             offsets_->addElement(iter_->getOffset(), status);
    209             offsets_->addElement(limitOffset, status);
    210         }
    211         otherHalf_ = firstHalf;
    212         return secondHalf | 0xc0;  // continuation CE
    213     }
    214     return firstHalf;
    215 }
    216 
    217 /**
    218 * Resets the cursor to the beginning of the string.
    219 */
    220 void CollationElementIterator::reset()
    221 {
    222     iter_ ->resetToOffset(0);
    223     otherHalf_ = 0;
    224     dir_ = 0;
    225 }
    226 
    227 void CollationElementIterator::setOffset(int32_t newOffset,
    228                                          UErrorCode& status)
    229 {
    230     if (U_FAILURE(status)) { return; }
    231     if (0 < newOffset && newOffset < string_.length()) {
    232         int32_t offset = newOffset;
    233         do {
    234             UChar c = string_.charAt(offset);
    235             if (!rbc_->isUnsafe(c) ||
    236                     (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
    237                 break;
    238             }
    239             // Back up to before this unsafe character.
    240             --offset;
    241         } while (offset > 0);
    242         if (offset < newOffset) {
    243             // We might have backed up more than necessary.
    244             // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
    245             // but for text "chu" setOffset(2) should remain at 2
    246             // although we initially back up to offset 0.
    247             // Find the last safe offset no greater than newOffset by iterating forward.
    248             int32_t lastSafeOffset = offset;
    249             do {
    250                 iter_->resetToOffset(lastSafeOffset);
    251                 do {
    252                     iter_->nextCE(status);
    253                     if (U_FAILURE(status)) { return; }
    254                 } while ((offset = iter_->getOffset()) == lastSafeOffset);
    255                 if (offset <= newOffset) {
    256                     lastSafeOffset = offset;
    257                 }
    258             } while (offset < newOffset);
    259             newOffset = lastSafeOffset;
    260         }
    261     }
    262     iter_->resetToOffset(newOffset);
    263     otherHalf_ = 0;
    264     dir_ = 1;
    265 }
    266 
    267 /**
    268 * Sets the source to the new source string.
    269 */
    270 void CollationElementIterator::setText(const UnicodeString& source,
    271                                        UErrorCode& status)
    272 {
    273     if (U_FAILURE(status)) {
    274         return;
    275     }
    276 
    277     string_ = source;
    278     const UChar *s = string_.getBuffer();
    279     CollationIterator *newIter;
    280     UBool numeric = rbc_->settings->isNumeric();
    281     if (rbc_->settings->dontCheckFCD()) {
    282         newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    283     } else {
    284         newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    285     }
    286     if (newIter == NULL) {
    287         status = U_MEMORY_ALLOCATION_ERROR;
    288         return;
    289     }
    290     delete iter_;
    291     iter_ = newIter;
    292     otherHalf_ = 0;
    293     dir_ = 0;
    294 }
    295 
    296 // Sets the source to the new character iterator.
    297 void CollationElementIterator::setText(CharacterIterator& source,
    298                                        UErrorCode& status)
    299 {
    300     if (U_FAILURE(status))
    301         return;
    302 
    303     source.getText(string_);
    304     setText(string_, status);
    305 }
    306 
    307 int32_t CollationElementIterator::strengthOrder(int32_t order) const
    308 {
    309     UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
    310     // Mask off the unwanted differences.
    311     if (s == UCOL_PRIMARY) {
    312         order &= 0xffff0000;
    313     }
    314     else if (s == UCOL_SECONDARY) {
    315         order &= 0xffffff00;
    316     }
    317 
    318     return order;
    319 }
    320 
    321 /* CollationElementIterator private constructors/destructors --------------- */
    322 
    323 /**
    324 * This is the "real" constructor for this class; it constructs an iterator
    325 * over the source text using the specified collator
    326 */
    327 CollationElementIterator::CollationElementIterator(
    328                                                const UnicodeString &source,
    329                                                const RuleBasedCollator *coll,
    330                                                UErrorCode &status)
    331         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
    332     setText(source, status);
    333 }
    334 
    335 /**
    336 * This is the "real" constructor for this class; it constructs an iterator over
    337 * the source text using the specified collator
    338 */
    339 CollationElementIterator::CollationElementIterator(
    340                                            const CharacterIterator &source,
    341                                            const RuleBasedCollator *coll,
    342                                            UErrorCode &status)
    343         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
    344     // We only call source.getText() which should be const anyway.
    345     setText(const_cast<CharacterIterator &>(source), status);
    346 }
    347 
    348 /* CollationElementIterator private methods -------------------------------- */
    349 
    350 const CollationElementIterator& CollationElementIterator::operator=(
    351                                          const CollationElementIterator& other)
    352 {
    353     if (this == &other) {
    354         return *this;
    355     }
    356 
    357     CollationIterator *newIter;
    358     const FCDUTF16CollationIterator *otherFCDIter =
    359             dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
    360     if(otherFCDIter != NULL) {
    361         newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
    362     } else {
    363         const UTF16CollationIterator *otherIter =
    364                 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
    365         if(otherIter != NULL) {
    366             newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
    367         } else {
    368             newIter = NULL;
    369         }
    370     }
    371     if(newIter != NULL) {
    372         delete iter_;
    373         iter_ = newIter;
    374         rbc_ = other.rbc_;
    375         otherHalf_ = other.otherHalf_;
    376         dir_ = other.dir_;
    377 
    378         string_ = other.string_;
    379     }
    380     if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
    381         UErrorCode errorCode = U_ZERO_ERROR;
    382         if(offsets_ == NULL) {
    383             offsets_ = new UVector32(other.offsets_->size(), errorCode);
    384         }
    385         if(offsets_ != NULL) {
    386             offsets_->assign(*other.offsets_, errorCode);
    387         }
    388     }
    389     return *this;
    390 }
    391 
    392 namespace {
    393 
    394 class MaxExpSink : public ContractionsAndExpansions::CESink {
    395 public:
    396     MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
    397     virtual ~MaxExpSink();
    398     virtual void handleCE(int64_t /*ce*/) {}
    399     virtual void handleExpansion(const int64_t ces[], int32_t length) {
    400         if (length <= 1) {
    401             // We do not need to add single CEs into the map.
    402             return;
    403         }
    404         int32_t count = 0;  // number of CE "halves"
    405         for (int32_t i = 0; i < length; ++i) {
    406             count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
    407         }
    408         // last "half" of the last CE
    409         int64_t ce = ces[length - 1];
    410         uint32_t p = (uint32_t)(ce >> 32);
    411         uint32_t lower32 = (uint32_t)ce;
    412         uint32_t lastHalf = getSecondHalf(p, lower32);
    413         if (lastHalf == 0) {
    414             lastHalf = getFirstHalf(p, lower32);
    415             U_ASSERT(lastHalf != 0);
    416         } else {
    417             lastHalf |= 0xc0;  // old-style continuation CE
    418         }
    419         if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
    420             uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
    421         }
    422     }
    423 
    424 private:
    425     UHashtable *maxExpansions;
    426     UErrorCode &errorCode;
    427 };
    428 
    429 MaxExpSink::~MaxExpSink() {}
    430 
    431 }  // namespace
    432 
    433 UHashtable *
    434 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
    435     if (U_FAILURE(errorCode)) { return NULL; }
    436     UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
    437                                            uhash_compareLong, &errorCode);
    438     if (U_FAILURE(errorCode)) { return NULL; }
    439     MaxExpSink sink(maxExpansions, errorCode);
    440     ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
    441     if (U_FAILURE(errorCode)) {
    442         uhash_close(maxExpansions);
    443         return NULL;
    444     }
    445     return maxExpansions;
    446 }
    447 
    448 int32_t
    449 CollationElementIterator::getMaxExpansion(int32_t order) const {
    450     return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
    451 }
    452 
    453 int32_t
    454 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
    455     if (order == 0) { return 1; }
    456     int32_t max;
    457     if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
    458         return max;
    459     }
    460     if ((order & 0xc0) == 0xc0) {
    461         // old-style continuation CE
    462         return 2;
    463     } else {
    464         return 1;
    465     }
    466 }
    467 
    468 U_NAMESPACE_END
    469 
    470 #endif /* #if !UCONFIG_NO_COLLATION */
    471