Home | History | Annotate | Download | only in common
      1 /*
      2  *************************************************************************
      3  * COPYRIGHT:
      4  * Copyright (c) 1996-2010, International Business Machines Corporation and
      5  * others. All Rights Reserved.
      6  *************************************************************************
      7  */
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_NORMALIZATION
     12 
     13 #include "unicode/uniset.h"
     14 #include "unicode/unistr.h"
     15 #include "unicode/chariter.h"
     16 #include "unicode/schriter.h"
     17 #include "unicode/uchriter.h"
     18 #include "unicode/normlzr.h"
     19 #include "cmemory.h"
     20 #include "normalizer2impl.h"
     21 #include "uprops.h"  // for uniset_getUnicode32Instance()
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
     26 
     27 //-------------------------------------------------------------------------
     28 // Constructors and other boilerplate
     29 //-------------------------------------------------------------------------
     30 
     31 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
     32     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
     33     text(new StringCharacterIterator(str)),
     34     currentIndex(0), nextIndex(0),
     35     buffer(), bufferPos(0)
     36 {
     37     init();
     38 }
     39 
     40 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
     41     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
     42     text(new UCharCharacterIterator(str, length)),
     43     currentIndex(0), nextIndex(0),
     44     buffer(), bufferPos(0)
     45 {
     46     init();
     47 }
     48 
     49 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
     50     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
     51     text(iter.clone()),
     52     currentIndex(0), nextIndex(0),
     53     buffer(), bufferPos(0)
     54 {
     55     init();
     56 }
     57 
     58 Normalizer::Normalizer(const Normalizer &copy) :
     59     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
     60     text(copy.text->clone()),
     61     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
     62     buffer(copy.buffer), bufferPos(copy.bufferPos)
     63 {
     64     init();
     65 }
     66 
     67 static const UChar _NUL=0;
     68 
     69 void
     70 Normalizer::init() {
     71     UErrorCode errorCode=U_ZERO_ERROR;
     72     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
     73     if(fOptions&UNORM_UNICODE_3_2) {
     74         delete fFilteredNorm2;
     75         fNorm2=fFilteredNorm2=
     76             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
     77     }
     78     if(U_FAILURE(errorCode)) {
     79         errorCode=U_ZERO_ERROR;
     80         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
     81     }
     82 }
     83 
     84 Normalizer::~Normalizer()
     85 {
     86     delete fFilteredNorm2;
     87     delete text;
     88 }
     89 
     90 Normalizer*
     91 Normalizer::clone() const
     92 {
     93     return new Normalizer(*this);
     94 }
     95 
     96 /**
     97  * Generates a hash code for this iterator.
     98  */
     99 int32_t Normalizer::hashCode() const
    100 {
    101     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
    102 }
    103 
    104 UBool Normalizer::operator==(const Normalizer& that) const
    105 {
    106     return
    107         this==&that ||
    108         (fUMode==that.fUMode &&
    109         fOptions==that.fOptions &&
    110         *text==*that.text &&
    111         buffer==that.buffer &&
    112         bufferPos==that.bufferPos &&
    113         nextIndex==that.nextIndex);
    114 }
    115 
    116 //-------------------------------------------------------------------------
    117 // Static utility methods
    118 //-------------------------------------------------------------------------
    119 
    120 void U_EXPORT2
    121 Normalizer::normalize(const UnicodeString& source,
    122                       UNormalizationMode mode, int32_t options,
    123                       UnicodeString& result,
    124                       UErrorCode &status) {
    125     if(source.isBogus() || U_FAILURE(status)) {
    126         result.setToBogus();
    127         if(U_SUCCESS(status)) {
    128             status=U_ILLEGAL_ARGUMENT_ERROR;
    129         }
    130     } else {
    131         UnicodeString localDest;
    132         UnicodeString *dest;
    133 
    134         if(&source!=&result) {
    135             dest=&result;
    136         } else {
    137             // the source and result strings are the same object, use a temporary one
    138             dest=&localDest;
    139         }
    140         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    141         if(U_SUCCESS(status)) {
    142             if(options&UNORM_UNICODE_3_2) {
    143                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    144                     normalize(source, *dest, status);
    145             } else {
    146                 n2->normalize(source, *dest, status);
    147             }
    148         }
    149         if(dest==&localDest && U_SUCCESS(status)) {
    150             result=*dest;
    151         }
    152     }
    153 }
    154 
    155 void U_EXPORT2
    156 Normalizer::compose(const UnicodeString& source,
    157                     UBool compat, int32_t options,
    158                     UnicodeString& result,
    159                     UErrorCode &status) {
    160     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
    161 }
    162 
    163 void U_EXPORT2
    164 Normalizer::decompose(const UnicodeString& source,
    165                       UBool compat, int32_t options,
    166                       UnicodeString& result,
    167                       UErrorCode &status) {
    168     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
    169 }
    170 
    171 UNormalizationCheckResult
    172 Normalizer::quickCheck(const UnicodeString& source,
    173                        UNormalizationMode mode, int32_t options,
    174                        UErrorCode &status) {
    175     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    176     if(U_SUCCESS(status)) {
    177         if(options&UNORM_UNICODE_3_2) {
    178             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    179                 quickCheck(source, status);
    180         } else {
    181             return n2->quickCheck(source, status);
    182         }
    183     } else {
    184         return UNORM_MAYBE;
    185     }
    186 }
    187 
    188 UBool
    189 Normalizer::isNormalized(const UnicodeString& source,
    190                          UNormalizationMode mode, int32_t options,
    191                          UErrorCode &status) {
    192     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    193     if(U_SUCCESS(status)) {
    194         if(options&UNORM_UNICODE_3_2) {
    195             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    196                 isNormalized(source, status);
    197         } else {
    198             return n2->isNormalized(source, status);
    199         }
    200     } else {
    201         return FALSE;
    202     }
    203 }
    204 
    205 UnicodeString & U_EXPORT2
    206 Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
    207                         UnicodeString &result,
    208                         UNormalizationMode mode, int32_t options,
    209                         UErrorCode &errorCode) {
    210     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
    211         result.setToBogus();
    212         if(U_SUCCESS(errorCode)) {
    213             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    214         }
    215     } else {
    216         UnicodeString localDest;
    217         UnicodeString *dest;
    218 
    219         if(&right!=&result) {
    220             dest=&result;
    221         } else {
    222             // the right and result strings are the same object, use a temporary one
    223             dest=&localDest;
    224         }
    225         *dest=left;
    226         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
    227         if(U_SUCCESS(errorCode)) {
    228             if(options&UNORM_UNICODE_3_2) {
    229                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
    230                     append(*dest, right, errorCode);
    231             } else {
    232                 n2->append(*dest, right, errorCode);
    233             }
    234         }
    235         if(dest==&localDest && U_SUCCESS(errorCode)) {
    236             result=*dest;
    237         }
    238     }
    239     return result;
    240 }
    241 
    242 //-------------------------------------------------------------------------
    243 // Iteration API
    244 //-------------------------------------------------------------------------
    245 
    246 /**
    247  * Return the current character in the normalized text.
    248  */
    249 UChar32 Normalizer::current() {
    250     if(bufferPos<buffer.length() || nextNormalize()) {
    251         return buffer.char32At(bufferPos);
    252     } else {
    253         return DONE;
    254     }
    255 }
    256 
    257 /**
    258  * Return the next character in the normalized text and advance
    259  * the iteration position by one.  If the end
    260  * of the text has already been reached, {@link #DONE} is returned.
    261  */
    262 UChar32 Normalizer::next() {
    263     if(bufferPos<buffer.length() ||  nextNormalize()) {
    264         UChar32 c=buffer.char32At(bufferPos);
    265         bufferPos+=UTF_CHAR_LENGTH(c);
    266         return c;
    267     } else {
    268         return DONE;
    269     }
    270 }
    271 
    272 /**
    273  * Return the previous character in the normalized text and decrement
    274  * the iteration position by one.  If the beginning
    275  * of the text has already been reached, {@link #DONE} is returned.
    276  */
    277 UChar32 Normalizer::previous() {
    278     if(bufferPos>0 || previousNormalize()) {
    279         UChar32 c=buffer.char32At(bufferPos-1);
    280         bufferPos-=UTF_CHAR_LENGTH(c);
    281         return c;
    282     } else {
    283         return DONE;
    284     }
    285 }
    286 
    287 void Normalizer::reset() {
    288     currentIndex=nextIndex=text->setToStart();
    289     clearBuffer();
    290 }
    291 
    292 void
    293 Normalizer::setIndexOnly(int32_t index) {
    294     text->setIndex(index);  // pins index
    295     currentIndex=nextIndex=text->getIndex();
    296     clearBuffer();
    297 }
    298 
    299 /**
    300  * Return the first character in the normalized text.  This resets
    301  * the <tt>Normalizer's</tt> position to the beginning of the text.
    302  */
    303 UChar32 Normalizer::first() {
    304     reset();
    305     return next();
    306 }
    307 
    308 /**
    309  * Return the last character in the normalized text.  This resets
    310  * the <tt>Normalizer's</tt> position to be just before the
    311  * the input text corresponding to that normalized character.
    312  */
    313 UChar32 Normalizer::last() {
    314     currentIndex=nextIndex=text->setToEnd();
    315     clearBuffer();
    316     return previous();
    317 }
    318 
    319 /**
    320  * Retrieve the current iteration position in the input text that is
    321  * being normalized.  This method is useful in applications such as
    322  * searching, where you need to be able to determine the position in
    323  * the input text that corresponds to a given normalized output character.
    324  * <p>
    325  * <b>Note:</b> This method sets the position in the <em>input</em>, while
    326  * {@link #next} and {@link #previous} iterate through characters in the
    327  * <em>output</em>.  This means that there is not necessarily a one-to-one
    328  * correspondence between characters returned by <tt>next</tt> and
    329  * <tt>previous</tt> and the indices passed to and returned from
    330  * <tt>setIndex</tt> and {@link #getIndex}.
    331  *
    332  */
    333 int32_t Normalizer::getIndex() const {
    334     if(bufferPos<buffer.length()) {
    335         return currentIndex;
    336     } else {
    337         return nextIndex;
    338     }
    339 }
    340 
    341 /**
    342  * Retrieve the index of the start of the input text.  This is the begin index
    343  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
    344  * over which this <tt>Normalizer</tt> is iterating
    345  */
    346 int32_t Normalizer::startIndex() const {
    347     return text->startIndex();
    348 }
    349 
    350 /**
    351  * Retrieve the index of the end of the input text.  This is the end index
    352  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
    353  * over which this <tt>Normalizer</tt> is iterating
    354  */
    355 int32_t Normalizer::endIndex() const {
    356     return text->endIndex();
    357 }
    358 
    359 //-------------------------------------------------------------------------
    360 // Property access methods
    361 //-------------------------------------------------------------------------
    362 
    363 void
    364 Normalizer::setMode(UNormalizationMode newMode)
    365 {
    366     fUMode = newMode;
    367     init();
    368 }
    369 
    370 UNormalizationMode
    371 Normalizer::getUMode() const
    372 {
    373     return fUMode;
    374 }
    375 
    376 void
    377 Normalizer::setOption(int32_t option,
    378                       UBool value)
    379 {
    380     if (value) {
    381         fOptions |= option;
    382     } else {
    383         fOptions &= (~option);
    384     }
    385     init();
    386 }
    387 
    388 UBool
    389 Normalizer::getOption(int32_t option) const
    390 {
    391     return (fOptions & option) != 0;
    392 }
    393 
    394 /**
    395  * Set the input text over which this <tt>Normalizer</tt> will iterate.
    396  * The iteration position is set to the beginning of the input text.
    397  */
    398 void
    399 Normalizer::setText(const UnicodeString& newText,
    400                     UErrorCode &status)
    401 {
    402     if (U_FAILURE(status)) {
    403         return;
    404     }
    405     CharacterIterator *newIter = new StringCharacterIterator(newText);
    406     if (newIter == NULL) {
    407         status = U_MEMORY_ALLOCATION_ERROR;
    408         return;
    409     }
    410     delete text;
    411     text = newIter;
    412     reset();
    413 }
    414 
    415 /**
    416  * Set the input text over which this <tt>Normalizer</tt> will iterate.
    417  * The iteration position is set to the beginning of the string.
    418  */
    419 void
    420 Normalizer::setText(const CharacterIterator& newText,
    421                     UErrorCode &status)
    422 {
    423     if (U_FAILURE(status)) {
    424         return;
    425     }
    426     CharacterIterator *newIter = newText.clone();
    427     if (newIter == NULL) {
    428         status = U_MEMORY_ALLOCATION_ERROR;
    429         return;
    430     }
    431     delete text;
    432     text = newIter;
    433     reset();
    434 }
    435 
    436 void
    437 Normalizer::setText(const UChar* newText,
    438                     int32_t length,
    439                     UErrorCode &status)
    440 {
    441     if (U_FAILURE(status)) {
    442         return;
    443     }
    444     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
    445     if (newIter == NULL) {
    446         status = U_MEMORY_ALLOCATION_ERROR;
    447         return;
    448     }
    449     delete text;
    450     text = newIter;
    451     reset();
    452 }
    453 
    454 /**
    455  * Copies the text under iteration into the UnicodeString referred to by "result".
    456  * @param result Receives a copy of the text under iteration.
    457  */
    458 void
    459 Normalizer::getText(UnicodeString&  result)
    460 {
    461     text->getText(result);
    462 }
    463 
    464 //-------------------------------------------------------------------------
    465 // Private utility methods
    466 //-------------------------------------------------------------------------
    467 
    468 void Normalizer::clearBuffer() {
    469     buffer.remove();
    470     bufferPos=0;
    471 }
    472 
    473 UBool
    474 Normalizer::nextNormalize() {
    475     clearBuffer();
    476     currentIndex=nextIndex;
    477     text->setIndex(nextIndex);
    478     if(!text->hasNext()) {
    479         return FALSE;
    480     }
    481     // Skip at least one character so we make progress.
    482     UnicodeString segment(text->next32PostInc());
    483     while(text->hasNext()) {
    484         UChar32 c;
    485         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
    486             text->move32(-1, CharacterIterator::kCurrent);
    487             break;
    488         }
    489         segment.append(c);
    490     }
    491     nextIndex=text->getIndex();
    492     UErrorCode errorCode=U_ZERO_ERROR;
    493     fNorm2->normalize(segment, buffer, errorCode);
    494     return U_SUCCESS(errorCode) && !buffer.isEmpty();
    495 }
    496 
    497 UBool
    498 Normalizer::previousNormalize() {
    499     clearBuffer();
    500     nextIndex=currentIndex;
    501     text->setIndex(currentIndex);
    502     if(!text->hasPrevious()) {
    503         return FALSE;
    504     }
    505     UnicodeString segment;
    506     while(text->hasPrevious()) {
    507         UChar32 c=text->previous32();
    508         segment.insert(0, c);
    509         if(fNorm2->hasBoundaryBefore(c)) {
    510             break;
    511         }
    512     }
    513     currentIndex=text->getIndex();
    514     UErrorCode errorCode=U_ZERO_ERROR;
    515     fNorm2->normalize(segment, buffer, errorCode);
    516     bufferPos=buffer.length();
    517     return U_SUCCESS(errorCode) && !buffer.isEmpty();
    518 }
    519 
    520 U_NAMESPACE_END
    521 
    522 #endif /* #if !UCONFIG_NO_NORMALIZATION */
    523