Home | History | Annotate | Download | only in common
      1 /*
      2  *************************************************************************
      3  * COPYRIGHT:
      4  * Copyright (c) 1996-2012, International Business Machines Corporation and
      5  * others. All Rights Reserved.
      6  *************************************************************************
      7  */
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_NORMALIZATION
     12 
     13 #include "unicode/uniset.h"
     14 #include "unicode/unistr.h"
     15 #include "unicode/chariter.h"
     16 #include "unicode/schriter.h"
     17 #include "unicode/uchriter.h"
     18 #include "unicode/normlzr.h"
     19 #include "unicode/utf16.h"
     20 #include "cmemory.h"
     21 #include "normalizer2impl.h"
     22 #include "uprops.h"  // for uniset_getUnicode32Instance()
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
     27 
     28 //-------------------------------------------------------------------------
     29 // Constructors and other boilerplate
     30 //-------------------------------------------------------------------------
     31 
     32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
     33     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
     34     text(new StringCharacterIterator(str)),
     35     currentIndex(0), nextIndex(0),
     36     buffer(), bufferPos(0)
     37 {
     38     init();
     39 }
     40 
     41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
     42     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
     43     text(new UCharCharacterIterator(str, length)),
     44     currentIndex(0), nextIndex(0),
     45     buffer(), bufferPos(0)
     46 {
     47     init();
     48 }
     49 
     50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
     51     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
     52     text(iter.clone()),
     53     currentIndex(0), nextIndex(0),
     54     buffer(), bufferPos(0)
     55 {
     56     init();
     57 }
     58 
     59 Normalizer::Normalizer(const Normalizer &copy) :
     60     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
     61     text(copy.text->clone()),
     62     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
     63     buffer(copy.buffer), bufferPos(copy.bufferPos)
     64 {
     65     init();
     66 }
     67 
     68 void
     69 Normalizer::init() {
     70     UErrorCode errorCode=U_ZERO_ERROR;
     71     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
     72     if(fOptions&UNORM_UNICODE_3_2) {
     73         delete fFilteredNorm2;
     74         fNorm2=fFilteredNorm2=
     75             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
     76     }
     77     if(U_FAILURE(errorCode)) {
     78         errorCode=U_ZERO_ERROR;
     79         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
     80     }
     81 }
     82 
     83 Normalizer::~Normalizer()
     84 {
     85     delete fFilteredNorm2;
     86     delete text;
     87 }
     88 
     89 Normalizer*
     90 Normalizer::clone() const
     91 {
     92     return new Normalizer(*this);
     93 }
     94 
     95 /**
     96  * Generates a hash code for this iterator.
     97  */
     98 int32_t Normalizer::hashCode() const
     99 {
    100     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
    101 }
    102 
    103 UBool Normalizer::operator==(const Normalizer& that) const
    104 {
    105     return
    106         this==&that ||
    107         (fUMode==that.fUMode &&
    108         fOptions==that.fOptions &&
    109         *text==*that.text &&
    110         buffer==that.buffer &&
    111         bufferPos==that.bufferPos &&
    112         nextIndex==that.nextIndex);
    113 }
    114 
    115 //-------------------------------------------------------------------------
    116 // Static utility methods
    117 //-------------------------------------------------------------------------
    118 
    119 void U_EXPORT2
    120 Normalizer::normalize(const UnicodeString& source,
    121                       UNormalizationMode mode, int32_t options,
    122                       UnicodeString& result,
    123                       UErrorCode &status) {
    124     if(source.isBogus() || U_FAILURE(status)) {
    125         result.setToBogus();
    126         if(U_SUCCESS(status)) {
    127             status=U_ILLEGAL_ARGUMENT_ERROR;
    128         }
    129     } else {
    130         UnicodeString localDest;
    131         UnicodeString *dest;
    132 
    133         if(&source!=&result) {
    134             dest=&result;
    135         } else {
    136             // the source and result strings are the same object, use a temporary one
    137             dest=&localDest;
    138         }
    139         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    140         if(U_SUCCESS(status)) {
    141             if(options&UNORM_UNICODE_3_2) {
    142                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    143                     normalize(source, *dest, status);
    144             } else {
    145                 n2->normalize(source, *dest, status);
    146             }
    147         }
    148         if(dest==&localDest && U_SUCCESS(status)) {
    149             result=*dest;
    150         }
    151     }
    152 }
    153 
    154 void U_EXPORT2
    155 Normalizer::compose(const UnicodeString& source,
    156                     UBool compat, int32_t options,
    157                     UnicodeString& result,
    158                     UErrorCode &status) {
    159     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
    160 }
    161 
    162 void U_EXPORT2
    163 Normalizer::decompose(const UnicodeString& source,
    164                       UBool compat, int32_t options,
    165                       UnicodeString& result,
    166                       UErrorCode &status) {
    167     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
    168 }
    169 
    170 UNormalizationCheckResult
    171 Normalizer::quickCheck(const UnicodeString& source,
    172                        UNormalizationMode mode, int32_t options,
    173                        UErrorCode &status) {
    174     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    175     if(U_SUCCESS(status)) {
    176         if(options&UNORM_UNICODE_3_2) {
    177             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    178                 quickCheck(source, status);
    179         } else {
    180             return n2->quickCheck(source, status);
    181         }
    182     } else {
    183         return UNORM_MAYBE;
    184     }
    185 }
    186 
    187 UBool
    188 Normalizer::isNormalized(const UnicodeString& source,
    189                          UNormalizationMode mode, int32_t options,
    190                          UErrorCode &status) {
    191     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    192     if(U_SUCCESS(status)) {
    193         if(options&UNORM_UNICODE_3_2) {
    194             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    195                 isNormalized(source, status);
    196         } else {
    197             return n2->isNormalized(source, status);
    198         }
    199     } else {
    200         return FALSE;
    201     }
    202 }
    203 
    204 UnicodeString & U_EXPORT2
    205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
    206                         UnicodeString &result,
    207                         UNormalizationMode mode, int32_t options,
    208                         UErrorCode &errorCode) {
    209     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
    210         result.setToBogus();
    211         if(U_SUCCESS(errorCode)) {
    212             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    213         }
    214     } else {
    215         UnicodeString localDest;
    216         UnicodeString *dest;
    217 
    218         if(&right!=&result) {
    219             dest=&result;
    220         } else {
    221             // the right and result strings are the same object, use a temporary one
    222             dest=&localDest;
    223         }
    224         *dest=left;
    225         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
    226         if(U_SUCCESS(errorCode)) {
    227             if(options&UNORM_UNICODE_3_2) {
    228                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
    229                     append(*dest, right, errorCode);
    230             } else {
    231                 n2->append(*dest, right, errorCode);
    232             }
    233         }
    234         if(dest==&localDest && U_SUCCESS(errorCode)) {
    235             result=*dest;
    236         }
    237     }
    238     return result;
    239 }
    240 
    241 //-------------------------------------------------------------------------
    242 // Iteration API
    243 //-------------------------------------------------------------------------
    244 
    245 /**
    246  * Return the current character in the normalized text.
    247  */
    248 UChar32 Normalizer::current() {
    249     if(bufferPos<buffer.length() || nextNormalize()) {
    250         return buffer.char32At(bufferPos);
    251     } else {
    252         return DONE;
    253     }
    254 }
    255 
    256 /**
    257  * Return the next character in the normalized text and advance
    258  * the iteration position by one.  If the end
    259  * of the text has already been reached, {@link #DONE} is returned.
    260  */
    261 UChar32 Normalizer::next() {
    262     if(bufferPos<buffer.length() ||  nextNormalize()) {
    263         UChar32 c=buffer.char32At(bufferPos);
    264         bufferPos+=U16_LENGTH(c);
    265         return c;
    266     } else {
    267         return DONE;
    268     }
    269 }
    270 
    271 /**
    272  * Return the previous character in the normalized text and decrement
    273  * the iteration position by one.  If the beginning
    274  * of the text has already been reached, {@link #DONE} is returned.
    275  */
    276 UChar32 Normalizer::previous() {
    277     if(bufferPos>0 || previousNormalize()) {
    278         UChar32 c=buffer.char32At(bufferPos-1);
    279         bufferPos-=U16_LENGTH(c);
    280         return c;
    281     } else {
    282         return DONE;
    283     }
    284 }
    285 
    286 void Normalizer::reset() {
    287     currentIndex=nextIndex=text->setToStart();
    288     clearBuffer();
    289 }
    290 
    291 void
    292 Normalizer::setIndexOnly(int32_t index) {
    293     text->setIndex(index);  // pins index
    294     currentIndex=nextIndex=text->getIndex();
    295     clearBuffer();
    296 }
    297 
    298 /**
    299  * Return the first character in the normalized text.  This resets
    300  * the <tt>Normalizer's</tt> position to the beginning of the text.
    301  */
    302 UChar32 Normalizer::first() {
    303     reset();
    304     return next();
    305 }
    306 
    307 /**
    308  * Return the last character in the normalized text.  This resets
    309  * the <tt>Normalizer's</tt> position to be just before the
    310  * the input text corresponding to that normalized character.
    311  */
    312 UChar32 Normalizer::last() {
    313     currentIndex=nextIndex=text->setToEnd();
    314     clearBuffer();
    315     return previous();
    316 }
    317 
    318 /**
    319  * Retrieve the current iteration position in the input text that is
    320  * being normalized.  This method is useful in applications such as
    321  * searching, where you need to be able to determine the position in
    322  * the input text that corresponds to a given normalized output character.
    323  * <p>
    324  * <b>Note:</b> This method sets the position in the <em>input</em>, while
    325  * {@link #next} and {@link #previous} iterate through characters in the
    326  * <em>output</em>.  This means that there is not necessarily a one-to-one
    327  * correspondence between characters returned by <tt>next</tt> and
    328  * <tt>previous</tt> and the indices passed to and returned from
    329  * <tt>setIndex</tt> and {@link #getIndex}.
    330  *
    331  */
    332 int32_t Normalizer::getIndex() const {
    333     if(bufferPos<buffer.length()) {
    334         return currentIndex;
    335     } else {
    336         return nextIndex;
    337     }
    338 }
    339 
    340 /**
    341  * Retrieve the index of the start of the input text.  This is the begin index
    342  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
    343  * over which this <tt>Normalizer</tt> is iterating
    344  */
    345 int32_t Normalizer::startIndex() const {
    346     return text->startIndex();
    347 }
    348 
    349 /**
    350  * Retrieve the index of the end of the input text.  This is the end index
    351  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
    352  * over which this <tt>Normalizer</tt> is iterating
    353  */
    354 int32_t Normalizer::endIndex() const {
    355     return text->endIndex();
    356 }
    357 
    358 //-------------------------------------------------------------------------
    359 // Property access methods
    360 //-------------------------------------------------------------------------
    361 
    362 void
    363 Normalizer::setMode(UNormalizationMode newMode)
    364 {
    365     fUMode = newMode;
    366     init();
    367 }
    368 
    369 UNormalizationMode
    370 Normalizer::getUMode() const
    371 {
    372     return fUMode;
    373 }
    374 
    375 void
    376 Normalizer::setOption(int32_t option,
    377                       UBool value)
    378 {
    379     if (value) {
    380         fOptions |= option;
    381     } else {
    382         fOptions &= (~option);
    383     }
    384     init();
    385 }
    386 
    387 UBool
    388 Normalizer::getOption(int32_t option) const
    389 {
    390     return (fOptions & option) != 0;
    391 }
    392 
    393 /**
    394  * Set the input text over which this <tt>Normalizer</tt> will iterate.
    395  * The iteration position is set to the beginning of the input text.
    396  */
    397 void
    398 Normalizer::setText(const UnicodeString& newText,
    399                     UErrorCode &status)
    400 {
    401     if (U_FAILURE(status)) {
    402         return;
    403     }
    404     CharacterIterator *newIter = new StringCharacterIterator(newText);
    405     if (newIter == NULL) {
    406         status = U_MEMORY_ALLOCATION_ERROR;
    407         return;
    408     }
    409     delete text;
    410     text = newIter;
    411     reset();
    412 }
    413 
    414 /**
    415  * Set the input text over which this <tt>Normalizer</tt> will iterate.
    416  * The iteration position is set to the beginning of the string.
    417  */
    418 void
    419 Normalizer::setText(const CharacterIterator& newText,
    420                     UErrorCode &status)
    421 {
    422     if (U_FAILURE(status)) {
    423         return;
    424     }
    425     CharacterIterator *newIter = newText.clone();
    426     if (newIter == NULL) {
    427         status = U_MEMORY_ALLOCATION_ERROR;
    428         return;
    429     }
    430     delete text;
    431     text = newIter;
    432     reset();
    433 }
    434 
    435 void
    436 Normalizer::setText(const UChar* newText,
    437                     int32_t length,
    438                     UErrorCode &status)
    439 {
    440     if (U_FAILURE(status)) {
    441         return;
    442     }
    443     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
    444     if (newIter == NULL) {
    445         status = U_MEMORY_ALLOCATION_ERROR;
    446         return;
    447     }
    448     delete text;
    449     text = newIter;
    450     reset();
    451 }
    452 
    453 /**
    454  * Copies the text under iteration into the UnicodeString referred to by "result".
    455  * @param result Receives a copy of the text under iteration.
    456  */
    457 void
    458 Normalizer::getText(UnicodeString&  result)
    459 {
    460     text->getText(result);
    461 }
    462 
    463 //-------------------------------------------------------------------------
    464 // Private utility methods
    465 //-------------------------------------------------------------------------
    466 
    467 void Normalizer::clearBuffer() {
    468     buffer.remove();
    469     bufferPos=0;
    470 }
    471 
    472 UBool
    473 Normalizer::nextNormalize() {
    474     clearBuffer();
    475     currentIndex=nextIndex;
    476     text->setIndex(nextIndex);
    477     if(!text->hasNext()) {
    478         return FALSE;
    479     }
    480     // Skip at least one character so we make progress.
    481     UnicodeString segment(text->next32PostInc());
    482     while(text->hasNext()) {
    483         UChar32 c;
    484         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
    485             text->move32(-1, CharacterIterator::kCurrent);
    486             break;
    487         }
    488         segment.append(c);
    489     }
    490     nextIndex=text->getIndex();
    491     UErrorCode errorCode=U_ZERO_ERROR;
    492     fNorm2->normalize(segment, buffer, errorCode);
    493     return U_SUCCESS(errorCode) && !buffer.isEmpty();
    494 }
    495 
    496 UBool
    497 Normalizer::previousNormalize() {
    498     clearBuffer();
    499     nextIndex=currentIndex;
    500     text->setIndex(currentIndex);
    501     if(!text->hasPrevious()) {
    502         return FALSE;
    503     }
    504     UnicodeString segment;
    505     while(text->hasPrevious()) {
    506         UChar32 c=text->previous32();
    507         segment.insert(0, c);
    508         if(fNorm2->hasBoundaryBefore(c)) {
    509             break;
    510         }
    511     }
    512     currentIndex=text->getIndex();
    513     UErrorCode errorCode=U_ZERO_ERROR;
    514     fNorm2->normalize(segment, buffer, errorCode);
    515     bufferPos=buffer.length();
    516     return U_SUCCESS(errorCode) && !buffer.isEmpty();
    517 }
    518 
    519 U_NAMESPACE_END
    520 
    521 #endif /* #if !UCONFIG_NO_NORMALIZATION */
    522