Home | History | Annotate | Download | only in common
      1 /*
      2  *************************************************************************
      3  * COPYRIGHT:
      4  * Copyright (c) 1996-2005, International Business Machines Corporation and
      5  * others. All Rights Reserved.
      6  *************************************************************************
      7  */
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_NORMALIZATION
     12 
     13 #include "unicode/unistr.h"
     14 #include "unicode/chariter.h"
     15 #include "unicode/schriter.h"
     16 #include "unicode/uchriter.h"
     17 #include "unicode/uiter.h"
     18 #include "unicode/normlzr.h"
     19 #include "cmemory.h"
     20 #include "unormimp.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
     25 
     26 //-------------------------------------------------------------------------
     27 // Constructors and other boilerplate
     28 //-------------------------------------------------------------------------
     29 
     30 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
     31     UObject(), fUMode(mode), fOptions(0),
     32     currentIndex(0), nextIndex(0),
     33     buffer(), bufferPos(0)
     34 {
     35     init(new StringCharacterIterator(str));
     36 }
     37 
     38 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
     39     UObject(), fUMode(mode), fOptions(0),
     40     currentIndex(0), nextIndex(0),
     41     buffer(), bufferPos(0)
     42 {
     43     init(new UCharCharacterIterator(str, length));
     44 }
     45 
     46 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
     47     UObject(), fUMode(mode), fOptions(0),
     48     currentIndex(0), nextIndex(0),
     49     buffer(), bufferPos(0)
     50 {
     51     init(iter.clone());
     52 }
     53 
     54 Normalizer::Normalizer(const Normalizer &copy) :
     55     UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
     56     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
     57     buffer(copy.buffer), bufferPos(copy.bufferPos)
     58 {
     59     init(((CharacterIterator *)(copy.text->context))->clone());
     60 }
     61 
     62 static const UChar _NUL=0;
     63 
     64 void
     65 Normalizer::init(CharacterIterator *iter) {
     66     UErrorCode errorCode=U_ZERO_ERROR;
     67 
     68     text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
     69     if(text!=NULL) {
     70         if(unorm_haveData(&errorCode)) {
     71             uiter_setCharacterIterator(text, iter);
     72         } else {
     73             delete iter;
     74             uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
     75         }
     76     } else {
     77         delete iter;
     78     }
     79 }
     80 
     81 Normalizer::~Normalizer()
     82 {
     83     if(text!=NULL) {
     84         delete (CharacterIterator *)text->context;
     85         uprv_free(text);
     86     }
     87 }
     88 
     89 Normalizer*
     90 Normalizer::clone() const
     91 {
     92     if(this!=0) {
     93         return new Normalizer(*this);
     94     } else {
     95         return 0;
     96     }
     97 }
     98 
     99 /**
    100  * Generates a hash code for this iterator.
    101  */
    102 int32_t Normalizer::hashCode() const
    103 {
    104     return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
    105 }
    106 
    107 UBool Normalizer::operator==(const Normalizer& that) const
    108 {
    109     return
    110         this==&that ||
    111         fUMode==that.fUMode &&
    112         fOptions==that.fOptions &&
    113         *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
    114         buffer==that.buffer &&
    115         bufferPos==that.bufferPos &&
    116         nextIndex==that.nextIndex;
    117 }
    118 
    119 //-------------------------------------------------------------------------
    120 // Static utility methods
    121 //-------------------------------------------------------------------------
    122 
    123 void U_EXPORT2
    124 Normalizer::normalize(const UnicodeString& source,
    125                       UNormalizationMode mode, int32_t options,
    126                       UnicodeString& result,
    127                       UErrorCode &status) {
    128     if(source.isBogus() || U_FAILURE(status)) {
    129         result.setToBogus();
    130         if(U_SUCCESS(status)) {
    131             status=U_ILLEGAL_ARGUMENT_ERROR;
    132         }
    133     } else {
    134         UnicodeString localDest;
    135         UnicodeString *dest;
    136 
    137         if(&source!=&result) {
    138             dest=&result;
    139         } else {
    140             // the source and result strings are the same object, use a temporary one
    141             dest=&localDest;
    142         }
    143 
    144         UChar *buffer=dest->getBuffer(source.length());
    145         int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
    146                                                source.getBuffer(), source.length(),
    147                                                mode, options,
    148                                                &status);
    149         dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
    150         if(status==U_BUFFER_OVERFLOW_ERROR) {
    151             status=U_ZERO_ERROR;
    152             buffer=dest->getBuffer(length);
    153             length=unorm_internalNormalize(buffer, dest->getCapacity(),
    154                                            source.getBuffer(), source.length(),
    155                                            mode, options,
    156                                            &status);
    157             dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
    158         }
    159 
    160         if(dest==&localDest) {
    161             result=*dest;
    162         }
    163         if(U_FAILURE(status)) {
    164             result.setToBogus();
    165         }
    166     }
    167 }
    168 
    169 void U_EXPORT2
    170 Normalizer::compose(const UnicodeString& source,
    171                     UBool compat, int32_t options,
    172                     UnicodeString& result,
    173                     UErrorCode &status) {
    174     if(source.isBogus() || U_FAILURE(status)) {
    175         result.setToBogus();
    176         if(U_SUCCESS(status)) {
    177             status=U_ILLEGAL_ARGUMENT_ERROR;
    178         }
    179     } else {
    180         UnicodeString localDest;
    181         UnicodeString *dest;
    182 
    183         if(&source!=&result) {
    184             dest=&result;
    185         } else {
    186             // the source and result strings are the same object, use a temporary one
    187             dest=&localDest;
    188         }
    189 
    190         UChar *buffer=dest->getBuffer(source.length());
    191         int32_t length=unorm_compose(buffer, dest->getCapacity(),
    192                                      source.getBuffer(), source.length(),
    193                                      compat, options,
    194                                      &status);
    195         dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
    196         if(status==U_BUFFER_OVERFLOW_ERROR) {
    197             status=U_ZERO_ERROR;
    198             buffer=dest->getBuffer(length);
    199             length=unorm_compose(buffer, dest->getCapacity(),
    200                                  source.getBuffer(), source.length(),
    201                                  compat, options,
    202                                  &status);
    203             dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
    204         }
    205 
    206         if(dest==&localDest) {
    207             result=*dest;
    208         }
    209         if(U_FAILURE(status)) {
    210             result.setToBogus();
    211         }
    212     }
    213 }
    214 
    215 void U_EXPORT2
    216 Normalizer::decompose(const UnicodeString& source,
    217                       UBool compat, int32_t options,
    218                       UnicodeString& result,
    219                       UErrorCode &status) {
    220     if(source.isBogus() || U_FAILURE(status)) {
    221         result.setToBogus();
    222         if(U_SUCCESS(status)) {
    223             status=U_ILLEGAL_ARGUMENT_ERROR;
    224         }
    225     } else {
    226         UnicodeString localDest;
    227         UnicodeString *dest;
    228 
    229         if(&source!=&result) {
    230             dest=&result;
    231         } else {
    232             // the source and result strings are the same object, use a temporary one
    233             dest=&localDest;
    234         }
    235 
    236         UChar *buffer=dest->getBuffer(source.length());
    237         int32_t length=unorm_decompose(buffer, dest->getCapacity(),
    238                                      source.getBuffer(), source.length(),
    239                                      compat, options,
    240                                      &status);
    241         dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
    242         if(status==U_BUFFER_OVERFLOW_ERROR) {
    243             status=U_ZERO_ERROR;
    244             buffer=dest->getBuffer(length);
    245             length=unorm_decompose(buffer, dest->getCapacity(),
    246                                    source.getBuffer(), source.length(),
    247                                    compat, options,
    248                                    &status);
    249             dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
    250         }
    251 
    252         if(dest==&localDest) {
    253             result=*dest;
    254         }
    255         if(U_FAILURE(status)) {
    256             result.setToBogus();
    257         }
    258     }
    259 }
    260 
    261 UnicodeString & U_EXPORT2
    262 Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
    263                         UnicodeString &result,
    264                         UNormalizationMode mode, int32_t options,
    265                         UErrorCode &errorCode) {
    266     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
    267         result.setToBogus();
    268         if(U_SUCCESS(errorCode)) {
    269             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    270         }
    271     } else {
    272         UnicodeString localDest;
    273         UnicodeString *dest;
    274 
    275         if(&left!=&result && &right!=&result) {
    276             dest=&result;
    277         } else {
    278             // the source and result strings are the same object, use a temporary one
    279             dest=&localDest;
    280         }
    281 
    282         UChar *buffer=dest->getBuffer(left.length()+right.length());
    283         int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
    284                                          right.getBuffer(), right.length(),
    285                                          buffer, dest->getCapacity(),
    286                                          mode, options,
    287                                          &errorCode);
    288         dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
    289         if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    290             errorCode=U_ZERO_ERROR;
    291             buffer=dest->getBuffer(length);
    292             int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
    293                                              right.getBuffer(), right.length(),
    294                                              buffer, dest->getCapacity(),
    295                                              mode, options,
    296                                              &errorCode);
    297             dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
    298         }
    299 
    300         if(dest==&localDest) {
    301             result=*dest;
    302         }
    303         if(U_FAILURE(errorCode)) {
    304             result.setToBogus();
    305         }
    306     }
    307     return result;
    308 }
    309 
    310 //-------------------------------------------------------------------------
    311 // Iteration API
    312 //-------------------------------------------------------------------------
    313 
    314 /**
    315  * Return the current character in the normalized text.
    316  */
    317 UChar32 Normalizer::current() {
    318     if(bufferPos<buffer.length() || nextNormalize()) {
    319         return buffer.char32At(bufferPos);
    320     } else {
    321         return DONE;
    322     }
    323 }
    324 
    325 /**
    326  * Return the next character in the normalized text and advance
    327  * the iteration position by one.  If the end
    328  * of the text has already been reached, {@link #DONE} is returned.
    329  */
    330 UChar32 Normalizer::next() {
    331     if(bufferPos<buffer.length() ||  nextNormalize()) {
    332         UChar32 c=buffer.char32At(bufferPos);
    333         bufferPos+=UTF_CHAR_LENGTH(c);
    334         return c;
    335     } else {
    336         return DONE;
    337     }
    338 }
    339 
    340 /**
    341  * Return the previous character in the normalized text and decrement
    342  * the iteration position by one.  If the beginning
    343  * of the text has already been reached, {@link #DONE} is returned.
    344  */
    345 UChar32 Normalizer::previous() {
    346     if(bufferPos>0 || previousNormalize()) {
    347         UChar32 c=buffer.char32At(bufferPos-1);
    348         bufferPos-=UTF_CHAR_LENGTH(c);
    349         return c;
    350     } else {
    351         return DONE;
    352     }
    353 }
    354 
    355 void Normalizer::reset() {
    356     currentIndex=nextIndex=text->move(text, 0, UITER_START);
    357     clearBuffer();
    358 }
    359 
    360 void
    361 Normalizer::setIndexOnly(int32_t index) {
    362     currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
    363     clearBuffer();
    364 }
    365 
    366 /**
    367  * Return the first character in the normalized text->  This resets
    368  * the <tt>Normalizer's</tt> position to the beginning of the text->
    369  */
    370 UChar32 Normalizer::first() {
    371     reset();
    372     return next();
    373 }
    374 
    375 /**
    376  * Return the last character in the normalized text->  This resets
    377  * the <tt>Normalizer's</tt> position to be just before the
    378  * the input text corresponding to that normalized character.
    379  */
    380 UChar32 Normalizer::last() {
    381     currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
    382     clearBuffer();
    383     return previous();
    384 }
    385 
    386 /**
    387  * Retrieve the current iteration position in the input text that is
    388  * being normalized.  This method is useful in applications such as
    389  * searching, where you need to be able to determine the position in
    390  * the input text that corresponds to a given normalized output character.
    391  * <p>
    392  * <b>Note:</b> This method sets the position in the <em>input</em>, while
    393  * {@link #next} and {@link #previous} iterate through characters in the
    394  * <em>output</em>.  This means that there is not necessarily a one-to-one
    395  * correspondence between characters returned by <tt>next</tt> and
    396  * <tt>previous</tt> and the indices passed to and returned from
    397  * <tt>setIndex</tt> and {@link #getIndex}.
    398  *
    399  */
    400 int32_t Normalizer::getIndex() const {
    401     if(bufferPos<buffer.length()) {
    402         return currentIndex;
    403     } else {
    404         return nextIndex;
    405     }
    406 }
    407 
    408 /**
    409  * Retrieve the index of the start of the input text->  This is the begin index
    410  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
    411  * over which this <tt>Normalizer</tt> is iterating
    412  */
    413 int32_t Normalizer::startIndex() const {
    414     return text->getIndex(text, UITER_START);
    415 }
    416 
    417 /**
    418  * Retrieve the index of the end of the input text->  This is the end index
    419  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
    420  * over which this <tt>Normalizer</tt> is iterating
    421  */
    422 int32_t Normalizer::endIndex() const {
    423     return text->getIndex(text, UITER_LIMIT);
    424 }
    425 
    426 //-------------------------------------------------------------------------
    427 // Property access methods
    428 //-------------------------------------------------------------------------
    429 
    430 void
    431 Normalizer::setMode(UNormalizationMode newMode)
    432 {
    433     fUMode = newMode;
    434 }
    435 
    436 UNormalizationMode
    437 Normalizer::getUMode() const
    438 {
    439     return fUMode;
    440 }
    441 
    442 void
    443 Normalizer::setOption(int32_t option,
    444                       UBool value)
    445 {
    446     if (value) {
    447         fOptions |= option;
    448     } else {
    449         fOptions &= (~option);
    450     }
    451 }
    452 
    453 UBool
    454 Normalizer::getOption(int32_t option) const
    455 {
    456     return (fOptions & option) != 0;
    457 }
    458 
    459 /**
    460  * Set the input text over which this <tt>Normalizer</tt> will iterate.
    461  * The iteration position is set to the beginning of the input text->
    462  */
    463 void
    464 Normalizer::setText(const UnicodeString& newText,
    465                     UErrorCode &status)
    466 {
    467     if (U_FAILURE(status)) {
    468         return;
    469     }
    470     CharacterIterator *newIter = new StringCharacterIterator(newText);
    471     if (newIter == NULL) {
    472         status = U_MEMORY_ALLOCATION_ERROR;
    473         return;
    474     }
    475     delete (CharacterIterator *)(text->context);
    476     text->context = newIter;
    477     reset();
    478 }
    479 
    480 /**
    481  * Set the input text over which this <tt>Normalizer</tt> will iterate.
    482  * The iteration position is set to the beginning of the string.
    483  */
    484 void
    485 Normalizer::setText(const CharacterIterator& newText,
    486                     UErrorCode &status)
    487 {
    488     if (U_FAILURE(status)) {
    489         return;
    490     }
    491     CharacterIterator *newIter = newText.clone();
    492     if (newIter == NULL) {
    493         status = U_MEMORY_ALLOCATION_ERROR;
    494         return;
    495     }
    496     delete (CharacterIterator *)(text->context);
    497     text->context = newIter;
    498     reset();
    499 }
    500 
    501 void
    502 Normalizer::setText(const UChar* newText,
    503                     int32_t length,
    504                     UErrorCode &status)
    505 {
    506     if (U_FAILURE(status)) {
    507         return;
    508     }
    509     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
    510     if (newIter == NULL) {
    511         status = U_MEMORY_ALLOCATION_ERROR;
    512         return;
    513     }
    514     delete (CharacterIterator *)(text->context);
    515     text->context = newIter;
    516     reset();
    517 }
    518 
    519 /**
    520  * Copies the text under iteration into the UnicodeString referred to by "result".
    521  * @param result Receives a copy of the text under iteration.
    522  */
    523 void
    524 Normalizer::getText(UnicodeString&  result)
    525 {
    526     ((CharacterIterator *)(text->context))->getText(result);
    527 }
    528 
    529 //-------------------------------------------------------------------------
    530 // Private utility methods
    531 //-------------------------------------------------------------------------
    532 
    533 void Normalizer::clearBuffer() {
    534     buffer.remove();
    535     bufferPos=0;
    536 }
    537 
    538 UBool
    539 Normalizer::nextNormalize() {
    540     UChar *p;
    541     int32_t length;
    542     UErrorCode errorCode;
    543 
    544     clearBuffer();
    545     currentIndex=nextIndex;
    546     text->move(text, nextIndex, UITER_ZERO);
    547     if(!text->hasNext(text)) {
    548         return FALSE;
    549     }
    550 
    551     errorCode=U_ZERO_ERROR;
    552     p=buffer.getBuffer(-1);
    553     length=unorm_next(text, p, buffer.getCapacity(),
    554                       fUMode, fOptions,
    555                       TRUE, 0,
    556                       &errorCode);
    557     buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
    558     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    559         errorCode=U_ZERO_ERROR;
    560         text->move(text, nextIndex, UITER_ZERO);
    561         p=buffer.getBuffer(length);
    562         length=unorm_next(text, p, buffer.getCapacity(),
    563                           fUMode, fOptions,
    564                           TRUE, 0,
    565                           &errorCode);
    566         buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
    567     }
    568 
    569     nextIndex=text->getIndex(text, UITER_CURRENT);
    570     return U_SUCCESS(errorCode) && !buffer.isEmpty();
    571 }
    572 
    573 UBool
    574 Normalizer::previousNormalize() {
    575     UChar *p;
    576     int32_t length;
    577     UErrorCode errorCode;
    578 
    579     clearBuffer();
    580     nextIndex=currentIndex;
    581     text->move(text, currentIndex, UITER_ZERO);
    582     if(!text->hasPrevious(text)) {
    583         return FALSE;
    584     }
    585 
    586     errorCode=U_ZERO_ERROR;
    587     p=buffer.getBuffer(-1);
    588     length=unorm_previous(text, p, buffer.getCapacity(),
    589                           fUMode, fOptions,
    590                           TRUE, 0,
    591                           &errorCode);
    592     buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
    593     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    594         errorCode=U_ZERO_ERROR;
    595         text->move(text, currentIndex, UITER_ZERO);
    596         p=buffer.getBuffer(length);
    597         length=unorm_previous(text, p, buffer.getCapacity(),
    598                               fUMode, fOptions,
    599                               TRUE, 0,
    600                               &errorCode);
    601         buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
    602     }
    603 
    604     bufferPos=buffer.length();
    605     currentIndex=text->getIndex(text, UITER_CURRENT);
    606     return U_SUCCESS(errorCode) && !buffer.isEmpty();
    607 }
    608 
    609 U_NAMESPACE_END
    610 
    611 #endif /* #if !UCONFIG_NO_NORMALIZATION */
    612