Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2008-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   05/11/2008  Andy Heninger  Port from Java
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
     16 
     17 #include "unicode/brkiter.h"
     18 #include "unicode/localpointer.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/unifilt.h"
     21 #include "unicode/uniset.h"
     22 
     23 #include "brktrans.h"
     24 #include "cmemory.h"
     25 #include "mutex.h"
     26 #include "uprops.h"
     27 #include "uinvchar.h"
     28 #include "util.h"
     29 #include "uvectr32.h"
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
     34 
     35 static const UChar SPACE       = 32;  // ' '
     36 
     37 
     38 /**
     39  * Constructs a transliterator with the default delimiters '{' and
     40  * '}'.
     41  */
     42 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
     43         Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
     44         cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
     45     }
     46 
     47 
     48 /**
     49  * Destructor.
     50  */
     51 BreakTransliterator::~BreakTransliterator() {
     52 }
     53 
     54 /**
     55  * Copy constructor.
     56  */
     57 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
     58         Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
     59 }
     60 
     61 
     62 /**
     63  * Transliterator API.
     64  */
     65 Transliterator* BreakTransliterator::clone(void) const {
     66     return new BreakTransliterator(*this);
     67 }
     68 
     69 /**
     70  * Implements {@link Transliterator#handleTransliterate}.
     71  */
     72 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
     73                                                     UBool isIncremental ) const {
     74 
     75         UErrorCode status = U_ZERO_ERROR;
     76         LocalPointer<BreakIterator> bi;
     77         LocalPointer<UVector32> boundaries;
     78 
     79         {
     80             Mutex m;
     81             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
     82             boundaries.moveFrom(nonConstThis->cachedBoundaries);
     83             bi.moveFrom(nonConstThis->cachedBI);
     84         }
     85         if (bi.isNull()) {
     86             bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
     87         }
     88         if (boundaries.isNull()) {
     89             boundaries.adoptInstead(new UVector32(status));
     90         }
     91 
     92         if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
     93             return;
     94         }
     95 
     96         boundaries->removeAllElements();
     97         UnicodeString sText = replaceableAsString(text);
     98         bi->setText(sText);
     99         bi->preceding(offsets.start);
    100 
    101         // To make things much easier, we will stack the boundaries, and then insert at the end.
    102         // generally, we won't need too many, since we will be filtered.
    103 
    104         int32_t boundary;
    105         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
    106             if (boundary == 0) continue;
    107             // HACK: Check to see that preceeding item was a letter
    108 
    109             UChar32 cp = sText.char32At(boundary-1);
    110             int type = u_charType(cp);
    111             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
    112             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    113 
    114             cp = sText.char32At(boundary);
    115             type = u_charType(cp);
    116             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
    117             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    118 
    119             boundaries->addElement(boundary, status);
    120             // printf("Boundary at %d\n", boundary);
    121         }
    122 
    123         int delta = 0;
    124         int lastBoundary = 0;
    125 
    126         if (boundaries->size() != 0) { // if we found something, adjust
    127             delta = boundaries->size() * fInsertion.length();
    128             lastBoundary = boundaries->lastElementi();
    129 
    130             // we do this from the end backwards, so that we don't have to keep updating.
    131 
    132             while (boundaries->size() > 0) {
    133                 boundary = boundaries->popi();
    134                 text.handleReplaceBetween(boundary, boundary, fInsertion);
    135             }
    136         }
    137 
    138         // Now fix up the return values
    139         offsets.contextLimit += delta;
    140         offsets.limit += delta;
    141         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
    142 
    143         // Return break iterator & boundaries vector to the cache.
    144         {
    145             Mutex m;
    146             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
    147             if (nonConstThis->cachedBI.isNull()) {
    148                 nonConstThis->cachedBI.moveFrom(bi);
    149             }
    150             if (nonConstThis->cachedBoundaries.isNull()) {
    151                 nonConstThis->cachedBoundaries.moveFrom(boundaries);
    152             }
    153         }
    154 
    155         // TODO:  do something with U_FAILURE(status);
    156         //        (need to look at transliterators overall, not just here.)
    157 }
    158 
    159 //
    160 //  getInsertion()
    161 //
    162 const UnicodeString &BreakTransliterator::getInsertion() const {
    163     return fInsertion;
    164 }
    165 
    166 //
    167 //  setInsertion()
    168 //
    169 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
    170     this->fInsertion = insertion;
    171 }
    172 
    173 //
    174 //   replaceableAsString   Hack to let break iterators work
    175 //                         on the replaceable text from transliterators.
    176 //                         In practice, the only real Replaceable type that we
    177 //                         will be seeing is UnicodeString, so this function
    178 //                         will normally be efficient.
    179 //
    180 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
    181     UnicodeString s;
    182     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
    183     if (rs != NULL) {
    184         s = *rs;
    185     } else {
    186         r.extractBetween(0, r.length(), s);
    187     }
    188     return s;
    189 }
    190 
    191 U_NAMESPACE_END
    192 
    193 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    194