Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2008-2015, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   05/11/2008  Andy Heninger  Port from Java
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
     14 
     15 #include "unicode/brkiter.h"
     16 #include "unicode/localpointer.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/unifilt.h"
     19 #include "unicode/uniset.h"
     20 
     21 #include "brktrans.h"
     22 #include "cmemory.h"
     23 #include "mutex.h"
     24 #include "uprops.h"
     25 #include "uinvchar.h"
     26 #include "util.h"
     27 #include "uvectr32.h"
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
     32 
     33 static const UChar SPACE       = 32;  // ' '
     34 
     35 
     36 /**
     37  * Constructs a transliterator with the default delimiters '{' and
     38  * '}'.
     39  */
     40 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
     41         Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
     42         cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) {
     43     }
     44 
     45 
     46 /**
     47  * Destructor.
     48  */
     49 BreakTransliterator::~BreakTransliterator() {
     50 }
     51 
     52 /**
     53  * Copy constructor.
     54  */
     55 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
     56         Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) {
     57 }
     58 
     59 
     60 /**
     61  * Transliterator API.
     62  */
     63 Transliterator* BreakTransliterator::clone(void) const {
     64     return new BreakTransliterator(*this);
     65 }
     66 
     67 /**
     68  * Implements {@link Transliterator#handleTransliterate}.
     69  */
     70 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
     71                                                     UBool isIncremental ) const {
     72 
     73         UErrorCode status = U_ZERO_ERROR;
     74         LocalPointer<BreakIterator> bi;
     75         LocalPointer<UVector32> boundaries;
     76 
     77         {
     78             Mutex m;
     79             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
     80             boundaries.moveFrom(nonConstThis->cachedBoundaries);
     81             bi.moveFrom(nonConstThis->cachedBI);
     82         }
     83         if (bi.isNull()) {
     84             bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
     85         }
     86         if (boundaries.isNull()) {
     87             boundaries.adoptInstead(new UVector32(status));
     88         }
     89 
     90         if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
     91             return;
     92         }
     93 
     94         boundaries->removeAllElements();
     95         UnicodeString sText = replaceableAsString(text);
     96         bi->setText(sText);
     97         bi->preceding(offsets.start);
     98 
     99         // To make things much easier, we will stack the boundaries, and then insert at the end.
    100         // generally, we won't need too many, since we will be filtered.
    101 
    102         int32_t boundary;
    103         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
    104             if (boundary == 0) continue;
    105             // HACK: Check to see that preceeding item was a letter
    106 
    107             UChar32 cp = sText.char32At(boundary-1);
    108             int type = u_charType(cp);
    109             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
    110             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    111 
    112             cp = sText.char32At(boundary);
    113             type = u_charType(cp);
    114             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
    115             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    116 
    117             boundaries->addElement(boundary, status);
    118             // printf("Boundary at %d\n", boundary);
    119         }
    120 
    121         int delta = 0;
    122         int lastBoundary = 0;
    123 
    124         if (boundaries->size() != 0) { // if we found something, adjust
    125             delta = boundaries->size() * fInsertion.length();
    126             lastBoundary = boundaries->lastElementi();
    127 
    128             // we do this from the end backwards, so that we don't have to keep updating.
    129 
    130             while (boundaries->size() > 0) {
    131                 boundary = boundaries->popi();
    132                 text.handleReplaceBetween(boundary, boundary, fInsertion);
    133             }
    134         }
    135 
    136         // Now fix up the return values
    137         offsets.contextLimit += delta;
    138         offsets.limit += delta;
    139         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
    140 
    141         // Return break iterator & boundaries vector to the cache.
    142         {
    143             Mutex m;
    144             BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
    145             if (nonConstThis->cachedBI.isNull()) {
    146                 nonConstThis->cachedBI.moveFrom(bi);
    147             }
    148             if (nonConstThis->cachedBoundaries.isNull()) {
    149                 nonConstThis->cachedBoundaries.moveFrom(boundaries);
    150             }
    151         }
    152 
    153         // TODO:  do something with U_FAILURE(status);
    154         //        (need to look at transliterators overall, not just here.)
    155 }
    156 
    157 //
    158 //  getInsertion()
    159 //
    160 const UnicodeString &BreakTransliterator::getInsertion() const {
    161     return fInsertion;
    162 }
    163 
    164 //
    165 //  setInsertion()
    166 //
    167 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
    168     this->fInsertion = insertion;
    169 }
    170 
    171 //
    172 //   replaceableAsString   Hack to let break iterators work
    173 //                         on the replaceable text from transliterators.
    174 //                         In practice, the only real Replaceable type that we
    175 //                         will be seeing is UnicodeString, so this function
    176 //                         will normally be efficient.
    177 //
    178 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
    179     UnicodeString s;
    180     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
    181     if (rs != NULL) {
    182         s = *rs;
    183     } else {
    184         r.extractBetween(0, r.length(), s);
    185     }
    186     return s;
    187 }
    188 
    189 U_NAMESPACE_END
    190 
    191 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    192