Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2008, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   05/11/2008  Andy Heninger  Port from Java
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
     14 
     15 #include "unicode/unifilt.h"
     16 #include "unicode/uchar.h"
     17 #include "unicode/uniset.h"
     18 #include "unicode/brkiter.h"
     19 #include "brktrans.h"
     20 #include "unicode/uchar.h"
     21 #include "cmemory.h"
     22 #include "uprops.h"
     23 #include "uinvchar.h"
     24 #include "util.h"
     25 #include "uvectr32.h"
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
     30 
     31 static const UChar SPACE       = 32;  // ' '
     32 
     33 
     34 /**
     35  * Constructs a transliterator with the default delimiters '{' and
     36  * '}'.
     37  */
     38 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
     39     Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
     40     fInsertion(SPACE) {
     41         bi = NULL;
     42         UErrorCode status = U_ZERO_ERROR;
     43         boundaries = new UVector32(status);
     44     }
     45 
     46 
     47 /**
     48  * Destructor.
     49  */
     50 BreakTransliterator::~BreakTransliterator() {
     51     delete bi;
     52     bi = NULL;
     53     delete boundaries;
     54     boundaries = NULL;
     55 }
     56 
     57 /**
     58  * Copy constructor.
     59  */
     60 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
     61     Transliterator(o) {
     62         bi = NULL;
     63         if (o.bi != NULL) {
     64             bi = o.bi->clone();
     65         }
     66         fInsertion = o.fInsertion;
     67         UErrorCode status = U_ZERO_ERROR;
     68         boundaries = new UVector32(status);
     69     }
     70 
     71 
     72 /**
     73  * Transliterator API.
     74  */
     75 Transliterator* BreakTransliterator::clone(void) const {
     76     return new BreakTransliterator(*this);
     77 }
     78 
     79 /**
     80  * Implements {@link Transliterator#handleTransliterate}.
     81  */
     82 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
     83                                                     UBool isIncremental ) const {
     84 
     85         UErrorCode status = U_ZERO_ERROR;
     86         boundaries->removeAllElements();
     87         BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
     88         nonConstThis->getBreakIterator(); // Lazy-create it if necessary
     89         UnicodeString sText = replaceableAsString(text);
     90         bi->setText(sText);
     91         bi->preceding(offsets.start);
     92 
     93         // To make things much easier, we will stack the boundaries, and then insert at the end.
     94         // generally, we won't need too many, since we will be filtered.
     95 
     96         int32_t boundary;
     97         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
     98             if (boundary == 0) continue;
     99             // HACK: Check to see that preceeding item was a letter
    100 
    101             UChar32 cp = sText.char32At(boundary-1);
    102             int type = u_charType(cp);
    103             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
    104             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    105 
    106             cp = sText.char32At(boundary);
    107             type = u_charType(cp);
    108             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
    109             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    110 
    111             boundaries->addElement(boundary, status);
    112             // printf("Boundary at %d\n", boundary);
    113         }
    114 
    115         int delta = 0;
    116         int lastBoundary = 0;
    117 
    118         if (boundaries->size() != 0) { // if we found something, adjust
    119             delta = boundaries->size() * fInsertion.length();
    120             lastBoundary = boundaries->lastElementi();
    121 
    122             // we do this from the end backwards, so that we don't have to keep updating.
    123 
    124             while (boundaries->size() > 0) {
    125                 boundary = boundaries->popi();
    126                 text.handleReplaceBetween(boundary, boundary, fInsertion);
    127             }
    128         }
    129 
    130         // Now fix up the return values
    131         offsets.contextLimit += delta;
    132         offsets.limit += delta;
    133         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
    134 
    135         // TODO:  do something with U_FAILURE(status);
    136         //        (need to look at transliterators overall, not just here.)
    137 }
    138 
    139 //
    140 //  getInsertion()
    141 //
    142 const UnicodeString &BreakTransliterator::getInsertion() const {
    143     return fInsertion;
    144 }
    145 
    146 //
    147 //  setInsertion()
    148 //
    149 void BreakTransliterator::setInsertsion(const UnicodeString &insertion) {
    150     this->fInsertion = insertion;
    151 }
    152 
    153 //
    154 //  getBreakIterator     Lazily create the break iterator if it does
    155 //                       not already exist.  Copied from Java, probably
    156 //                       better to just create it in the constructor.
    157 //
    158 BreakIterator *BreakTransliterator::getBreakIterator() {
    159     UErrorCode status = U_ZERO_ERROR;
    160     if (bi == NULL) {
    161         // Note:  Thai breaking behavior is universal, it is not
    162         //        tied to the Thai locale.
    163         bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
    164     }
    165     return bi;
    166 }
    167 
    168 //
    169 //   replaceableAsString   Hack to let break iterators work
    170 //                         on the replaceable text from transliterators.
    171 //                         In practice, the only real Replaceable type that we
    172 //                         will be seeing is UnicodeString, so this function
    173 //                         will normally be efficient.
    174 //
    175 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
    176     if (r.getDynamicClassID() == UnicodeString::getStaticClassID()) {
    177         return (UnicodeString &) r;
    178     }
    179     UnicodeString s;
    180     r.extractBetween(0, r.length(), s);
    181     return s;
    182 }
    183 
    184 U_NAMESPACE_END
    185 
    186 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    187