1 /* 2 ********************************************************************** 3 * Copyright (C) 2008-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 05/11/2008 Andy Heninger Port from Java 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION 14 15 #include "unicode/unifilt.h" 16 #include "unicode/uchar.h" 17 #include "unicode/uniset.h" 18 #include "unicode/brkiter.h" 19 #include "brktrans.h" 20 #include "unicode/uchar.h" 21 #include "cmemory.h" 22 #include "uprops.h" 23 #include "uinvchar.h" 24 #include "util.h" 25 #include "uvectr32.h" 26 27 U_NAMESPACE_BEGIN 28 29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) 30 31 static const UChar SPACE = 32; // ' ' 32 33 34 /** 35 * Constructs a transliterator with the default delimiters '{' and 36 * '}'. 37 */ 38 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : 39 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), 40 fInsertion(SPACE) { 41 bi = NULL; 42 UErrorCode status = U_ZERO_ERROR; 43 boundaries = new UVector32(status); 44 } 45 46 47 /** 48 * Destructor. 49 */ 50 BreakTransliterator::~BreakTransliterator() { 51 delete bi; 52 bi = NULL; 53 delete boundaries; 54 boundaries = NULL; 55 } 56 57 /** 58 * Copy constructor. 59 */ 60 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : 61 Transliterator(o) { 62 bi = NULL; 63 if (o.bi != NULL) { 64 bi = o.bi->clone(); 65 } 66 fInsertion = o.fInsertion; 67 UErrorCode status = U_ZERO_ERROR; 68 boundaries = new UVector32(status); 69 } 70 71 72 /** 73 * Transliterator API. 74 */ 75 Transliterator* BreakTransliterator::clone(void) const { 76 return new BreakTransliterator(*this); 77 } 78 79 /** 80 * Implements {@link Transliterator#handleTransliterate}. 81 */ 82 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 83 UBool isIncremental ) const { 84 85 UErrorCode status = U_ZERO_ERROR; 86 boundaries->removeAllElements(); 87 BreakTransliterator *nonConstThis = (BreakTransliterator *)this; 88 nonConstThis->getBreakIterator(); // Lazy-create it if necessary 89 UnicodeString sText = replaceableAsString(text); 90 bi->setText(sText); 91 bi->preceding(offsets.start); 92 93 // To make things much easier, we will stack the boundaries, and then insert at the end. 94 // generally, we won't need too many, since we will be filtered. 95 96 int32_t boundary; 97 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { 98 if (boundary == 0) continue; 99 // HACK: Check to see that preceeding item was a letter 100 101 UChar32 cp = sText.char32At(boundary-1); 102 int type = u_charType(cp); 103 //System.out.println(Integer.toString(cp,16) + " (before): " + type); 104 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 105 106 cp = sText.char32At(boundary); 107 type = u_charType(cp); 108 //System.out.println(Integer.toString(cp,16) + " (after): " + type); 109 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 110 111 boundaries->addElement(boundary, status); 112 // printf("Boundary at %d\n", boundary); 113 } 114 115 int delta = 0; 116 int lastBoundary = 0; 117 118 if (boundaries->size() != 0) { // if we found something, adjust 119 delta = boundaries->size() * fInsertion.length(); 120 lastBoundary = boundaries->lastElementi(); 121 122 // we do this from the end backwards, so that we don't have to keep updating. 123 124 while (boundaries->size() > 0) { 125 boundary = boundaries->popi(); 126 text.handleReplaceBetween(boundary, boundary, fInsertion); 127 } 128 } 129 130 // Now fix up the return values 131 offsets.contextLimit += delta; 132 offsets.limit += delta; 133 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; 134 135 // TODO: do something with U_FAILURE(status); 136 // (need to look at transliterators overall, not just here.) 137 } 138 139 // 140 // getInsertion() 141 // 142 const UnicodeString &BreakTransliterator::getInsertion() const { 143 return fInsertion; 144 } 145 146 // 147 // setInsertion() 148 // 149 void BreakTransliterator::setInsertion(const UnicodeString &insertion) { 150 this->fInsertion = insertion; 151 } 152 153 // 154 // getBreakIterator Lazily create the break iterator if it does 155 // not already exist. Copied from Java, probably 156 // better to just create it in the constructor. 157 // 158 BreakIterator *BreakTransliterator::getBreakIterator() { 159 UErrorCode status = U_ZERO_ERROR; 160 if (bi == NULL) { 161 // Note: Thai breaking behavior is universal, it is not 162 // tied to the Thai locale. 163 bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); 164 } 165 return bi; 166 } 167 168 // 169 // replaceableAsString Hack to let break iterators work 170 // on the replaceable text from transliterators. 171 // In practice, the only real Replaceable type that we 172 // will be seeing is UnicodeString, so this function 173 // will normally be efficient. 174 // 175 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { 176 UnicodeString s; 177 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); 178 if (rs != NULL) { 179 s = *rs; 180 } else { 181 r.extractBetween(0, r.length(), s); 182 } 183 return s; 184 } 185 186 U_NAMESPACE_END 187 188 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 189