1 /* 2 ********************************************************************** 3 * Copyright (C) 2008-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 05/11/2008 Andy Heninger Port from Java 8 ********************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION 14 15 #include "unicode/brkiter.h" 16 #include "unicode/localpointer.h" 17 #include "unicode/uchar.h" 18 #include "unicode/unifilt.h" 19 #include "unicode/uniset.h" 20 21 #include "brktrans.h" 22 #include "cmemory.h" 23 #include "mutex.h" 24 #include "uprops.h" 25 #include "uinvchar.h" 26 #include "util.h" 27 #include "uvectr32.h" 28 29 U_NAMESPACE_BEGIN 30 31 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) 32 33 static const UChar SPACE = 32; // ' ' 34 35 36 /** 37 * Constructs a transliterator with the default delimiters '{' and 38 * '}'. 39 */ 40 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : 41 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), 42 cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { 43 } 44 45 46 /** 47 * Destructor. 48 */ 49 BreakTransliterator::~BreakTransliterator() { 50 } 51 52 /** 53 * Copy constructor. 54 */ 55 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : 56 Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { 57 } 58 59 60 /** 61 * Transliterator API. 62 */ 63 Transliterator* BreakTransliterator::clone(void) const { 64 return new BreakTransliterator(*this); 65 } 66 67 /** 68 * Implements {@link Transliterator#handleTransliterate}. 69 */ 70 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 71 UBool isIncremental ) const { 72 73 UErrorCode status = U_ZERO_ERROR; 74 LocalPointer<BreakIterator> bi; 75 LocalPointer<UVector32> boundaries; 76 77 { 78 Mutex m; 79 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); 80 boundaries.moveFrom(nonConstThis->cachedBoundaries); 81 bi.moveFrom(nonConstThis->cachedBI); 82 } 83 if (bi.isNull()) { 84 bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); 85 } 86 if (boundaries.isNull()) { 87 boundaries.adoptInstead(new UVector32(status)); 88 } 89 90 if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { 91 return; 92 } 93 94 boundaries->removeAllElements(); 95 UnicodeString sText = replaceableAsString(text); 96 bi->setText(sText); 97 bi->preceding(offsets.start); 98 99 // To make things much easier, we will stack the boundaries, and then insert at the end. 100 // generally, we won't need too many, since we will be filtered. 101 102 int32_t boundary; 103 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { 104 if (boundary == 0) continue; 105 // HACK: Check to see that preceeding item was a letter 106 107 UChar32 cp = sText.char32At(boundary-1); 108 int type = u_charType(cp); 109 //System.out.println(Integer.toString(cp,16) + " (before): " + type); 110 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 111 112 cp = sText.char32At(boundary); 113 type = u_charType(cp); 114 //System.out.println(Integer.toString(cp,16) + " (after): " + type); 115 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 116 117 boundaries->addElement(boundary, status); 118 // printf("Boundary at %d\n", boundary); 119 } 120 121 int delta = 0; 122 int lastBoundary = 0; 123 124 if (boundaries->size() != 0) { // if we found something, adjust 125 delta = boundaries->size() * fInsertion.length(); 126 lastBoundary = boundaries->lastElementi(); 127 128 // we do this from the end backwards, so that we don't have to keep updating. 129 130 while (boundaries->size() > 0) { 131 boundary = boundaries->popi(); 132 text.handleReplaceBetween(boundary, boundary, fInsertion); 133 } 134 } 135 136 // Now fix up the return values 137 offsets.contextLimit += delta; 138 offsets.limit += delta; 139 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; 140 141 // Return break iterator & boundaries vector to the cache. 142 { 143 Mutex m; 144 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); 145 if (nonConstThis->cachedBI.isNull()) { 146 nonConstThis->cachedBI.moveFrom(bi); 147 } 148 if (nonConstThis->cachedBoundaries.isNull()) { 149 nonConstThis->cachedBoundaries.moveFrom(boundaries); 150 } 151 } 152 153 // TODO: do something with U_FAILURE(status); 154 // (need to look at transliterators overall, not just here.) 155 } 156 157 // 158 // getInsertion() 159 // 160 const UnicodeString &BreakTransliterator::getInsertion() const { 161 return fInsertion; 162 } 163 164 // 165 // setInsertion() 166 // 167 void BreakTransliterator::setInsertion(const UnicodeString &insertion) { 168 this->fInsertion = insertion; 169 } 170 171 // 172 // replaceableAsString Hack to let break iterators work 173 // on the replaceable text from transliterators. 174 // In practice, the only real Replaceable type that we 175 // will be seeing is UnicodeString, so this function 176 // will normally be efficient. 177 // 178 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { 179 UnicodeString s; 180 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); 181 if (rs != NULL) { 182 s = *rs; 183 } else { 184 r.extractBetween(0, r.length(), s); 185 } 186 return s; 187 } 188 189 U_NAMESPACE_END 190 191 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 192