1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2008-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 05/11/2008 Andy Heninger Port from Java 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION 16 17 #include "unicode/brkiter.h" 18 #include "unicode/localpointer.h" 19 #include "unicode/uchar.h" 20 #include "unicode/unifilt.h" 21 #include "unicode/uniset.h" 22 23 #include "brktrans.h" 24 #include "cmemory.h" 25 #include "mutex.h" 26 #include "uprops.h" 27 #include "uinvchar.h" 28 #include "util.h" 29 #include "uvectr32.h" 30 31 U_NAMESPACE_BEGIN 32 33 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) 34 35 static const UChar SPACE = 32; // ' ' 36 37 38 /** 39 * Constructs a transliterator with the default delimiters '{' and 40 * '}'. 41 */ 42 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : 43 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), 44 cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { 45 } 46 47 48 /** 49 * Destructor. 50 */ 51 BreakTransliterator::~BreakTransliterator() { 52 } 53 54 /** 55 * Copy constructor. 56 */ 57 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : 58 Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { 59 } 60 61 62 /** 63 * Transliterator API. 64 */ 65 Transliterator* BreakTransliterator::clone(void) const { 66 return new BreakTransliterator(*this); 67 } 68 69 /** 70 * Implements {@link Transliterator#handleTransliterate}. 71 */ 72 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 73 UBool isIncremental ) const { 74 75 UErrorCode status = U_ZERO_ERROR; 76 LocalPointer<BreakIterator> bi; 77 LocalPointer<UVector32> boundaries; 78 79 { 80 Mutex m; 81 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); 82 boundaries.moveFrom(nonConstThis->cachedBoundaries); 83 bi.moveFrom(nonConstThis->cachedBI); 84 } 85 if (bi.isNull()) { 86 bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); 87 } 88 if (boundaries.isNull()) { 89 boundaries.adoptInstead(new UVector32(status)); 90 } 91 92 if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { 93 return; 94 } 95 96 boundaries->removeAllElements(); 97 UnicodeString sText = replaceableAsString(text); 98 bi->setText(sText); 99 bi->preceding(offsets.start); 100 101 // To make things much easier, we will stack the boundaries, and then insert at the end. 102 // generally, we won't need too many, since we will be filtered. 103 104 int32_t boundary; 105 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { 106 if (boundary == 0) continue; 107 // HACK: Check to see that preceeding item was a letter 108 109 UChar32 cp = sText.char32At(boundary-1); 110 int type = u_charType(cp); 111 //System.out.println(Integer.toString(cp,16) + " (before): " + type); 112 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 113 114 cp = sText.char32At(boundary); 115 type = u_charType(cp); 116 //System.out.println(Integer.toString(cp,16) + " (after): " + type); 117 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 118 119 boundaries->addElement(boundary, status); 120 // printf("Boundary at %d\n", boundary); 121 } 122 123 int delta = 0; 124 int lastBoundary = 0; 125 126 if (boundaries->size() != 0) { // if we found something, adjust 127 delta = boundaries->size() * fInsertion.length(); 128 lastBoundary = boundaries->lastElementi(); 129 130 // we do this from the end backwards, so that we don't have to keep updating. 131 132 while (boundaries->size() > 0) { 133 boundary = boundaries->popi(); 134 text.handleReplaceBetween(boundary, boundary, fInsertion); 135 } 136 } 137 138 // Now fix up the return values 139 offsets.contextLimit += delta; 140 offsets.limit += delta; 141 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; 142 143 // Return break iterator & boundaries vector to the cache. 144 { 145 Mutex m; 146 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); 147 if (nonConstThis->cachedBI.isNull()) { 148 nonConstThis->cachedBI.moveFrom(bi); 149 } 150 if (nonConstThis->cachedBoundaries.isNull()) { 151 nonConstThis->cachedBoundaries.moveFrom(boundaries); 152 } 153 } 154 155 // TODO: do something with U_FAILURE(status); 156 // (need to look at transliterators overall, not just here.) 157 } 158 159 // 160 // getInsertion() 161 // 162 const UnicodeString &BreakTransliterator::getInsertion() const { 163 return fInsertion; 164 } 165 166 // 167 // setInsertion() 168 // 169 void BreakTransliterator::setInsertion(const UnicodeString &insertion) { 170 this->fInsertion = insertion; 171 } 172 173 // 174 // replaceableAsString Hack to let break iterators work 175 // on the replaceable text from transliterators. 176 // In practice, the only real Replaceable type that we 177 // will be seeing is UnicodeString, so this function 178 // will normally be efficient. 179 // 180 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { 181 UnicodeString s; 182 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); 183 if (rs != NULL) { 184 s = *rs; 185 } else { 186 r.extractBetween(0, r.length(), s); 187 } 188 return s; 189 } 190 191 U_NAMESPACE_END 192 193 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 194