1 /* 2 ******************************************************************************* 3 * Copyright (C) 2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/filteredbrk.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION 11 12 #include <unicode/ucharstriebuilder.h> 13 14 #include <set> 15 #include <string> 16 #include <functional> 17 #include "uresimp.h" 18 #include "ubrkimpl.h" 19 20 U_NAMESPACE_BEGIN 21 22 using namespace std; 23 24 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie 25 static const int32_t kMATCH = (1<<1); //< exact match - skip this one. 26 static const int32_t kSuppressInReverse = (1<<0); 27 static const int32_t kAddToForward = (1<<1); 28 static const UChar kFULLSTOP = 0x002E; // '.' 29 30 class ULISentenceBreakIterator : public BreakIterator { 31 public: 32 ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); 33 virtual ~ULISentenceBreakIterator() {} 34 ULISentenceBreakIterator(const ULISentenceBreakIterator& other); 35 private: 36 LocalPointer<BreakIterator> fDelegate; 37 LocalUTextPointer fText; 38 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. 39 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." 40 41 /* -- subclass interface -- */ 42 public: 43 /* -- cloning and other subclass stuff -- */ 44 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, 45 int32_t &/*BufferSize*/, 46 UErrorCode &status) { 47 // for now - always deep clone 48 status = U_SAFECLONE_ALLOCATED_WARNING; 49 return clone(); 50 } 51 virtual BreakIterator* clone(void) const { return new ULISentenceBreakIterator(*this); } 52 virtual UClassID getDynamicClassID(void) const { return NULL; } 53 virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; } 54 55 /* -- text modifying -- */ 56 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } 57 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } 58 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } 59 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } 60 61 /* -- other functions that are just delegated -- */ 62 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } 63 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } 64 65 /* -- ITERATION -- */ 66 virtual int32_t first(void) { return fDelegate->first(); } 67 virtual int32_t preceding(int32_t offset) { return fDelegate->preceding(offset); } 68 virtual int32_t previous(void) { return fDelegate->previous(); } 69 virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); } 70 virtual int32_t current(void) const { return fDelegate->current(); } 71 72 virtual int32_t next(void); 73 74 virtual int32_t next(int32_t n) { return fDelegate->next(n); } 75 virtual int32_t following(int32_t offset) { return fDelegate->following(offset); } 76 virtual int32_t last(void) { return fDelegate->last(); } 77 78 }; 79 80 ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator& other) 81 : BreakIterator(other), fDelegate(other.fDelegate->clone()) 82 { 83 /* 84 TODO: not able to clone Tries. Should be a refcounted hidden master instead. 85 if(other.fBackwardsTrie.isValid()) { 86 fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone()); 87 } 88 if(other.fForwardsPartialTrie.isValid()) { 89 fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone()); 90 } 91 */ 92 } 93 94 95 ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : 96 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), 97 fDelegate(adopt), 98 fBackwardsTrie(backwards), 99 fForwardsPartialTrie(forwards) 100 { 101 // all set.. 102 } 103 104 int32_t ULISentenceBreakIterator::next() { 105 int32_t n = fDelegate->next(); 106 if(n == UBRK_DONE || // at end or 107 fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions 108 return n; 109 } 110 // OK, do we need to break here? 111 UErrorCode status = U_ZERO_ERROR; 112 // refresh text 113 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); 114 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 115 do { // outer loop runs once per underlying break (from fDelegate). 116 // loops while 'n' points to an exception. 117 utext_setNativeIndex(fText.getAlias(), n); // from n.. 118 fBackwardsTrie->reset(); 119 UChar32 uch; 120 //if(debug2) u_printf(" n@ %d\n", n); 121 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 122 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? 123 // TODO only do this the 1st time? 124 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); 125 } else { 126 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); 127 uch = utext_next32(fText.getAlias()); 128 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); 129 } 130 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; 131 132 int32_t bestPosn = -1; 133 int32_t bestValue = -1; 134 135 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. 136 USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie 137 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far 138 bestPosn = utext_getNativeIndex(fText.getAlias()); 139 bestValue = fBackwardsTrie->getValue(); 140 } 141 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); 142 } 143 144 if(USTRINGTRIE_MATCHES(r)) { // exact match? 145 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 146 bestValue = fBackwardsTrie->getValue(); 147 bestPosn = utext_getNativeIndex(fText.getAlias()); 148 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 149 } 150 151 if(bestPosn>=0) { 152 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 153 154 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? 155 //int32_t bestValue = fBackwardsTrie->getValue(); 156 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); 157 158 if(bestValue == kMATCH) { // exact match! 159 //if(debug2) u_printf(" exact backward match\n"); 160 n = fDelegate->next(); // skip this one. Find the next lowerlevel break. 161 if(n==UBRK_DONE) return n; 162 continue; // See if the next is another exception. 163 } else if(bestValue == kPARTIAL 164 && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie 165 //if(debug2) u_printf(" partial backward match\n"); 166 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 167 // to see if it matches something going forward. 168 fForwardsPartialTrie->reset(); 169 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; 170 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. 171 //if(debug2) u_printf("Retrying at %d\n", bestPosn); 172 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && 173 USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) { 174 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); 175 } 176 if(USTRINGTRIE_MATCHES(rfwd)) { 177 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); 178 // only full matches here, nothing to check 179 // skip the next: 180 n = fDelegate->next(); 181 if(n==UBRK_DONE) return n; 182 continue; 183 } else { 184 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); 185 // no match (no exception) -return the 'underlying' break 186 return n; 187 } 188 } else { 189 return n; // internal error and/or no forwards trie 190 } 191 } else { 192 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match 193 return n; // No match - so exit. Not an exception. 194 } 195 } while(n != UBRK_DONE); 196 return n; 197 } 198 199 U_NAMESPACE_END 200 201 #if 0 202 // Would improve performance - but, platform issues. 203 // for the 'set' 204 namespace std { 205 template <> struct hash<icu::UnicodeString> { 206 size_t operator()( const UnicodeString& str ) const { 207 return (size_t)str.hashCode(); 208 } 209 }; 210 } 211 #endif 212 213 U_NAMESPACE_BEGIN 214 215 class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { 216 public: 217 virtual ~SimpleFilteredBreakIteratorBuilder(); 218 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); 219 SimpleFilteredBreakIteratorBuilder(); 220 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 221 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 222 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); 223 private: 224 set<UnicodeString> fSet; 225 }; 226 227 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() 228 { 229 } 230 231 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) 232 : fSet() 233 { 234 if(U_SUCCESS(status)) { 235 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status)); 236 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status)); 237 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status)); 238 if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it. 239 240 LocalUResourceBundlePointer strs; 241 UErrorCode subStatus = status; 242 do { 243 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); 244 if(strs.isValid() && U_SUCCESS(subStatus)) { 245 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); 246 suppressBreakAfter(str, status); // load the string 247 } 248 } while (strs.isValid() && U_SUCCESS(subStatus)); 249 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { 250 status = subStatus; 251 } 252 } 253 } 254 255 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder() 256 : fSet() 257 { 258 } 259 260 UBool 261 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 262 { 263 if( U_FAILURE(status) ) return FALSE; 264 return fSet.insert(exception).second; 265 } 266 267 UBool 268 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 269 { 270 if( U_FAILURE(status) ) return FALSE; 271 return ((fSet.erase(exception)) != 0); 272 } 273 274 BreakIterator * 275 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { 276 LocalPointer<BreakIterator> adopt(adoptBreakIterator); 277 278 if(U_FAILURE(status)) { 279 return NULL; 280 } 281 282 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status)); 283 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status)); 284 285 int32_t revCount = 0; 286 int32_t fwdCount = 0; 287 288 int32_t subCount = fSet.size(); 289 LocalArray<UnicodeString> ustrs(new UnicodeString[subCount]); 290 LocalArray<int> partials(new int[subCount]); 291 292 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. 293 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." 294 295 int n=0; 296 for ( set<UnicodeString>::iterator i = fSet.begin(); 297 i != fSet.end(); 298 i++) { 299 const UnicodeString &abbr = *i; 300 ustrs[n] = abbr; 301 partials[n] = 0; // default: not partial 302 n++; 303 } 304 // first pass - find partials. 305 for(int i=0;i<subCount;i++) { 306 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations 307 if(nn>-1 && (nn+1)!=ustrs[i].length()) { 308 //if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer()); 309 // is partial. 310 // is it unique? 311 int sameAs = -1; 312 for(int j=0;j<subCount;j++) { 313 if(j==i) continue; 314 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { 315 //if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1); 316 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn 317 if(partials[j]==0) { // hasn't been processed yet 318 partials[j] = kSuppressInReverse | kAddToForward; 319 //if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer()); 320 } else if(partials[j] & kSuppressInReverse) { 321 sameAs = j; // the other entry is already in the reverse table. 322 } 323 } 324 } 325 //if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n", ustrs[i].getTerminatedBuffer(), sameAs, partials[i]); 326 UnicodeString prefix(ustrs[i], 0, nn+1); 327 if(sameAs == -1 && partials[i] == 0) { 328 // first one - add the prefix to the reverse table. 329 prefix.reverse(); 330 builder->add(prefix, kPARTIAL, status); 331 revCount++; 332 //if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status)); 333 partials[i] = kSuppressInReverse | kAddToForward; 334 } else { 335 //if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer()); 336 } 337 } 338 } 339 for(int i=0;i<subCount;i++) { 340 if(partials[i]==0) { 341 ustrs[i].reverse(); 342 builder->add(ustrs[i], kMATCH, status); 343 revCount++; 344 //if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status)); 345 } else { 346 //if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer()); 347 348 // an optimization would be to only add the portion after the '.' 349 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, 350 // instead of "Ph.D." since we already know the "Ph." part is a match. 351 // would need the trie to be able to hold 0-length strings, though. 352 builder2->add(ustrs[i], kMATCH, status); // forward 353 fwdCount++; 354 //ustrs[i].reverse(); 355 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); 356 } 357 } 358 //if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount); 359 360 if(revCount>0) { 361 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); 362 if(U_FAILURE(status)) { 363 //printf("Error %s building backwards\n", u_errorName(status)); 364 return NULL; 365 } 366 } 367 368 if(fwdCount>0) { 369 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); 370 if(U_FAILURE(status)) { 371 //printf("Error %s building forwards\n", u_errorName(status)); 372 return NULL; 373 } 374 } 375 376 return new ULISentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); 377 } 378 379 380 // ----------- 381 382 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { 383 } 384 385 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { 386 } 387 388 FilteredBreakIteratorBuilder * 389 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { 390 if(U_FAILURE(status)) return NULL; 391 392 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status)); 393 if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR; 394 return ret.orphan(); 395 } 396 397 FilteredBreakIteratorBuilder * 398 FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { 399 if(U_FAILURE(status)) return NULL; 400 401 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder()); 402 if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR; 403 return ret.orphan(); 404 } 405 406 U_NAMESPACE_END 407 408 #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION 409