1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1997-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 * 9 * File brkiter.cpp 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 02/18/97 aliu Converted from OpenClass. Added DONE. 15 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods. 16 ***************************************************************************************** 17 */ 18 19 // ***************************************************************************** 20 // This file was generated from the java source file BreakIterator.java 21 // ***************************************************************************** 22 23 #include "unicode/utypes.h" 24 25 #if !UCONFIG_NO_BREAK_ITERATION 26 27 #include "unicode/rbbi.h" 28 #include "unicode/brkiter.h" 29 #include "unicode/udata.h" 30 #include "unicode/ures.h" 31 #include "unicode/ustring.h" 32 #include "unicode/filteredbrk.h" 33 #include "ucln_cmn.h" 34 #include "cstring.h" 35 #include "umutex.h" 36 #include "servloc.h" 37 #include "locbased.h" 38 #include "uresimp.h" 39 #include "uassert.h" 40 #include "ubrkimpl.h" 41 #include "charstr.h" 42 43 // ***************************************************************************** 44 // class BreakIterator 45 // This class implements methods for finding the location of boundaries in text. 46 // Instances of BreakIterator maintain a current position and scan over text 47 // returning the index of characters where boundaries occur. 48 // ***************************************************************************** 49 50 U_NAMESPACE_BEGIN 51 52 // ------------------------------------- 53 54 BreakIterator* 55 BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status) 56 { 57 char fnbuff[256]; 58 char ext[4]={'\0'}; 59 CharString actualLocale; 60 int32_t size; 61 const UChar* brkfname = NULL; 62 UResourceBundle brkRulesStack; 63 UResourceBundle brkNameStack; 64 UResourceBundle *brkRules = &brkRulesStack; 65 UResourceBundle *brkName = &brkNameStack; 66 RuleBasedBreakIterator *result = NULL; 67 68 if (U_FAILURE(status)) 69 return NULL; 70 71 ures_initStackObject(brkRules); 72 ures_initStackObject(brkName); 73 74 // Get the locale 75 UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status); 76 77 // Get the "boundaries" array. 78 if (U_SUCCESS(status)) { 79 brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status); 80 // Get the string object naming the rules file 81 brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status); 82 // Get the actual string 83 brkfname = ures_getString(brkName, &size, &status); 84 U_ASSERT((size_t)size<sizeof(fnbuff)); 85 if ((size_t)size>=sizeof(fnbuff)) { 86 size=0; 87 if (U_SUCCESS(status)) { 88 status = U_BUFFER_OVERFLOW_ERROR; 89 } 90 } 91 92 // Use the string if we found it 93 if (U_SUCCESS(status) && brkfname) { 94 actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status); 95 96 UChar* extStart=u_strchr(brkfname, 0x002e); 97 int len = 0; 98 if(extStart!=NULL){ 99 len = (int)(extStart-brkfname); 100 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff 101 u_UCharsToChars(brkfname, fnbuff, len); 102 } 103 fnbuff[len]=0; // nul terminate 104 } 105 } 106 107 ures_close(brkRules); 108 ures_close(brkName); 109 110 UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status); 111 if (U_FAILURE(status)) { 112 ures_close(b); 113 return NULL; 114 } 115 116 // Create a RuleBasedBreakIterator 117 result = new RuleBasedBreakIterator(file, status); 118 119 // If there is a result, set the valid locale and actual locale, and the kind 120 if (U_SUCCESS(status) && result != NULL) { 121 U_LOCALE_BASED(locBased, *(BreakIterator*)result); 122 locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), 123 actualLocale.data()); 124 result->setBreakType(kind); 125 } 126 127 ures_close(b); 128 129 if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple 130 delete result; 131 return NULL; 132 } 133 134 if (result == NULL) { 135 udata_close(file); 136 if (U_SUCCESS(status)) { 137 status = U_MEMORY_ALLOCATION_ERROR; 138 } 139 } 140 141 return result; 142 } 143 144 // Creates a break iterator for word breaks. 145 BreakIterator* U_EXPORT2 146 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) 147 { 148 return createInstance(key, UBRK_WORD, status); 149 } 150 151 // ------------------------------------- 152 153 // Creates a break iterator for line breaks. 154 BreakIterator* U_EXPORT2 155 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status) 156 { 157 return createInstance(key, UBRK_LINE, status); 158 } 159 160 // ------------------------------------- 161 162 // Creates a break iterator for character breaks. 163 BreakIterator* U_EXPORT2 164 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status) 165 { 166 return createInstance(key, UBRK_CHARACTER, status); 167 } 168 169 // ------------------------------------- 170 171 // Creates a break iterator for sentence breaks. 172 BreakIterator* U_EXPORT2 173 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status) 174 { 175 return createInstance(key, UBRK_SENTENCE, status); 176 } 177 178 // ------------------------------------- 179 180 // Creates a break iterator for title casing breaks. 181 BreakIterator* U_EXPORT2 182 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status) 183 { 184 return createInstance(key, UBRK_TITLE, status); 185 } 186 187 // ------------------------------------- 188 189 // Gets all the available locales that has localized text boundary data. 190 const Locale* U_EXPORT2 191 BreakIterator::getAvailableLocales(int32_t& count) 192 { 193 return Locale::getAvailableLocales(count); 194 } 195 196 // ------------------------------------------ 197 // 198 // Default constructor and destructor 199 // 200 //------------------------------------------- 201 202 BreakIterator::BreakIterator() 203 { 204 *validLocale = *actualLocale = 0; 205 } 206 207 BreakIterator::~BreakIterator() 208 { 209 } 210 211 // ------------------------------------------ 212 // 213 // Registration 214 // 215 //------------------------------------------- 216 #if !UCONFIG_NO_SERVICE 217 218 // ------------------------------------- 219 220 class ICUBreakIteratorFactory : public ICUResourceBundleFactory { 221 public: 222 virtual ~ICUBreakIteratorFactory(); 223 protected: 224 virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const { 225 return BreakIterator::makeInstance(loc, kind, status); 226 } 227 }; 228 229 ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {} 230 231 // ------------------------------------- 232 233 class ICUBreakIteratorService : public ICULocaleService { 234 public: 235 ICUBreakIteratorService() 236 : ICULocaleService(UNICODE_STRING("Break Iterator", 14)) 237 { 238 UErrorCode status = U_ZERO_ERROR; 239 registerFactory(new ICUBreakIteratorFactory(), status); 240 } 241 242 virtual ~ICUBreakIteratorService(); 243 244 virtual UObject* cloneInstance(UObject* instance) const { 245 return ((BreakIterator*)instance)->clone(); 246 } 247 248 virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const { 249 LocaleKey& lkey = (LocaleKey&)key; 250 int32_t kind = lkey.kind(); 251 Locale loc; 252 lkey.currentLocale(loc); 253 return BreakIterator::makeInstance(loc, kind, status); 254 } 255 256 virtual UBool isDefault() const { 257 return countFactories() == 1; 258 } 259 }; 260 261 ICUBreakIteratorService::~ICUBreakIteratorService() {} 262 263 // ------------------------------------- 264 265 // defined in ucln_cmn.h 266 U_NAMESPACE_END 267 268 static icu::UInitOnce gInitOnce; 269 static icu::ICULocaleService* gService = NULL; 270 271 272 273 /** 274 * Release all static memory held by breakiterator. 275 */ 276 U_CDECL_BEGIN 277 static UBool U_CALLCONV breakiterator_cleanup(void) { 278 #if !UCONFIG_NO_SERVICE 279 if (gService) { 280 delete gService; 281 gService = NULL; 282 } 283 gInitOnce.reset(); 284 #endif 285 return TRUE; 286 } 287 U_CDECL_END 288 U_NAMESPACE_BEGIN 289 290 static void U_CALLCONV 291 initService(void) { 292 gService = new ICUBreakIteratorService(); 293 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup); 294 } 295 296 static ICULocaleService* 297 getService(void) 298 { 299 umtx_initOnce(gInitOnce, &initService); 300 return gService; 301 } 302 303 304 // ------------------------------------- 305 306 static inline UBool 307 hasService(void) 308 { 309 return !gInitOnce.isReset() && getService() != NULL; 310 } 311 312 // ------------------------------------- 313 314 URegistryKey U_EXPORT2 315 BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status) 316 { 317 ICULocaleService *service = getService(); 318 if (service == NULL) { 319 status = U_MEMORY_ALLOCATION_ERROR; 320 return NULL; 321 } 322 return service->registerInstance(toAdopt, locale, kind, status); 323 } 324 325 // ------------------------------------- 326 327 UBool U_EXPORT2 328 BreakIterator::unregister(URegistryKey key, UErrorCode& status) 329 { 330 if (U_SUCCESS(status)) { 331 if (hasService()) { 332 return gService->unregister(key, status); 333 } 334 status = U_MEMORY_ALLOCATION_ERROR; 335 } 336 return FALSE; 337 } 338 339 // ------------------------------------- 340 341 StringEnumeration* U_EXPORT2 342 BreakIterator::getAvailableLocales(void) 343 { 344 ICULocaleService *service = getService(); 345 if (service == NULL) { 346 return NULL; 347 } 348 return service->getAvailableLocales(); 349 } 350 #endif /* UCONFIG_NO_SERVICE */ 351 352 // ------------------------------------- 353 354 BreakIterator* 355 BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status) 356 { 357 if (U_FAILURE(status)) { 358 return NULL; 359 } 360 361 #if !UCONFIG_NO_SERVICE 362 if (hasService()) { 363 Locale actualLoc(""); 364 BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status); 365 // TODO: The way the service code works in ICU 2.8 is that if 366 // there is a real registered break iterator, the actualLoc 367 // will be populated, but if the handleDefault path is taken 368 // (because nothing is registered that can handle the 369 // requested locale) then the actualLoc comes back empty. In 370 // that case, the returned object already has its actual/valid 371 // locale data populated (by makeInstance, which is what 372 // handleDefault calls), so we don't touch it. YES, A COMMENT 373 // THIS LONG is a sign of bad code -- so the action item is to 374 // revisit this in ICU 3.0 and clean it up/fix it/remove it. 375 if (U_SUCCESS(status) && (result != NULL) && *actualLoc.getName() != 0) { 376 U_LOCALE_BASED(locBased, *result); 377 locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName()); 378 } 379 return result; 380 } 381 else 382 #endif 383 { 384 return makeInstance(loc, kind, status); 385 } 386 } 387 388 // ------------------------------------- 389 enum { kKeyValueLenMax = 32 }; 390 391 BreakIterator* 392 BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) 393 { 394 395 if (U_FAILURE(status)) { 396 return NULL; 397 } 398 char lbType[kKeyValueLenMax]; 399 400 BreakIterator *result = NULL; 401 switch (kind) { 402 case UBRK_CHARACTER: 403 result = BreakIterator::buildInstance(loc, "grapheme", kind, status); 404 break; 405 case UBRK_WORD: 406 result = BreakIterator::buildInstance(loc, "word", kind, status); 407 break; 408 case UBRK_LINE: 409 uprv_strcpy(lbType, "line"); 410 { 411 char lbKeyValue[kKeyValueLenMax] = {0}; 412 UErrorCode kvStatus = U_ZERO_ERROR; 413 int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus); 414 if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) { 415 uprv_strcat(lbType, "_"); 416 uprv_strcat(lbType, lbKeyValue); 417 } 418 } 419 result = BreakIterator::buildInstance(loc, lbType, kind, status); 420 break; 421 case UBRK_SENTENCE: 422 result = BreakIterator::buildInstance(loc, "sentence", kind, status); 423 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION 424 { 425 char ssKeyValue[kKeyValueLenMax] = {0}; 426 UErrorCode kvStatus = U_ZERO_ERROR; 427 int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus); 428 if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) { 429 FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus); 430 if (U_SUCCESS(kvStatus)) { 431 result = fbiBuilder->build(result, status); 432 delete fbiBuilder; 433 } 434 } 435 } 436 #endif 437 break; 438 case UBRK_TITLE: 439 result = BreakIterator::buildInstance(loc, "title", kind, status); 440 break; 441 default: 442 status = U_ILLEGAL_ARGUMENT_ERROR; 443 } 444 445 if (U_FAILURE(status)) { 446 return NULL; 447 } 448 449 return result; 450 } 451 452 Locale 453 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const { 454 U_LOCALE_BASED(locBased, *this); 455 return locBased.getLocale(type, status); 456 } 457 458 const char * 459 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const { 460 U_LOCALE_BASED(locBased, *this); 461 return locBased.getLocaleID(type, status); 462 } 463 464 465 // This implementation of getRuleStatus is a do-nothing stub, here to 466 // provide a default implementation for any derived BreakIterator classes that 467 // do not implement it themselves. 468 int32_t BreakIterator::getRuleStatus() const { 469 return 0; 470 } 471 472 // This implementation of getRuleStatusVec is a do-nothing stub, here to 473 // provide a default implementation for any derived BreakIterator classes that 474 // do not implement it themselves. 475 int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) { 476 if (U_FAILURE(status)) { 477 return 0; 478 } 479 if (capacity < 1) { 480 status = U_BUFFER_OVERFLOW_ERROR; 481 return 1; 482 } 483 *fillInVec = 0; 484 return 1; 485 } 486 487 BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) { 488 U_LOCALE_BASED(locBased, (*this)); 489 locBased.setLocaleIDs(valid, actual); 490 } 491 492 U_NAMESPACE_END 493 494 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 495 496 //eof 497