Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 1997-2015, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 *
      9 * File brkiter.cpp
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   02/18/97    aliu        Converted from OpenClass.  Added DONE.
     15 *   01/13/2000  helena      Added UErrorCode parameter to createXXXInstance methods.
     16 *****************************************************************************************
     17 */
     18 
     19 // *****************************************************************************
     20 // This file was generated from the java source file BreakIterator.java
     21 // *****************************************************************************
     22 
     23 #include "unicode/utypes.h"
     24 
     25 #if !UCONFIG_NO_BREAK_ITERATION
     26 
     27 #include "unicode/rbbi.h"
     28 #include "unicode/brkiter.h"
     29 #include "unicode/udata.h"
     30 #include "unicode/ures.h"
     31 #include "unicode/ustring.h"
     32 #include "unicode/filteredbrk.h"
     33 #include "ucln_cmn.h"
     34 #include "cstring.h"
     35 #include "umutex.h"
     36 #include "servloc.h"
     37 #include "locbased.h"
     38 #include "uresimp.h"
     39 #include "uassert.h"
     40 #include "ubrkimpl.h"
     41 #include "charstr.h"
     42 
     43 // *****************************************************************************
     44 // class BreakIterator
     45 // This class implements methods for finding the location of boundaries in text.
     46 // Instances of BreakIterator maintain a current position and scan over text
     47 // returning the index of characters where boundaries occur.
     48 // *****************************************************************************
     49 
     50 U_NAMESPACE_BEGIN
     51 
     52 // -------------------------------------
     53 
     54 BreakIterator*
     55 BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
     56 {
     57     char fnbuff[256];
     58     char ext[4]={'\0'};
     59     CharString actualLocale;
     60     int32_t size;
     61     const UChar* brkfname = NULL;
     62     UResourceBundle brkRulesStack;
     63     UResourceBundle brkNameStack;
     64     UResourceBundle *brkRules = &brkRulesStack;
     65     UResourceBundle *brkName  = &brkNameStack;
     66     RuleBasedBreakIterator *result = NULL;
     67 
     68     if (U_FAILURE(status))
     69         return NULL;
     70 
     71     ures_initStackObject(brkRules);
     72     ures_initStackObject(brkName);
     73 
     74     // Get the locale
     75     UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
     76 
     77     // Get the "boundaries" array.
     78     if (U_SUCCESS(status)) {
     79         brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
     80         // Get the string object naming the rules file
     81         brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
     82         // Get the actual string
     83         brkfname = ures_getString(brkName, &size, &status);
     84         U_ASSERT((size_t)size<sizeof(fnbuff));
     85         if ((size_t)size>=sizeof(fnbuff)) {
     86             size=0;
     87             if (U_SUCCESS(status)) {
     88                 status = U_BUFFER_OVERFLOW_ERROR;
     89             }
     90         }
     91 
     92         // Use the string if we found it
     93         if (U_SUCCESS(status) && brkfname) {
     94             actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
     95 
     96             UChar* extStart=u_strchr(brkfname, 0x002e);
     97             int len = 0;
     98             if(extStart!=NULL){
     99                 len = (int)(extStart-brkfname);
    100                 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
    101                 u_UCharsToChars(brkfname, fnbuff, len);
    102             }
    103             fnbuff[len]=0; // nul terminate
    104         }
    105     }
    106 
    107     ures_close(brkRules);
    108     ures_close(brkName);
    109 
    110     UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
    111     if (U_FAILURE(status)) {
    112         ures_close(b);
    113         return NULL;
    114     }
    115 
    116     // Create a RuleBasedBreakIterator
    117     result = new RuleBasedBreakIterator(file, status);
    118 
    119     // If there is a result, set the valid locale and actual locale, and the kind
    120     if (U_SUCCESS(status) && result != NULL) {
    121         U_LOCALE_BASED(locBased, *(BreakIterator*)result);
    122         locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
    123                               actualLocale.data());
    124     }
    125 
    126     ures_close(b);
    127 
    128     if (U_FAILURE(status) && result != NULL) {  // Sometimes redundant check, but simple
    129         delete result;
    130         return NULL;
    131     }
    132 
    133     if (result == NULL) {
    134         udata_close(file);
    135         if (U_SUCCESS(status)) {
    136             status = U_MEMORY_ALLOCATION_ERROR;
    137         }
    138     }
    139 
    140     return result;
    141 }
    142 
    143 // Creates a break iterator for word breaks.
    144 BreakIterator* U_EXPORT2
    145 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
    146 {
    147     return createInstance(key, UBRK_WORD, status);
    148 }
    149 
    150 // -------------------------------------
    151 
    152 // Creates a break iterator  for line breaks.
    153 BreakIterator* U_EXPORT2
    154 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
    155 {
    156     return createInstance(key, UBRK_LINE, status);
    157 }
    158 
    159 // -------------------------------------
    160 
    161 // Creates a break iterator  for character breaks.
    162 BreakIterator* U_EXPORT2
    163 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
    164 {
    165     return createInstance(key, UBRK_CHARACTER, status);
    166 }
    167 
    168 // -------------------------------------
    169 
    170 // Creates a break iterator  for sentence breaks.
    171 BreakIterator* U_EXPORT2
    172 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
    173 {
    174     return createInstance(key, UBRK_SENTENCE, status);
    175 }
    176 
    177 // -------------------------------------
    178 
    179 // Creates a break iterator for title casing breaks.
    180 BreakIterator* U_EXPORT2
    181 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
    182 {
    183     return createInstance(key, UBRK_TITLE, status);
    184 }
    185 
    186 // -------------------------------------
    187 
    188 // Gets all the available locales that has localized text boundary data.
    189 const Locale* U_EXPORT2
    190 BreakIterator::getAvailableLocales(int32_t& count)
    191 {
    192     return Locale::getAvailableLocales(count);
    193 }
    194 
    195 // ------------------------------------------
    196 //
    197 // Constructors, destructor and assignment operator
    198 //
    199 //-------------------------------------------
    200 
    201 BreakIterator::BreakIterator()
    202 {
    203     *validLocale = *actualLocale = 0;
    204 }
    205 
    206 BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
    207     uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
    208     uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
    209 }
    210 
    211 BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
    212     if (this != &other) {
    213         uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
    214         uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
    215     }
    216     return *this;
    217 }
    218 
    219 BreakIterator::~BreakIterator()
    220 {
    221 }
    222 
    223 // ------------------------------------------
    224 //
    225 // Registration
    226 //
    227 //-------------------------------------------
    228 #if !UCONFIG_NO_SERVICE
    229 
    230 // -------------------------------------
    231 
    232 class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
    233 public:
    234     virtual ~ICUBreakIteratorFactory();
    235 protected:
    236     virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const {
    237         return BreakIterator::makeInstance(loc, kind, status);
    238     }
    239 };
    240 
    241 ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
    242 
    243 // -------------------------------------
    244 
    245 class ICUBreakIteratorService : public ICULocaleService {
    246 public:
    247     ICUBreakIteratorService()
    248         : ICULocaleService(UNICODE_STRING("Break Iterator", 14))
    249     {
    250         UErrorCode status = U_ZERO_ERROR;
    251         registerFactory(new ICUBreakIteratorFactory(), status);
    252     }
    253 
    254     virtual ~ICUBreakIteratorService();
    255 
    256     virtual UObject* cloneInstance(UObject* instance) const {
    257         return ((BreakIterator*)instance)->clone();
    258     }
    259 
    260     virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const {
    261         LocaleKey& lkey = (LocaleKey&)key;
    262         int32_t kind = lkey.kind();
    263         Locale loc;
    264         lkey.currentLocale(loc);
    265         return BreakIterator::makeInstance(loc, kind, status);
    266     }
    267 
    268     virtual UBool isDefault() const {
    269         return countFactories() == 1;
    270     }
    271 };
    272 
    273 ICUBreakIteratorService::~ICUBreakIteratorService() {}
    274 
    275 // -------------------------------------
    276 
    277 // defined in ucln_cmn.h
    278 U_NAMESPACE_END
    279 
    280 static icu::UInitOnce gInitOnceBrkiter;
    281 static icu::ICULocaleService* gService = NULL;
    282 
    283 
    284 
    285 /**
    286  * Release all static memory held by breakiterator.
    287  */
    288 U_CDECL_BEGIN
    289 static UBool U_CALLCONV breakiterator_cleanup(void) {
    290 #if !UCONFIG_NO_SERVICE
    291     if (gService) {
    292         delete gService;
    293         gService = NULL;
    294     }
    295     gInitOnceBrkiter.reset();
    296 #endif
    297     return TRUE;
    298 }
    299 U_CDECL_END
    300 U_NAMESPACE_BEGIN
    301 
    302 static void U_CALLCONV
    303 initService(void) {
    304     gService = new ICUBreakIteratorService();
    305     ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
    306 }
    307 
    308 static ICULocaleService*
    309 getService(void)
    310 {
    311     umtx_initOnce(gInitOnceBrkiter, &initService);
    312     return gService;
    313 }
    314 
    315 
    316 // -------------------------------------
    317 
    318 static inline UBool
    319 hasService(void)
    320 {
    321     return !gInitOnceBrkiter.isReset() && getService() != NULL;
    322 }
    323 
    324 // -------------------------------------
    325 
    326 URegistryKey U_EXPORT2
    327 BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
    328 {
    329     ICULocaleService *service = getService();
    330     if (service == NULL) {
    331         status = U_MEMORY_ALLOCATION_ERROR;
    332         return NULL;
    333     }
    334     return service->registerInstance(toAdopt, locale, kind, status);
    335 }
    336 
    337 // -------------------------------------
    338 
    339 UBool U_EXPORT2
    340 BreakIterator::unregister(URegistryKey key, UErrorCode& status)
    341 {
    342     if (U_SUCCESS(status)) {
    343         if (hasService()) {
    344             return gService->unregister(key, status);
    345         }
    346         status = U_MEMORY_ALLOCATION_ERROR;
    347     }
    348     return FALSE;
    349 }
    350 
    351 // -------------------------------------
    352 
    353 StringEnumeration* U_EXPORT2
    354 BreakIterator::getAvailableLocales(void)
    355 {
    356     ICULocaleService *service = getService();
    357     if (service == NULL) {
    358         return NULL;
    359     }
    360     return service->getAvailableLocales();
    361 }
    362 #endif /* UCONFIG_NO_SERVICE */
    363 
    364 // -------------------------------------
    365 
    366 BreakIterator*
    367 BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
    368 {
    369     if (U_FAILURE(status)) {
    370         return NULL;
    371     }
    372 
    373 #if !UCONFIG_NO_SERVICE
    374     if (hasService()) {
    375         Locale actualLoc("");
    376         BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
    377         // TODO: The way the service code works in ICU 2.8 is that if
    378         // there is a real registered break iterator, the actualLoc
    379         // will be populated, but if the handleDefault path is taken
    380         // (because nothing is registered that can handle the
    381         // requested locale) then the actualLoc comes back empty.  In
    382         // that case, the returned object already has its actual/valid
    383         // locale data populated (by makeInstance, which is what
    384         // handleDefault calls), so we don't touch it.  YES, A COMMENT
    385         // THIS LONG is a sign of bad code -- so the action item is to
    386         // revisit this in ICU 3.0 and clean it up/fix it/remove it.
    387         if (U_SUCCESS(status) && (result != NULL) && *actualLoc.getName() != 0) {
    388             U_LOCALE_BASED(locBased, *result);
    389             locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
    390         }
    391         return result;
    392     }
    393     else
    394 #endif
    395     {
    396         return makeInstance(loc, kind, status);
    397     }
    398 }
    399 
    400 // -------------------------------------
    401 enum { kKeyValueLenMax = 32 };
    402 
    403 BreakIterator*
    404 BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
    405 {
    406 
    407     if (U_FAILURE(status)) {
    408         return NULL;
    409     }
    410     char lbType[kKeyValueLenMax];
    411 
    412     BreakIterator *result = NULL;
    413     switch (kind) {
    414     case UBRK_CHARACTER:
    415         result = BreakIterator::buildInstance(loc, "grapheme", status);
    416         break;
    417     case UBRK_WORD:
    418         result = BreakIterator::buildInstance(loc, "word", status);
    419         break;
    420     case UBRK_LINE:
    421         uprv_strcpy(lbType, "line");
    422         {
    423             char lbKeyValue[kKeyValueLenMax] = {0};
    424             UErrorCode kvStatus = U_ZERO_ERROR;
    425             int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
    426             if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
    427                 uprv_strcat(lbType, "_");
    428                 uprv_strcat(lbType, lbKeyValue);
    429             }
    430         }
    431         result = BreakIterator::buildInstance(loc, lbType, status);
    432         break;
    433     case UBRK_SENTENCE:
    434         result = BreakIterator::buildInstance(loc, "sentence", status);
    435 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
    436         {
    437             char ssKeyValue[kKeyValueLenMax] = {0};
    438             UErrorCode kvStatus = U_ZERO_ERROR;
    439             int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
    440             if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
    441                 FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
    442                 if (U_SUCCESS(kvStatus)) {
    443                     result = fbiBuilder->build(result, status);
    444                     delete fbiBuilder;
    445                 }
    446             }
    447         }
    448 #endif
    449         break;
    450     case UBRK_TITLE:
    451         result = BreakIterator::buildInstance(loc, "title", status);
    452         break;
    453     default:
    454         status = U_ILLEGAL_ARGUMENT_ERROR;
    455     }
    456 
    457     if (U_FAILURE(status)) {
    458         return NULL;
    459     }
    460 
    461     return result;
    462 }
    463 
    464 Locale
    465 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
    466     U_LOCALE_BASED(locBased, *this);
    467     return locBased.getLocale(type, status);
    468 }
    469 
    470 const char *
    471 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
    472     U_LOCALE_BASED(locBased, *this);
    473     return locBased.getLocaleID(type, status);
    474 }
    475 
    476 
    477 // This implementation of getRuleStatus is a do-nothing stub, here to
    478 // provide a default implementation for any derived BreakIterator classes that
    479 // do not implement it themselves.
    480 int32_t BreakIterator::getRuleStatus() const {
    481     return 0;
    482 }
    483 
    484 // This implementation of getRuleStatusVec is a do-nothing stub, here to
    485 // provide a default implementation for any derived BreakIterator classes that
    486 // do not implement it themselves.
    487 int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
    488     if (U_FAILURE(status)) {
    489         return 0;
    490     }
    491     if (capacity < 1) {
    492         status = U_BUFFER_OVERFLOW_ERROR;
    493         return 1;
    494     }
    495     *fillInVec = 0;
    496     return 1;
    497 }
    498 
    499 BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
    500   U_LOCALE_BASED(locBased, (*this));
    501   locBased.setLocaleIDs(valid, actual);
    502 }
    503 
    504 U_NAMESPACE_END
    505 
    506 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    507 
    508 //eof
    509