Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  ************************************************************************************
      5  * Copyright (C) 2006-2016, International Business Machines Corporation
      6  * and others. All Rights Reserved.
      7  ************************************************************************************
      8  */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_BREAK_ITERATION
     13 
     14 #include "unicode/uchar.h"
     15 #include "unicode/uniset.h"
     16 #include "unicode/chariter.h"
     17 #include "unicode/ures.h"
     18 #include "unicode/udata.h"
     19 #include "unicode/putil.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/uscript.h"
     22 #include "unicode/ucharstrie.h"
     23 #include "unicode/bytestrie.h"
     24 
     25 #include "brkeng.h"
     26 #include "cmemory.h"
     27 #include "dictbe.h"
     28 #include "charstr.h"
     29 #include "dictionarydata.h"
     30 #include "mutex.h"
     31 #include "uvector.h"
     32 #include "umutex.h"
     33 #include "uresimp.h"
     34 #include "ubrkimpl.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 /*
     39  ******************************************************************
     40  */
     41 
     42 LanguageBreakEngine::LanguageBreakEngine() {
     43 }
     44 
     45 LanguageBreakEngine::~LanguageBreakEngine() {
     46 }
     47 
     48 /*
     49  ******************************************************************
     50  */
     51 
     52 LanguageBreakFactory::LanguageBreakFactory() {
     53 }
     54 
     55 LanguageBreakFactory::~LanguageBreakFactory() {
     56 }
     57 
     58 /*
     59  ******************************************************************
     60  */
     61 
     62 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
     63     (void)status;
     64 }
     65 
     66 UnhandledEngine::~UnhandledEngine() {
     67     delete fHandled;
     68     fHandled = nullptr;
     69 }
     70 
     71 UBool
     72 UnhandledEngine::handles(UChar32 c) const {
     73     return fHandled && fHandled->contains(c);
     74 }
     75 
     76 int32_t
     77 UnhandledEngine::findBreaks( UText *text,
     78                              int32_t /* startPos */,
     79                              int32_t endPos,
     80                              UVector32 &/*foundBreaks*/ ) const {
     81     UChar32 c = utext_current32(text);
     82     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
     83         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
     84         c = utext_current32(text);
     85     }
     86     return 0;
     87 }
     88 
     89 void
     90 UnhandledEngine::handleCharacter(UChar32 c) {
     91     if (fHandled == nullptr) {
     92         fHandled = new UnicodeSet();
     93         if (fHandled == nullptr) {
     94             return;
     95         }
     96     }
     97     if (!fHandled->contains(c)) {
     98         UErrorCode status = U_ZERO_ERROR;
     99         // Apply the entire script of the character.
    100         int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
    101         fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
    102     }
    103 }
    104 
    105 /*
    106  ******************************************************************
    107  */
    108 
    109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
    110     fEngines = 0;
    111 }
    112 
    113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
    114     if (fEngines != 0) {
    115         delete fEngines;
    116     }
    117 }
    118 
    119 U_NAMESPACE_END
    120 U_CDECL_BEGIN
    121 static void U_CALLCONV _deleteEngine(void *obj) {
    122     delete (const icu::LanguageBreakEngine *) obj;
    123 }
    124 U_CDECL_END
    125 U_NAMESPACE_BEGIN
    126 
    127 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
    128 
    129 const LanguageBreakEngine *
    130 ICULanguageBreakFactory::getEngineFor(UChar32 c) {
    131     const LanguageBreakEngine *lbe = NULL;
    132     UErrorCode  status = U_ZERO_ERROR;
    133 
    134     Mutex m(&gBreakEngineMutex);
    135 
    136     if (fEngines == NULL) {
    137         UStack  *engines = new UStack(_deleteEngine, NULL, status);
    138         if (U_FAILURE(status) || engines == NULL) {
    139             // Note: no way to return error code to caller.
    140             delete engines;
    141             return NULL;
    142         }
    143         fEngines = engines;
    144     } else {
    145         int32_t i = fEngines->size();
    146         while (--i >= 0) {
    147             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
    148             if (lbe != NULL && lbe->handles(c)) {
    149                 return lbe;
    150             }
    151         }
    152     }
    153 
    154     // We didn't find an engine. Create one.
    155     lbe = loadEngineFor(c);
    156     if (lbe != NULL) {
    157         fEngines->push((void *)lbe, status);
    158     }
    159     return lbe;
    160 }
    161 
    162 const LanguageBreakEngine *
    163 ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
    164     UErrorCode status = U_ZERO_ERROR;
    165     UScriptCode code = uscript_getScript(c, &status);
    166     if (U_SUCCESS(status)) {
    167         DictionaryMatcher *m = loadDictionaryMatcherFor(code);
    168         if (m != NULL) {
    169             const LanguageBreakEngine *engine = NULL;
    170             switch(code) {
    171             case USCRIPT_THAI:
    172                 engine = new ThaiBreakEngine(m, status);
    173                 break;
    174             case USCRIPT_LAO:
    175                 engine = new LaoBreakEngine(m, status);
    176                 break;
    177             case USCRIPT_MYANMAR:
    178                 engine = new BurmeseBreakEngine(m, status);
    179                 break;
    180             case USCRIPT_KHMER:
    181                 engine = new KhmerBreakEngine(m, status);
    182                 break;
    183 
    184 #if !UCONFIG_NO_NORMALIZATION
    185                 // CJK not available w/o normalization
    186             case USCRIPT_HANGUL:
    187                 engine = new CjkBreakEngine(m, kKorean, status);
    188                 break;
    189 
    190             // use same BreakEngine and dictionary for both Chinese and Japanese
    191             case USCRIPT_HIRAGANA:
    192             case USCRIPT_KATAKANA:
    193             case USCRIPT_HAN:
    194                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
    195                 break;
    196 #if 0
    197             // TODO: Have to get some characters with script=common handled
    198             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
    199             // them to CjkBreakEngine does not work. The engine has to
    200             // special-case them.
    201             case USCRIPT_COMMON:
    202             {
    203                 UBlockCode block = ublock_getCode(code);
    204                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
    205                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
    206                 break;
    207             }
    208 #endif
    209 #endif
    210 
    211             default:
    212                 break;
    213             }
    214             if (engine == NULL) {
    215                 delete m;
    216             }
    217             else if (U_FAILURE(status)) {
    218                 delete engine;
    219                 engine = NULL;
    220             }
    221             return engine;
    222         }
    223     }
    224     return NULL;
    225 }
    226 
    227 DictionaryMatcher *
    228 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
    229     UErrorCode status = U_ZERO_ERROR;
    230     // open root from brkitr tree.
    231     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
    232     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
    233     int32_t dictnlength = 0;
    234     const UChar *dictfname =
    235         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
    236     if (U_FAILURE(status)) {
    237         ures_close(b);
    238         return NULL;
    239     }
    240     CharString dictnbuf;
    241     CharString ext;
    242     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
    243     if (extStart != NULL) {
    244         int32_t len = (int32_t)(extStart - dictfname);
    245         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
    246         dictnlength = len;
    247     }
    248     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
    249     ures_close(b);
    250 
    251     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
    252     if (U_SUCCESS(status)) {
    253         // build trie
    254         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
    255         const int32_t *indexes = (const int32_t *)data;
    256         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
    257         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    258         DictionaryMatcher *m = NULL;
    259         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    260             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
    261             const char *characters = (const char *)(data + offset);
    262             m = new BytesDictionaryMatcher(characters, transform, file);
    263         }
    264         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    265             const UChar *characters = (const UChar *)(data + offset);
    266             m = new UCharsDictionaryMatcher(characters, file);
    267         }
    268         if (m == NULL) {
    269             // no matcher exists to take ownership - either we are an invalid
    270             // type or memory allocation failed
    271             udata_close(file);
    272         }
    273         return m;
    274     } else if (dictfname != NULL) {
    275         // we don't have a dictionary matcher.
    276         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
    277         status = U_ZERO_ERROR;
    278         return NULL;
    279     }
    280     return NULL;
    281 }
    282 
    283 U_NAMESPACE_END
    284 
    285 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    286