Home | History | Annotate | Download | only in common
      1 /*
      2  ************************************************************************************
      3  * Copyright (C) 2006-2015, International Business Machines Corporation
      4  * and others. All Rights Reserved.
      5  ************************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "brkeng.h"
     13 #include "dictbe.h"
     14 #include "unicode/uchar.h"
     15 #include "unicode/uniset.h"
     16 #include "unicode/chariter.h"
     17 #include "unicode/ures.h"
     18 #include "unicode/udata.h"
     19 #include "unicode/putil.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/uscript.h"
     22 #include "unicode/ucharstrie.h"
     23 #include "unicode/bytestrie.h"
     24 #include "charstr.h"
     25 #include "dictionarydata.h"
     26 #include "mutex.h"
     27 #include "uvector.h"
     28 #include "umutex.h"
     29 #include "uresimp.h"
     30 #include "ubrkimpl.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 /*
     35  ******************************************************************
     36  */
     37 
     38 LanguageBreakEngine::LanguageBreakEngine() {
     39 }
     40 
     41 LanguageBreakEngine::~LanguageBreakEngine() {
     42 }
     43 
     44 /*
     45  ******************************************************************
     46  */
     47 
     48 LanguageBreakFactory::LanguageBreakFactory() {
     49 }
     50 
     51 LanguageBreakFactory::~LanguageBreakFactory() {
     52 }
     53 
     54 /*
     55  ******************************************************************
     56  */
     57 
     58 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
     59     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
     60         fHandled[i] = 0;
     61     }
     62 }
     63 
     64 UnhandledEngine::~UnhandledEngine() {
     65     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
     66         if (fHandled[i] != 0) {
     67             delete fHandled[i];
     68         }
     69     }
     70 }
     71 
     72 UBool
     73 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
     74     return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
     75         && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
     76 }
     77 
     78 int32_t
     79 UnhandledEngine::findBreaks( UText *text,
     80                                  int32_t startPos,
     81                                  int32_t endPos,
     82                                  UBool reverse,
     83                                  int32_t breakType,
     84                                  UStack &/*foundBreaks*/ ) const {
     85     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
     86         UChar32 c = utext_current32(text);
     87         if (reverse) {
     88             while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
     89                 c = utext_previous32(text);
     90             }
     91         }
     92         else {
     93             while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
     94                 utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
     95                 c = utext_current32(text);
     96             }
     97         }
     98     }
     99     return 0;
    100 }
    101 
    102 void
    103 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
    104     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
    105         if (fHandled[breakType] == 0) {
    106             fHandled[breakType] = new UnicodeSet();
    107             if (fHandled[breakType] == 0) {
    108                 return;
    109             }
    110         }
    111         if (!fHandled[breakType]->contains(c)) {
    112             UErrorCode status = U_ZERO_ERROR;
    113             // Apply the entire script of the character.
    114             int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
    115             fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
    116         }
    117     }
    118 }
    119 
    120 /*
    121  ******************************************************************
    122  */
    123 
    124 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
    125     fEngines = 0;
    126 }
    127 
    128 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
    129     if (fEngines != 0) {
    130         delete fEngines;
    131     }
    132 }
    133 
    134 U_NAMESPACE_END
    135 U_CDECL_BEGIN
    136 static void U_CALLCONV _deleteEngine(void *obj) {
    137     delete (const icu::LanguageBreakEngine *) obj;
    138 }
    139 U_CDECL_END
    140 U_NAMESPACE_BEGIN
    141 
    142 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
    143 
    144 const LanguageBreakEngine *
    145 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
    146     const LanguageBreakEngine *lbe = NULL;
    147     UErrorCode  status = U_ZERO_ERROR;
    148 
    149     Mutex m(&gBreakEngineMutex);
    150 
    151     if (fEngines == NULL) {
    152         UStack  *engines = new UStack(_deleteEngine, NULL, status);
    153         if (U_FAILURE(status) || engines == NULL) {
    154             // Note: no way to return error code to caller.
    155             delete engines;
    156             return NULL;
    157         }
    158         fEngines = engines;
    159     } else {
    160         int32_t i = fEngines->size();
    161         while (--i >= 0) {
    162             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
    163             if (lbe != NULL && lbe->handles(c, breakType)) {
    164                 return lbe;
    165             }
    166         }
    167     }
    168 
    169     // We didn't find an engine. Create one.
    170     lbe = loadEngineFor(c, breakType);
    171     if (lbe != NULL) {
    172         fEngines->push((void *)lbe, status);
    173     }
    174     return lbe;
    175 }
    176 
    177 const LanguageBreakEngine *
    178 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
    179     UErrorCode status = U_ZERO_ERROR;
    180     UScriptCode code = uscript_getScript(c, &status);
    181     if (U_SUCCESS(status)) {
    182         DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
    183         if (m != NULL) {
    184             const LanguageBreakEngine *engine = NULL;
    185             switch(code) {
    186             case USCRIPT_THAI:
    187                 engine = new ThaiBreakEngine(m, status);
    188                 break;
    189             case USCRIPT_LAO:
    190                 engine = new LaoBreakEngine(m, status);
    191                 break;
    192             case USCRIPT_MYANMAR:
    193                 engine = new BurmeseBreakEngine(m, status);
    194                 break;
    195             case USCRIPT_KHMER:
    196                 engine = new KhmerBreakEngine(m, status);
    197                 break;
    198 
    199 #if !UCONFIG_NO_NORMALIZATION
    200                 // CJK not available w/o normalization
    201             case USCRIPT_HANGUL:
    202                 engine = new CjkBreakEngine(m, kKorean, status);
    203                 break;
    204 
    205             // use same BreakEngine and dictionary for both Chinese and Japanese
    206             case USCRIPT_HIRAGANA:
    207             case USCRIPT_KATAKANA:
    208             case USCRIPT_HAN:
    209                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
    210                 break;
    211 #if 0
    212             // TODO: Have to get some characters with script=common handled
    213             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
    214             // them to CjkBreakEngine does not work. The engine has to
    215             // special-case them.
    216             case USCRIPT_COMMON:
    217             {
    218                 UBlockCode block = ublock_getCode(code);
    219                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
    220                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
    221                 break;
    222             }
    223 #endif
    224 #endif
    225 
    226             default:
    227                 break;
    228             }
    229             if (engine == NULL) {
    230                 delete m;
    231             }
    232             else if (U_FAILURE(status)) {
    233                 delete engine;
    234                 engine = NULL;
    235             }
    236             return engine;
    237         }
    238     }
    239     return NULL;
    240 }
    241 
    242 DictionaryMatcher *
    243 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
    244     UErrorCode status = U_ZERO_ERROR;
    245     // open root from brkitr tree.
    246     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
    247     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
    248     int32_t dictnlength = 0;
    249     const UChar *dictfname =
    250         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
    251     if (U_FAILURE(status)) {
    252         ures_close(b);
    253         return NULL;
    254     }
    255     CharString dictnbuf;
    256     CharString ext;
    257     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
    258     if (extStart != NULL) {
    259         int32_t len = (int32_t)(extStart - dictfname);
    260         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
    261         dictnlength = len;
    262     }
    263     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
    264     ures_close(b);
    265 
    266     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
    267     if (U_SUCCESS(status)) {
    268         // build trie
    269         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
    270         const int32_t *indexes = (const int32_t *)data;
    271         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
    272         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    273         DictionaryMatcher *m = NULL;
    274         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    275             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
    276             const char *characters = (const char *)(data + offset);
    277             m = new BytesDictionaryMatcher(characters, transform, file);
    278         }
    279         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    280             const UChar *characters = (const UChar *)(data + offset);
    281             m = new UCharsDictionaryMatcher(characters, file);
    282         }
    283         if (m == NULL) {
    284             // no matcher exists to take ownership - either we are an invalid
    285             // type or memory allocation failed
    286             udata_close(file);
    287         }
    288         return m;
    289     } else if (dictfname != NULL) {
    290         // we don't have a dictionary matcher.
    291         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
    292         status = U_ZERO_ERROR;
    293         return NULL;
    294     }
    295     return NULL;
    296 }
    297 
    298 U_NAMESPACE_END
    299 
    300 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    301