Home | History | Annotate | Download | only in common
      1 /**
      2  ************************************************************************************
      3  * Copyright (C) 2006-2007, International Business Machines Corporation and others. *
      4  * All Rights Reserved.                                                             *
      5  ************************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "brkeng.h"
     13 #include "dictbe.h"
     14 #include "triedict.h"
     15 #include "unicode/uchar.h"
     16 #include "unicode/uniset.h"
     17 #include "unicode/chariter.h"
     18 #include "unicode/ures.h"
     19 #include "unicode/udata.h"
     20 #include "unicode/putil.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/uscript.h"
     23 #include "uvector.h"
     24 #include "umutex.h"
     25 #include "uresimp.h"
     26 #include "ubrkimpl.h"
     27 #include <stdio.h>
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 /*
     32  ******************************************************************
     33  */
     34 
     35 LanguageBreakEngine::LanguageBreakEngine() {
     36 }
     37 
     38 LanguageBreakEngine::~LanguageBreakEngine() {
     39 }
     40 
     41 /*
     42  ******************************************************************
     43  */
     44 
     45 LanguageBreakFactory::LanguageBreakFactory() {
     46 }
     47 
     48 LanguageBreakFactory::~LanguageBreakFactory() {
     49 }
     50 
     51 /*
     52  ******************************************************************
     53  */
     54 
     55 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
     56     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
     57         fHandled[i] = 0;
     58     }
     59 }
     60 
     61 UnhandledEngine::~UnhandledEngine() {
     62     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
     63         if (fHandled[i] != 0) {
     64             delete fHandled[i];
     65         }
     66     }
     67 }
     68 
     69 UBool
     70 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
     71     return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
     72         && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
     73 }
     74 
     75 int32_t
     76 UnhandledEngine::findBreaks( UText *text,
     77                                  int32_t startPos,
     78                                  int32_t endPos,
     79                                  UBool reverse,
     80                                  int32_t breakType,
     81                                  UStack &/*foundBreaks*/ ) const {
     82     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
     83         UChar32 c = utext_current32(text);
     84         if (reverse) {
     85             while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
     86                 c = utext_previous32(text);
     87             }
     88         }
     89         else {
     90             while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
     91                 utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
     92                 c = utext_current32(text);
     93             }
     94         }
     95     }
     96     return 0;
     97 }
     98 
     99 void
    100 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
    101     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
    102         if (fHandled[breakType] == 0) {
    103             fHandled[breakType] = new UnicodeSet();
    104             if (fHandled[breakType] == 0) {
    105                 return;
    106             }
    107         }
    108         if (!fHandled[breakType]->contains(c)) {
    109             UErrorCode status = U_ZERO_ERROR;
    110             // Apply the entire script of the character.
    111             int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
    112             fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
    113         }
    114     }
    115 }
    116 
    117 /*
    118  ******************************************************************
    119  */
    120 
    121 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
    122     fEngines = 0;
    123 }
    124 
    125 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
    126     if (fEngines != 0) {
    127         delete fEngines;
    128     }
    129 }
    130 
    131 U_NAMESPACE_END
    132 U_CDECL_BEGIN
    133 static void U_CALLCONV _deleteEngine(void *obj) {
    134     delete (const U_NAMESPACE_QUALIFIER LanguageBreakEngine *) obj;
    135 }
    136 U_CDECL_END
    137 U_NAMESPACE_BEGIN
    138 
    139 const LanguageBreakEngine *
    140 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
    141     UBool       needsInit;
    142     int32_t     i;
    143     const LanguageBreakEngine *lbe = NULL;
    144     UErrorCode  status = U_ZERO_ERROR;
    145 
    146     // TODO: The global mutex should not be used.
    147     // The global mutex should only be used for short periods.
    148     // A ICULanguageBreakFactory specific mutex should be used.
    149     umtx_lock(NULL);
    150     needsInit = (UBool)(fEngines == NULL);
    151     if (!needsInit) {
    152         i = fEngines->size();
    153         while (--i >= 0) {
    154             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
    155             if (lbe != NULL && lbe->handles(c, breakType)) {
    156                 break;
    157             }
    158             lbe = NULL;
    159         }
    160     }
    161     umtx_unlock(NULL);
    162 
    163     if (lbe != NULL) {
    164         return lbe;
    165     }
    166 
    167     if (needsInit) {
    168         UStack  *engines = new UStack(_deleteEngine, NULL, status);
    169         if (U_SUCCESS(status) && engines == NULL) {
    170             status = U_MEMORY_ALLOCATION_ERROR;
    171         }
    172         else if (U_FAILURE(status)) {
    173             delete engines;
    174             engines = NULL;
    175         }
    176         else {
    177             umtx_lock(NULL);
    178             if (fEngines == NULL) {
    179                 fEngines = engines;
    180                 engines = NULL;
    181             }
    182             umtx_unlock(NULL);
    183             delete engines;
    184         }
    185     }
    186 
    187     if (fEngines == NULL) {
    188         return NULL;
    189     }
    190 
    191     // We didn't find an engine the first time through, or there was no
    192     // stack. Create an engine.
    193     const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
    194 
    195     // Now get the lock, and see if someone else has created it in the
    196     // meantime
    197     umtx_lock(NULL);
    198     i = fEngines->size();
    199     while (--i >= 0) {
    200         lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
    201         if (lbe != NULL && lbe->handles(c, breakType)) {
    202             break;
    203         }
    204         lbe = NULL;
    205     }
    206     if (lbe == NULL && newlbe != NULL) {
    207         fEngines->push((void *)newlbe, status);
    208         lbe = newlbe;
    209         newlbe = NULL;
    210     }
    211     umtx_unlock(NULL);
    212 
    213     delete newlbe;
    214 
    215     return lbe;
    216 }
    217 
    218 const LanguageBreakEngine *
    219 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
    220     UErrorCode status = U_ZERO_ERROR;
    221     UScriptCode code = uscript_getScript(c, &status);
    222     if (U_SUCCESS(status)) {
    223         const CompactTrieDictionary *dict = loadDictionaryFor(code, breakType);
    224         if (dict != NULL) {
    225             const LanguageBreakEngine *engine = NULL;
    226             switch(code) {
    227             case USCRIPT_THAI:
    228                 engine = new ThaiBreakEngine(dict, status);
    229                 break;
    230 
    231             case USCRIPT_HANGUL:
    232                 engine = new CjkBreakEngine(dict, kKorean, status);
    233                 break;
    234 
    235             // use same BreakEngine and dictionary for both Chinese and Japanese
    236             case USCRIPT_HIRAGANA:
    237             case USCRIPT_KATAKANA:
    238             case USCRIPT_HAN:
    239                 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
    240                 break;
    241 #if 0
    242             // TODO: Have to get some characters with script=common handled
    243             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
    244             // them to CjkBreakEngine does not work. The engine has to
    245             // special-case them.
    246             case USCRIPT_COMMON:
    247             {
    248                 UBlockCode block = ublock_getCode(code);
    249                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
    250                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
    251                 break;
    252             }
    253 #endif
    254             default:
    255                 break;
    256             }
    257             if (engine == NULL) {
    258                 delete dict;
    259             }
    260             else if (U_FAILURE(status)) {
    261                 delete engine;
    262                 engine = NULL;
    263             }
    264             return engine;
    265         }
    266     }
    267     return NULL;
    268 }
    269 
    270 const CompactTrieDictionary *
    271 ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) {
    272     UErrorCode status = U_ZERO_ERROR;
    273     // Open root from brkitr tree.
    274     char dictnbuff[256];
    275     char ext[4]={'\0'};
    276 
    277     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
    278     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
    279     b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status);
    280     int32_t dictnlength = 0;
    281     const UChar *dictfname = ures_getString(b, &dictnlength, &status);
    282     if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) {
    283         dictnlength = 0;
    284         status = U_BUFFER_OVERFLOW_ERROR;
    285     }
    286     if (U_SUCCESS(status) && dictfname) {
    287         UChar* extStart=u_strchr(dictfname, 0x002e);
    288         int len = 0;
    289         if(extStart!=NULL){
    290             len = extStart-dictfname;
    291             u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
    292             u_UCharsToChars(dictfname, dictnbuff, len);
    293         }
    294         dictnbuff[len]=0; // nul terminate
    295     }
    296     ures_close(b);
    297     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status);
    298     if (U_SUCCESS(status)) {
    299         const CompactTrieDictionary *dict = new CompactTrieDictionary(
    300             file, status);
    301         if (U_SUCCESS(status) && dict == NULL) {
    302             status = U_MEMORY_ALLOCATION_ERROR;
    303         }
    304         if (U_FAILURE(status)) {
    305             delete dict;
    306             dict = NULL;
    307         }
    308         return dict;
    309     } else if (dictfname != NULL){
    310         //create dummy dict if dictionary filename not valid
    311         UChar c = 0x0020;
    312         status = U_ZERO_ERROR;
    313         MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE);
    314         mtd->addWord(&c, 1, status, 1);
    315         return new CompactTrieDictionary(*mtd, status);
    316     }
    317     return NULL;
    318 }
    319 
    320 U_NAMESPACE_END
    321 
    322 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    323