Home | History | Annotate | Download | only in patches
      1 --- source/common/brkeng.cpp	2009-11-11 07:47:22.000000000 -0800
      2 +++ source/common/brkeng.cpp	2011-01-21 14:12:45.479922000 -0800
      3 @@ -226,6 +226,30 @@
      4              case USCRIPT_THAI:
      5                  engine = new ThaiBreakEngine(dict, status);
      6                  break;
      7 +                
      8 +            case USCRIPT_HANGUL:
      9 +                engine = new CjkBreakEngine(dict, kKorean, status);
     10 +                break;
     11 +
     12 +            // use same BreakEngine and dictionary for both Chinese and Japanese
     13 +            case USCRIPT_HIRAGANA:
     14 +            case USCRIPT_KATAKANA:
     15 +            case USCRIPT_HAN:
     16 +                engine = new CjkBreakEngine(dict, kChineseJapanese, status);
     17 +                break;
     18 +#if 0
     19 +            // TODO: Have to get some characters with script=common handled
     20 +            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
     21 +            // them to CjkBreakEngine does not work. The engine has to
     22 +            // special-case them.
     23 +            case USCRIPT_COMMON:
     24 +            {
     25 +                UBlockCode block = ublock_getCode(code);
     26 +                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
     27 +                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
     28 +                break;
     29 +            }
     30 +#endif
     31              default:
     32                  break;
     33              }
     34 @@ -281,6 +305,13 @@
     35              dict = NULL;
     36          }
     37          return dict;
     38 +    } else if (dictfname != NULL){
     39 +        //create dummy dict if dictionary filename not valid
     40 +        UChar c = 0x0020;
     41 +        status = U_ZERO_ERROR;
     42 +        MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE);
     43 +        mtd->addWord(&c, 1, status, 1);
     44 +        return new CompactTrieDictionary(*mtd, status);  
     45      }
     46      return NULL;
     47  }
     48 --- source/common/dictbe.cpp	2008-06-13 12:21:12.000000000 -0700
     49 +++ source/common/dictbe.cpp	2011-01-21 14:12:45.468928000 -0800
     50 @@ -16,6 +16,9 @@
     51  #include "unicode/ubrk.h"
     52  #include "uvector.h"
     53  #include "triedict.h"
     54 +#include "uassert.h"
     55 +#include "unicode/normlzr.h"
     56 +#include "cmemory.h"
     57  
     58  U_NAMESPACE_BEGIN
     59  
     60 @@ -422,6 +425,294 @@
     61      return wordsFound;
     62  }
     63  
     64 +/*
     65 + ******************************************************************
     66 + * CjkBreakEngine
     67 + */
     68 +static const uint32_t kuint32max = 0xFFFFFFFF;
     69 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)
     70 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
     71 +    if (!adoptDictionary->getValued()) {
     72 +        status = U_ILLEGAL_ARGUMENT_ERROR;
     73 +        return;
     74 +    }
     75 +
     76 +    // Korean dictionary only includes Hangul syllables
     77 +    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
     78 +    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
     79 +    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
     80 +    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
     81 +
     82 +    if (U_SUCCESS(status)) {
     83 +        // handle Korean and Japanese/Chinese using different dictionaries
     84 +        if (type == kKorean) {
     85 +            setCharacters(fHangulWordSet);
     86 +        } else { //Chinese and Japanese
     87 +            UnicodeSet cjSet;
     88 +            cjSet.addAll(fHanWordSet);
     89 +            cjSet.addAll(fKatakanaWordSet);
     90 +            cjSet.addAll(fHiraganaWordSet);
     91 +            cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
     92 +            setCharacters(cjSet);
     93 +        }
     94 +    }
     95 +}
     96 +
     97 +CjkBreakEngine::~CjkBreakEngine(){
     98 +    delete fDictionary;
     99 +}
    100 +
    101 +// The katakanaCost values below are based on the length frequencies of all
    102 +// katakana phrases in the dictionary
    103 +static const int kMaxKatakanaLength = 8;
    104 +static const int kMaxKatakanaGroupLength = 20;
    105 +static const uint32_t maxSnlp = 255;
    106 +
    107 +static inline uint32_t getKatakanaCost(int wordLength){
    108 +    //TODO: fill array with actual values from dictionary!
    109 +    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
    110 +                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
    111 +    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
    112 +}
    113 +
    114 +static inline bool isKatakana(uint16_t value) {
    115 +    return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
    116 +            (value >= 0xFF66u && value <= 0xFF9fu);
    117 +}
    118 +
    119 +// A very simple helper class to streamline the buffer handling in
    120 +// divideUpDictionaryRange. 
    121 +template<class T, size_t N>
    122 +class AutoBuffer {
    123 + public:
    124 +  AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
    125 +    if (size > N) {
    126 +      buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    127 +      capacity = size;
    128 +    }
    129 +  }
    130 +  ~AutoBuffer() {
    131 +    if (buffer != stackBuffer)
    132 +      uprv_free(buffer);
    133 +  }
    134 +#if 0
    135 +  T* operator& () {
    136 +    return buffer;
    137 +  }
    138 +#endif
    139 +  T* elems() {
    140 +    return buffer;
    141 +  }
    142 +  const T& operator[] (size_t i) const {
    143 +    return buffer[i];
    144 +  }
    145 +  T& operator[] (size_t i) {
    146 +    return buffer[i];
    147 +  }
    148 +
    149 +  // resize without copy
    150 +  void resize(size_t size) {
    151 +    if (size <= capacity)
    152 +      return;
    153 +    if (buffer != stackBuffer)
    154 +      uprv_free(buffer);
    155 +    buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
    156 +    capacity = size;
    157 +  }
    158 + private:
    159 +  T stackBuffer[N];
    160 +  T* buffer;
    161 +  AutoBuffer();
    162 +  size_t capacity;
    163 +};
    164 +
    165 +
    166 +/*
    167 + * @param text A UText representing the text
    168 + * @param rangeStart The start of the range of dictionary characters
    169 + * @param rangeEnd The end of the range of dictionary characters
    170 + * @param foundBreaks Output of C array of int32_t break positions, or 0
    171 + * @return The number of breaks found
    172 + */
    173 +int32_t 
    174 +CjkBreakEngine::divideUpDictionaryRange( UText *text,
    175 +        int32_t rangeStart,
    176 +        int32_t rangeEnd,
    177 +        UStack &foundBreaks ) const {
    178 +    if (rangeStart >= rangeEnd) {
    179 +        return 0;
    180 +    }
    181 +
    182 +    const size_t defaultInputLength = 80;
    183 +    size_t inputLength = rangeEnd - rangeStart;
    184 +    AutoBuffer<UChar, defaultInputLength> charString(inputLength);
    185 +
    186 +    // Normalize the input string and put it in normalizedText.
    187 +    // The map from the indices of the normalized input to the raw
    188 +    // input is kept in charPositions.
    189 +    UErrorCode status = U_ZERO_ERROR;
    190 +    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
    191 +    if (U_FAILURE(status))
    192 +        return 0;
    193 +
    194 +    UnicodeString inputString(charString.elems(), inputLength);
    195 +    UNormalizationMode norm_mode = UNORM_NFKC;
    196 +    UBool isNormalized =
    197 +        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
    198 +        Normalizer::isNormalized(inputString, norm_mode, status);
    199 +
    200 +    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
    201 +    int numChars = 0;
    202 +    UText normalizedText = UTEXT_INITIALIZER;
    203 +    // Needs to be declared here because normalizedText holds onto its buffer.
    204 +    UnicodeString normalizedString;
    205 +    if (isNormalized) {
    206 +        int32_t index = 0;
    207 +        charPositions[0] = 0;
    208 +        while(index < inputString.length()) {
    209 +            index = inputString.moveIndex32(index, 1);
    210 +            charPositions[++numChars] = index;
    211 +        }
    212 +        utext_openUnicodeString(&normalizedText, &inputString, &status);
    213 +    }
    214 +    else {
    215 +        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
    216 +        if (U_FAILURE(status))
    217 +            return 0;
    218 +        charPositions.resize(normalizedString.length() + 1);
    219 +        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
    220 +        int32_t index = 0;
    221 +        charPositions[0] = 0;
    222 +        while(index < normalizer.endIndex()){
    223 +            UChar32 uc = normalizer.next();
    224 +            charPositions[++numChars] = index = normalizer.getIndex();
    225 +        }
    226 +        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
    227 +    }
    228 +
    229 +    if (U_FAILURE(status))
    230 +        return 0;
    231 +
    232 +    // From this point on, all the indices refer to the indices of
    233 +    // the normalized input string.
    234 +
    235 +    // bestSnlp[i] is the snlp of the best segmentation of the first i
    236 +    // characters in the range to be matched.
    237 +    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
    238 +    bestSnlp[0] = 0;
    239 +    for(int i=1; i<=numChars; i++){
    240 +        bestSnlp[i] = kuint32max;
    241 +    }
    242 +
    243 +    // prev[i] is the index of the last CJK character in the previous word in 
    244 +    // the best segmentation of the first i characters.
    245 +    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
    246 +    for(int i=0; i<=numChars; i++){
    247 +        prev[i] = -1;
    248 +    }
    249 +
    250 +    const size_t maxWordSize = 20;
    251 +    AutoBuffer<uint16_t, maxWordSize> values(numChars);
    252 +    AutoBuffer<int32_t, maxWordSize> lengths(numChars);
    253 +
    254 +    // Dynamic programming to find the best segmentation.
    255 +    bool is_prev_katakana = false;
    256 +    for (int i = 0; i < numChars; ++i) {
    257 +        //utext_setNativeIndex(text, rangeStart + i);
    258 +        utext_setNativeIndex(&normalizedText, i);
    259 +        if (bestSnlp[i] == kuint32max)
    260 +            continue;
    261 +
    262 +        int count;
    263 +        // limit maximum word length matched to size of current substring
    264 +        int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i; 
    265 +
    266 +        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
    267 +
    268 +        // if there are no single character matches found in the dictionary 
    269 +        // starting with this charcter, treat character as a 1-character word 
    270 +        // with the highest value possible, i.e. the least likely to occur.
    271 +        // Exclude Korean characters from this treatment, as they should be left
    272 +        // together by default.
    273 +        if((count == 0 || lengths[0] != 1) &&
    274 +                !fHangulWordSet.contains(utext_current32(&normalizedText))){
    275 +            values[count] = maxSnlp;
    276 +            lengths[count++] = 1;
    277 +        }
    278 +
    279 +        for (int j = 0; j < count; j++){
    280 +            //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
    281 +            uint32_t newSnlp = bestSnlp[i] + values[j];
    282 +            if (newSnlp < bestSnlp[lengths[j] + i]) {
    283 +                bestSnlp[lengths[j] + i] = newSnlp;
    284 +                prev[lengths[j] + i] = i;
    285 +            }
    286 +        }
    287 +
    288 +        // In Japanese,
    289 +        // Katakana word in single character is pretty rare. So we apply
    290 +        // the following heuristic to Katakana: any continuous run of Katakana
    291 +        // characters is considered a candidate word with a default cost
    292 +        // specified in the katakanaCost table according to its length.
    293 +        //utext_setNativeIndex(text, rangeStart + i);
    294 +        utext_setNativeIndex(&normalizedText, i);
    295 +        bool is_katakana = isKatakana(utext_current32(&normalizedText));
    296 +        if (!is_prev_katakana && is_katakana) {
    297 +            int j = i + 1;
    298 +            utext_next32(&normalizedText);
    299 +            // Find the end of the continuous run of Katakana characters
    300 +            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
    301 +                    isKatakana(utext_current32(&normalizedText))) {
    302 +                utext_next32(&normalizedText);
    303 +                ++j;
    304 +            }
    305 +            if ((j - i) < kMaxKatakanaGroupLength) {
    306 +                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
    307 +                if (newSnlp < bestSnlp[j]) {
    308 +                    bestSnlp[j] = newSnlp;
    309 +                    prev[j] = i;
    310 +                }
    311 +            }
    312 +        }
    313 +        is_prev_katakana = is_katakana;
    314 +    }
    315 +
    316 +    // Start pushing the optimal offset index into t_boundary (t for tentative).
    317 +    // prev[numChars] is guaranteed to be meaningful.
    318 +    // We'll first push in the reverse order, i.e.,
    319 +    // t_boundary[0] = numChars, and afterwards do a swap.
    320 +    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
    321 +
    322 +    int numBreaks = 0;
    323 +    // No segmentation found, set boundary to end of range
    324 +    if (bestSnlp[numChars] == kuint32max) {
    325 +        t_boundary[numBreaks++] = numChars;
    326 +    } else {
    327 +        for (int i = numChars; i > 0; i = prev[i]){
    328 +            t_boundary[numBreaks++] = i;
    329 +    
    330 +        }
    331 +        U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
    332 +    }
    333 +
    334 +    // Reverse offset index in t_boundary.
    335 +    // Don't add a break for the start of the dictionary range if there is one
    336 +    // there already.
    337 +    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
    338 +        t_boundary[numBreaks++] = 0;
    339 +    }
    340 +
    341 +    // Now that we're done, convert positions in t_bdry[] (indices in 
    342 +    // the normalized input string) back to indices in the raw input string
    343 +    // while reversing t_bdry and pushing values to foundBreaks.
    344 +    for (int i = numBreaks-1; i >= 0; i--) {
    345 +        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
    346 +    }
    347 +
    348 +    utext_close(&normalizedText);
    349 +    return numBreaks;
    350 +}
    351 +
    352  U_NAMESPACE_END
    353  
    354  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    355 --- source/common/dictbe.h	2006-09-29 17:37:45.000000000 -0700
    356 +++ source/common/dictbe.h	2011-01-21 14:12:45.492920000 -0800
    357 @@ -1,8 +1,8 @@
    358  /**
    359 - *******************************************************************************
    360 - * Copyright (C) 2006, International Business Machines Corporation and others. *
    361 - * All Rights Reserved.                                                        *
    362 - *******************************************************************************
    363 + **********************************************************************************
    364 + * Copyright (C) 2006-2010, International Business Machines Corporation and others.
    365 + * All Rights Reserved.                                                       
    366 + **********************************************************************************
    367   */
    368  
    369  #ifndef DICTBE_H
    370 @@ -65,31 +65,31 @@
    371     */
    372    virtual ~DictionaryBreakEngine();
    373  
    374 - /**
    375 -  * <p>Indicate whether this engine handles a particular character for
    376 -  * a particular kind of break.</p>
    377 -  *
    378 -  * @param c A character which begins a run that the engine might handle
    379 -  * @param breakType The type of text break which the caller wants to determine
    380 -  * @return TRUE if this engine handles the particular character and break
    381 -  * type.
    382 -  */
    383 +  /**
    384 +   * <p>Indicate whether this engine handles a particular character for
    385 +   * a particular kind of break.</p>
    386 +   *
    387 +   * @param c A character which begins a run that the engine might handle
    388 +   * @param breakType The type of text break which the caller wants to determine
    389 +   * @return TRUE if this engine handles the particular character and break
    390 +   * type.
    391 +   */
    392    virtual UBool handles( UChar32 c, int32_t breakType ) const;
    393  
    394 - /**
    395 -  * <p>Find any breaks within a run in the supplied text.</p>
    396 -  *
    397 -  * @param text A UText representing the text. The
    398 -  * iterator is left at the end of the run of characters which the engine
    399 -  * is capable of handling.
    400 -  * @param startPos The start of the run within the supplied text.
    401 -  * @param endPos The end of the run within the supplied text.
    402 -  * @param reverse Whether the caller is looking for breaks in a reverse
    403 -  * direction.
    404 -  * @param breakType The type of break desired, or -1.
    405 -  * @param foundBreaks An allocated C array of the breaks found, if any
    406 -  * @return The number of breaks found.
    407 -  */
    408 +  /**
    409 +   * <p>Find any breaks within a run in the supplied text.</p>
    410 +   *
    411 +   * @param text A UText representing the text. The iterator is left at
    412 +   * the end of the run of characters which the engine is capable of handling 
    413 +   * that starts from the first (or last) character in the range.
    414 +   * @param startPos The start of the run within the supplied text.
    415 +   * @param endPos The end of the run within the supplied text.
    416 +   * @param reverse Whether the caller is looking for breaks in a reverse
    417 +   * direction.
    418 +   * @param breakType The type of break desired, or -1.
    419 +   * @param foundBreaks An allocated C array of the breaks found, if any
    420 +   * @return The number of breaks found.
    421 +   */
    422    virtual int32_t findBreaks( UText *text,
    423                                int32_t startPos,
    424                                int32_t endPos,
    425 @@ -114,7 +114,7 @@
    426  //  virtual void setBreakTypes( uint32_t breakTypes );
    427  
    428   /**
    429 -  * <p>Divide up a range of known dictionary characters.</p>
    430 +  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    431    *
    432    * @param text A UText representing the text
    433    * @param rangeStart The start of the range of dictionary characters
    434 @@ -171,7 +171,7 @@
    435  
    436   protected:
    437   /**
    438 -  * <p>Divide up a range of known dictionary characters.</p>
    439 +  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    440    *
    441    * @param text A UText representing the text
    442    * @param rangeStart The start of the range of dictionary characters
    443 @@ -186,6 +186,66 @@
    444  
    445  };
    446  
    447 +/*******************************************************************
    448 + * CjkBreakEngine
    449 + */
    450 +
    451 +//indicates language/script that the CjkBreakEngine will handle
    452 +enum LanguageType {
    453 +    kKorean,
    454 +    kChineseJapanese
    455 +};
    456 +
    457 +/**
    458 + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
    459 + * TrieWordDictionary with costs associated with each word and
    460 + * Viterbi decoding to determine CJK-specific breaks.</p>
    461 + */
    462 +class CjkBreakEngine : public DictionaryBreakEngine {
    463 + protected:
    464 +    /**
    465 +     * The set of characters handled by this engine
    466 +     * @internal
    467 +     */
    468 +  UnicodeSet                fHangulWordSet;
    469 +  UnicodeSet                fHanWordSet;
    470 +  UnicodeSet                fKatakanaWordSet;
    471 +  UnicodeSet                fHiraganaWordSet;
    472 +
    473 +  const TrieWordDictionary  *fDictionary;
    474 +
    475 + public:
    476 +
    477 +    /**
    478 +     * <p>Default constructor.</p>
    479 +     *
    480 +     * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
    481 +     * engine is deleted. The TrieWordDictionary must contain costs for each word
    482 +     * in order for the dictionary to work properly.
    483 +     */
    484 +  CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status);
    485 +
    486 +    /**
    487 +     * <p>Virtual destructor.</p>
    488 +     */
    489 +  virtual ~CjkBreakEngine();
    490 +
    491 + protected:
    492 +    /**
    493 +     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    494 +     *
    495 +     * @param text A UText representing the text
    496 +     * @param rangeStart The start of the range of dictionary characters
    497 +     * @param rangeEnd The end of the range of dictionary characters
    498 +     * @param foundBreaks Output of C array of int32_t break positions, or 0
    499 +     * @return The number of breaks found
    500 +     */
    501 +  virtual int32_t divideUpDictionaryRange( UText *text,
    502 +          int32_t rangeStart,
    503 +          int32_t rangeEnd,
    504 +          UStack &foundBreaks ) const;
    505 +
    506 +};
    507  
    508  U_NAMESPACE_END
    509  
    510 --- source/common/rbbi.cpp	2010-07-22 17:15:37.000000000 -0700
    511 +++ source/common/rbbi.cpp	2011-01-21 14:12:45.457938000 -0800
    512 @@ -1555,10 +1555,12 @@
    513                              int32_t endPos,
    514                              UBool reverse) {
    515      // Reset the old break cache first.
    516 -    uint32_t dictionaryCount = fDictionaryCharCount;
    517      reset();
    518  
    519 -    if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
    520 +    // note: code segment below assumes that dictionary chars are in the 
    521 +    // startPos-endPos range
    522 +    // value returned should be next character in sequence
    523 +    if ((endPos - startPos) <= 1) {
    524          return (reverse ? startPos : endPos);
    525      }
    526      
    527 @@ -1711,7 +1713,7 @@
    528              // proposed break by one of the breaks we found. Use following() and
    529              // preceding() to do the work. They should never recurse in this case.
    530              if (reverse) {
    531 -                return preceding(endPos - 1);
    532 +                return preceding(endPos);
    533              }
    534              else {
    535                  return following(startPos);
    536 --- source/common/triedict.cpp	2008-02-13 01:35:50.000000000 -0800
    537 +++ source/common/triedict.cpp	2011-01-21 14:12:45.271006000 -0800
    538 @@ -20,6 +20,7 @@
    539  #include "uvector.h"
    540  #include "uvectr32.h"
    541  #include "uarrsort.h"
    542 +#include "hash.h"
    543  
    544  //#define DEBUG_TRIE_DICT 1
    545  
    546 @@ -27,6 +28,11 @@
    547  #include <sys/times.h>
    548  #include <limits.h>
    549  #include <stdio.h>
    550 +#include <time.h>
    551 +#ifndef CLK_TCK
    552 +#define CLK_TCK      CLOCKS_PER_SEC
    553 +#endif
    554 +
    555  #endif
    556  
    557  U_NAMESPACE_BEGIN
    558 @@ -45,6 +51,11 @@
    559   * MutableTrieDictionary
    560   */
    561  
    562 +//#define MAX_VALUE 65535
    563 +
    564 +// forward declaration
    565 +inline uint16_t scaleLogProbabilities(double logprob);
    566 +
    567  // Node structure for the ternary, uncompressed trie
    568  struct TernaryNode : public UMemory {
    569      UChar       ch;         // UTF-16 code unit
    570 @@ -77,7 +88,8 @@
    571      delete high;
    572  }
    573  
    574 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
    575 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
    576 +                                              UBool containsValue /* = FALSE */  ) {
    577      // Start the trie off with something. Having the root node already present
    578      // cuts a special case out of the search/insertion functions.
    579      // Making it a median character cuts the worse case for searches from
    580 @@ -91,14 +103,19 @@
    581      if (U_SUCCESS(status) && fIter == NULL) {
    582          status = U_MEMORY_ALLOCATION_ERROR;
    583      }
    584 +
    585 +    fValued = containsValue;
    586  }
    587  
    588 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
    589 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, 
    590 +                                              UBool containsValue /* = false */ ) {
    591      fTrie = NULL;
    592      fIter = utext_openUChars(NULL, NULL, 0, &status);
    593      if (U_SUCCESS(status) && fIter == NULL) {
    594          status = U_MEMORY_ALLOCATION_ERROR;
    595      }
    596 +
    597 +    fValued = containsValue;
    598  }
    599  
    600  MutableTrieDictionary::~MutableTrieDictionary() {
    601 @@ -108,12 +125,13 @@
    602  
    603  int32_t
    604  MutableTrieDictionary::search( UText *text,
    605 -                                   int32_t maxLength,
    606 -                                   int32_t *lengths,
    607 -                                   int &count,
    608 -                                   int limit,
    609 -                                   TernaryNode *&parent,
    610 -                                   UBool &pMatched ) const {
    611 +                               int32_t maxLength,
    612 +                               int32_t *lengths,
    613 +                               int &count,
    614 +                               int limit,
    615 +                               TernaryNode *&parent,
    616 +                               UBool &pMatched,
    617 +                               uint16_t *values /*=NULL*/) const {
    618      // TODO: current implementation works in UTF-16 space
    619      const TernaryNode *up = NULL;
    620      const TernaryNode *p = fTrie;
    621 @@ -121,6 +139,10 @@
    622      pMatched = TRUE;
    623      int i;
    624  
    625 +    if (!fValued) {
    626 +        values = NULL;
    627 +    }
    628 +
    629      UChar uc = utext_current32(text);
    630      for (i = 0; i < maxLength && p != NULL; ++i) {
    631          while (p != NULL) {
    632 @@ -141,7 +163,11 @@
    633              break;
    634          }
    635          // Must be equal to get here
    636 -        if (limit > 0 && (p->flags & kEndsWord)) {
    637 +        if (limit > 0 && (p->flags > 0)) {
    638 +            //is there a more efficient way to add values? ie. remove if stmt
    639 +            if(values != NULL) {
    640 +                values[mycount] = p->flags;
    641 +            }
    642              lengths[mycount++] = i+1;
    643              --limit;
    644          }
    645 @@ -161,13 +187,14 @@
    646  void
    647  MutableTrieDictionary::addWord( const UChar *word,
    648                                  int32_t length,
    649 -                                UErrorCode &status ) {
    650 -#if 0
    651 -    if (length <= 0) {
    652 +                                UErrorCode &status,
    653 +                                uint16_t value /* = 0 */ ) {
    654 +    // dictionary cannot store zero values, would interfere with flags
    655 +    if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
    656          status = U_ILLEGAL_ARGUMENT_ERROR;
    657          return;
    658      }
    659 -#endif
    660 +
    661      TernaryNode *parent;
    662      UBool pMatched;
    663      int count;
    664 @@ -177,7 +204,7 @@
    665      matched = search(fIter, length, NULL, count, 0, parent, pMatched);
    666      
    667      while (matched++ < length) {
    668 -        UChar32 uc = utext_next32(fIter);  // TODO:  supplemetary support?
    669 +        UChar32 uc = utext_next32(fIter);  // TODO:  supplementary support?
    670          U_ASSERT(uc != U_SENTINEL);
    671          TernaryNode *newNode = new TernaryNode(uc);
    672          if (newNode == NULL) {
    673 @@ -199,30 +226,23 @@
    674          parent = newNode;
    675      }
    676  
    677 -    parent->flags |= kEndsWord;
    678 -}
    679 -
    680 -#if 0
    681 -void
    682 -MutableTrieDictionary::addWords( UEnumeration *words,
    683 -                                  UErrorCode &status ) {
    684 -    int32_t length;
    685 -    const UChar *word;
    686 -    while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
    687 -        addWord(word, length, status);
    688 +    if(fValued && value > 0){
    689 +        parent->flags = value;
    690 +    } else {
    691 +        parent->flags |= kEndsWord;
    692      }
    693  }
    694 -#endif
    695  
    696  int32_t
    697  MutableTrieDictionary::matches( UText *text,
    698                                  int32_t maxLength,
    699                                  int32_t *lengths,
    700                                  int &count,
    701 -                                int limit ) const {
    702 +                                int limit,
    703 +                                uint16_t *values /*=NULL*/) const {
    704      TernaryNode *parent;
    705      UBool pMatched;
    706 -    return search(text, maxLength, lengths, count, limit, parent, pMatched);
    707 +    return search(text, maxLength, lengths, count, limit, parent, pMatched, values);
    708  }
    709  
    710  // Implementation of iteration for MutableTrieDictionary
    711 @@ -277,7 +297,7 @@
    712                      break;
    713                  }
    714              case kEqual:
    715 -                emit = (node->flags & kEndsWord) != 0;
    716 +                emit = node->flags > 0;
    717                  equal = (node->equal != NULL);
    718                  // If this node should be part of the next emitted string, append
    719                  // the UChar to the string, and make sure we pop it when we come
    720 @@ -299,7 +319,7 @@
    721                  }
    722              case kGreaterThan:
    723                  // If this node's character is in the string, remove it.
    724 -                if (node->equal != NULL || (node->flags & kEndsWord)) {
    725 +                if (node->equal != NULL || node->flags > 0) {
    726                      unistr.truncate(unistr.length()-1);
    727                  }
    728                  if (node->high != NULL) {
    729 @@ -354,12 +374,75 @@
    730   * CompactTrieDictionary
    731   */
    732  
    733 +//TODO further optimization:
    734 +// minimise size of trie with logprobs by storing values
    735 +// for terminal nodes directly in offsets[]
    736 +// --> calculating from next offset *might* be simpler, but would have to add
    737 +// one last offset for logprob of last node
    738 +// --> if calculate from current offset, need to factor in possible overflow
    739 +// as well.
    740 +// idea: store in offset, set first bit to indicate logprob storage-->won't
    741 +// have to access additional node
    742 +
    743 +// {'Dic', 1}, version 1: uses old header, no values
    744 +#define COMPACT_TRIE_MAGIC_1 0x44696301
    745 +// version 2: uses new header (more than 2^16 nodes), no values
    746 +#define COMPACT_TRIE_MAGIC_2 0x44696302
    747 +// version 3: uses new header, includes values
    748 +#define COMPACT_TRIE_MAGIC_3 0x44696303
    749 +
    750  struct CompactTrieHeader {
    751      uint32_t        size;           // Size of the data in bytes
    752      uint32_t        magic;          // Magic number (including version)
    753 +    uint32_t        nodeCount;      // Number of entries in offsets[]
    754 +    uint32_t        root;           // Node number of the root node
    755 +    uint32_t        offsets[1];     // Offsets to nodes from start of data
    756 +};
    757 +
    758 +// old version of CompactTrieHeader kept for backwards compatibility
    759 +struct CompactTrieHeaderV1 {
    760 +    uint32_t        size;           // Size of the data in bytes
    761 +    uint32_t        magic;          // Magic number (including version)
    762      uint16_t        nodeCount;      // Number of entries in offsets[]
    763      uint16_t        root;           // Node number of the root node
    764 -    uint32_t        offsets[1];      // Offsets to nodes from start of data
    765 +    uint32_t        offsets[1];     // Offsets to nodes from start of data
    766 +};
    767 +
    768 +// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
    769 +struct CompactTrieInfo {
    770 +    uint32_t        size;           // Size of the data in bytes
    771 +    uint32_t        magic;          // Magic number (including version)
    772 +    uint32_t        nodeCount;      // Number of entries in offsets[]
    773 +    uint32_t        root;           // Node number of the root node
    774 +    uint32_t        *offsets;       // Offsets to nodes from start of data
    775 +    uint8_t         *address;       // pointer to header bytes in memory
    776 +
    777 +    CompactTrieInfo(const void *data, UErrorCode &status){
    778 +        CompactTrieHeader *header = (CompactTrieHeader *) data;
    779 +        if (header->magic != COMPACT_TRIE_MAGIC_1 && 
    780 +                header->magic != COMPACT_TRIE_MAGIC_2 &&
    781 +                header->magic != COMPACT_TRIE_MAGIC_3) {
    782 +            status = U_ILLEGAL_ARGUMENT_ERROR;
    783 +        } else {
    784 +            size = header->size;
    785 +            magic = header->magic;
    786 +
    787 +            if (header->magic == COMPACT_TRIE_MAGIC_1) {
    788 +                CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
    789 +                nodeCount = headerV1->nodeCount;
    790 +                root = headerV1->root;
    791 +                offsets = &(headerV1->offsets[0]);
    792 +                address = (uint8_t *)headerV1;
    793 +            } else {
    794 +                nodeCount = header->nodeCount;
    795 +                root = header->root;
    796 +                offsets = &(header->offsets[0]);
    797 +                address = (uint8_t *)header;
    798 +            }
    799 +        }
    800 +    }
    801 +
    802 +    ~CompactTrieInfo(){}
    803  };
    804  
    805  // Note that to avoid platform-specific alignment issues, all members of the node
    806 @@ -375,10 +458,14 @@
    807  enum CompactTrieNodeFlags {
    808      kVerticalNode   = 0x1000,       // This is a vertical node
    809      kParentEndsWord = 0x2000,       // The node whose equal link points to this ends a word
    810 -    kReservedFlag1  = 0x4000,
    811 -    kReservedFlag2  = 0x8000,
    812 +    kExceedsCount   = 0x4000,       // new MSB for count >= 4096, originally kReservedFlag1
    813 +    kEqualOverflows = 0x8000,       // Links to nodeIDs > 2^16, orig. kReservedFlag2
    814      kCountMask      = 0x0FFF,       // The count portion of flagscount
    815 -    kFlagMask       = 0xF000        // The flags portion of flagscount
    816 +    kFlagMask       = 0xF000,       // The flags portion of flagscount
    817 +    kRootCountMask  = 0x7FFF        // The count portion of flagscount in the root node
    818 +
    819 +    //offset flags:
    820 +    //kOffsetContainsValue = 0x80000000       // Offset contains value for parent node
    821  };
    822  
    823  // The two node types are distinguished by the kVerticalNode flag.
    824 @@ -402,63 +489,177 @@
    825      uint16_t        chars[1];       // Code units
    826  };
    827  
    828 -// {'Dic', 1}, version 1
    829 -#define COMPACT_TRIE_MAGIC_1 0x44696301
    830 -
    831  CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
    832                                                  UErrorCode &status )
    833  : fUData(dataObj)
    834  {
    835 -    fData = (const CompactTrieHeader *) udata_getMemory(dataObj);
    836 +    fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
    837 +    *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
    838      fOwnData = FALSE;
    839 -    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
    840 -        status = U_ILLEGAL_ARGUMENT_ERROR;
    841 -        fData = NULL;
    842 -    }
    843  }
    844 +
    845  CompactTrieDictionary::CompactTrieDictionary( const void *data,
    846                                                  UErrorCode &status )
    847  : fUData(NULL)
    848  {
    849 -    fData = (const CompactTrieHeader *) data;
    850 +    fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
    851 +    *fInfo = CompactTrieInfo(data, status);
    852      fOwnData = FALSE;
    853 -    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
    854 -        status = U_ILLEGAL_ARGUMENT_ERROR;
    855 -        fData = NULL;
    856 -    }
    857  }
    858  
    859  CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
    860                                                  UErrorCode &status )
    861  : fUData(NULL)
    862  {
    863 -    fData = compactMutableTrieDictionary(dict, status);
    864 +    const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status);
    865 +    if (U_SUCCESS(status)) {
    866 +        fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
    867 +        *fInfo = CompactTrieInfo(header, status);
    868 +    }
    869 +
    870      fOwnData = !U_FAILURE(status);
    871  }
    872  
    873  CompactTrieDictionary::~CompactTrieDictionary() {
    874      if (fOwnData) {
    875 -        uprv_free((void *)fData);
    876 +        uprv_free((void *)(fInfo->address));
    877      }
    878 +    uprv_free((void *)fInfo);
    879 +
    880      if (fUData) {
    881          udata_close(fUData);
    882      }
    883  }
    884  
    885 +UBool CompactTrieDictionary::getValued() const{
    886 +    return fInfo->magic == COMPACT_TRIE_MAGIC_3;
    887 +}
    888 +
    889  uint32_t
    890  CompactTrieDictionary::dataSize() const {
    891 -    return fData->size;
    892 +    return fInfo->size;
    893  }
    894  
    895  const void *
    896  CompactTrieDictionary::data() const {
    897 -    return fData;
    898 +    return fInfo->address;
    899 +}
    900 +
    901 +//This function finds the address of a node for us, given its node ID
    902 +static inline const CompactTrieNode *
    903 +getCompactNode(const CompactTrieInfo *info, uint32_t node) {
    904 +    if(node < info->root-1) {
    905 +        return (const CompactTrieNode *)(&info->offsets[node]);
    906 +    } else {
    907 +        return (const CompactTrieNode *)(info->address + info->offsets[node]);
    908 +    }
    909  }
    910  
    911 -// This function finds the address of a node for us, given its node ID
    912 +//this version of getCompactNode is currently only used in compactMutableTrieDictionary()
    913  static inline const CompactTrieNode *
    914 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {
    915 -    return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
    916 +getCompactNode(const CompactTrieHeader *header, uint32_t node) {
    917 +    if(node < header->root-1) {
    918 +        return (const CompactTrieNode *)(&header->offsets[node]);
    919 +    } else {
    920 +        return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
    921 +    }
    922 +}
    923 +
    924 +
    925 +/**
    926 + * Calculates the number of links in a node
    927 + * @node The specified node
    928 + */
    929 +static inline const uint16_t
    930 +getCount(const CompactTrieNode *node){
    931 +    return (node->flagscount & kCountMask);
    932 +    //use the code below if number of links ever exceed 4096
    933 +    //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2);
    934 +}
    935 +
    936 +/**
    937 + * calculates an equal link node ID of a horizontal node
    938 + * @hnode The horizontal node containing the equal link
    939 + * @param index The index into hnode->entries[]
    940 + * @param nodeCount The length of hnode->entries[]
    941 + */
    942 +static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
    943 +    if(vnode->flagscount & kEqualOverflows){
    944 +        // treat overflow bits as an extension of chars[]
    945 +        uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)];
    946 +        return vnode->equal + (((uint32_t)*overflow) << 16);
    947 +    }else{
    948 +        return vnode->equal;
    949 +    }
    950 +}
    951 +
    952 +/**
    953 + * calculates an equal link node ID of a horizontal node
    954 + * @hnode The horizontal node containing the equal link
    955 + * @param index The index into hnode->entries[]
    956 + * @param nodeCount The length of hnode->entries[]
    957 + */
    958 +static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){
    959 +    if(hnode->flagscount & kEqualOverflows){
    960 +        //set overflow to point to the uint16_t containing the overflow bits 
    961 +        uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
    962 +        overflow += index/4;
    963 +        uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
    964 +        return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
    965 +    } else {
    966 +        return hnode->entries[index].equal;
    967 +    }
    968 +}
    969 +
    970 +/**
    971 + * Returns the value stored in the specified node which is associated with its
    972 + * parent node.
    973 + * TODO: how to tell that value is stored in node or in offset? check whether
    974 + * node ID < fInfo->root!
    975 + */
    976 +static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
    977 +    uint16_t count = getCount((CompactTrieNode *)hnode);
    978 +    uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
    979 +
    980 +    if(hnode->flagscount & kEqualOverflows)
    981 +        overflowSize = (count + 3) / 4 * sizeof(uint16_t);
    982 +    return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); 
    983 +}
    984 +
    985 +static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
    986 +    // calculate size of total node ID overflow storage in bytes
    987 +    uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0;
    988 +    return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize)); 
    989 +}
    990 +
    991 +static inline uint16_t getValue(const CompactTrieNode *node){
    992 +    if(node->flagscount & kVerticalNode)
    993 +        return getValue((const CompactTrieVerticalNode *)node);
    994 +    else
    995 +        return getValue((const CompactTrieHorizontalNode *)node);
    996 +}
    997 +
    998 +//returns index of match in CompactTrieHorizontalNode.entries[] using binary search
    999 +inline int16_t 
   1000 +searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, 
   1001 +        UChar uc, uint16_t nodeCount){
   1002 +    int low = 0;
   1003 +    int high = nodeCount-1;
   1004 +    int middle;
   1005 +    while (high >= low) {
   1006 +        middle = (high+low)/2;
   1007 +        if (uc == entries[middle].ch) {
   1008 +            return middle;
   1009 +        }
   1010 +        else if (uc < entries[middle].ch) {
   1011 +            high = middle-1;
   1012 +        }
   1013 +        else {
   1014 +            low = middle+1;
   1015 +        }
   1016 +    }
   1017 +
   1018 +    return -1;
   1019  }
   1020  
   1021  int32_t
   1022 @@ -466,17 +667,38 @@
   1023                                  int32_t maxLength,
   1024                                  int32_t *lengths,
   1025                                  int &count,
   1026 -                                int limit ) const {
   1027 +                                int limit,
   1028 +                                uint16_t *values /*= NULL*/) const {
   1029 +    if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
   1030 +        values = NULL;
   1031 +
   1032      // TODO: current implementation works in UTF-16 space
   1033 -    const CompactTrieNode *node = getCompactNode(fData, fData->root);
   1034 +    const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
   1035      int mycount = 0;
   1036  
   1037      UChar uc = utext_current32(text);
   1038      int i = 0;
   1039  
   1040 +    // handle root node with only kEqualOverflows flag: assume horizontal node without parent
   1041 +    if(node != NULL){
   1042 +        const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;
   1043 +        int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
   1044 +        if(index > -1){
   1045 +            node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask));
   1046 +            utext_next32(text);
   1047 +            uc = utext_current32(text);
   1048 +            ++i;
   1049 +        }else{
   1050 +            node = NULL;
   1051 +        }
   1052 +    }
   1053 +
   1054      while (node != NULL) {
   1055          // Check if the node we just exited ends a word
   1056          if (limit > 0 && (node->flagscount & kParentEndsWord)) {
   1057 +            if(values != NULL){
   1058 +                values[mycount] = getValue(node);
   1059 +            }
   1060              lengths[mycount++] = i;
   1061              --limit;
   1062          }
   1063 @@ -487,7 +709,7 @@
   1064              break;
   1065          }
   1066  
   1067 -        int nodeCount = (node->flagscount & kCountMask);
   1068 +        int nodeCount = getCount(node);
   1069          if (nodeCount == 0) {
   1070              // Special terminal node; return now
   1071              break;
   1072 @@ -507,35 +729,27 @@
   1073              // To get here we must have come through the whole list successfully;
   1074              // go on to the next node. Note that a word cannot end in the middle
   1075              // of a vertical node.
   1076 -            node = getCompactNode(fData, vnode->equal);
   1077 +            node = getCompactNode(fInfo, calcEqualLink(vnode));
   1078          }
   1079          else {
   1080              // Horizontal node; do binary search
   1081              const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
   1082 -            int low = 0;
   1083 -            int high = nodeCount-1;
   1084 -            int middle;
   1085 -            node = NULL;    // If we don't find a match, we'll fall out of the loop
   1086 -            while (high >= low) {
   1087 -                middle = (high+low)/2;
   1088 -                if (uc == hnode->entries[middle].ch) {
   1089 -                    // We hit a match; get the next node and next character
   1090 -                    node = getCompactNode(fData, hnode->entries[middle].equal);
   1091 -                    utext_next32(text);
   1092 -                    uc = utext_current32(text);
   1093 -                    ++i;
   1094 -                    break;
   1095 -                }
   1096 -                else if (uc < hnode->entries[middle].ch) {
   1097 -                    high = middle-1;
   1098 -                }
   1099 -                else {
   1100 -                    low = middle+1;
   1101 -                }
   1102 +            const CompactTrieHorizontalEntry *entries;
   1103 +            entries = hnode->entries;
   1104 +
   1105 +            int index = searchHorizontalEntries(entries, uc, nodeCount);
   1106 +            if(index > -1){  //
   1107 +                // We hit a match; get the next node and next character
   1108 +                node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount));
   1109 +                utext_next32(text);
   1110 +                uc = utext_current32(text);
   1111 +                ++i;
   1112 +            }else{
   1113 +                node = NULL;    // If we don't find a match, we'll fall out of the loop              
   1114              }
   1115          }
   1116      }
   1117 -exit:
   1118 +    exit:
   1119      count = mycount;
   1120      return i;
   1121  }
   1122 @@ -545,16 +759,16 @@
   1123  private:
   1124      UVector32               fNodeStack;     // Stack of nodes to process
   1125      UVector32               fIndexStack;    // Stack of where in node we are
   1126 -    const CompactTrieHeader *fHeader;       // Trie data
   1127 +    const CompactTrieInfo   *fInfo;         // Trie data
   1128  
   1129  public:
   1130      static UClassID U_EXPORT2 getStaticClassID(void);
   1131      virtual UClassID getDynamicClassID(void) const;
   1132  public:
   1133 -    CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) 
   1134 +    CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) 
   1135          : fNodeStack(status), fIndexStack(status) {
   1136 -        fHeader = header;
   1137 -        fNodeStack.push(header->root, status);
   1138 +        fInfo = info;
   1139 +        fNodeStack.push(info->root, status);
   1140          fIndexStack.push(0, status);
   1141          unistr.remove();
   1142      }
   1143 @@ -564,14 +778,14 @@
   1144      
   1145      virtual StringEnumeration *clone() const {
   1146          UErrorCode status = U_ZERO_ERROR;
   1147 -        return new CompactTrieEnumeration(fHeader, status);
   1148 +        return new CompactTrieEnumeration(fInfo, status);
   1149      }
   1150      
   1151      virtual const UnicodeString * snext(UErrorCode &status);
   1152  
   1153      // Very expensive, but this should never be used.
   1154      virtual int32_t count(UErrorCode &status) const {
   1155 -        CompactTrieEnumeration counter(fHeader, status);
   1156 +        CompactTrieEnumeration counter(fInfo, status);
   1157          int32_t result = 0;
   1158          while (counter.snext(status) != NULL && U_SUCCESS(status)) {
   1159              ++result;
   1160 @@ -582,7 +796,7 @@
   1161      virtual void reset(UErrorCode &status) {
   1162          fNodeStack.removeAllElements();
   1163          fIndexStack.removeAllElements();
   1164 -        fNodeStack.push(fHeader->root, status);
   1165 +        fNodeStack.push(fInfo->root, status);
   1166          fIndexStack.push(0, status);
   1167          unistr.remove();
   1168      }
   1169 @@ -595,26 +809,34 @@
   1170      if (fNodeStack.empty() || U_FAILURE(status)) {
   1171          return NULL;
   1172      }
   1173 -    const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
   1174 +    const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
   1175      int where = fIndexStack.peeki();
   1176      while (!fNodeStack.empty() && U_SUCCESS(status)) {
   1177 -        int nodeCount = (node->flagscount & kCountMask);
   1178 +        int nodeCount;
   1179 +
   1180 +        bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
   1181 +        if(isRoot){
   1182 +            nodeCount = node->flagscount & kRootCountMask;
   1183 +        } else {
   1184 +            nodeCount = getCount(node);
   1185 +        }
   1186 +
   1187          UBool goingDown = FALSE;
   1188          if (nodeCount == 0) {
   1189              // Terminal node; go up immediately
   1190              fNodeStack.popi();
   1191              fIndexStack.popi();
   1192 -            node = getCompactNode(fHeader, fNodeStack.peeki());
   1193 +            node = getCompactNode(fInfo, fNodeStack.peeki());
   1194              where = fIndexStack.peeki();
   1195          }
   1196 -        else if (node->flagscount & kVerticalNode) {
   1197 +        else if ((node->flagscount & kVerticalNode) && !isRoot) {
   1198              // Vertical node
   1199              const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
   1200              if (where == 0) {
   1201                  // Going down
   1202 -                unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount);
   1203 +                unistr.append((const UChar *)vnode->chars, nodeCount);
   1204                  fIndexStack.setElementAt(1, fIndexStack.size()-1);
   1205 -                node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status));
   1206 +                node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status));
   1207                  where = fIndexStack.push(0, status);
   1208                  goingDown = TRUE;
   1209              }
   1210 @@ -623,7 +845,7 @@
   1211                  unistr.truncate(unistr.length()-nodeCount);
   1212                  fNodeStack.popi();
   1213                  fIndexStack.popi();
   1214 -                node = getCompactNode(fHeader, fNodeStack.peeki());
   1215 +                node = getCompactNode(fInfo, fNodeStack.peeki());
   1216                  where = fIndexStack.peeki();
   1217              }
   1218          }
   1219 @@ -638,7 +860,7 @@
   1220                  // Push on next node
   1221                  unistr.append((UChar)hnode->entries[where].ch);
   1222                  fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
   1223 -                node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status));
   1224 +                node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status));
   1225                  where = fIndexStack.push(0, status);
   1226                  goingDown = TRUE;
   1227              }
   1228 @@ -646,12 +868,14 @@
   1229                  // Going up
   1230                  fNodeStack.popi();
   1231                  fIndexStack.popi();
   1232 -                node = getCompactNode(fHeader, fNodeStack.peeki());
   1233 +                node = getCompactNode(fInfo, fNodeStack.peeki());
   1234                  where = fIndexStack.peeki();
   1235              }
   1236          }
   1237 +
   1238          // Check if the parent of the node we've just gone down to ends a
   1239          // word. If so, return it.
   1240 +        // The root node should never end up here.
   1241          if (goingDown && (node->flagscount & kParentEndsWord)) {
   1242              return &unistr;
   1243          }
   1244 @@ -664,7 +888,7 @@
   1245      if (U_FAILURE(status)) {
   1246          return NULL;
   1247      }
   1248 -    return new CompactTrieEnumeration(fData, status);
   1249 +    return new CompactTrieEnumeration(fInfo, status);
   1250  }
   1251  
   1252  //
   1253 @@ -672,21 +896,36 @@
   1254  // and back again
   1255  //
   1256  
   1257 -// Helper classes to construct the compact trie
   1258 +enum CompactTrieNodeType {
   1259 +    kHorizontalType = 0,
   1260 +    kVerticalType = 1,
   1261 +    kValueType = 2
   1262 +};
   1263 +
   1264 +/**
   1265 + * The following classes (i.e. BuildCompactTrie*Node) are helper classes to 
   1266 + * construct the compact trie by storing information for each node and later 
   1267 + * writing the node to memory in a sequential format.
   1268 + */
   1269  class BuildCompactTrieNode: public UMemory {
   1270 - public:
   1271 +public:
   1272      UBool           fParentEndsWord;
   1273 -    UBool           fVertical;
   1274 +    CompactTrieNodeType fNodeType;
   1275      UBool           fHasDuplicate;
   1276 +    UBool           fEqualOverflows;
   1277      int32_t         fNodeID;
   1278      UnicodeString   fChars;
   1279 +    uint16_t        fValue;
   1280  
   1281 - public:
   1282 -    BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) {
   1283 +public:
   1284 +    BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, 
   1285 +            UStack &nodes, UErrorCode &status, uint16_t value = 0) {
   1286          fParentEndsWord = parentEndsWord;
   1287          fHasDuplicate = FALSE;
   1288 -        fVertical = vertical;
   1289 +        fNodeType = nodeType;
   1290 +        fEqualOverflows = FALSE;
   1291          fNodeID = nodes.size();
   1292 +        fValue = parentEndsWord? value : 0;
   1293          nodes.push(this, status);
   1294      }
   1295      
   1296 @@ -694,87 +933,225 @@
   1297      }
   1298      
   1299      virtual uint32_t size() {
   1300 -        return sizeof(uint16_t);
   1301 +        if(fValue > 0)
   1302 +            return sizeof(uint16_t) * 2;
   1303 +        else
   1304 +            return sizeof(uint16_t);
   1305      }
   1306      
   1307      virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
   1308          // Write flag/count
   1309 -        *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)
   1310 -            | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 );
   1311 +
   1312 +        // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
   1313 +        // used as a 5th MSB.
   1314 +        U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
   1315 +
   1316 +        *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) | 
   1317 +        ((fNodeID == 2)? (fChars.length() & kRootCountMask): 
   1318 +            (
   1319 +                    (fChars.length() & kCountMask) | 
   1320 +                    //((fChars.length() << 2) & kExceedsCount) |
   1321 +                    (fNodeType == kVerticalType ? kVerticalNode : 0) | 
   1322 +                    (fParentEndsWord ? kParentEndsWord : 0 )
   1323 +            )
   1324 +        );
   1325          offset += sizeof(uint16_t);
   1326      }
   1327 +
   1328 +    virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
   1329 +        if(fValue > 0){
   1330 +            *((uint16_t *)(bytes+offset)) = fValue;
   1331 +            offset += sizeof(uint16_t);
   1332 +        }
   1333 +    }
   1334 +
   1335 +};
   1336 +
   1337 +/**
   1338 + * Stores value of parent terminating nodes that have no more subtries. 
   1339 + */
   1340 +class BuildCompactTrieValueNode: public BuildCompactTrieNode {
   1341 +public:
   1342 +    BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)
   1343 +        : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
   1344 +    }
   1345 +
   1346 +    virtual ~BuildCompactTrieValueNode(){
   1347 +    }
   1348 +
   1349 +    virtual uint32_t size() {
   1350 +        return sizeof(uint16_t) * 2;
   1351 +    }
   1352 +
   1353 +    virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
   1354 +        // don't write value directly to memory but store it in offset to be written later
   1355 +        //offset = fValue & kOffsetContainsValue;
   1356 +        BuildCompactTrieNode::write(bytes, offset, translate);
   1357 +        BuildCompactTrieNode::writeValue(bytes, offset);
   1358 +    }
   1359  };
   1360  
   1361  class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
   1362   public:
   1363      UStack          fLinks;
   1364 +    UBool           fMayOverflow; //intermediate value for fEqualOverflows
   1365  
   1366   public:
   1367 -    BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
   1368 -        : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) {
   1369 +    BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
   1370 +    : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) {
   1371 +        fMayOverflow = FALSE;
   1372      }
   1373      
   1374      virtual ~BuildCompactTrieHorizontalNode() {
   1375      }
   1376      
   1377 +    // It is impossible to know beforehand exactly how much space the node will
   1378 +    // need in memory before being written, because the node IDs in the equal
   1379 +    // links may or may not overflow after node coalescing. Therefore, this method 
   1380 +    // returns the maximum size possible for the node.
   1381      virtual uint32_t size() {
   1382 -        return offsetof(CompactTrieHorizontalNode,entries) +
   1383 -                (fChars.length()*sizeof(CompactTrieHorizontalEntry));
   1384 +        uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
   1385 +        (fChars.length()*sizeof(CompactTrieHorizontalEntry));
   1386 +
   1387 +        if(fValue > 0)
   1388 +            estimatedSize += sizeof(uint16_t);
   1389 +
   1390 +        //estimate extra space needed to store overflow for node ID links
   1391 +        //may be more than what is actually needed
   1392 +        for(int i=0; i < fChars.length(); i++){
   1393 +            if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
   1394 +                fMayOverflow = TRUE;
   1395 +                break;
   1396 +            }          
   1397 +        }
   1398 +        if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t)
   1399 +            estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
   1400 +
   1401 +        return estimatedSize;
   1402      }
   1403      
   1404      virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
   1405 -        BuildCompactTrieNode::write(bytes, offset, translate);
   1406          int32_t count = fChars.length();
   1407 +
   1408 +        //if largest nodeID > 2^16, set flag
   1409 +        //large node IDs are more likely to be at the back of the array
   1410 +        for (int32_t i = count-1; i >= 0; --i) {
   1411 +            if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){
   1412 +                fEqualOverflows = TRUE;
   1413 +                break;
   1414 +            }
   1415 +        }
   1416 +
   1417 +        BuildCompactTrieNode::write(bytes, offset, translate);
   1418 +
   1419 +        // write entries[] to memory
   1420          for (int32_t i = 0; i < count; ++i) {
   1421              CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
   1422              entry->ch = fChars[i];
   1423              entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
   1424  #ifdef DEBUG_TRIE_DICT
   1425 -            if (entry->equal == 0) {
   1426 +
   1427 +            if ((entry->equal == 0) && !fEqualOverflows) {
   1428                  fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
   1429                          i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
   1430              }
   1431  #endif
   1432              offset += sizeof(CompactTrieHorizontalEntry);
   1433          }
   1434 +
   1435 +        // append extra bits of equal nodes to end if fEqualOverflows
   1436 +        if (fEqualOverflows) {
   1437 +            uint16_t leftmostBits = 0;
   1438 +            for (int16_t i = 0; i < count; i++) {
   1439 +                leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
   1440 +
   1441 +                // write filled uint16_t to memory
   1442 +                if(i % 4 == 3){
   1443 +                    *((uint16_t *)(bytes+offset)) = leftmostBits;
   1444 +                    leftmostBits = 0;
   1445 +                    offset += sizeof(uint16_t);
   1446 +                }
   1447 +            }
   1448 +
   1449 +            // pad last uint16_t with zeroes if necessary
   1450 +            int remainder = count % 4;
   1451 +            if (remainder > 0) {
   1452 +                *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder));
   1453 +                offset += sizeof(uint16_t);
   1454 +            }
   1455 +        }
   1456 +
   1457 +        BuildCompactTrieNode::writeValue(bytes, offset);
   1458 +    }
   1459 +
   1460 +    // returns leftmost bits of physical node link
   1461 +    uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
   1462 +        uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16);
   1463 +#ifdef DEBUG_TRIE_DICT
   1464 +        if (leftmostBits > 0xF) {
   1465 +            fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
   1466 +                    i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
   1467 +        }
   1468 +#endif
   1469 +        return leftmostBits;
   1470      }
   1471      
   1472      void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
   1473          fChars.append(ch);
   1474          fLinks.push(link, status);
   1475      }
   1476 +
   1477  };
   1478  
   1479  class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
   1480 - public:
   1481 +public:
   1482      BuildCompactTrieNode    *fEqual;
   1483  
   1484 - public:
   1485 -    BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
   1486 -        : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {
   1487 +public:
   1488 +    BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
   1489 +    : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
   1490          fEqual = NULL;
   1491      }
   1492      
   1493      virtual ~BuildCompactTrieVerticalNode() {
   1494      }
   1495      
   1496 +    // Returns the maximum possible size of this node. See comment in 
   1497 +    // BuildCompactTrieHorizontal node for more information.
   1498      virtual uint32_t size() {
   1499 -        return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
   1500 +        uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
   1501 +        if(fValue > 0){
   1502 +            estimatedSize += sizeof(uint16_t);
   1503 +        }
   1504 +
   1505 +        if(fEqual->fNodeID > 0xFFFF){
   1506 +            estimatedSize += sizeof(uint16_t);
   1507 +        }
   1508 +        return estimatedSize;
   1509      }
   1510      
   1511      virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
   1512          CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
   1513 +        fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
   1514          BuildCompactTrieNode::write(bytes, offset, translate);
   1515          node->equal = translate.elementAti(fEqual->fNodeID);
   1516          offset += sizeof(node->equal);
   1517  #ifdef DEBUG_TRIE_DICT
   1518 -        if (node->equal == 0) {
   1519 +        if ((node->equal == 0) && !fEqualOverflows) {
   1520              fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
   1521                      fEqual->fNodeID);
   1522          }
   1523  #endif
   1524          fChars.extract(0, fChars.length(), (UChar *)node->chars);
   1525 -        offset += sizeof(uint16_t)*fChars.length();
   1526 +        offset += sizeof(UChar)*fChars.length();
   1527 +
   1528 +        // append 16 bits of to end for equal node if fEqualOverflows
   1529 +        if (fEqualOverflows) {
   1530 +            *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16);
   1531 +            offset += sizeof(uint16_t);
   1532 +        }
   1533 +
   1534 +        BuildCompactTrieNode::writeValue(bytes, offset);
   1535      }
   1536      
   1537      void addChar(UChar ch) {
   1538 @@ -784,60 +1161,85 @@
   1539      void setLink(BuildCompactTrieNode *node) {
   1540          fEqual = node;
   1541      }
   1542 +    
   1543  };
   1544  
   1545  // Forward declaration
   1546  static void walkHorizontal(const TernaryNode *node,
   1547                              BuildCompactTrieHorizontalNode *building,
   1548                              UStack &nodes,
   1549 -                            UErrorCode &status);
   1550 +                            UErrorCode &status,
   1551 +                            Hashtable *values);
   1552  
   1553 -// Convert one node. Uses recursion.
   1554 +// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.
   1555  
   1556  static BuildCompactTrieNode *
   1557 -compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) {
   1558 +compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, 
   1559 +        UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
   1560      if (U_FAILURE(status)) {
   1561          return NULL;
   1562      }
   1563      BuildCompactTrieNode *result = NULL;
   1564      UBool horizontal = (node->low != NULL || node->high != NULL);
   1565      if (horizontal) {
   1566 -        BuildCompactTrieHorizontalNode *hResult =
   1567 -                new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
   1568 +        BuildCompactTrieHorizontalNode *hResult;
   1569 +        if(values != NULL){
   1570 +            hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
   1571 +        } else {
   1572 +            hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
   1573 +        }
   1574 +
   1575          if (hResult == NULL) {
   1576              status = U_MEMORY_ALLOCATION_ERROR;
   1577              return NULL;
   1578          }
   1579          if (U_SUCCESS(status)) {
   1580 -            walkHorizontal(node, hResult, nodes, status);
   1581 +            walkHorizontal(node, hResult, nodes, status, values);
   1582              result = hResult;
   1583          }
   1584      }
   1585      else {
   1586 -        BuildCompactTrieVerticalNode *vResult =
   1587 -                new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
   1588 +        BuildCompactTrieVerticalNode *vResult;
   1589 +        if(values != NULL){
   1590 +            vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue);
   1591 +        } else { 
   1592 +            vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
   1593 +        }
   1594 +
   1595          if (vResult == NULL) {
   1596              status = U_MEMORY_ALLOCATION_ERROR;
   1597 +            return NULL;
   1598          }
   1599          else if (U_SUCCESS(status)) {
   1600 -            UBool   endsWord = FALSE;
   1601 +            uint16_t   value = 0;
   1602 +            UBool endsWord = FALSE;
   1603              // Take up nodes until we end a word, or hit a node with < or > links
   1604              do {
   1605                  vResult->addChar(node->ch);
   1606 -                endsWord = (node->flags & kEndsWord) != 0;
   1607 +                value = node->flags;
   1608 +                endsWord = value > 0;
   1609                  node = node->equal;
   1610              }
   1611              while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
   1612 +
   1613              if (node == NULL) {
   1614                  if (!endsWord) {
   1615                      status = U_ILLEGAL_ARGUMENT_ERROR;  // Corrupt input trie
   1616                  }
   1617 -                else {
   1618 +                else if(values != NULL){
   1619 +                    UnicodeString key(value); //store value as a single-char UnicodeString
   1620 +                    BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);
   1621 +                    if(link == NULL){
   1622 +                        link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes?
   1623 +                        values->put(key, link, status);
   1624 +                    }
   1625 +                    vResult->setLink(link);
   1626 +                } else {
   1627                      vResult->setLink((BuildCompactTrieNode *)nodes[1]);
   1628                  }
   1629              }
   1630              else {
   1631 -                vResult->setLink(compactOneNode(node, endsWord, nodes, status));
   1632 +                vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
   1633              }
   1634              result = vResult;
   1635          }
   1636 @@ -849,19 +1251,28 @@
   1637  // Uses recursion.
   1638  
   1639  static void walkHorizontal(const TernaryNode *node,
   1640 -                            BuildCompactTrieHorizontalNode *building,
   1641 -                            UStack &nodes,
   1642 -                            UErrorCode &status) {
   1643 +                           BuildCompactTrieHorizontalNode *building,
   1644 +                           UStack &nodes,
   1645 +                           UErrorCode &status, Hashtable *values = NULL) {
   1646      while (U_SUCCESS(status) && node != NULL) {
   1647          if (node->low != NULL) {
   1648 -            walkHorizontal(node->low, building, nodes, status);
   1649 +            walkHorizontal(node->low, building, nodes, status, values);
   1650          }
   1651          BuildCompactTrieNode *link = NULL;
   1652          if (node->equal != NULL) {
   1653 -            link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
   1654 +            link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
   1655          }
   1656 -        else if (node->flags & kEndsWord) {
   1657 -            link = (BuildCompactTrieNode *)nodes[1];
   1658 +        else if (node->flags > 0) {
   1659 +            if(values != NULL) {
   1660 +                UnicodeString key(node->flags); //store value as a single-char UnicodeString
   1661 +                link = (BuildCompactTrieValueNode *) values->get(key);
   1662 +                if(link == NULL) {
   1663 +                    link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes?
   1664 +                    values->put(key, link, status);
   1665 +                }
   1666 +            } else {
   1667 +                link = (BuildCompactTrieNode *)nodes[1];
   1668 +            }
   1669          }
   1670          if (U_SUCCESS(status) && link != NULL) {
   1671              building->addNode(node->ch, link, status);
   1672 @@ -881,13 +1292,15 @@
   1673  _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
   1674      BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
   1675      BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
   1676 +
   1677      // Check for comparing a node to itself, to avoid spurious duplicates
   1678      if (left == right) {
   1679          return 0;
   1680      }
   1681 +
   1682      // Most significant is type of node. Can never coalesce.
   1683 -    if (left->fVertical != right->fVertical) {
   1684 -        return left->fVertical - right->fVertical;
   1685 +    if (left->fNodeType != right->fNodeType) {
   1686 +        return left->fNodeType - right->fNodeType;
   1687      }
   1688      // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
   1689      if (left->fParentEndsWord != right->fParentEndsWord) {
   1690 @@ -898,12 +1311,19 @@
   1691      if (result != 0) {
   1692          return result;
   1693      }
   1694 +
   1695 +    // If the node value differs, we should not coalesce.
   1696 +    // If values aren't stored, all fValues should be 0.
   1697 +    if (left->fValue != right->fValue) {
   1698 +        return left->fValue - right->fValue;
   1699 +    }
   1700 +
   1701      // We know they're both the same node type, so branch for the two cases.
   1702 -    if (left->fVertical) {
   1703 +    if (left->fNodeType == kVerticalType) {
   1704          result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
   1705 -                            - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
   1706 +        - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
   1707      }
   1708 -    else {
   1709 +    else if(left->fChars.length() > 0 && right->fChars.length() > 0){
   1710          // We need to compare the links vectors. They should be the
   1711          // same size because the strings were equal.
   1712          // We compare the node IDs instead of the pointers, to handle
   1713 @@ -914,9 +1334,10 @@
   1714          int32_t count = hleft->fLinks.size();
   1715          for (int32_t i = 0; i < count && result == 0; ++i) {
   1716              result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
   1717 -                     ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
   1718 +            ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
   1719          }
   1720      }
   1721 +
   1722      // If they are equal to each other, mark them (speeds coalescing)
   1723      if (result == 0) {
   1724          left->fHasDuplicate = TRUE;
   1725 @@ -1031,20 +1452,25 @@
   1726      // Add node 0, used as the NULL pointer/sentinel.
   1727      nodes.addElement((int32_t)0, status);
   1728  
   1729 +    Hashtable *values = NULL;                           // Index of (unique) values
   1730 +    if (dict.fValued) {
   1731 +        values = new Hashtable(status);
   1732 +    }
   1733 +
   1734      // Start by creating the special empty node we use to indicate that the parent
   1735      // terminates a word. This must be node 1, because the builder assumes
   1736 -    // that.
   1737 +    // that. This node will never be used for tries storing numerical values.
   1738      if (U_FAILURE(status)) {
   1739          return NULL;
   1740      }
   1741 -    BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status);
   1742 +    BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status);
   1743      if (terminal == NULL) {
   1744          status = U_MEMORY_ALLOCATION_ERROR;
   1745      }
   1746  
   1747      // This call does all the work of building the new trie structure. The root
   1748 -    // will be node 2.
   1749 -    BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status);
   1750 +    // will have node ID 2 before writing to memory.
   1751 +    BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values);
   1752  #ifdef DEBUG_TRIE_DICT
   1753      (void) ::times(&timing);
   1754      fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
   1755 @@ -1077,21 +1503,37 @@
   1756          return NULL;
   1757      }
   1758  
   1759 +    //map terminal value nodes
   1760 +    int valueCount = 0;
   1761 +    UVector valueNodes(status);
   1762 +    if(values != NULL) {
   1763 +        valueCount = values->count(); //number of unique terminal value nodes
   1764 +    }
   1765 +     
   1766 +    // map non-terminal nodes
   1767 +    int valuePos = 1;//, nodePos = valueCount + valuePos;
   1768 +    nodeCount = valueCount + valuePos;
   1769      for (i = 1; i < count; ++i) {
   1770          node = (BuildCompactTrieNode *)nodes[i];
   1771          if (node->fNodeID == i) {
   1772              // Only one node out of each duplicate set is used
   1773 -            if (i >= translate.size()) {
   1774 +            if (node->fNodeID >= translate.size()) {
   1775                  // Logically extend the mapping table
   1776 -                translate.setSize(i+1);
   1777 +                translate.setSize(i + 1);
   1778 +            }
   1779 +            //translate.setElementAt(object, index)!
   1780 +            if(node->fNodeType == kValueType) {
   1781 +                valueNodes.addElement(node, status);
   1782 +               translate.setElementAt(valuePos++, i);
   1783 +             } else {
   1784 +                translate.setElementAt(nodeCount++, i);
   1785              }
   1786 -            translate.setElementAt(nodeCount++, i);
   1787              totalSize += node->size();
   1788          }
   1789      }
   1790 -    
   1791 -    // Check for overflowing 16 bits worth of nodes.
   1792 -    if (nodeCount > 0x10000) {
   1793 +
   1794 +    // Check for overflowing 20 bits worth of nodes.
   1795 +    if (nodeCount > 0x100000) {
   1796          status = U_ILLEGAL_ARGUMENT_ERROR;
   1797          return NULL;
   1798      }
   1799 @@ -1111,9 +1553,14 @@
   1800          status = U_MEMORY_ALLOCATION_ERROR;
   1801          return NULL;
   1802      }
   1803 -
   1804 +    
   1805      CompactTrieHeader *header = (CompactTrieHeader *)bytes;
   1806 -    header->size = totalSize;
   1807 +    //header->size = totalSize;
   1808 +    if(dict.fValued){
   1809 +        header->magic = COMPACT_TRIE_MAGIC_3;
   1810 +    } else {
   1811 +        header->magic = COMPACT_TRIE_MAGIC_2;
   1812 +    }
   1813      header->nodeCount = nodeCount;
   1814      header->offsets[0] = 0;                     // Sentinel
   1815      header->root = translate.elementAti(root->fNodeID);
   1816 @@ -1123,23 +1570,40 @@
   1817      }
   1818  #endif
   1819      uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
   1820 -    nodeCount = 1;
   1821 +    nodeCount = valueCount + 1;
   1822 +
   1823 +    // Write terminal value nodes to memory
   1824 +    for (i=0; i < valueNodes.size(); i++) {
   1825 +        //header->offsets[i + 1] = offset;
   1826 +        uint32_t tmpOffset = 0;
   1827 +        node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
   1828 +        //header->offsets[i + 1] = (uint32_t)node->fValue;
   1829 +        node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
   1830 +    }
   1831 +
   1832      // Now write the data
   1833      for (i = 1; i < count; ++i) {
   1834          node = (BuildCompactTrieNode *)nodes[i];
   1835 -        if (node->fNodeID == i) {
   1836 +        if (node->fNodeID == i && node->fNodeType != kValueType) {
   1837              header->offsets[nodeCount++] = offset;
   1838              node->write(bytes, offset, translate);
   1839          }
   1840      }
   1841 +
   1842 +    //free all extra space
   1843 +    uprv_realloc(bytes, offset);
   1844 +    header->size = offset;
   1845 +
   1846  #ifdef DEBUG_TRIE_DICT
   1847 +    fprintf(stdout, "Space freed: %d\n", totalSize-offset);
   1848 +
   1849      (void) ::times(&timing);
   1850      fprintf(stderr, "Trie built, time user %f system %f\n",
   1851          (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
   1852          (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
   1853      previous = timing;
   1854      fprintf(stderr, "Final offset is %d\n", offset);
   1855 -    
   1856 +
   1857      // Collect statistics on node types and sizes
   1858      int hCount = 0;
   1859      int vCount = 0;
   1860 @@ -1148,68 +1612,85 @@
   1861      size_t hItemCount = 0;
   1862      size_t vItemCount = 0;
   1863      uint32_t previousOff = offset;
   1864 -    for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
   1865 +    uint32_t numOverflow = 0;
   1866 +    uint32_t valueSpace = 0;
   1867 +    for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
   1868          const CompactTrieNode *node = getCompactNode(header, nodeIdx);
   1869 -        if (node->flagscount & kVerticalNode) {
   1870 +        int itemCount;
   1871 +        if(nodeIdx == header->root)
   1872 +            itemCount = node->flagscount & kRootCountMask;
   1873 +        else
   1874 +            itemCount = getCount(node);
   1875 +        if(node->flagscount & kEqualOverflows){
   1876 +            numOverflow++;
   1877 +        }
   1878 +        if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
   1879              vCount += 1;
   1880 -            vItemCount += (node->flagscount & kCountMask);
   1881 +            vItemCount += itemCount;
   1882              vSize += previousOff-header->offsets[nodeIdx];
   1883          }
   1884          else {
   1885              hCount += 1;
   1886 -            hItemCount += (node->flagscount & kCountMask);
   1887 -            hSize += previousOff-header->offsets[nodeIdx];
   1888 +            hItemCount += itemCount;
   1889 +            if(nodeIdx >= header->root) {
   1890 +                hSize += previousOff-header->offsets[nodeIdx];
   1891 +            }
   1892          }
   1893 +        
   1894 +        if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord)
   1895 +            valueSpace += sizeof(uint16_t);
   1896          previousOff = header->offsets[nodeIdx];
   1897      }
   1898      fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
   1899                  (double)hSize/hCount, (double)hItemCount/hCount);
   1900      fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
   1901                  (double)vSize/vCount, (double)vItemCount/vCount);
   1902 +    fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow);
   1903 +    fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
   1904  #endif
   1905  
   1906      if (U_FAILURE(status)) {
   1907          uprv_free(bytes);
   1908          header = NULL;
   1909      }
   1910 -    else {
   1911 -        header->magic = COMPACT_TRIE_MAGIC_1;
   1912 -    }
   1913      return header;
   1914  }
   1915  
   1916  // Forward declaration
   1917  static TernaryNode *
   1918 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status );
   1919 -
   1920 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status );
   1921  
   1922  // Convert a horizontal node (or subarray thereof) into a ternary subtrie
   1923  static TernaryNode *
   1924 -unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array,
   1925 -                            int low, int high, UErrorCode &status ) {
   1926 +unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode,
   1927 +        int low, int high, int nodeCount, UErrorCode &status) {
   1928      if (U_FAILURE(status) || low > high) {
   1929          return NULL;
   1930      }
   1931      int middle = (low+high)/2;
   1932 -    TernaryNode *result = new TernaryNode(array[middle].ch);
   1933 +    TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
   1934      if (result == NULL) {
   1935          status = U_MEMORY_ALLOCATION_ERROR;
   1936          return NULL;
   1937      }
   1938 -    const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);
   1939 +    const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount));
   1940      if (equal->flagscount & kParentEndsWord) {
   1941 -        result->flags |= kEndsWord;
   1942 +        if(info->magic == COMPACT_TRIE_MAGIC_3){
   1943 +            result->flags = getValue(equal);
   1944 +        }else{
   1945 +            result->flags |= kEndsWord;
   1946 +        }
   1947      }
   1948 -    result->low = unpackHorizontalArray(header, array, low, middle-1, status);
   1949 -    result->high = unpackHorizontalArray(header, array, middle+1, high, status);
   1950 -    result->equal = unpackOneNode(header, equal, status);
   1951 +    result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);
   1952 +    result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);
   1953 +    result->equal = unpackOneNode(info, equal, status);
   1954      return result;
   1955  }                            
   1956  
   1957  // Convert one compact trie node into a ternary subtrie
   1958  static TernaryNode *
   1959 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) {
   1960 -    int nodeCount = (node->flagscount & kCountMask);
   1961 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) {
   1962 +    int nodeCount = getCount(node);
   1963      if (nodeCount == 0 || U_FAILURE(status)) {
   1964          // Failure, or terminal node
   1965          return NULL;
   1966 @@ -1234,29 +1715,41 @@
   1967              previous = latest;
   1968          }
   1969          if (latest != NULL) {
   1970 -            const CompactTrieNode *equal = getCompactNode(header, vnode->equal);
   1971 +            const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode));
   1972              if (equal->flagscount & kParentEndsWord) {
   1973 -                latest->flags |= kEndsWord;
   1974 +                if(info->magic == COMPACT_TRIE_MAGIC_3){
   1975 +                    latest->flags = getValue(equal);
   1976 +                } else {
   1977 +                    latest->flags |= kEndsWord;
   1978 +                }
   1979              }
   1980 -            latest->equal = unpackOneNode(header, equal, status);
   1981 +            latest->equal = unpackOneNode(info, equal, status);
   1982          }
   1983          return head;
   1984      }
   1985      else {
   1986          // Horizontal node
   1987          const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
   1988 -        return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status);
   1989 +        return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status);
   1990      }
   1991  }
   1992  
   1993 +// returns a MutableTrieDictionary generated from the CompactTrieDictionary
   1994  MutableTrieDictionary *
   1995  CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
   1996 -    MutableTrieDictionary *result = new MutableTrieDictionary( status );
   1997 +    MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 );
   1998      if (result == NULL) {
   1999          status = U_MEMORY_ALLOCATION_ERROR;
   2000          return NULL;
   2001      }
   2002 -    TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status);
   2003 +    // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly
   2004 +    // because only kEqualOverflows flag should be checked in root's flagscount
   2005 +    const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *) 
   2006 +    getCompactNode(fInfo, fInfo->root);
   2007 +    uint16_t nodeCount = hnode->flagscount & kRootCountMask;
   2008 +    TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, 
   2009 +            nodeCount, status);
   2010 +
   2011      if (U_FAILURE(status)) {
   2012          delete root;    // Clean up
   2013          delete result;
   2014 @@ -1270,8 +1763,8 @@
   2015  
   2016  U_CAPI int32_t U_EXPORT2
   2017  triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
   2018 -           UErrorCode *status) {
   2019 -
   2020 +        UErrorCode *status) {
   2021 +    
   2022      if (status == NULL || U_FAILURE(*status)) {
   2023          return 0;
   2024      }
   2025 @@ -1286,14 +1779,14 @@
   2026      //
   2027      const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
   2028      if(!(  pInfo->dataFormat[0]==0x54 &&   /* dataFormat="TrDc" */
   2029 -           pInfo->dataFormat[1]==0x72 &&
   2030 -           pInfo->dataFormat[2]==0x44 &&
   2031 -           pInfo->dataFormat[3]==0x63 &&
   2032 -           pInfo->formatVersion[0]==1  )) {
   2033 +            pInfo->dataFormat[1]==0x72 &&
   2034 +            pInfo->dataFormat[2]==0x44 &&
   2035 +            pInfo->dataFormat[3]==0x63 &&
   2036 +            pInfo->formatVersion[0]==1  )) {
   2037          udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
   2038 -                         pInfo->dataFormat[0], pInfo->dataFormat[1],
   2039 -                         pInfo->dataFormat[2], pInfo->dataFormat[3],
   2040 -                         pInfo->formatVersion[0]);
   2041 +                pInfo->dataFormat[0], pInfo->dataFormat[1],
   2042 +                pInfo->dataFormat[2], pInfo->dataFormat[3],
   2043 +                pInfo->formatVersion[0]);
   2044          *status=U_UNSUPPORTED_ERROR;
   2045          return 0;
   2046      }
   2047 @@ -1311,8 +1804,10 @@
   2048      //
   2049      const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
   2050      const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
   2051 -    if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
   2052 -            || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
   2053 +    uint32_t magic = ds->readUInt32(header->magic);
   2054 +    if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
   2055 +            || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
   2056 +            || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
   2057      {
   2058          udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
   2059          *status=U_UNSUPPORTED_ERROR;
   2060 @@ -1333,10 +1828,10 @@
   2061      //
   2062      if (length < sizeWithUData) {
   2063          udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
   2064 -                            totalSize);
   2065 +                totalSize);
   2066          *status=U_INDEX_OUTOFBOUNDS_ERROR;
   2067          return 0;
   2068 -        }
   2069 +    }
   2070  
   2071      //
   2072      // Swap the Data.  Do the data itself first, then the CompactTrieHeader, because
   2073 @@ -1355,20 +1850,38 @@
   2074      }
   2075  
   2076      // We need to loop through all the nodes in the offset table, and swap each one.
   2077 -    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
   2078 +    uint32_t nodeCount, rootId;
   2079 +    if(header->magic == COMPACT_TRIE_MAGIC_1) {
   2080 +        nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
   2081 +        rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
   2082 +    } else {
   2083 +        nodeCount = ds->readUInt32(header->nodeCount);
   2084 +        rootId = ds->readUInt32(header->root);
   2085 +    }
   2086 +
   2087      // Skip node 0, which should always be 0.
   2088 -    for (int i = 1; i < nodeCount; ++i) {
   2089 +    for (uint32_t i = 1; i < nodeCount; ++i) {
   2090          uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
   2091          const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
   2092          CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
   2093          uint16_t flagscount = ds->readUInt16(inNode->flagscount);
   2094 -        uint16_t itemCount = flagscount & kCountMask;
   2095 +        uint16_t itemCount = getCount(inNode);
   2096 +        //uint16_t itemCount = flagscount & kCountMask;
   2097          ds->writeUInt16(&outNode->flagscount, flagscount);
   2098          if (itemCount > 0) {
   2099 -            if (flagscount & kVerticalNode) {
   2100 +            uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped
   2101 +            if (flagscount & kVerticalNode && i != rootId) {
   2102 +                if(flagscount & kEqualOverflows){
   2103 +                    // include overflow bits
   2104 +                    overflow += 1;
   2105 +                }
   2106 +                if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) {
   2107 +                    //include values
   2108 +                    overflow += 1;
   2109 +                }
   2110                  ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
   2111 -                                    itemCount*sizeof(uint16_t),
   2112 -                                    outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
   2113 +                        (itemCount + overflow)*sizeof(uint16_t),
   2114 +                        outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
   2115                  uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
   2116                  ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
   2117              }
   2118 @@ -1381,26 +1894,62 @@
   2119                      word = ds->readUInt16(inHNode->entries[j].equal);
   2120                      ds->writeUInt16(&outHNode->entries[j].equal, word);
   2121                  }
   2122 +
   2123 +                // swap overflow/value information
   2124 +                if(flagscount & kEqualOverflows){
   2125 +                    overflow += (itemCount + 3) / 4;
   2126 +                }
   2127 +
   2128 +                if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) {
   2129 +                    //include values
   2130 +                    overflow += 1;
   2131 +                }
   2132 +
   2133 +                uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount];
   2134 +                uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount];
   2135 +                for(int j = 0; j<overflow; j++){
   2136 +                    uint16_t extraInfo = ds->readUInt16(*inOverflow);
   2137 +                    ds->writeUInt16(outOverflow, extraInfo);
   2138 +
   2139 +                    inOverflow++;
   2140 +                    outOverflow++;
   2141 +                }
   2142              }
   2143          }
   2144      }
   2145  #endif
   2146  
   2147 -    // All the data in all the nodes consist of 16 bit items. Swap them all at once.
   2148 -    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
   2149 -    uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t));
   2150 -    ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
   2151 -
   2152      // Swap the header
   2153      ds->writeUInt32(&outputHeader->size, totalSize);
   2154 -    uint32_t magic = ds->readUInt32(header->magic);
   2155      ds->writeUInt32(&outputHeader->magic, magic);
   2156 -    ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
   2157 -    uint16_t root = ds->readUInt16(header->root);
   2158 -    ds->writeUInt16(&outputHeader->root, root);
   2159 -    ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
   2160 -            sizeof(uint32_t)*(int32_t)nodeCount,
   2161 -            outBytes+offsetof(CompactTrieHeader,offsets), status);
   2162 +
   2163 +    uint32_t nodeCount;
   2164 +    uint32_t offsetPos;
   2165 +    if (header->magic == COMPACT_TRIE_MAGIC_1) {
   2166 +        CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
   2167 +        CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader;
   2168 +
   2169 +        nodeCount = ds->readUInt16(headerV1->nodeCount);
   2170 +        ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
   2171 +        uint16_t root = ds->readUInt16(headerV1->root);
   2172 +        ds->writeUInt16(&outputHeaderV1->root, root);
   2173 +        offsetPos = offsetof(CompactTrieHeaderV1,offsets);
   2174 +    } else {
   2175 +        nodeCount = ds->readUInt32(header->nodeCount);
   2176 +        ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
   2177 +        uint32_t root = ds->readUInt32(header->root);
   2178 +        ds->writeUInt32(&outputHeader->root, root);
   2179 +        offsetPos = offsetof(CompactTrieHeader,offsets);
   2180 +    }
   2181 +
   2182 +    // All the data in all the nodes consist of 16 bit items. Swap them all at once.
   2183 +    uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
   2184 +    ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
   2185 +
   2186 +    //swap offsets
   2187 +    ds->swapArray32(ds, inBytes+offsetPos,
   2188 +            sizeof(uint32_t)*(uint32_t)nodeCount,
   2189 +            outBytes+offsetPos, status);
   2190  
   2191      return sizeWithUData;
   2192  }
   2193 --- source/common/triedict.h	2006-06-06 15:38:49.000000000 -0700
   2194 +++ source/common/triedict.h	2011-01-21 14:12:45.496927000 -0800
   2195 @@ -47,7 +47,6 @@
   2196  U_NAMESPACE_BEGIN
   2197  
   2198  class StringEnumeration;
   2199 -struct CompactTrieHeader;
   2200  
   2201  /*******************************************************************
   2202   * TrieWordDictionary
   2203 @@ -72,23 +71,29 @@
   2204     */
   2205    virtual ~TrieWordDictionary();
   2206  
   2207 +  /**
   2208 +   * <p>Returns true if the dictionary contains values associated with each word.</p>
   2209 +   */
   2210 +  virtual UBool getValued() const = 0;
   2211 +
   2212   /**
   2213    * <p>Find dictionary words that match the text.</p>
   2214    *
   2215    * @param text A UText representing the text. The
   2216    * iterator is left after the longest prefix match in the dictionary.
   2217 -  * @param start The current position in text.
   2218    * @param maxLength The maximum number of code units to match.
   2219    * @param lengths An array that is filled with the lengths of words that matched.
   2220    * @param count Filled with the number of elements output in lengths.
   2221    * @param limit The size of the lengths array; this limits the number of words output.
   2222 +  * @param values An array that is filled with the values associated with the matched words.
   2223    * @return The number of characters in text that were matched.
   2224    */
   2225    virtual int32_t matches( UText *text,
   2226                                int32_t maxLength,
   2227                                int32_t *lengths,
   2228                                int &count,
   2229 -                              int limit ) const = 0;
   2230 +                              int limit,
   2231 +                              uint16_t *values = NULL) const = 0;
   2232  
   2233    /**
   2234     * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
   2235 @@ -128,6 +133,12 @@
   2236  
   2237    UText    *fIter;
   2238  
   2239 +    /**
   2240 +     * A UText for internal use
   2241 +     * @internal
   2242 +     */
   2243 +  UBool fValued;
   2244 +
   2245    friend class CompactTrieDictionary;   // For fast conversion
   2246  
   2247   public:
   2248 @@ -138,14 +149,29 @@
   2249    * @param median A UChar around which to balance the trie. Ideally, it should
   2250    * begin at least one word that is near the median of the set in the dictionary
   2251    * @param status A status code recording the success of the call.
   2252 +  * @param containsValue True if the dictionary stores values associated with each word.
   2253    */
   2254 -  MutableTrieDictionary( UChar median, UErrorCode &status );
   2255 +  MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE );
   2256  
   2257    /**
   2258     * <p>Virtual destructor.</p>
   2259     */
   2260    virtual ~MutableTrieDictionary();
   2261  
   2262 +  /**
   2263 +   * Indicate whether the MutableTrieDictionary stores values associated with each word
   2264 +   */
   2265 +  void setValued(UBool valued){
   2266 +      fValued = valued;
   2267 +  }
   2268 +
   2269 +  /**
   2270 +   * <p>Returns true if the dictionary contains values associated with each word.</p>
   2271 +   */
   2272 +  virtual UBool getValued() const {
   2273 +      return fValued;
   2274 +  }
   2275 +
   2276   /**
   2277    * <p>Find dictionary words that match the text.</p>
   2278    *
   2279 @@ -155,13 +181,15 @@
   2280    * @param lengths An array that is filled with the lengths of words that matched.
   2281    * @param count Filled with the number of elements output in lengths.
   2282    * @param limit The size of the lengths array; this limits the number of words output.
   2283 +  * @param values An array that is filled with the values associated with the matched words.
   2284    * @return The number of characters in text that were matched.
   2285    */
   2286    virtual int32_t matches( UText *text,
   2287                                int32_t maxLength,
   2288                                int32_t *lengths,
   2289                                int &count,
   2290 -                              int limit ) const;
   2291 +                              int limit,
   2292 +                              uint16_t *values = NULL) const;
   2293  
   2294    /**
   2295     * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
   2296 @@ -173,15 +201,17 @@
   2297    virtual StringEnumeration *openWords( UErrorCode &status ) const;
   2298  
   2299   /**
   2300 -  * <p>Add one word to the dictionary.</p>
   2301 +  * <p>Add one word to the dictionary with an optional associated value.</p>
   2302    *
   2303    * @param word A UChar buffer containing the word.
   2304    * @param length The length of the word.
   2305 -  * @param status The resultant status
   2306 +  * @param status The resultant status.
   2307 +  * @param value The nonzero value associated with this word.
   2308    */
   2309    virtual void addWord( const UChar *word,
   2310                          int32_t length,
   2311 -                        UErrorCode &status);
   2312 +                        UErrorCode &status,
   2313 +                        uint16_t value = 0);
   2314  
   2315  #if 0
   2316   /**
   2317 @@ -203,8 +233,9 @@
   2318    * @param lengths An array that is filled with the lengths of words that matched.
   2319    * @param count Filled with the number of elements output in lengths.
   2320    * @param limit The size of the lengths array; this limits the number of words output.
   2321 -  * @param parent The parent of the current node
   2322 -  * @param pMatched The returned parent node matched the input
   2323 +  * @param parent The parent of the current node.
   2324 +  * @param pMatched The returned parent node matched the input/
   2325 +  * @param values An array that is filled with the values associated with the matched words.
   2326    * @return The number of characters in text that were matched.
   2327    */
   2328    virtual int32_t search( UText *text,
   2329 @@ -213,40 +244,46 @@
   2330                                int &count,
   2331                                int limit,
   2332                                TernaryNode *&parent,
   2333 -                              UBool &pMatched ) const;
   2334 +                              UBool &pMatched,
   2335 +                              uint16_t *values = NULL) const;
   2336  
   2337  private:
   2338   /**
   2339    * <p>Private constructor. The root node it not allocated.</p>
   2340    *
   2341    * @param status A status code recording the success of the call.
   2342 +  * @param containsValues True if the dictionary will store a value associated 
   2343 +  * with each word added.
   2344    */
   2345 -  MutableTrieDictionary( UErrorCode &status );
   2346 +  MutableTrieDictionary( UErrorCode &status, UBool containsValues = false );
   2347  };
   2348  
   2349  /*******************************************************************
   2350   * CompactTrieDictionary
   2351   */
   2352  
   2353 +//forward declarations
   2354 +struct CompactTrieHeader;
   2355 +struct CompactTrieInfo;
   2356 +
   2357  /**
   2358   * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
   2359   * to save space.</p>
   2360   */
   2361  class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
   2362   private:
   2363 -    /**
   2364 -     * The root node of the trie
   2365 -     */
   2366 +  /**
   2367 +   * The header of the CompactTrieDictionary which contains all info
   2368 +   */
   2369  
   2370 -  const CompactTrieHeader   *fData;
   2371 -
   2372 -    /**
   2373 -     * A UBool indicating whether or not we own the fData.
   2374 -     */
   2375 +  CompactTrieInfo                 *fInfo; 
   2376  
   2377 +  /**
   2378 +   * A UBool indicating whether or not we own the fData.
   2379 +   */
   2380    UBool                     fOwnData;
   2381  
   2382 -    UDataMemory              *fUData;
   2383 +  UDataMemory              *fUData;
   2384   public:
   2385    /**
   2386     * <p>Construct a dictionary from a UDataMemory.</p>
   2387 @@ -277,6 +314,11 @@
   2388     */
   2389    virtual ~CompactTrieDictionary();
   2390  
   2391 +  /**
   2392 +   * <p>Returns true if the dictionary contains values associated with each word.</p>
   2393 +   */
   2394 +  virtual UBool getValued() const;
   2395 +
   2396   /**
   2397    * <p>Find dictionary words that match the text.</p>
   2398    *
   2399 @@ -286,13 +328,15 @@
   2400    * @param lengths An array that is filled with the lengths of words that matched.
   2401    * @param count Filled with the number of elements output in lengths.
   2402    * @param limit The size of the lengths array; this limits the number of words output.
   2403 +  * @param values An array that is filled with the values associated with the matched words.
   2404    * @return The number of characters in text that were matched.
   2405    */
   2406    virtual int32_t matches( UText *text,
   2407 -                              int32_t rangeEnd,
   2408 +                              int32_t maxLength,
   2409                                int32_t *lengths,
   2410                                int &count,
   2411 -                              int limit ) const;
   2412 +                              int limit,
   2413 +                              uint16_t *values = NULL) const;
   2414  
   2415    /**
   2416     * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
   2417 @@ -311,7 +355,7 @@
   2418    virtual uint32_t dataSize() const;
   2419    
   2420   /**
   2421 -  * <p>Return a void * pointer to the compact data, platform-endian.</p>
   2422 +  * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian.</p>
   2423    *
   2424    * @return The data for the compact dictionary, suitable for passing to the
   2425    * constructor.
   2426 @@ -342,5 +386,5 @@
   2427  
   2428  U_NAMESPACE_END
   2429  
   2430 -    /* TRIEDICT_H */
   2431 +/* TRIEDICT_H */
   2432  #endif
   2433 --- source/data/Makefile.in	2010-10-29 13:21:33.000000000 -0700
   2434 +++ source/data/Makefile.in	2011-01-26 16:24:24.856798000 -0800
   2435 @@ -509,8 +520,9 @@
   2436  ####################################################    CTD
   2437  # CTD FILES
   2438  
   2439 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
   2440 -	$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
   2441 +# .ctd file now generated regardless of whether dictionary file exists
   2442 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
   2443 +	$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt
   2444  
   2445  ####################################################    CFU
   2446  # CFU FILES
   2447 --- source/data/brkitr/root.txt	2010-07-28 17:18:28.000000000 -0700
   2448 +++ source/data/brkitr/root.txt	2011-01-21 14:12:45.653922000 -0800
   2449 @@ -17,5 +17,8 @@
   2450      }
   2451      dictionaries{
   2452          Thai:process(dependency){"thaidict.ctd"}
   2453 +        Hani:process(dependency){"cjdict.ctd"}
   2454 +        Hira:process(dependency){"cjdict.ctd"}
   2455 +        Kata:process(dependency){"cjdict.ctd"}
   2456      }
   2457  }
   2458 --- source/data/xml/brkitr/root.xml	2010-03-01 15:13:18.000000000 -0800
   2459 +++ source/data/xml/brkitr/root.xml	2011-01-21 14:12:45.735922000 -0800
   2460 @@ -25,6 +25,9 @@
   2461              </icu:boundaries>
   2462              <icu:dictionaries>
   2463                  <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
   2464 +                <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>
   2465 +                <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>
   2466 +                <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>
   2467              </icu:dictionaries>
   2468          </icu:breakIteratorData>
   2469      </special>
   2470 --- source/test/cintltst/creststn.c	2010-10-28 10:44:02.000000000 -0700
   2471 +++ source/test/cintltst/creststn.c	2011-01-21 14:12:44.995020000 -0800
   2472 @@ -2188,21 +2188,21 @@
   2473        
   2474  
   2475        {
   2476 -            UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);
   2477 +            UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);
   2478              const UChar *got = NULL, *exp=NULL;
   2479              int32_t gotLen = 0, expLen=0;
   2480 -            ja = ures_getByKey(ja, "boundaries", ja, &status);
   2481 -            exp = tres_getString(ja, -1, "word", &expLen, &status);
   2482 +            th = ures_getByKey(th, "boundaries", th, &status);
   2483 +            exp = tres_getString(th, -1, "grapheme", &expLen, &status);
   2484                
   2485              tb = ures_getByKey(aliasB, "boundaries", tb, &status);
   2486 -            got = tres_getString(tb, -1, "word", &gotLen, &status);
   2487 +            got = tres_getString(tb, -1, "grapheme", &gotLen, &status);
   2488                  
   2489              if(U_FAILURE(status)) {
   2490                  log_err("%s trying to read str boundaries\n", u_errorName(status));
   2491              } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {
   2492                  log_err("Referencing alias didn't get the right data\n");
   2493              }
   2494 -            ures_close(ja);
   2495 +            ures_close(th);
   2496              status = U_ZERO_ERROR;
   2497        }
   2498        /* simple alias */
   2499 --- source/test/intltest/rbbiapts.cpp	2010-07-12 11:03:29.000000000 -0700
   2500 +++ source/test/intltest/rbbiapts.cpp	2011-01-21 14:12:45.033014000 -0800
   2501 @@ -156,9 +156,13 @@
   2502      if(*a!=*b){
   2503          errln("Failed: boilerplate method operator!= does not return correct results");
   2504      }
   2505 -    BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
   2506 -    if(a && c){
   2507 -        if(*c==*a){
   2508 +    // Japanese word break iteratos is identical to root with
   2509 +    // a dictionary-based break iterator, but Thai character break iterator
   2510 +    // is still different from Root. 
   2511 +    BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);
   2512 +    BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),status);
   2513 +    if(c && d){
   2514 +        if(*c==*d){
   2515              errln("Failed: boilerplate method opertator== does not return correct results");
   2516          }
   2517      }else{
   2518 @@ -167,6 +171,7 @@
   2519      delete a;
   2520      delete b;
   2521      delete c;
   2522 +    delete d;
   2523  }
   2524  
   2525  void RBBIAPITest::TestgetRules()
   2526 @@ -635,21 +640,21 @@
   2527  //
   2528  void RBBIAPITest::TestRuleStatus() {
   2529       UChar str[30];
   2530 -     u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",
   2531 -              // 012345678901234567  8      9    0  1      2    3  4      5    6
   2532 -              //                    Ideographic    Katakana       Hiragana
   2533 +     //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
   2534 +     // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
   2535 +     u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
   2536 +              // 012345678901234567  8      9    0      
   2537 +              //                     Katakana      
   2538                  str, 30);
   2539       UnicodeString testString1(str);
   2540 -     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
   2541 +     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
   2542       int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
   2543                            UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
   2544 -                          UBRK_WORD_IDEO,     UBRK_WORD_IDEO,   UBRK_WORD_NONE,
   2545 -                          UBRK_WORD_KANA,     UBRK_WORD_NONE,   UBRK_WORD_KANA,    UBRK_WORD_KANA};
   2546 +                          UBRK_WORD_IDEO,     UBRK_WORD_NONE};
   2547  
   2548       int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
   2549                            UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
   2550 -                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT,   UBRK_WORD_NONE_LIMIT,
   2551 -                          UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT,   UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
   2552 +                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
   2553  
   2554       UErrorCode status=U_ZERO_ERROR;
   2555  
   2556 @@ -888,9 +893,11 @@
   2557  
   2558      URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
   2559      {
   2560 +#if 0 // With a dictionary based word breaking, ja_word is identical to root.
   2561          if (ja_word && *ja_word == *root_word) {
   2562              errln("japan not different from root");
   2563          }
   2564 +#endif
   2565      }
   2566  
   2567      {
   2568 --- source/test/intltest/rbbitst.cpp	2010-10-08 18:23:28.000000000 -0700
   2569 +++ source/test/intltest/rbbitst.cpp	2011-01-21 14:12:45.180030000 -0800
   2570 @@ -35,6 +35,8 @@
   2571  #include <string.h>
   2572  #include <stdio.h>
   2573  #include <stdlib.h>
   2574 +#include "unicode/numfmt.h"
   2575 +#include "unicode/uscript.h"
   2576  
   2577  #define TEST_ASSERT(x) {if (!(x)) { \
   2578      errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
   2579 @@ -138,11 +140,13 @@
   2580              if (exec) TestThaiBreaks();                        break;
   2581          case 23: name = "TestTailoredBreaks";
   2582              if (exec) TestTailoredBreaks();                    break;
   2583 +        case 24: name = "TestTrieDictWithValue";
   2584 +            if(exec) TestTrieDictWithValue();                  break;
   2585  #else
   2586 -        case 21: case 22: case 23: name = "skip";
   2587 +        case 21: case 22: case 23: case 24: name = "skip";
   2588              break;
   2589  #endif
   2590 -        case 24: name = "TestDictRules";
   2591 +        case 25: name = "TestDictRules";
   2592              if (exec) TestDictRules();                         break;
   2593          case 25: name = "TestBug5532";
   2594              if (exec) TestBug5532();                           break;
   2595 @@ -607,6 +611,8 @@
   2596  
   2597  
   2598  void RBBITest::TestJapaneseWordBreak() {
   2599 +// TODO: Rewrite this test for a dictionary-based word breaking.
   2600 +#if 0
   2601      UErrorCode status = U_ZERO_ERROR;
   2602      BITestData   japaneseWordSelection(status);
   2603  
   2604 @@ -628,6 +634,7 @@
   2605  
   2606      generalIteratorTest(*e, japaneseWordSelection);
   2607      delete e;
   2608 +#endif
   2609  }
   2610  
   2611  void RBBITest::TestTrieDict() {
   2612 @@ -849,6 +856,372 @@
   2613      delete compact2;
   2614  }
   2615  
   2616 +/*TODO: delete later*/
   2617 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
   2618 +    UErrorCode      status  = U_ZERO_ERROR;
   2619 +    FILE *outfile = fopen(filename,"w");
   2620 +    UConverter *cvt = ucnv_open("UTF-8", &status);
   2621 +    if (U_FAILURE(status))
   2622 +        return;
   2623 +    if(outfile != NULL){
   2624 +        status = U_ZERO_ERROR;
   2625 +        const UnicodeString *word = enumer->snext(status);
   2626 +        while (word != NULL && U_SUCCESS(status)) {
   2627 +            char u8word[500];
   2628 +            status = U_ZERO_ERROR;
   2629 +            ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
   2630 +                    &status);
   2631 +            fprintf(outfile,"%s\n", u8word);
   2632 +            status = U_ZERO_ERROR;
   2633 +            word = enumer->snext(status);
   2634 +        }
   2635 +        fclose(outfile);
   2636 +    }
   2637 +    ucnv_close(cvt);
   2638 +}
   2639 +
   2640 +// A very simple helper class to streamline the buffer handling in
   2641 +// TestTrieDictWithValue
   2642 +template<class T, size_t N>
   2643 +class AutoBuffer {
   2644 + public:
   2645 +  AutoBuffer(size_t size) : buffer(stackBuffer) {
   2646 +    if (size > N)
   2647 +      buffer = new T[size];
   2648 +  }
   2649 +  ~AutoBuffer() {
   2650 +    if (buffer != stackBuffer) 
   2651 +      delete [] buffer;
   2652 +  }
   2653 +  T* elems() {
   2654 +    return buffer;
   2655 +  }
   2656 +  const T& operator[] (size_t i) const {
   2657 +    return buffer[i];
   2658 +  }
   2659 +  T& operator[] (size_t i) {
   2660 +    return buffer[i];
   2661 +  }
   2662 + private:
   2663 +  T stackBuffer[N]; 
   2664 +  T* buffer;
   2665 +  AutoBuffer();
   2666 +};
   2667 +
   2668 +//----------------------------------------------------------------------------
   2669 +//
   2670 +// TestTrieDictWithValue    Test trie dictionaries with logprob values and 
   2671 +// more than 2^16 nodes after compaction.
   2672 +//
   2673 +//----------------------------------------------------------------------------
   2674 +void RBBITest::TestTrieDictWithValue() {
   2675 +    UErrorCode      status  = U_ZERO_ERROR;
   2676 +
   2677 +    //
   2678 +    //  Open and read the test data file.
   2679 +    //
   2680 +    const char *testDataDirectory = IntlTest::getSourceTestData(status);
   2681 +    const char *filename = "cjdict-truncated.txt";
   2682 +    char testFileName[1000];
   2683 +    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {
   2684 +        errln("Can't open test data.  Path too long.");
   2685 +        return;
   2686 +    }
   2687 +    strcpy(testFileName, testDataDirectory);
   2688 +    strcat(testFileName, filename);
   2689 +
   2690 +    // Items needing deleting at the end
   2691 +    MutableTrieDictionary *mutableDict = NULL;
   2692 +    CompactTrieDictionary *compactDict = NULL;
   2693 +    UnicodeSet            *breaks      = NULL;
   2694 +    UChar                 *testFile    = NULL;
   2695 +    StringEnumeration     *enumer1     = NULL;
   2696 +    StringEnumeration     *enumer2     = NULL;
   2697 +    MutableTrieDictionary *mutable2    = NULL;
   2698 +    StringEnumeration     *cloneEnum   = NULL;
   2699 +    CompactTrieDictionary *compact2    = NULL;
   2700 +    NumberFormat          *nf           = NULL;
   2701 +    UText *originalText = NULL, *cloneText = NULL;
   2702 +
   2703 +    const UnicodeString *originalWord = NULL;
   2704 +    const UnicodeString *cloneWord    = NULL;
   2705 +    UChar *current;
   2706 +    UChar *word;
   2707 +    UChar uc;
   2708 +    int32_t wordLen;
   2709 +    int32_t wordCount;
   2710 +    int32_t testCount;
   2711 +    int32_t valueLen;
   2712 +    int counter = 0;
   2713 +
   2714 +    int    len;
   2715 +    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
   2716 +    if (U_FAILURE(status)) {
   2717 +        goto cleanup; /* something went wrong, error already output */
   2718 +    }
   2719 +
   2720 +    mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
   2721 +    if (U_FAILURE(status)) {
   2722 +        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
   2723 +        goto cleanup;
   2724 +    }
   2725 +
   2726 +    breaks = new UnicodeSet;
   2727 +    breaks->add(0x000A);     // Line Feed
   2728 +    breaks->add(0x000D);     // Carriage Return
   2729 +    breaks->add(0x2028);     // Line Separator
   2730 +    breaks->add(0x2029);     // Paragraph Separator
   2731 +    breaks->add(0x0009);     // Tab character
   2732 +
   2733 +    // Now add each non-comment line of the file as a word.
   2734 +    current = testFile;
   2735 +    word = current;
   2736 +    uc = *current++;
   2737 +    wordLen = 0;
   2738 +    wordCount = 0;
   2739 +    nf = NumberFormat::createInstance(status);
   2740 +
   2741 +    while (uc) {
   2742 +        UnicodeString ucharValue;
   2743 +        valueLen = 0;
   2744 +
   2745 +        if (uc == 0x0023) {     // #comment line, skip
   2746 +            while (uc && !breaks->contains(uc)) {
   2747 +                uc = *current++;
   2748 +            }
   2749 +        }
   2750 +        else{
   2751 +            while (uc && !breaks->contains(uc)) {
   2752 +                ++wordLen;
   2753 +                uc = *current++;
   2754 +            }
   2755 +            if(uc == 0x0009){ //separator is a tab char, read in num after tab
   2756 +                uc = *current++;
   2757 +                while (uc && !breaks->contains(uc)) {
   2758 +                    ucharValue.append(uc);
   2759 +                    uc = *current++;
   2760 +                }
   2761 +            }
   2762 +        }
   2763 +        if (wordLen > 0) {
   2764 +            Formattable value((int32_t)0);
   2765 +            nf->parse(ucharValue.getTerminatedBuffer(), value, status);
   2766 +            
   2767 +            if(U_FAILURE(status)){
   2768 +                errln("parsing of value failed when reading in dictionary\n");
   2769 +                goto cleanup;
   2770 +            }
   2771 +            mutableDict->addWord(word, wordLen, status, value.getLong());
   2772 +            if (U_FAILURE(status)) {
   2773 +                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
   2774 +                goto cleanup;
   2775 +            }
   2776 +            wordCount += 1;
   2777 +        }
   2778 +
   2779 +        // Find beginning of next line
   2780 +        while (uc && breaks->contains(uc)) {
   2781 +            uc = *current++;
   2782 +        }
   2783 +        word = current-1;
   2784 +        wordLen = 0;
   2785 +    }
   2786 +    
   2787 +    if (wordCount < 50) {
   2788 +        errln("Word count (%d) unreasonably small\n", wordCount);
   2789 +        goto cleanup;
   2790 +    }
   2791 +
   2792 +    enumer1 = mutableDict->openWords(status);
   2793 +    if (U_FAILURE(status)) {
   2794 +        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
   2795 +        goto cleanup;
   2796 +    }
   2797 +
   2798 +    testCount = 0;
   2799 +    if (wordCount != (testCount = enumer1->count(status))) {
   2800 +        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
   2801 +                testCount, wordCount, u_errorName(status));
   2802 +        goto cleanup;
   2803 +    }
   2804 +    
   2805 +    // Now compact it
   2806 +    compactDict = new CompactTrieDictionary(*mutableDict, status);
   2807 +    if (U_FAILURE(status)) {
   2808 +        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
   2809 +        goto cleanup;
   2810 +    }
   2811 +    
   2812 +    enumer2 = compactDict->openWords(status);
   2813 +    if (U_FAILURE(status)) {
   2814 +        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
   2815 +        goto cleanup;
   2816 +    }
   2817 +
   2818 +
   2819 +    //delete later
   2820 +//    writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
   2821 +//    writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
   2822 +
   2823 +    enumer1->reset(status);
   2824 +    enumer2->reset(status);
   2825 +
   2826 +    originalWord = enumer1->snext(status);
   2827 +    cloneWord = enumer2->snext(status);
   2828 +    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
   2829 +        if (*originalWord != *cloneWord) {
   2830 +            errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n", 
   2831 +                    counter, originalWord->length(), cloneWord->length());
   2832 +            goto cleanup;
   2833 +        }
   2834 +        
   2835 +        // check if attached values of the same word in both dictionaries tally
   2836 +#if 0
   2837 +        int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
   2838 +        uint16_t values1[originalWord->length()], values2[cloneWord->length()];
   2839 +#endif
   2840 +        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
   2841 +        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
   2842 +        AutoBuffer<uint16_t, 20> values1(originalWord->length());
   2843 +        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
   2844 +      
   2845 +        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
   2846 +        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
   2847 +        
   2848 +        int count1, count2;
   2849 +        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
   2850 +        compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
   2851 +        
   2852 +        if(values1[count1-1] != values2[count2-1]){
   2853 +            errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n", 
   2854 +                  counter, values1[count1-1], values2[count2-1]);
   2855 +            goto cleanup;
   2856 +        }
   2857 +        
   2858 +        counter++;
   2859 +        originalWord = enumer1->snext(status);
   2860 +        cloneWord = enumer2->snext(status);
   2861 +    }
   2862 +    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
   2863 +        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
   2864 +    }
   2865 +    
   2866 +    delete enumer1;
   2867 +    enumer1 = NULL;
   2868 +    delete enumer2;
   2869 +    enumer2 = NULL;
   2870 +
   2871 +    // Now un-compact it
   2872 +    mutable2 = compactDict->cloneMutable(status);
   2873 +    if (U_FAILURE(status)) {
   2874 +        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
   2875 +        goto cleanup;
   2876 +    }
   2877 +
   2878 +    cloneEnum = mutable2->openWords(status);
   2879 +    if (U_FAILURE(status)) {
   2880 +        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
   2881 +        goto cleanup;
   2882 +    }
   2883 +
   2884 +    if (wordCount != (testCount = cloneEnum->count(status))) {
   2885 +        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
   2886 +                testCount, wordCount, u_errorName(status));
   2887 +        goto cleanup;
   2888 +    }
   2889 +
   2890 +    // Compact original dictionary to clone. Note that we can only compare the same kind of
   2891 +    // dictionary as the order of the enumerators is not guaranteed to be the same between
   2892 +    // different kinds
   2893 +    enumer1 = mutableDict->openWords(status);
   2894 +    if (U_FAILURE(status)) {
   2895 +        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
   2896 +        goto cleanup;
   2897 +    }
   2898 +
   2899 +    counter = 0;
   2900 +    originalWord = enumer1->snext(status);
   2901 +    cloneWord = cloneEnum->snext(status);
   2902 +    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
   2903 +        if (*originalWord != *cloneWord) {
   2904 +            errln("Original and cloned MutableTrieDictionary word mismatch\n");
   2905 +            goto cleanup;
   2906 +        }
   2907 +
   2908 +        // check if attached values of the same word in both dictionaries tally
   2909 +        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
   2910 +        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
   2911 +        AutoBuffer<uint16_t, 20> values1(originalWord->length());
   2912 +        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
   2913 +        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
   2914 +        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
   2915 +        
   2916 +        int count1, count2;
   2917 +        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
   2918 +        mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
   2919 +        
   2920 +        if(values1[count1-1] != values2[count2-1]){
   2921 +            errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n", 
   2922 +                  counter, values1[count1-1], values2[count2-1]);
   2923 +            goto cleanup;
   2924 +        }
   2925 +        
   2926 +        counter++;
   2927 +
   2928 +        originalWord = enumer1->snext(status);
   2929 +        cloneWord = cloneEnum->snext(status);
   2930 +    }
   2931 +
   2932 +    if (U_FAILURE(status)) {
   2933 +        errln("Enumeration failed: %s\n", u_errorName(status));
   2934 +        goto cleanup;
   2935 +    }
   2936 +
   2937 +    if (originalWord != cloneWord) {
   2938 +        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
   2939 +        goto cleanup;
   2940 +    }
   2941 +
   2942 +    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
   2943 +    compact2 = new CompactTrieDictionary(compactDict->data(), status);
   2944 +    if (U_FAILURE(status)) {
   2945 +        errln("CompactTrieDictionary(const void *,...) failed\n");
   2946 +        goto cleanup;
   2947 +    }
   2948 +
   2949 +    if (compact2->dataSize() == 0) {
   2950 +        errln("CompactTrieDictionary->dataSize() == 0\n");
   2951 +        goto cleanup;
   2952 +    }
   2953 +
   2954 +    // Now count the words via the second dictionary
   2955 +    delete enumer1;
   2956 +    enumer1 = compact2->openWords(status);
   2957 +    if (U_FAILURE(status)) {
   2958 +        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
   2959 +        goto cleanup;
   2960 +    }
   2961 +
   2962 +    if (wordCount != (testCount = enumer1->count(status))) {
   2963 +        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
   2964 +                testCount, wordCount, u_errorName(status));
   2965 +        goto cleanup;
   2966 +    }
   2967 +
   2968 +    cleanup:
   2969 +    delete compactDict;
   2970 +    delete mutableDict;
   2971 +    delete breaks;
   2972 +    delete[] testFile;
   2973 +    delete enumer1;
   2974 +    delete mutable2;
   2975 +    delete cloneEnum;
   2976 +    delete compact2;
   2977 +    utext_close(originalText);
   2978 +    utext_close(cloneText);
   2979 +
   2980 + 
   2981 +}
   2982  
   2983  //----------------------------------------------------------------------------
   2984  //
   2985 @@ -1870,8 +2243,15 @@
   2986  // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
   2987  static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
   2988                                          "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
   2989 +#if 0 
   2990  static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
   2991  static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
   2992 +#endif
   2993 +// There's no separate Japanese word break iterator. Root is the same as Japanese.
   2994 +// Our dictionary-based iterator has to be tweaked to better handle U+3005,
   2995 +// U+3007, U+300B and some other cases.
   2996 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
   2997 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
   2998  
   2999  // UBreakIteratorType UBRK_SENTENCE, Locale "el"
   3000  // Add break after Greek question mark (cldrbug #2069).
   3001 @@ -2672,6 +3052,8 @@
   3002      UnicodeSet  *fNewlineSet;
   3003      UnicodeSet  *fKatakanaSet;
   3004      UnicodeSet  *fALetterSet;
   3005 +    // TODO(jungshik): Do we still need this change? 
   3006 +    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
   3007      UnicodeSet  *fMidNumLetSet;
   3008      UnicodeSet  *fMidLetterSet;
   3009      UnicodeSet  *fMidNumSet;
   3010 @@ -2680,6 +3062,7 @@
   3011      UnicodeSet  *fOtherSet;
   3012      UnicodeSet  *fExtendSet;
   3013      UnicodeSet  *fExtendNumLetSet;
   3014 +    UnicodeSet  *fDictionaryCjkSet;
   3015  
   3016      RegexMatcher  *fMatcher;
   3017  
   3018 @@ -2696,12 +3079,24 @@
   3019      fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
   3020      fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
   3021      fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
   3022 -    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
   3023 +    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
   3024 +    // Exclude Hangul syllables from ALetterSet during testing.
   3025 +    // Leave CJK dictionary characters out from the monkey tests!
   3026 +#if 0 
   3027 +    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
   3028 +                                      "[\\p{Line_Break = Complex_Context}"
   3029 +                                      "-\\p{Grapheme_Cluster_Break = Extend}"
   3030 +                                      "-\\p{Grapheme_Cluster_Break = Control}"
   3031 +                                      "]]",
   3032 +                                      status);
   3033 +#endif
   3034 +    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
   3035 +    fALetterSet->removeAll(*fDictionaryCjkSet);
   3036      fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
   3037      fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
   3038      fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
   3039      fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
   3040 -    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
   3041 +    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"),      status);
   3042      fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
   3043      fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
   3044      fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
   3045 @@ -2725,13 +3120,14 @@
   3046      fOtherSet->removeAll(*fFormatSet);
   3047      fOtherSet->removeAll(*fExtendSet);
   3048      // Inhibit dictionary characters from being tested at all.
   3049 +    fOtherSet->removeAll(*fDictionaryCjkSet);
   3050      fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
   3051  
   3052      fSets->addElement(fCRSet,        status);
   3053      fSets->addElement(fLFSet,        status);
   3054      fSets->addElement(fNewlineSet,   status);
   3055      fSets->addElement(fALetterSet,   status);
   3056 -    fSets->addElement(fKatakanaSet,  status);
   3057 +    //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
   3058      fSets->addElement(fMidLetterSet, status);
   3059      fSets->addElement(fMidNumLetSet, status);
   3060      fSets->addElement(fMidNumSet,    status);
   3061 @@ -3978,6 +4374,7 @@
   3062      for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
   3063          count --;
   3064          if (forward[count] != i) {
   3065 +            printStringBreaks(ustr, expected, expectedcount);
   3066              test->errln("happy break test previous() failed: expected %d but got %d",
   3067                          forward[count], i);
   3068              break;
   3069 @@ -4011,23 +4408,25 @@
   3070      UErrorCode    status = U_ZERO_ERROR;
   3071      // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
   3072      BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
   3073 +    // Replaced any C+J characters in a row with a random sequence of characters
   3074 +    // of the same length to make our C+J segmentation not get in the way.
   3075      static const char *strlist[] =
   3076      {
   3077      "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
   3078 -    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
   3079 +    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
   3080      "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
   3081      "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
   3082 -    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
   3083 +    "\\uac00\\u3588\\u009c\\u0953\\u194b",
   3084      "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
   3085      "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
   3086 -    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
   3087 +    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
   3088      "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
   3089      "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
   3090      "\\u2027\\U000e0067\\u0a47\\u00b7",
   3091      "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
   3092      "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
   3093      "\\u0589\\U000e006e\\u0a42\\U000104a5",
   3094 -    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
   3095 +    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
   3096      "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
   3097      "\\u0027\\u11af\\U000e0057\\u0602",
   3098      "\\U0001d7f2\\U000e007\\u0004\\u0589",
   3099 @@ -4039,7 +4438,7 @@
   3100      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3101      "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3102      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3103 -    "\\u58f4\\U000e0049\\u20e7\\u2027",
   3104 +    "\\u18f4\\U000e0049\\u20e7\\u2027",
   3105      "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3106      "\\ua183\\u102d\\u0bec\\u003a",
   3107      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3108 @@ -4049,7 +4448,7 @@
   3109      "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
   3110      "\\u003a\\u0664\\u00b7\\u1fba",
   3111      "\\u003b\\u0027\\u00b7\\u47a3",
   3112 -    "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
   3113 +    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
   3114      "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
   3115      "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
   3116      };
   3117 @@ -4104,12 +4503,12 @@
   3118      "\\U0001d7f2\\U000e007d\\u0004\\u0589",
   3119      "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
   3120      "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
   3121 -    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
   3122 +    "\\U000e0065\\u302c\\u09ee\\U000e0068",
   3123      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
   3124      "\\u0233\\U000e0020\\u0a69\\u0d6a",
   3125      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
   3126      "\\u58f4\\U000e0049\\u20e7\\u2027",
   3127 -    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3128 +    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
   3129      "\\ua183\\u102d\\u0bec\\u003a",
   3130      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
   3131      "\\u003a\\u0e57\\u0fad\\u002e",
   3132 --- source/test/intltest/rbbitst.h	2010-07-22 17:15:37.000000000 -0700
   3133 +++ source/test/intltest/rbbitst.h	2011-01-21 14:12:45.152007000 -0800
   3134 @@ -70,6 +70,7 @@
   3135      void TestBug5775();
   3136      void TestThaiBreaks();
   3137      void TestTailoredBreaks();
   3138 +    void TestTrieDictWithValue();
   3139      void TestDictRules();
   3140      void TestBug5532();
   3141  
   3142 --- source/test/testdata/rbbitst.txt	2010-07-28 17:18:28.000000000 -0700
   3143 +++ source/test/testdata/rbbitst.txt	2011-01-21 14:12:45.221011000 -0800
   3144 @@ -161,7 +161,23 @@
   3145  <data>abc<200>\U0001D800def<200>\U0001D3FF </data>
   3146  
   3147  # Hiragana & Katakana stay together, but separates from each other and Latin.
   3148 -<data>abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#</data>
   3149 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent
   3150 +#<data>abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#</data>
   3151 +
   3152 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
   3153 +<data><400><400></data>
   3154 +
   3155 +# more Japanese tests
   3156 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
   3157 +# and the Katakana block are not treated correctly. Enable this later.
   3158 +#<data><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400></data>
   3159 +<data><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400></data>
   3160 +
   3161 +# Testing of word boundary for dictionary word containing both kanji and kana
   3162 +<data><400><400><400></data>
   3163 +
   3164 +# Testing of Chinese segmentation (taken from a Chinese news article)
   3165 +<data>400<100><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400>200<100><400><400><400><400><400>63<100><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400><400></data>
   3166  
   3167  # Words with interior formatting characters
   3168  <data>def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> </data>
   3169 @@ -169,6 +185,8 @@
   3170  # to test for bug #4097779
   3171  <data>aa\N{COMBINING GRAVE ACCENT}a<200> </data>
   3172  
   3173 +# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
   3174 +<data>'<200> <100><400></data>
   3175  
   3176  #      to test for bug #4098467
   3177  #      What follows is a string of Korean characters (I found it in the Yellow Pages
   3178 @@ -178,9 +196,15 @@
   3179  #      precomposed syllables...
   3180  <data>\uc0c1\ud56d<200> \ud55c\uc778<200> \uc5f0\ud569<200> \uc7a5\ub85c\uad50\ud68c<200> \u1109\u1161\u11bc\u1112\u1161\u11bc<200> \u1112\u1161\u11ab\u110b\u1175\u11ab<200> \u110b\u1167\u11ab\u1112\u1161\u11b8<200> \u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> </data>
   3181  
   3182 -<data>abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> </data>
   3183 +# more Korean tests (Jamo not tested here, not counted as dictionary characters)
   3184 +# Disable them now because we don't include a Korean dictionary.
   3185 +#<data>\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
   3186 +#<data>\ud604\uc7ac<200>\ub294<200> \uac80\ucc30<200>\uc774<200> \ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> \uc870\uc0ac<200>\ud560<200> \uac00\ub2a5\uc131<200>\uc740<200> \uc5c6\ub2e4<200>\u002e</data>
   3187 +
   3188 +<data>abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> </data>
   3189 +
   3190 +<data>\u06c9<200>\uc799<200>\ufffa</data>
   3191  
   3192 -<data>\u06c9\uc799\ufffa<200></data>
   3193  
   3194  #      
   3195  #      Try some words from other scripts.
   3196 @@ -491,8 +515,7 @@
   3197  <data>\uc0c1\ud56d \ud55c\uc778 \uc5f0\ud569 \uc7a5\ub85c\uad50\ud68c</data>
   3198  
   3199  #      conjoining jamo...
   3200 -#      TODO:  rules update needed
   3201 -#<data>\u1109\u1161\u11bc\u1112\u1161\u11bc \u1112\u1161\u11ab\u110b\u1175\u11ab #\u110b\u1167\u11ab\u1112\u1161\u11b8 \u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c</data>
   3202 +<data>\u1109\u1161\u11bc\u1112\u1161\u11bc \u1112\u1161\u11ab\u110b\u1175\u11ab \u110b\u1167\u11ab\u1112\u1161\u11b8 \u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c</data>
   3203  
   3204  #      to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
   3205  <data>\u4e01\uff0e\u4e02\uff01\u4e03\uff1f</data>
   3206 --- source/test/testdata/testaliases.txt	2009-11-12 13:53:42.000000000 -0800
   3207 +++ source/test/testdata/testaliases.txt	2011-01-21 14:12:45.204005000 -0800
   3208 @@ -28,7 +28,7 @@
   3209      LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }
   3210  
   3211      // aliasing using position
   3212 -    boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle
   3213 +    boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle
   3214  
   3215      // aliasing arrays
   3216      zoneTests {
   3217 --- source/tools/genctd/genctd.cpp	2009-08-04 14:09:17.000000000 -0700
   3218 +++ source/tools/genctd/genctd.cpp	2011-01-21 14:12:45.564923000 -0800
   3219 @@ -1,6 +1,6 @@
   3220  /*
   3221  **********************************************************************
   3222 -*   Copyright (C) 2002-2009, International Business Machines
   3223 +*   Copyright (C) 2002-2010, International Business Machines
   3224  *   Corporation and others.  All Rights Reserved.
   3225  **********************************************************************
   3226  *
   3227 @@ -34,12 +34,15 @@
   3228  #include "unicode/udata.h"
   3229  #include "unicode/putil.h"
   3230  
   3231 +//#include "unicode/ustdio.h"
   3232 +
   3233  #include "uoptions.h"
   3234  #include "unewdata.h"
   3235  #include "ucmndata.h"
   3236  #include "rbbidata.h"
   3237  #include "triedict.h"
   3238  #include "cmemory.h"
   3239 +#include "uassert.h"
   3240  
   3241  #include <stdio.h>
   3242  #include <stdlib.h>
   3243 @@ -199,147 +202,191 @@
   3244      long        wordFileSize;
   3245      FILE        *file;
   3246      char        *wordBufferC;
   3247 -
   3248 +    MutableTrieDictionary *mtd = NULL;
   3249 +    
   3250      file = fopen(wordFileName, "rb");
   3251 -    if( file == 0 ) {
   3252 -        fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
   3253 -        exit(-1);
   3254 -    }
   3255 -    fseek(file, 0, SEEK_END);
   3256 -    wordFileSize = ftell(file);
   3257 -    fseek(file, 0, SEEK_SET);
   3258 -    wordBufferC = new char[wordFileSize+10];
   3259 -
   3260 -    result = (long)fread(wordBufferC, 1, wordFileSize, file);
   3261 -    if (result != wordFileSize)  {
   3262 -        fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
   3263 -        exit (-1);
   3264 -    }
   3265 -    wordBufferC[wordFileSize]=0;
   3266 -    fclose(file);
   3267 -
   3268 -    //
   3269 -    // Look for a Unicode Signature (BOM) on the word file
   3270 -    //
   3271 -    int32_t        signatureLength;
   3272 -    const char *   wordSourceC = wordBufferC;
   3273 -    const char*    encoding = ucnv_detectUnicodeSignature(
   3274 -                           wordSourceC, wordFileSize, &signatureLength, &status);
   3275 -    if (U_FAILURE(status)) {
   3276 -        exit(status);
   3277 -    }
   3278 -    if(encoding!=NULL ){
   3279 -        wordSourceC  += signatureLength;
   3280 -        wordFileSize -= signatureLength;
   3281 -    }
   3282 -
   3283 -    //
   3284 -    // Open a converter to take the rule file to UTF-16
   3285 -    //
   3286 -    UConverter* conv;
   3287 -    conv = ucnv_open(encoding, &status);
   3288 -    if (U_FAILURE(status)) {
   3289 -        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
   3290 -        exit(status);
   3291 -    }
   3292 -
   3293 -    //
   3294 -    // Convert the words to UChar.
   3295 -    //  Preflight first to determine required buffer size.
   3296 -    //
   3297 -    uint32_t destCap = ucnv_toUChars(conv,
   3298 -                       NULL,           //  dest,
   3299 -                       0,              //  destCapacity,
   3300 -                       wordSourceC,
   3301 -                       wordFileSize,
   3302 -                       &status);
   3303 -    if (status != U_BUFFER_OVERFLOW_ERROR) {
   3304 -        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3305 -        exit(status);
   3306 -    };
   3307 -
   3308 -    status = U_ZERO_ERROR;
   3309 -    UChar *wordSourceU = new UChar[destCap+1];
   3310 -    ucnv_toUChars(conv,
   3311 -                  wordSourceU,     //  dest,
   3312 -                  destCap+1,
   3313 -                  wordSourceC,
   3314 -                  wordFileSize,
   3315 -                  &status);
   3316 -    if (U_FAILURE(status)) {
   3317 -        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3318 -        exit(status);
   3319 -    };
   3320 -    ucnv_close(conv);
   3321 -
   3322 -    // Get rid of the original file buffer
   3323 -    delete[] wordBufferC;
   3324 -
   3325 -    // Create a MutableTrieDictionary, and loop through all the lines, inserting
   3326 -    // words.
   3327 -
   3328 -    // First, pick a median character.
   3329 -    UChar *current = wordSourceU + (destCap/2);
   3330 -    UChar uc = *current++;
   3331 -    UnicodeSet breaks;
   3332 -    breaks.add(0x000A);     // Line Feed
   3333 -    breaks.add(0x000D);     // Carriage Return
   3334 -    breaks.add(0x2028);     // Line Separator
   3335 -    breaks.add(0x2029);     // Paragraph Separator
   3336 -
   3337 -    do { 
   3338 -        // Look for line break
   3339 -        while (uc && !breaks.contains(uc)) {
   3340 -            uc = *current++;
   3341 -        }
   3342 -        // Now skip to first non-line-break
   3343 -        while (uc && breaks.contains(uc)) {
   3344 -            uc = *current++;
   3345 +    if( file == 0 ) { //cannot find file
   3346 +        //create 1-line dummy file: ie 1 char, 1 value
   3347 +        UNewDataMemory *pData;
   3348 +        char msg[1024];
   3349 +
   3350 +        /* write message with just the name */
   3351 +        sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);
   3352 +        fprintf(stderr, "%s\n", msg);
   3353 +
   3354 +        UChar c = 0x0020;
   3355 +        mtd = new MutableTrieDictionary(c, status, TRUE);
   3356 +        mtd->addWord(&c, 1, status, 1);
   3357 +
   3358 +    } else { //read words in from input file
   3359 +        fseek(file, 0, SEEK_END);
   3360 +        wordFileSize = ftell(file);
   3361 +        fseek(file, 0, SEEK_SET);
   3362 +        wordBufferC = new char[wordFileSize+10];
   3363 +    
   3364 +        result = (long)fread(wordBufferC, 1, wordFileSize, file);
   3365 +        if (result != wordFileSize)  {
   3366 +            fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
   3367 +            exit (-1);
   3368          }
   3369 -    }
   3370 -    while (uc && (breaks.contains(uc) || u_isspace(uc)));
   3371 -
   3372 -    MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
   3373 +        wordBufferC[wordFileSize]=0;
   3374 +        fclose(file);
   3375      
   3376 -    if (U_FAILURE(status)) {
   3377 -        fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
   3378 -        exit(status);
   3379 -    }
   3380 +        //
   3381 +        // Look for a Unicode Signature (BOM) on the word file
   3382 +        //
   3383 +        int32_t        signatureLength;
   3384 +        const char *   wordSourceC = wordBufferC;
   3385 +        const char*    encoding = ucnv_detectUnicodeSignature(
   3386 +                               wordSourceC, wordFileSize, &signatureLength, &status);
   3387 +        if (U_FAILURE(status)) {
   3388 +            exit(status);
   3389 +        }
   3390 +        if(encoding!=NULL ){
   3391 +            wordSourceC  += signatureLength;
   3392 +            wordFileSize -= signatureLength;
   3393 +        }
   3394      
   3395 -    // Now add the words. Words are non-space characters at the beginning of
   3396 -    // lines, and must be at least one UChar.
   3397 -    current = wordSourceU;
   3398 -    UChar *candidate = current;
   3399 -    uc = *current++;
   3400 -    int32_t length = 0;
   3401 -
   3402 -    while (uc) {
   3403 -        while (uc && !u_isspace(uc)) {
   3404 -            ++length;
   3405 -            uc = *current++;
   3406 +        //
   3407 +        // Open a converter to take the rule file to UTF-16
   3408 +        //
   3409 +        UConverter* conv;
   3410 +        conv = ucnv_open(encoding, &status);
   3411 +        if (U_FAILURE(status)) {
   3412 +            fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
   3413 +            exit(status);
   3414          }
   3415 -        if (length > 0) {
   3416 -            mtd->addWord(candidate, length, status);
   3417 -            if (U_FAILURE(status)) {
   3418 -                fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
   3419 -                        u_errorName(status));
   3420 -                exit(status);
   3421 +    
   3422 +        //
   3423 +        // Convert the words to UChar.
   3424 +        //  Preflight first to determine required buffer size.
   3425 +        //
   3426 +        uint32_t destCap = ucnv_toUChars(conv,
   3427 +                           NULL,           //  dest,
   3428 +                           0,              //  destCapacity,
   3429 +                           wordSourceC,
   3430 +                           wordFileSize,
   3431 +                           &status);
   3432 +        if (status != U_BUFFER_OVERFLOW_ERROR) {
   3433 +            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3434 +            exit(status);
   3435 +        };
   3436 +    
   3437 +        status = U_ZERO_ERROR;
   3438 +        UChar *wordSourceU = new UChar[destCap+1];
   3439 +        ucnv_toUChars(conv,
   3440 +                      wordSourceU,     //  dest,
   3441 +                      destCap+1,
   3442 +                      wordSourceC,
   3443 +                      wordFileSize,
   3444 +                      &status);
   3445 +        if (U_FAILURE(status)) {
   3446 +            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3447 +            exit(status);
   3448 +        };
   3449 +        ucnv_close(conv);
   3450 +    
   3451 +        // Get rid of the original file buffer
   3452 +        delete[] wordBufferC;
   3453 +    
   3454 +        // Create a MutableTrieDictionary, and loop through all the lines, inserting
   3455 +        // words.
   3456 +    
   3457 +        // First, pick a median character.
   3458 +        UChar *current = wordSourceU + (destCap/2);
   3459 +        UChar uc = *current++;
   3460 +        UnicodeSet breaks;
   3461 +        breaks.add(0x000A);     // Line Feed
   3462 +        breaks.add(0x000D);     // Carriage Return
   3463 +        breaks.add(0x2028);     // Line Separator
   3464 +        breaks.add(0x2029);     // Paragraph Separator
   3465 +    
   3466 +        do { 
   3467 +            // Look for line break
   3468 +            while (uc && !breaks.contains(uc)) {
   3469 +                uc = *current++;
   3470 +            }
   3471 +            // Now skip to first non-line-break
   3472 +            while (uc && breaks.contains(uc)) {
   3473 +                uc = *current++;
   3474              }
   3475          }
   3476 -        // Find beginning of next line
   3477 -        while (uc && !breaks.contains(uc)) {
   3478 -            uc = *current++;
   3479 +        while (uc && (breaks.contains(uc) || u_isspace(uc)));
   3480 +    
   3481 +        mtd = new MutableTrieDictionary(uc, status);
   3482 +        
   3483 +        if (U_FAILURE(status)) {
   3484 +            fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
   3485 +            exit(status);
   3486          }
   3487 -        while (uc && breaks.contains(uc)) {
   3488 -            uc = *current++;
   3489 +        
   3490 +        // Now add the words. Words are non-space characters at the beginning of
   3491 +        // lines, and must be at least one UChar. If a word has an associated value,
   3492 +        // the value should follow the word on the same line after a tab character.
   3493 +        current = wordSourceU;
   3494 +        UChar *candidate = current;
   3495 +        uc = *current++;
   3496 +        int32_t length = 0;
   3497 +        int count = 0;
   3498 +                
   3499 +        while (uc) {
   3500 +            while (uc && !u_isspace(uc)) {
   3501 +                ++length;
   3502 +                uc = *current++;
   3503 +            }
   3504 +            
   3505 +            UnicodeString valueString;
   3506 +            UChar candidateValue;
   3507 +            if(uc == 0x0009){ //separator is a tab char, read in number after space
   3508 +            	while (uc && u_isspace(uc)) {
   3509 +            		uc = *current++;
   3510 +            	}
   3511 +                while (uc && !u_isspace(uc)) {
   3512 +                    valueString.append(uc);
   3513 +                    uc = *current++;
   3514 +                }
   3515 +            }
   3516 +            
   3517 +            if (length > 0) {
   3518 +                count++;
   3519 +                if(valueString.length() > 0){
   3520 +                    mtd->setValued(TRUE);
   3521 +    
   3522 +                    uint32_t value = 0;
   3523 +                    char* s = new char[valueString.length()];
   3524 +                    valueString.extract(0,valueString.length(), s, valueString.length());
   3525 +                    int n = sscanf(s, "%ud", &value);
   3526 +                    U_ASSERT(n == 1);
   3527 +                    U_ASSERT(value >= 0); 
   3528 +                    mtd->addWord(candidate, length, status, (uint16_t)value);
   3529 +                    delete[] s;
   3530 +                } else {
   3531 +                    mtd->addWord(candidate, length, status);
   3532 +                }
   3533 +    
   3534 +                if (U_FAILURE(status)) {
   3535 +                    fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
   3536 +                            u_errorName(status), count);
   3537 +                    exit(status);
   3538 +                }
   3539 +            }
   3540 +    
   3541 +            // Find beginning of next line
   3542 +            while (uc && !breaks.contains(uc)) {
   3543 +                uc = *current++;
   3544 +            }
   3545 +            // Find next non-line-breaking character
   3546 +            while (uc && breaks.contains(uc)) {
   3547 +                uc = *current++;
   3548 +            }
   3549 +            candidate = current-1;
   3550 +            length = 0;
   3551          }
   3552 -        candidate = current-1;
   3553 -        length = 0;
   3554 +    
   3555 +        // Get rid of the Unicode text buffer
   3556 +        delete[] wordSourceU;
   3557      }
   3558  
   3559 -    // Get rid of the Unicode text buffer
   3560 -    delete[] wordSourceU;
   3561 -
   3562      // Now, create a CompactTrieDictionary from the mutable dictionary
   3563      CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
   3564      if (U_FAILURE(status)) {
   3565 @@ -393,4 +440,3 @@
   3566  
   3567  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
   3568  }
   3569 -
   3570 --- source/tools/genctd/Makefile.in	2006-12-16 13:07:01.000000000 -0800
   3571 +++ source/tools/genctd/Makefile.in	2011-01-21 14:12:45.555920000 -0800
   3572 @@ -23,13 +23,13 @@
   3573  ## Extra files to remove for 'make clean'
   3574  CLEANFILES = *~ $(DEPS) $(MAN_FILES)
   3575  
   3576 -## Target information
   3577 +## Target informationcd 
   3578  TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
   3579  
   3580  ifneq ($(top_builddir),$(top_srcdir))
   3581  CPPFLAGS += -I$(top_builddir)/common
   3582  endif
   3583 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
   3584 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n
   3585  LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
   3586  
   3587  OBJECTS = genctd.o
   3588