Home | History | Annotate | Download | only in text
      1 /*
      2  *******************************************************************************
      3  * Copyright (C) 2012-2014, International Business Machines Corporation and         *
      4  * others. All Rights Reserved.                                                *
      5  *******************************************************************************
      6  */
      7 package com.ibm.icu.text;
      8 
      9 import static com.ibm.icu.impl.CharacterIteration.DONE32;
     10 import static com.ibm.icu.impl.CharacterIteration.current32;
     11 import static com.ibm.icu.impl.CharacterIteration.next32;
     12 
     13 import java.io.IOException;
     14 import java.text.CharacterIterator;
     15 
     16 import com.ibm.icu.impl.Assert;
     17 
     18 class CjkBreakEngine extends DictionaryBreakEngine {
     19     private static final UnicodeSet fHangulWordSet = new UnicodeSet();
     20     private static final UnicodeSet fHanWordSet = new UnicodeSet();
     21     private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
     22     private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
     23     static {
     24         fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
     25         fHanWordSet.applyPattern("[:Han:]");
     26         fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
     27         fHiraganaWordSet.applyPattern("[:Hiragana:]");
     28 
     29         // freeze them all
     30         fHangulWordSet.freeze();
     31         fHanWordSet.freeze();
     32         fKatakanaWordSet.freeze();
     33         fHiraganaWordSet.freeze();
     34     }
     35 
     36     private DictionaryMatcher fDictionary = null;
     37 
     38     public CjkBreakEngine(boolean korean) throws IOException {
     39         super(BreakIterator.KIND_WORD);
     40         fDictionary = DictionaryData.loadDictionaryFor("Hira");
     41         if (korean) {
     42             setCharacters(fHangulWordSet);
     43         } else { //Chinese and Japanese
     44             UnicodeSet cjSet = new UnicodeSet();
     45             cjSet = new UnicodeSet();
     46             cjSet.addAll(fHanWordSet);
     47             cjSet.addAll(fKatakanaWordSet);
     48             cjSet.addAll(fHiraganaWordSet);
     49             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
     50             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
     51             setCharacters(cjSet);
     52         }
     53     }
     54 
     55     public boolean equals(Object obj) {
     56         if (obj instanceof CjkBreakEngine) {
     57             CjkBreakEngine other = (CjkBreakEngine)obj;
     58             return this.fSet.equals(other.fSet);
     59         }
     60         return false;
     61     }
     62 
     63     public int hashCode() {
     64         return getClass().hashCode();
     65     }
     66 
     67     private static final int kMaxKatakanaLength = 8;
     68     private static final int kMaxKatakanaGroupLength = 20;
     69     private static final int maxSnlp = 255;
     70     private static final int kint32max = Integer.MAX_VALUE;
     71     private static int getKatakanaCost(int wordlength) {
     72         int katakanaCost[] =  new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
     73         return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
     74     }
     75 
     76     private static boolean isKatakana(int value) {
     77         return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
     78                 (value >= 0xFF66 && value <= 0xFF9F);
     79     }
     80 
     81     public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
     82             DequeI foundBreaks) {
     83         if (startPos >= endPos) {
     84             return 0;
     85         }
     86 
     87         inText.setIndex(startPos);
     88 
     89         int inputLength = endPos - startPos;
     90         int[] charPositions = new int[inputLength + 1];
     91         StringBuffer s = new StringBuffer("");
     92         inText.setIndex(startPos);
     93         while (inText.getIndex() < endPos) {
     94             s.append(inText.current());
     95             inText.next();
     96         }
     97         String prenormstr = s.toString();
     98         boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
     99                                Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
    100         CharacterIterator text;
    101         int numChars = 0;
    102         if (isNormalized) {
    103             text = new java.text.StringCharacterIterator(prenormstr);
    104             int index = 0;
    105             charPositions[0] = 0;
    106             while (index < prenormstr.length()) {
    107                 int codepoint = prenormstr.codePointAt(index);
    108                 index += Character.charCount(codepoint);
    109                 numChars++;
    110                 charPositions[numChars] = index;
    111             }
    112         } else {
    113             String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
    114             text = new java.text.StringCharacterIterator(normStr);
    115             charPositions = new int[normStr.length() + 1];
    116             Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
    117             int index = 0;
    118             charPositions[0] = 0;
    119             while (index < normalizer.endIndex()) {
    120                 normalizer.next();
    121                 numChars++;
    122                 index = normalizer.getIndex();
    123                 charPositions[numChars] = index;
    124             }
    125         }
    126 
    127         // From here on out, do the algorithm. Note that our indices
    128         // refer to indices within the normalized string.
    129         int[] bestSnlp = new int[numChars + 1];
    130         bestSnlp[0] = 0;
    131         for (int i = 1; i <= numChars; i++) {
    132             bestSnlp[i] = kint32max;
    133         }
    134 
    135         int[] prev = new int[numChars + 1];
    136         for (int i = 0; i <= numChars; i++) {
    137             prev[i] = -1;
    138         }
    139 
    140         final int maxWordSize = 20;
    141         int values[] = new int[numChars];
    142         int lengths[] = new int[numChars];
    143         // dynamic programming to find the best segmentation
    144         boolean is_prev_katakana = false;
    145         for (int i = 0; i < numChars; i++) {
    146             text.setIndex(i);
    147             if (bestSnlp[i] == kint32max) {
    148                 continue;
    149             }
    150 
    151             int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
    152             int[] count_ = new int[1];
    153             fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
    154             int count = count_[0];
    155 
    156             // if there are no single character matches found in the dictionary
    157             // starting with this character, treat character as a 1-character word
    158             // with the highest value possible (i.e. the least likely to occur).
    159             // Exclude Korean characters from this treatment, as they should be
    160             // left together by default.
    161             if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
    162                 values[count] = maxSnlp;
    163                 lengths[count] = 1;
    164                 count++;
    165             }
    166 
    167             for (int j = 0; j < count; j++) {
    168                 int newSnlp = bestSnlp[i] + values[j];
    169                 if (newSnlp < bestSnlp[lengths[j] + i]) {
    170                     bestSnlp[lengths[j] + i] = newSnlp;
    171                     prev[lengths[j] + i] = i;
    172                 }
    173             }
    174 
    175             // In Japanese, single-character Katakana words are pretty rare.
    176             // So we apply the following heuristic to Katakana: any continuous
    177             // run of Katakana characters is considered a candidate word with
    178             // a default cost specified in the katakanaCost table according
    179             // to its length.
    180             text.setIndex(i);
    181             boolean is_katakana = isKatakana(current32(text));
    182             if (!is_prev_katakana && is_katakana) {
    183                 int j = i + 1;
    184                 next32(text);
    185                 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
    186                     next32(text);
    187                     ++j;
    188                 }
    189 
    190                 if ((j - i) < kMaxKatakanaGroupLength) {
    191                     int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
    192                     if (newSnlp < bestSnlp[j]) {
    193                         bestSnlp[j] = newSnlp;
    194                         prev[j] = i;
    195                     }
    196                 }
    197             }
    198             is_prev_katakana = is_katakana;
    199         }
    200 
    201         int t_boundary[] = new int[numChars + 1];
    202         int numBreaks = 0;
    203         if (bestSnlp[numChars] == kint32max) {
    204             t_boundary[numBreaks] = numChars;
    205             numBreaks++;
    206         } else {
    207             for (int i = numChars; i > 0; i = prev[i]) {
    208                 t_boundary[numBreaks] = i;
    209                 numBreaks++;
    210             }
    211             Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
    212         }
    213 
    214         if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
    215             t_boundary[numBreaks++] = 0;
    216         }
    217 
    218         int correctedNumBreaks = 0;
    219         for (int i = numBreaks - 1; i >= 0; i--) {
    220             int pos = charPositions[t_boundary[i]] + startPos;
    221             if (!(foundBreaks.contains(pos) || pos == startPos)) {
    222                 foundBreaks.push(charPositions[t_boundary[i]] + startPos);
    223                 correctedNumBreaks++;
    224             }
    225         }
    226 
    227         if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
    228             foundBreaks.pop();
    229             correctedNumBreaks--;
    230         }
    231         if (!foundBreaks.isEmpty())
    232             inText.setIndex(foundBreaks.peek());
    233         return correctedNumBreaks;
    234     }
    235 }
    236