1 # 2 # Copyright (C) 2002-2009, International Business Machines Corporation 3 # and others. All Rights Reserved. 4 # 5 # file: word.txt 6 # 7 # ICU Word Break Rules 8 # See Unicode Standard Annex #29. 9 # These rules are based on UAX-29 Revision 13 for Unicode 5.1 10 # 11 # Note: Updates to word.txt will usually need to be merged into 12 # word_POSIX.txt and word_ja.txt also. 13 14 ############################################################################## 15 # 16 # Character class definitions from TR 29 17 # 18 ############################################################################## 19 20 !!chain; 21 22 23 # 24 # Character Class Definitions. 25 # 26 27 $CR = [\p{Word_Break = CR}]; 28 $LF = [\p{Word_Break = LF}]; 29 $Newline = [\p{Word_Break = Newline}]; 30 $Extend = [\p{Word_Break = Extend}]; 31 $Format = [\p{Word_Break = Format}]; 32 $Hiragana = [:Hiragana:]; 33 $Katakana = [\p{Word_Break = Katakana}]; 34 $Han = [:Han:]; 35 $ALetter = [\p{Word_Break = ALetter}]; 36 $MidNumLet = [\p{Word_Break = MidNumLet}]; 37 $MidLetter = [\p{Word_Break = MidLetter}]; 38 $MidNum = [\p{Word_Break = MidNum}]; 39 $Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits 40 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 41 42 43 # Dictionary character set, for triggering language-based break engines. Currently 44 # limited to LineBreak=Complex_Context and CJK. Note that this set only works 45 # in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all 46 # characters requiring dictionary break. 47 48 $Control = [\p{Grapheme_Cluster_Break = Control}]; 49 $HangulSyllable = [\uac00-\ud7a3]; 50 $ComplexContext = [:LineBreak = Complex_Context:]; 51 $KanaKanji = [$Han $Hiragana $Katakana]; 52 $dictionaryCJK = [$KanaKanji $HangulSyllable]; 53 $dictionary = [$ComplexContext $dictionaryCJK]; 54 55 # leave CJK scripts out of ALetterPlus 56 $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; 57 58 59 # 60 # Rules 4 Ignore Format and Extend characters, 61 # except when they appear at the beginning of a region of text. 62 # 63 # TODO: check if handling of katakana in dictionary makes rules incorrect/void. 64 $KatakanaEx = $Katakana ($Extend | $Format)*; 65 $ALetterEx = $ALetterPlus ($Extend | $Format)*; 66 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 67 $MidLetterEx = $MidLetter ($Extend | $Format)*; 68 $MidNumEx = $MidNum ($Extend | $Format)*; 69 $NumericEx = $Numeric ($Extend | $Format)*; 70 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 71 72 $Ideographic = [\p{Ideographic}]; 73 $HiraganaEx = $Hiragana ($Extend | $Format)*; 74 $IdeographicEx = $Ideographic ($Extend | $Format)*; 75 76 ## ------------------------------------------------- 77 78 !!forward; 79 80 81 # Rule 3 - CR x LF 82 # 83 $CR $LF; 84 85 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 86 # of a region of Text. The rule here comes into play when the start of text 87 # begins with a group of Format chars, or with a "word" consisting of a single 88 # char that is not in any of the listed word break categories followed by 89 # format char(s). 90 # format char(s), or is not a CJK dictionary character. 91 [^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+; 92 93 $NumericEx {100}; 94 $ALetterEx {200}; 95 $HangulSyllable {200}; 96 $KatakanaEx {400}; #originally 300 97 $HiraganaEx {400}; #originally 300 98 $IdeographicEx {400}; # 99 100 # 101 # rule 5 102 # Do not break between most letters. 103 # 104 $ALetterEx $ALetterEx {200}; 105 106 # rule 6 and 7 107 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 108 109 # rule 8 110 111 $NumericEx $NumericEx {100}; 112 113 # rule 9 114 115 $ALetterEx $NumericEx {200}; 116 117 # rule 10 118 119 $NumericEx $ALetterEx {200}; 120 121 # rule 11 and 12 122 123 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 124 125 # rule 13 126 127 # To be consistent with '$KanaKanji $KanaKanji', changed 128 # from 300 to 400. 129 # See also TestRuleStatus in intltest/rbbiapts.cpp 130 $KatakanaEx $KatakanaEx {400}; 131 132 # rule 13a/b 133 134 $ALetterEx $ExtendNumLetEx {200}; # (13a) 135 $NumericEx $ExtendNumLetEx {100}; # (13a) 136 $KatakanaEx $ExtendNumLetEx {400}; # (13a) 137 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 138 139 $ExtendNumLetEx $ALetterEx {200}; # (13b) 140 $ExtendNumLetEx $NumericEx {100}; # (13b) 141 $ExtendNumLetEx $KatakanaEx {400}; # (13b) 142 143 # special handling for CJK characters: chain for later dictionary segmentation 144 $HangulSyllable $HangulSyllable {200}; 145 $KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found 146 147 148 ## ------------------------------------------------- 149 150 !!reverse; 151 152 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; 153 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 154 $BackNumericEx = ($Format | $Extend)* $Numeric; 155 $BackMidNumEx = ($Format | $Extend)* $MidNum; 156 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; 157 $BackKatakanaEx = ($Format | $Extend)* $Katakana; 158 $BackHiraganaEx = ($Extend | $Format)* $Hiragana; 159 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; 160 161 # rule 3 162 $LF $CR; 163 164 # rule 4 165 ($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?; 166 167 # rule 5 168 169 $BackALetterEx $BackALetterEx; 170 171 # rule 6 and 7 172 173 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; 174 175 176 # rule 8 177 178 $BackNumericEx $BackNumericEx; 179 180 # rule 9 181 182 $BackNumericEx $BackALetterEx; 183 184 # rule 10 185 186 $BackALetterEx $BackNumericEx; 187 188 # rule 11 and 12 189 190 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; 191 192 # rule 13 193 194 $BackKatakanaEx $BackKatakanaEx; 195 196 # rules 13 a/b 197 # 198 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 199 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 200 201 # special handling for CJK characters: chain for later dictionary segmentation 202 $HangulSyllable $HangulSyllable; 203 $KanaKanji $KanaKanji; #different rule status if both kanji and kana found 204 205 ## ------------------------------------------------- 206 207 !!safe_reverse; 208 209 # rule 3 210 ($Extend | $Format)+ .?; 211 212 # rule 6 213 ($MidLetter | $MidNumLet) $BackALetterEx; 214 215 # rule 11 216 ($MidNum | $MidNumLet) $BackNumericEx; 217 218 # For dictionary-based break 219 $dictionary $dictionary; 220 221 ## ------------------------------------------------- 222 223 !!safe_forward; 224 225 # rule 4 226 ($Extend | $Format)+ .?; 227 228 # rule 6 229 ($MidLetterEx | $MidNumLetEx) $ALetterEx; 230 231 # rule 11 232 ($MidNumEx | $MidNumLetEx) $NumericEx; 233 234 # For dictionary-based break 235 $dictionary $dictionary; 236