1 # 2 # Copyright (C) 2002-2013, International Business Machines Corporation 3 # and others. All Rights Reserved. 4 # 5 # file: word.txt 6 # 7 # ICU Word Break Rules 8 # See Unicode Standard Annex #29. 9 # These rules are based on UAX #29 Revision 20 for Unicode Version 6.2 10 # 11 # Note: Updates to word.txt will usually need to be merged into 12 # word_POSIX.txt also. 13 14 ############################################################################## 15 # 16 # Character class definitions from TR 29 17 # 18 ############################################################################## 19 20 !!chain; 21 22 23 # 24 # Character Class Definitions. 25 # 26 27 $CR = [\p{Word_Break = CR}]; 28 $LF = [\p{Word_Break = LF}]; 29 $Newline = [\p{Word_Break = Newline}]; 30 $Extend = [\p{Word_Break = Extend}]; 31 $Format = [\p{Word_Break = Format}]; 32 $Hiragana = [:Hiragana:]; 33 $Katakana = [\p{Word_Break = Katakana}]; 34 $Han = [:Han:]; 35 $ALetter = [\p{Word_Break = ALetter}]; 36 $MidNumLet = [\p{Word_Break = MidNumLet}]; 37 $MidLetter = [\p{Word_Break = MidLetter}]; 38 $MidNum = [\p{Word_Break = MidNum}]; 39 $Numeric = [\p{Word_Break = Numeric}]; 40 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 41 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 42 43 44 # Dictionary character set, for triggering language-based break engines. Currently 45 # limited to LineBreak=Complex_Context. Note that this set only works in Unicode 46 # 5.0 or later as the definition of Complex_Context was corrected to include all 47 # characters requiring dictionary break. 48 49 $Control = [\p{Grapheme_Cluster_Break = Control}]; 50 $HangulSyllable = [\uac00-\ud7a3]; 51 $ComplexContext = [:LineBreak = Complex_Context:]; 52 $KanaKanji = [$Han $Hiragana $Katakana]; 53 $dictionaryCJK = [$KanaKanji $HangulSyllable]; 54 $dictionary = [$ComplexContext $dictionaryCJK]; 55 56 # leave CJK scripts out of ALetterPlus 57 $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; 58 59 60 # 61 # Rules 4 Ignore Format and Extend characters, 62 # except when they appear at the beginning of a region of text. 63 # 64 # TODO: check if handling of katakana in dictionary makes rules incorrect/void 65 $KatakanaEx = $Katakana ($Extend | $Format)*; 66 $ALetterEx = $ALetterPlus ($Extend | $Format)*; 67 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 68 $MidLetterEx = $MidLetter ($Extend | $Format)*; 69 $MidNumEx = $MidNum ($Extend | $Format)*; 70 $NumericEx = $Numeric ($Extend | $Format)*; 71 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 72 $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*; 73 74 $Ideographic = [\p{Ideographic}]; 75 $HiraganaEx = $Hiragana ($Extend | $Format)*; 76 $IdeographicEx = $Ideographic ($Extend | $Format)*; 77 78 ## ------------------------------------------------- 79 80 !!forward; 81 82 83 # Rule 3 - CR x LF 84 # 85 $CR $LF; 86 87 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 88 # of a region of Text. The rule here comes into play when the start of text 89 # begins with a group of Format chars, or with a "word" consisting of a single 90 # char that is not in any of the listed word break categories followed by 91 # format char(s), or is not a CJK dictionary character. 92 [^$CR $LF $Newline]? ($Extend | $Format)+; 93 94 $NumericEx {100}; 95 $ALetterEx {200}; 96 $HangulSyllable {200}; 97 $KatakanaEx {400}; # note: these status values override those from rule 5 98 $HiraganaEx {400}; # by virtue of being numerically larger. 99 $IdeographicEx {400}; # 100 101 # 102 # rule 5 103 # Do not break between most letters. 104 # 105 $ALetterEx $ALetterEx {200}; 106 107 # rule 6 and 7 108 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 109 110 # rule 8 111 112 $NumericEx $NumericEx {100}; 113 114 # rule 9 115 116 $ALetterEx $NumericEx {200}; 117 118 # rule 10 119 120 $NumericEx $ALetterEx {200}; 121 122 # rule 11 and 12 123 124 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 125 126 # rule 13 127 # to be consistent with $KanaKanji $KanaKanhi, changed 128 # from 300 to 400. 129 # See also TestRuleStatus in intltest/rbbiapts.cpp 130 $KatakanaEx $KatakanaEx {400}; 131 132 # rule 13a/b 133 134 $ALetterEx $ExtendNumLetEx {200}; # (13a) 135 $NumericEx $ExtendNumLetEx {100}; # (13a) 136 $KatakanaEx $ExtendNumLetEx {400}; # (13a) 137 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 138 139 $ExtendNumLetEx $ALetterEx {200}; # (13b) 140 $ExtendNumLetEx $NumericEx {100}; # (13b) 141 $ExtendNumLetEx $KatakanaEx {400}; # (13b) 142 143 # rule 13c 144 145 $Regional_IndicatorEx $Regional_IndicatorEx; 146 147 # special handling for CJK characters: chain for later dictionary segmentation 148 $HangulSyllable $HangulSyllable {200}; 149 $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 150 151 152 ## ------------------------------------------------- 153 154 !!reverse; 155 156 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; 157 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 158 $BackNumericEx = ($Format | $Extend)* $Numeric; 159 $BackMidNumEx = ($Format | $Extend)* $MidNum; 160 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; 161 $BackKatakanaEx = ($Format | $Extend)* $Katakana; 162 $BackHiraganaEx = ($Format | $Extend)* $Hiragana; 163 $BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet; 164 $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator; 165 166 # rule 3 167 $LF $CR; 168 169 # rule 4 170 ($Format | $Extend)* [^$CR $LF $Newline]?; 171 172 # rule 5 173 174 $BackALetterEx $BackALetterEx; 175 176 # rule 6 and 7 177 178 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; 179 180 181 # rule 8 182 183 $BackNumericEx $BackNumericEx; 184 185 # rule 9 186 187 $BackNumericEx $BackALetterEx; 188 189 # rule 10 190 191 $BackALetterEx $BackNumericEx; 192 193 # rule 11 and 12 194 195 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; 196 197 # rule 13 198 199 $BackKatakanaEx $BackKatakanaEx; 200 201 # rules 13 a/b 202 # 203 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 204 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 205 206 # rule 13c 207 208 $BackRegional_IndicatorEx $BackRegional_IndicatorEx; 209 210 # special handling for CJK characters: chain for later dictionary segmentation 211 $HangulSyllable $HangulSyllable; 212 $KanaKanji $KanaKanji; #different rule status if both kanji and kana found 213 214 ## ------------------------------------------------- 215 216 !!safe_reverse; 217 218 # rule 3 219 ($Extend | $Format)+ .?; 220 221 # rule 6 222 ($MidLetter | $MidNumLet) $BackALetterEx; 223 224 # rule 11 225 ($MidNum | $MidNumLet) $BackNumericEx; 226 227 # For dictionary-based break 228 $dictionary $dictionary; 229 230 ## ------------------------------------------------- 231 232 !!safe_forward; 233 234 # rule 4 235 ($Extend | $Format)+ .?; 236 237 # rule 6 238 ($MidLetterEx | $MidNumLetEx) $ALetterEx; 239 240 # rule 11 241 ($MidNumEx | $MidNumLetEx) $NumericEx; 242 243 # For dictionary-based break 244 $dictionary $dictionary; 245