1 # 2 # Copyright (C) 2002-2010, International Business Machines Corporation 3 # and others. All Rights Reserved. 4 # 5 # file: word.txt 6 # 7 # ICU Word Break Rules 8 # See Unicode Standard Annex #29. 9 # These rules are based on UAX-29 Revision 16 for Unicode 6.0 10 # 11 # Note: Updates to word.txt will usually need to be merged into 12 # word_POSIX.txt and word_ja.txt also. 13 14 ############################################################################## 15 # 16 # Character class definitions from TR 29 17 # 18 ############################################################################## 19 20 !!chain; 21 22 23 # 24 # Character Class Definitions. 25 # 26 27 $CR = [\p{Word_Break = CR}]; 28 $LF = [\p{Word_Break = LF}]; 29 $Newline = [\p{Word_Break = Newline}]; 30 $Extend = [\p{Word_Break = Extend}]; 31 $Format = [\p{Word_Break = Format}]; 32 $Katakana = [\p{Word_Break = Katakana}]; 33 $ALetter = [\p{Word_Break = ALetter}]; 34 $MidNumLet = [\p{Word_Break = MidNumLet}]; 35 $MidLetter = [\p{Word_Break = MidLetter}]; 36 $MidNum = [\p{Word_Break = MidNum}]; 37 $Numeric = [\p{Word_Break = Numeric}]; 38 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 39 40 41 # Dictionary character set, for triggering language-based break engines. Currently 42 # limited to LineBreak=Complex_Context. Note that this set only works in Unicode 43 # 5.0 or later as the definition of Complex_Context was corrected to include all 44 # characters requiring dictionary break. 45 46 $dictionary = [:LineBreak = Complex_Context:]; 47 $Control = [\p{Grapheme_Cluster_Break = Control}]; 48 $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not 49 # include the dictionary characters. 50 51 # 52 # Rules 4 Ignore Format and Extend characters, 53 # except when they appear at the beginning of a region of text. 54 # 55 $KatakanaEx = $Katakana ($Extend | $Format)*; 56 $ALetterEx = $ALetterPlus ($Extend | $Format)*; 57 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 58 $MidLetterEx = $MidLetter ($Extend | $Format)*; 59 $MidNumEx = $MidNum ($Extend | $Format)*; 60 $NumericEx = $Numeric ($Extend | $Format)*; 61 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 62 63 $Hiragana = [\p{script=Hiragana}]; 64 $Ideographic = [\p{Ideographic}]; 65 $HiraganaEx = $Hiragana ($Extend | $Format)*; 66 $IdeographicEx = $Ideographic ($Extend | $Format)*; 67 68 ## ------------------------------------------------- 69 70 !!forward; 71 72 73 # Rule 3 - CR x LF 74 # 75 $CR $LF; 76 77 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 78 # of a region of Text. The rule here comes into play when the start of text 79 # begins with a group of Format chars, or with a "word" consisting of a single 80 # char that is not in any of the listed word break categories followed by 81 # format char(s). 82 [^$CR $LF $Newline]? ($Extend | $Format)+; 83 84 $NumericEx {100}; 85 $ALetterEx {200}; 86 $KatakanaEx {300}; # note: these status values override those from rule 5 87 $HiraganaEx {300}; # by virtual of being numerically larger. 88 $IdeographicEx {400}; # 89 90 # 91 # rule 5 92 # Do not break between most letters. 93 # 94 $ALetterEx $ALetterEx {200}; 95 96 # rule 6 and 7 97 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 98 99 # rule 8 100 101 $NumericEx $NumericEx {100}; 102 103 # rule 9 104 105 $ALetterEx $NumericEx {200}; 106 107 # rule 10 108 109 $NumericEx $ALetterEx {200}; 110 111 # rule 11 and 12 112 113 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 114 115 # rule 13 116 117 $KatakanaEx $KatakanaEx {300}; 118 119 # rule 13a/b 120 121 $ALetterEx $ExtendNumLetEx {200}; # (13a) 122 $NumericEx $ExtendNumLetEx {100}; # (13a) 123 $KatakanaEx $ExtendNumLetEx {300}; # (13a) 124 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 125 126 $ExtendNumLetEx $ALetterEx {200}; # (13b) 127 $ExtendNumLetEx $NumericEx {100}; # (13b) 128 $ExtendNumLetEx $KatakanaEx {300}; # (13b) 129 130 131 132 ## ------------------------------------------------- 133 134 !!reverse; 135 136 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; 137 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 138 $BackNumericEx = ($Format | $Extend)* $Numeric; 139 $BackMidNumEx = ($Format | $Extend)* $MidNum; 140 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; 141 $BackKatakanaEx = ($Format | $Extend)* $Katakana; 142 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; 143 144 # rule 3 145 $LF $CR; 146 147 # rule 4 148 ($Format | $Extend)* [^$CR $LF $Newline]?; 149 150 # rule 5 151 152 $BackALetterEx $BackALetterEx; 153 154 # rule 6 and 7 155 156 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; 157 158 159 # rule 8 160 161 $BackNumericEx $BackNumericEx; 162 163 # rule 9 164 165 $BackNumericEx $BackALetterEx; 166 167 # rule 10 168 169 $BackALetterEx $BackNumericEx; 170 171 # rule 11 and 12 172 173 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; 174 175 # rule 13 176 177 $BackKatakanaEx $BackKatakanaEx; 178 179 # rules 13 a/b 180 # 181 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 182 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 183 184 ## ------------------------------------------------- 185 186 !!safe_reverse; 187 188 # rule 3 189 ($Extend | $Format)+ .?; 190 191 # rule 6 192 ($MidLetter | $MidNumLet) $BackALetterEx; 193 194 # rule 11 195 ($MidNum | $MidNumLet) $BackNumericEx; 196 197 # For dictionary-based break 198 $dictionary $dictionary; 199 200 ## ------------------------------------------------- 201 202 !!safe_forward; 203 204 # rule 4 205 ($Extend | $Format)+ .?; 206 207 # rule 6 208 ($MidLetterEx | $MidNumLetEx) $ALetterEx; 209 210 # rule 11 211 ($MidNumEx | $MidNumLetEx) $NumericEx; 212 213 # For dictionary-based break 214 $dictionary $dictionary; 215