1 # 2 # Copyright (C) 2002-2010, International Business Machines Corporation 3 # and others. All Rights Reserved. 4 # 5 # file: word_ja.txt 6 # 7 # ICU Word Break Rules 8 # See Unicode Standard Annex #29. 9 # These rules are based on UAX-29 Revision 16 for Unicode 6.0 10 # 11 # Note: Updates to word.txt will usually need to be merged into 12 # word_POSIX.txt and word_ja.txt also. 13 14 ############################################################################## 15 # 16 # Character class definitions from TR 29 17 # 18 ############################################################################## 19 20 !!chain; 21 22 23 # 24 # Character Class Definitions. 25 # 26 27 $CR = [\p{Word_Break = CR}]; 28 $LF = [\p{Word_Break = LF}]; 29 $Newline = [\p{Word_Break = Newline}]; 30 $Extend = [\p{Word_Break = Extend}]; 31 $Format = [\p{Word_Break = Format}]; 32 $Katakana = [\p{Word_Break = Katakana}]; 33 $ALetter = [\p{Word_Break = ALetter}]; 34 $MidNumLet = [\p{Word_Break = MidNumLet}]; 35 $MidLetter = [\p{Word_Break = MidLetter}]; 36 $MidNum = [\p{Word_Break = MidNum}]; 37 $Numeric = [\p{Word_Break = Numeric}]; 38 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 39 40 41 # Dictionary character set, for triggering language-based break engines. Currently 42 # limited to LineBreak=Complex_Context. Note that this set only works in Unicode 43 # 5.0 or later as the definition of Complex_Context was corrected to include all 44 # characters requiring dictionary break. 45 46 $dictionary = [:LineBreak = Complex_Context:]; 47 $Control = [\p{Grapheme_Cluster_Break = Control}]; 48 $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not 49 # include the dictionary characters. 50 51 # 52 # Rules 4 Ignore Format and Extend characters, 53 # except when they appear at the beginning of a region of text. 54 # 55 $KatakanaEx = $Katakana ($Extend | $Format)*; 56 $ALetterEx = $ALetterPlus ($Extend | $Format)*; 57 $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 58 $MidLetterEx = $MidLetter ($Extend | $Format)*; 59 $MidNumEx = $MidNum ($Extend | $Format)*; 60 $NumericEx = $Numeric ($Extend | $Format)*; 61 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 62 63 $Hiragana = [\p{script=Hiragana}]; 64 $Ideographic = [\p{Ideographic} [\u3005 \u3007 \u303B]]; 65 $HiraganaEx = $Hiragana ($Extend | $Format)*; 66 $IdeographicEx = $Ideographic ($Extend | $Format)*; 67 68 ## ------------------------------------------------- 69 70 !!forward; 71 72 73 # Rule 3 - CR x LF 74 # 75 $CR $LF; 76 77 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 78 # of a region of Text. The rule here comes into play when the start of text 79 # begins with a group of Format chars, or with a "word" consisting of a single 80 # char that is not in any of the listed word break categories followed by 81 # format char(s). 82 [^$CR $LF $Newline]? ($Extend | $Format)+; 83 84 $NumericEx {100}; 85 $ALetterEx {200}; 86 $KatakanaEx {300}; # note: these status values override those from rule 5 87 $HiraganaEx {300}; # by virtual of being numerically larger. 88 $IdeographicEx {400}; # 89 90 # 91 # rule 5 92 # Do not break between most letters. 93 # 94 $ALetterEx $ALetterEx {200}; 95 96 # rule 6 and 7 97 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 98 99 # rule 8 100 101 $NumericEx $NumericEx {100}; 102 103 # rule 9 104 105 $ALetterEx $NumericEx {200}; 106 107 # rule 10 108 109 $NumericEx $ALetterEx {200}; 110 111 # rule 11 and 12 112 113 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 114 115 # rule 13 116 117 $KatakanaEx $KatakanaEx {300}; 118 $HiraganaEx $HiraganaEx {300}; 119 $IdeographicEx $IdeographicEx {400}; 120 121 122 # rule 13a/b 123 124 $ALetterEx $ExtendNumLetEx {200}; # (13a) 125 $NumericEx $ExtendNumLetEx {100}; # (13a) 126 $KatakanaEx $ExtendNumLetEx {300}; # (13a) 127 $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 128 129 $ExtendNumLetEx $ALetterEx {200}; # (13b) 130 $ExtendNumLetEx $NumericEx {100}; # (13b) 131 $ExtendNumLetEx $KatakanaEx {300}; # (13b) 132 133 134 135 ## ------------------------------------------------- 136 137 !!reverse; 138 139 $BackALetterEx = ($Format | $Extend)* $ALetterPlus; 140 $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 141 $BackNumericEx = ($Format | $Extend)* $Numeric; 142 $BackMidNumEx = ($Format | $Extend)* $MidNum; 143 $BackMidLetterEx = ($Format | $Extend)* $MidLetter; 144 $BackKatakanaEx = ($Format | $Extend)* $Katakana; 145 $BackHiraganaEx = ($Format | $Extend)* $Hiragana; 146 $BackIdeographicEx = ($Format | $Extend)* $Ideographic; 147 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; 148 149 # rule 3 150 $LF $CR; 151 152 # rule 4 153 ($Format | $Extend)* [^$CR $LF $Newline]?; 154 155 # rule 5 156 157 $BackALetterEx $BackALetterEx; 158 159 # rule 6 and 7 160 161 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; 162 163 164 # rule 8 165 166 $BackNumericEx $BackNumericEx; 167 168 # rule 9 169 170 $BackNumericEx $BackALetterEx; 171 172 # rule 10 173 174 $BackALetterEx $BackNumericEx; 175 176 # rule 11 and 12 177 178 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; 179 180 # rule 13 181 182 $BackKatakanaEx $BackKatakanaEx; 183 $BackHiraganaEx $BackHiraganaEx; 184 $BackIdeographicEx $BackIdeographicEx; 185 186 187 188 # rules 13 a/b 189 # 190 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 191 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 192 193 ## ------------------------------------------------- 194 195 !!safe_reverse; 196 197 # rule 3 198 ($Extend | $Format)+ .?; 199 200 # rule 6 201 ($MidLetter | $MidNumLet) $BackALetterEx; 202 203 # rule 11 204 ($MidNum | $MidNumLet) $BackNumericEx; 205 206 # For dictionary-based break 207 $dictionary $dictionary; 208 209 ## ------------------------------------------------- 210 211 !!safe_forward; 212 213 # rule 4 214 ($Extend | $Format)+ .?; 215 216 # rule 6 217 ($MidLetterEx | $MidNumLetEx) $ALetterEx; 218 219 # rule 11 220 ($MidNumEx | $MidNumLetEx) $NumericEx; 221 222 # For dictionary-based break 223 $dictionary $dictionary; 224