Home | History | Annotate | Download | only in brkitr
      1 #
      2 # Copyright (C) 2002-2010, International Business Machines Corporation 
      3 # and others. All Rights Reserved.
      4 #
      5 # file:  word_ja.txt
      6 #
      7 # ICU Word Break Rules
      8 #      See Unicode Standard Annex #29.
      9 #      These rules are based on UAX-29 Revision 16 for Unicode 6.0
     10 #
     11 # Note:  Updates to word.txt will usually need to be merged into
     12 #        word_POSIX.txt and word_ja.txt also.
     13 
     14 ##############################################################################
     15 #
     16 #  Character class definitions from TR 29
     17 #
     18 ##############################################################################
     19 
     20 !!chain;
     21 
     22 
     23 #
     24 #  Character Class Definitions.
     25 #
     26 
     27 $CR           = [\p{Word_Break = CR}];
     28 $LF           = [\p{Word_Break = LF}];
     29 $Newline      = [\p{Word_Break = Newline}];
     30 $Extend       = [\p{Word_Break = Extend}];
     31 $Format       = [\p{Word_Break = Format}];
     32 $Katakana     = [\p{Word_Break = Katakana}];
     33 $ALetter      = [\p{Word_Break = ALetter}];
     34 $MidNumLet    = [\p{Word_Break = MidNumLet}];
     35 $MidLetter    = [\p{Word_Break = MidLetter}];
     36 $MidNum       = [\p{Word_Break = MidNum}];
     37 $Numeric      = [\p{Word_Break = Numeric}];
     38 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
     39 
     40 
     41 #   Dictionary character set, for triggering language-based break engines. Currently
     42 #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
     43 #   5.0 or later as the definition of Complex_Context was corrected to include all
     44 #   characters requiring dictionary break.
     45 
     46 $dictionary   = [:LineBreak = Complex_Context:];
     47 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
     48 $ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
     49                                                              #  include the dictionary characters.
     50 
     51 #
     52 #  Rules 4    Ignore Format and Extend characters, 
     53 #             except when they appear at the beginning of a region of text.
     54 #
     55 $KatakanaEx     = $Katakana     ($Extend |  $Format)*;
     56 $ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
     57 $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
     58 $MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
     59 $MidNumEx       = $MidNum       ($Extend |  $Format)*;
     60 $NumericEx      = $Numeric      ($Extend |  $Format)*;
     61 $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
     62 
     63 $Hiragana       = [\p{script=Hiragana}];
     64 $Ideographic    = [\p{Ideographic} [\u3005 \u3007 \u303B]];
     65 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
     66 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
     67 
     68 ## -------------------------------------------------
     69 
     70 !!forward;
     71 
     72 
     73 # Rule 3 - CR x LF
     74 #
     75 $CR $LF;
     76 
     77 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
     78 #          of a region of Text.   The rule here comes into play when the start of text
     79 #          begins with a group of Format chars, or with a "word" consisting of a single
     80 #          char that is not in any of the listed word break categories followed by
     81 #          format char(s).
     82 [^$CR $LF $Newline]? ($Extend |  $Format)+;
     83 
     84 $NumericEx {100};
     85 $ALetterEx {200};
     86 $KatakanaEx {300};       # note:  these status values override those from rule 5
     87 $HiraganaEx {300};       #        by virtual of being numerically larger.
     88 $IdeographicEx {400};    #
     89 
     90 #
     91 # rule 5
     92 #    Do not break between most letters.
     93 #
     94 $ALetterEx $ALetterEx {200};
     95 
     96 # rule 6 and 7
     97 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
     98 
     99 # rule 8
    100 
    101 $NumericEx $NumericEx {100};
    102 
    103 # rule 9
    104 
    105 $ALetterEx $NumericEx {200};
    106 
    107 # rule 10
    108 
    109 $NumericEx $ALetterEx {200};
    110 
    111 # rule 11 and 12 
    112 
    113 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
    114 
    115 # rule 13
    116 
    117 $KatakanaEx  $KatakanaEx {300};
    118 $HiraganaEx    $HiraganaEx {300};
    119 $IdeographicEx $IdeographicEx {400};
    120 
    121 
    122 # rule 13a/b
    123 
    124 $ALetterEx      $ExtendNumLetEx {200};    #  (13a)
    125 $NumericEx      $ExtendNumLetEx {100};    #  (13a)
    126 $KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
    127 $ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
    128 
    129 $ExtendNumLetEx $ALetterEx  {200};    #  (13b)
    130 $ExtendNumLetEx $NumericEx  {100};    #  (13b)
    131 $ExtendNumLetEx $KatakanaEx {300};    #  (13b)
    132  
    133 
    134 
    135 ## -------------------------------------------------
    136 
    137 !!reverse;
    138 
    139 $BackALetterEx     = ($Format | $Extend)* $ALetterPlus;
    140 $BackMidNumLetEx   = ($Format | $Extend)* $MidNumLet;
    141 $BackNumericEx     = ($Format | $Extend)* $Numeric;
    142 $BackMidNumEx      = ($Format | $Extend)* $MidNum;
    143 $BackMidLetterEx   = ($Format | $Extend)* $MidLetter;
    144 $BackKatakanaEx    = ($Format | $Extend)* $Katakana;
    145 $BackHiraganaEx    = ($Format | $Extend)* $Hiragana;
    146 $BackIdeographicEx = ($Format | $Extend)* $Ideographic;
    147 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
    148 
    149 # rule 3
    150 $LF $CR;
    151 
    152 # rule 4
    153 ($Format | $Extend)*  [^$CR $LF $Newline]?;
    154 
    155 # rule 5
    156 
    157 $BackALetterEx $BackALetterEx;
    158 
    159 # rule 6 and 7
    160 
    161 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
    162 
    163 
    164 # rule 8
    165 
    166 $BackNumericEx $BackNumericEx;
    167 
    168 # rule 9
    169 
    170 $BackNumericEx $BackALetterEx;
    171 
    172 # rule 10
    173 
    174 $BackALetterEx $BackNumericEx;
    175 
    176 # rule 11 and 12
    177 
    178 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
    179 
    180 # rule 13
    181 
    182 $BackKatakanaEx $BackKatakanaEx;
    183 $BackHiraganaEx $BackHiraganaEx;
    184 $BackIdeographicEx $BackIdeographicEx;
    185 
    186 
    187 
    188 # rules 13 a/b
    189 #
    190 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
    191 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 
    192 
    193 ## -------------------------------------------------
    194 
    195 !!safe_reverse;
    196 
    197 # rule 3
    198 ($Extend | $Format)+ .?;
    199 
    200 # rule 6
    201 ($MidLetter | $MidNumLet) $BackALetterEx;
    202 
    203 # rule 11
    204 ($MidNum | $MidNumLet) $BackNumericEx;
    205 
    206 # For dictionary-based break
    207 $dictionary $dictionary;
    208 
    209 ## -------------------------------------------------
    210 
    211 !!safe_forward;
    212 
    213 # rule 4
    214 ($Extend | $Format)+ .?;
    215 
    216 # rule 6
    217 ($MidLetterEx | $MidNumLetEx) $ALetterEx;
    218 
    219 # rule 11
    220 ($MidNumEx | $MidNumLetEx) $NumericEx;
    221 
    222 # For dictionary-based break
    223 $dictionary $dictionary;
    224