Home | History | Annotate | Download | only in brkitr
      1 #
      2 # Copyright (C) 2002-2009, International Business Machines Corporation 
      3 # and others. All Rights Reserved.
      4 #
      5 # file:  word.txt
      6 #
      7 # ICU Word Break Rules
      8 #      See Unicode Standard Annex #29.
      9 #      These rules are based on UAX-29 Revision 13 for Unicode 5.1
     10 #
     11 # Note:  Updates to word.txt will usually need to be merged into
     12 #        word_POSIX.txt and word_ja.txt also.
     13 
     14 ##############################################################################
     15 #
     16 #  Character class definitions from TR 29
     17 #
     18 ##############################################################################
     19 
     20 !!chain;
     21 
     22 
     23 #
     24 #  Character Class Definitions.
     25 #
     26 
     27 $CR           = [\p{Word_Break = CR}];
     28 $LF           = [\p{Word_Break = LF}];
     29 $Newline      = [\p{Word_Break = Newline}];
     30 $Extend       = [\p{Word_Break = Extend}];
     31 $Format       = [\p{Word_Break = Format}];
     32 $Hiragana     = [:Hiragana:];
     33 $Katakana     = [\p{Word_Break = Katakana}];
     34 $Han          = [:Han:];
     35 $ALetter      = [\p{Word_Break = ALetter}];
     36 $MidNumLet    = [\p{Word_Break = MidNumLet}];
     37 $MidLetter    = [\p{Word_Break = MidLetter}];
     38 $MidNum       = [\p{Word_Break = MidNum}];
     39 $Numeric      = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
     40 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
     41 
     42 
     43 #   Dictionary character set, for triggering language-based break engines. Currently
     44 #   limited to LineBreak=Complex_Context and CJK. Note that this set only works
     45 #   in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all
     46 #   characters requiring dictionary break.
     47 
     48 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
     49 $HangulSyllable = [\uac00-\ud7a3];
     50 $ComplexContext = [:LineBreak = Complex_Context:];
     51 $KanaKanji      = [$Han $Hiragana $Katakana];
     52 $dictionaryCJK  = [$KanaKanji $HangulSyllable];
     53 $dictionary     = [$ComplexContext $dictionaryCJK];
     54 
     55 # leave CJK scripts out of ALetterPlus
     56 $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
     57 
     58 
     59 #
     60 #  Rules 4    Ignore Format and Extend characters, 
     61 #             except when they appear at the beginning of a region of text.
     62 #
     63 # TODO: check if handling of katakana in dictionary makes rules incorrect/void.
     64 $KatakanaEx     = $Katakana     ($Extend |  $Format)*;
     65 $ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
     66 $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
     67 $MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
     68 $MidNumEx       = $MidNum       ($Extend |  $Format)*;
     69 $NumericEx      = $Numeric      ($Extend |  $Format)*;
     70 $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
     71 
     72 $Ideographic    = [\p{Ideographic}];
     73 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
     74 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
     75 
     76 ## -------------------------------------------------
     77 
     78 !!forward;
     79 
     80 
     81 # Rule 3 - CR x LF
     82 #
     83 $CR $LF;
     84 
     85 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
     86 #          of a region of Text.   The rule here comes into play when the start of text
     87 #          begins with a group of Format chars, or with a "word" consisting of a single
     88 #          char that is not in any of the listed word break categories followed by
     89 #          format char(s).
     90  #          format char(s), or is not a CJK dictionary character.
     91 [^$CR $LF $Newline $dictionaryCJK]? ($Extend |  $Format)+;
     92 
     93 $NumericEx {100};
     94 $ALetterEx {200};
     95 $HangulSyllable {200};
     96 $KatakanaEx {400}; #originally 300
     97 $HiraganaEx {400}; #originally 300
     98 $IdeographicEx {400};    #
     99 
    100 #
    101 # rule 5
    102 #    Do not break between most letters.
    103 #
    104 $ALetterEx $ALetterEx {200};
    105 
    106 # rule 6 and 7
    107 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
    108 
    109 # rule 8
    110 
    111 $NumericEx $NumericEx {100};
    112 
    113 # rule 9
    114 
    115 $ALetterEx $NumericEx {200};
    116 
    117 # rule 10
    118 
    119 $NumericEx $ALetterEx {200};
    120 
    121 # rule 11 and 12 
    122 
    123 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
    124 
    125 # rule 13
    126 
    127 # To be consistent with '$KanaKanji $KanaKanji', changed 
    128 # from 300 to 400.
    129 # See also TestRuleStatus in intltest/rbbiapts.cpp
    130 $KatakanaEx  $KatakanaEx {400};
    131 
    132 # rule 13a/b
    133 
    134 $ALetterEx      $ExtendNumLetEx {200};    #  (13a)
    135 $NumericEx      $ExtendNumLetEx {100};    #  (13a)
    136 $KatakanaEx     $ExtendNumLetEx {400};    #  (13a)
    137 $ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
    138 
    139 $ExtendNumLetEx $ALetterEx  {200};    #  (13b)
    140 $ExtendNumLetEx $NumericEx  {100};    #  (13b)
    141 $ExtendNumLetEx $KatakanaEx {400};    #  (13b)
    142 
    143 # special handling for CJK characters: chain for later dictionary segmentation
    144 $HangulSyllable $HangulSyllable {200};
    145 $KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found
    146 
    147 
    148 ## -------------------------------------------------
    149 
    150 !!reverse;
    151 
    152 $BackALetterEx     = ($Format | $Extend)* $ALetterPlus;
    153 $BackMidNumLetEx   = ($Format | $Extend)* $MidNumLet;
    154 $BackNumericEx     = ($Format | $Extend)* $Numeric;
    155 $BackMidNumEx      = ($Format | $Extend)* $MidNum;
    156 $BackMidLetterEx   = ($Format | $Extend)* $MidLetter;
    157 $BackKatakanaEx    = ($Format | $Extend)* $Katakana;
    158 $BackHiraganaEx    = ($Extend | $Format)* $Hiragana;
    159 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
    160 
    161 # rule 3
    162 $LF $CR;
    163 
    164 # rule 4
    165 ($Format | $Extend)*  [^$CR $LF $Newline $dictionaryCJK]?;
    166 
    167 # rule 5
    168 
    169 $BackALetterEx $BackALetterEx;
    170 
    171 # rule 6 and 7
    172 
    173 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
    174 
    175 
    176 # rule 8
    177 
    178 $BackNumericEx $BackNumericEx;
    179 
    180 # rule 9
    181 
    182 $BackNumericEx $BackALetterEx;
    183 
    184 # rule 10
    185 
    186 $BackALetterEx $BackNumericEx;
    187 
    188 # rule 11 and 12
    189 
    190 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
    191 
    192 # rule 13
    193 
    194 $BackKatakanaEx $BackKatakanaEx;
    195 
    196 # rules 13 a/b
    197 #
    198 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
    199 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 
    200 
    201 # special handling for CJK characters: chain for later dictionary segmentation
    202 $HangulSyllable $HangulSyllable;
    203 $KanaKanji $KanaKanji; #different rule status if both kanji and kana found
    204 
    205 ## -------------------------------------------------
    206 
    207 !!safe_reverse;
    208 
    209 # rule 3
    210 ($Extend | $Format)+ .?;
    211 
    212 # rule 6
    213 ($MidLetter | $MidNumLet) $BackALetterEx;
    214 
    215 # rule 11
    216 ($MidNum | $MidNumLet) $BackNumericEx;
    217 
    218 # For dictionary-based break
    219 $dictionary $dictionary;
    220 
    221 ## -------------------------------------------------
    222 
    223 !!safe_forward;
    224 
    225 # rule 4
    226 ($Extend | $Format)+ .?;
    227 
    228 # rule 6
    229 ($MidLetterEx | $MidNumLetEx) $ALetterEx;
    230 
    231 # rule 11
    232 ($MidNumEx | $MidNumLetEx) $NumericEx;
    233 
    234 # For dictionary-based break
    235 $dictionary $dictionary;
    236