Home | History | Annotate | Download | only in brkitr
      1 #
      2 # Copyright (C) 2002-2013, International Business Machines Corporation 
      3 # and others. All Rights Reserved.
      4 #
      5 # file:  word.txt
      6 #
      7 # ICU Word Break Rules
      8 #      See Unicode Standard Annex #29.
      9 #      These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
     10 #
     11 # Note:  Updates to word.txt will usually need to be merged into
     12 #        word_POSIX.txt also.
     13 
     14 ##############################################################################
     15 #
     16 #  Character class definitions from TR 29
     17 #
     18 ##############################################################################
     19 
     20 !!chain;
     21 
     22 
     23 #
     24 #  Character Class Definitions.
     25 #
     26 
     27 $CR           = [\p{Word_Break = CR}];
     28 $LF           = [\p{Word_Break = LF}];
     29 $Newline      = [\p{Word_Break = Newline}];
     30 $Extend       = [\p{Word_Break = Extend}];
     31 $Format       = [\p{Word_Break = Format}];
     32 $Hiragana     = [:Hiragana:];
     33 $Katakana     = [\p{Word_Break = Katakana}];
     34 $Han          = [:Han:];
     35 $ALetter      = [\p{Word_Break = ALetter}];
     36 $MidNumLet    = [\p{Word_Break = MidNumLet}];
     37 $MidLetter    = [\p{Word_Break = MidLetter}];
     38 $MidNum       = [\p{Word_Break = MidNum}];
     39 $Numeric      = [\p{Word_Break = Numeric}];
     40 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
     41 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
     42 
     43 
     44 #   Dictionary character set, for triggering language-based break engines. Currently
     45 #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
     46 #   5.0 or later as the definition of Complex_Context was corrected to include all
     47 #   characters requiring dictionary break.
     48 
     49 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
     50 $HangulSyllable = [\uac00-\ud7a3];
     51 $ComplexContext = [:LineBreak = Complex_Context:];
     52 $KanaKanji      = [$Han $Hiragana $Katakana];
     53 $dictionaryCJK  = [$KanaKanji $HangulSyllable];
     54 $dictionary     = [$ComplexContext $dictionaryCJK];
     55 
     56 # leave CJK scripts out of ALetterPlus
     57 $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
     58 
     59 
     60 #
     61 #  Rules 4    Ignore Format and Extend characters, 
     62 #             except when they appear at the beginning of a region of text.
     63 #
     64 # TODO: check if handling of katakana in dictionary makes rules incorrect/void
     65 $KatakanaEx     = $Katakana     ($Extend |  $Format)*;
     66 $ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
     67 $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
     68 $MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
     69 $MidNumEx       = $MidNum       ($Extend |  $Format)*;
     70 $NumericEx      = $Numeric      ($Extend |  $Format)*;
     71 $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
     72 $Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
     73 
     74 $Ideographic    = [\p{Ideographic}];
     75 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
     76 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
     77 
     78 ## -------------------------------------------------
     79 
     80 !!forward;
     81 
     82 
     83 # Rule 3 - CR x LF
     84 #
     85 $CR $LF;
     86 
     87 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
     88 #          of a region of Text.   The rule here comes into play when the start of text
     89 #          begins with a group of Format chars, or with a "word" consisting of a single
     90 #          char that is not in any of the listed word break categories followed by
     91 #          format char(s), or is not a CJK dictionary character.
     92 [^$CR $LF $Newline]? ($Extend |  $Format)+;
     93 
     94 $NumericEx {100};
     95 $ALetterEx {200};
     96 $HangulSyllable {200};
     97 $KatakanaEx {400};       # note:  these status values override those from rule 5
     98 $HiraganaEx {400};       #        by virtue of being numerically larger.
     99 $IdeographicEx {400};    #
    100 
    101 #
    102 # rule 5
    103 #    Do not break between most letters.
    104 #
    105 $ALetterEx $ALetterEx {200};
    106 
    107 # rule 6 and 7
    108 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
    109 
    110 # rule 8
    111 
    112 $NumericEx $NumericEx {100};
    113 
    114 # rule 9
    115 
    116 $ALetterEx $NumericEx {200};
    117 
    118 # rule 10
    119 
    120 $NumericEx $ALetterEx {200};
    121 
    122 # rule 11 and 12 
    123 
    124 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
    125 
    126 # rule 13
    127 # to be consistent with $KanaKanji $KanaKanhi, changed
    128 # from 300 to 400.
    129 # See also TestRuleStatus in intltest/rbbiapts.cpp
    130 $KatakanaEx  $KatakanaEx {400};
    131 
    132 # rule 13a/b
    133 
    134 $ALetterEx      $ExtendNumLetEx {200};    #  (13a)
    135 $NumericEx      $ExtendNumLetEx {100};    #  (13a)
    136 $KatakanaEx     $ExtendNumLetEx {400};    #  (13a)
    137 $ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
    138 
    139 $ExtendNumLetEx $ALetterEx  {200};    #  (13b)
    140 $ExtendNumLetEx $NumericEx  {100};    #  (13b)
    141 $ExtendNumLetEx $KatakanaEx {400};    #  (13b)
    142 
    143 # rule 13c
    144 
    145 $Regional_IndicatorEx $Regional_IndicatorEx;
    146 
    147 # special handling for CJK characters: chain for later dictionary segmentation
    148 $HangulSyllable $HangulSyllable {200};
    149 $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 
    150 
    151 
    152 ## -------------------------------------------------
    153 
    154 !!reverse;
    155 
    156 $BackALetterEx            = ($Format | $Extend)* $ALetterPlus;
    157 $BackMidNumLetEx          = ($Format | $Extend)* $MidNumLet;
    158 $BackNumericEx            = ($Format | $Extend)* $Numeric;
    159 $BackMidNumEx             = ($Format | $Extend)* $MidNum;
    160 $BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
    161 $BackKatakanaEx           = ($Format | $Extend)* $Katakana;
    162 $BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
    163 $BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
    164 $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
    165 
    166 # rule 3
    167 $LF $CR;
    168 
    169 # rule 4
    170 ($Format | $Extend)*  [^$CR $LF $Newline]?;
    171 
    172 # rule 5
    173 
    174 $BackALetterEx $BackALetterEx;
    175 
    176 # rule 6 and 7
    177 
    178 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
    179 
    180 
    181 # rule 8
    182 
    183 $BackNumericEx $BackNumericEx;
    184 
    185 # rule 9
    186 
    187 $BackNumericEx $BackALetterEx;
    188 
    189 # rule 10
    190 
    191 $BackALetterEx $BackNumericEx;
    192 
    193 # rule 11 and 12
    194 
    195 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
    196 
    197 # rule 13
    198 
    199 $BackKatakanaEx $BackKatakanaEx;
    200 
    201 # rules 13 a/b
    202 #
    203 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
    204 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 
    205 
    206 # rule 13c
    207 
    208 $BackRegional_IndicatorEx $BackRegional_IndicatorEx;
    209 
    210 # special handling for CJK characters: chain for later dictionary segmentation
    211 $HangulSyllable $HangulSyllable;
    212 $KanaKanji $KanaKanji; #different rule status if both kanji and kana found
    213 
    214 ## -------------------------------------------------
    215 
    216 !!safe_reverse;
    217 
    218 # rule 3
    219 ($Extend | $Format)+ .?;
    220 
    221 # rule 6
    222 ($MidLetter | $MidNumLet) $BackALetterEx;
    223 
    224 # rule 11
    225 ($MidNum | $MidNumLet) $BackNumericEx;
    226 
    227 # For dictionary-based break
    228 $dictionary $dictionary;
    229 
    230 ## -------------------------------------------------
    231 
    232 !!safe_forward;
    233 
    234 # rule 4
    235 ($Extend | $Format)+ .?;
    236 
    237 # rule 6
    238 ($MidLetterEx | $MidNumLetEx) $ALetterEx;
    239 
    240 # rule 11
    241 ($MidNumEx | $MidNumLetEx) $NumericEx;
    242 
    243 # For dictionary-based break
    244 $dictionary $dictionary;
    245