Home | History | Annotate | Download | only in brkitr
      1 #
      2 # Copyright (C) 2002-2010, International Business Machines Corporation 
      3 # and others. All Rights Reserved.
      4 #
      5 # file:  word_POSIX.txt
      6 #
      7 # ICU Word Break Rules, POSIX locale.
      8 #      See Unicode Standard Annex #29.
      9 #      These rules are based on UAX-29 Revision 16 for Unicode 6.0
     10 #
     11 # Note:  Updates to word.txt will usually need to be merged into
     12 #        word_POSIX.txt and word_ja.txt also.
     13 
     14 ##############################################################################
     15 #
     16 #  Character class definitions from TR 29
     17 #
     18 ##############################################################################
     19 
     20 !!chain;
     21 
     22 
     23 #
     24 #  Character Class Definitions.
     25 #
     26 
     27 $CR           = [\p{Word_Break = CR}];
     28 $LF           = [\p{Word_Break = LF}];
     29 $Newline      = [\p{Word_Break = Newline}];
     30 $Extend       = [\p{Word_Break = Extend}];
     31 $Format       = [\p{Word_Break = Format}];
     32 $Katakana     = [\p{Word_Break = Katakana}];
     33 $ALetter      = [\p{Word_Break = ALetter}];
     34 $MidNumLet    = [\p{Word_Break = MidNumLet} - [.]];
     35 $MidLetter    = [\p{Word_Break = MidLetter} - [\:]];
     36 $MidNum       = [\p{Word_Break = MidNum} [.]];
     37 $Numeric      = [\p{Word_Break = Numeric}];
     38 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
     39 
     40 
     41 #   Dictionary character set, for triggering language-based break engines. Currently
     42 #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
     43 #   5.0 or later as the definition of Complex_Context was corrected to include all
     44 #   characters requiring dictionary break.
     45 
     46 $dictionary   = [:LineBreak = Complex_Context:];
     47 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
     48 $ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
     49                                                              #  include the dictionary characters.
     50 
     51 #
     52 #  Rules 4    Ignore Format and Extend characters, 
     53 #             except when they appear at the beginning of a region of text.
     54 #
     55 $KatakanaEx     = $Katakana     ($Extend |  $Format)*;
     56 $ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
     57 $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
     58 $MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
     59 $MidNumEx       = $MidNum       ($Extend |  $Format)*;
     60 $NumericEx      = $Numeric      ($Extend |  $Format)*;
     61 $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
     62 
     63 $Hiragana       = [\p{script=Hiragana}];
     64 $Ideographic    = [\p{Ideographic}];
     65 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
     66 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
     67 
     68 ## -------------------------------------------------
     69 
     70 !!forward;
     71 
     72 
     73 # Rule 3 - CR x LF
     74 #
     75 $CR $LF;
     76 
     77 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
     78 #          of a region of Text.   The rule here comes into play when the start of text
     79 #          begins with a group of Format chars, or with a "word" consisting of a single
     80 #          char that is not in any of the listed word break categories followed by
     81 #          format char(s).
     82 [^$CR $LF $Newline]? ($Extend |  $Format)+;
     83 
     84 $NumericEx {100};
     85 $ALetterEx {200};
     86 $KatakanaEx {300};       # note:  these status values override those from rule 5
     87 $HiraganaEx {300};       #        by virtual of being numerically larger.
     88 $IdeographicEx {400};    #
     89 
     90 #
     91 # rule 5
     92 #    Do not break between most letters.
     93 #
     94 $ALetterEx $ALetterEx {200};
     95 
     96 # rule 6 and 7
     97 $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
     98 
     99 # rule 8
    100 
    101 $NumericEx $NumericEx {100};
    102 
    103 # rule 9
    104 
    105 $ALetterEx $NumericEx {200};
    106 
    107 # rule 10
    108 
    109 $NumericEx $ALetterEx {200};
    110 
    111 # rule 11 and 12 
    112 
    113 $NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
    114 
    115 # rule 13
    116 
    117 $KatakanaEx  $KatakanaEx {300};
    118 
    119 # rule 13a/b
    120 
    121 $ALetterEx      $ExtendNumLetEx {200};    #  (13a)
    122 $NumericEx      $ExtendNumLetEx {100};    #  (13a)
    123 $KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
    124 $ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
    125 
    126 $ExtendNumLetEx $ALetterEx  {200};    #  (13b)
    127 $ExtendNumLetEx $NumericEx  {100};    #  (13b)
    128 $ExtendNumLetEx $KatakanaEx {300};    #  (13b)
    129  
    130 
    131 
    132 ## -------------------------------------------------
    133 
    134 !!reverse;
    135 
    136 $BackALetterEx     = ($Format | $Extend)* $ALetterPlus;
    137 $BackMidNumLetEx   = ($Format | $Extend)* $MidNumLet;
    138 $BackNumericEx     = ($Format | $Extend)* $Numeric;
    139 $BackMidNumEx      = ($Format | $Extend)* $MidNum;
    140 $BackMidLetterEx   = ($Format | $Extend)* $MidLetter;
    141 $BackKatakanaEx    = ($Format | $Extend)* $Katakana;
    142 $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
    143 
    144 # rule 3
    145 $LF $CR;
    146 
    147 # rule 4
    148 ($Format | $Extend)*  [^$CR $LF $Newline]?;
    149 
    150 # rule 5
    151 
    152 $BackALetterEx $BackALetterEx;
    153 
    154 # rule 6 and 7
    155 
    156 $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
    157 
    158 
    159 # rule 8
    160 
    161 $BackNumericEx $BackNumericEx;
    162 
    163 # rule 9
    164 
    165 $BackNumericEx $BackALetterEx;
    166 
    167 # rule 10
    168 
    169 $BackALetterEx $BackNumericEx;
    170 
    171 # rule 11 and 12
    172 
    173 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
    174 
    175 # rule 13
    176 
    177 $BackKatakanaEx $BackKatakanaEx;
    178 
    179 # rules 13 a/b
    180 #
    181 $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
    182 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 
    183 
    184 ## -------------------------------------------------
    185 
    186 !!safe_reverse;
    187 
    188 # rule 3
    189 ($Extend | $Format)+ .?;
    190 
    191 # rule 6
    192 ($MidLetter | $MidNumLet) $BackALetterEx;
    193 
    194 # rule 11
    195 ($MidNum | $MidNumLet) $BackNumericEx;
    196 
    197 # For dictionary-based break
    198 $dictionary $dictionary;
    199 
    200 ## -------------------------------------------------
    201 
    202 !!safe_forward;
    203 
    204 # rule 4
    205 ($Extend | $Format)+ .?;
    206 
    207 # rule 6
    208 ($MidLetterEx | $MidNumLetEx) $ALetterEx;
    209 
    210 # rule 11
    211 ($MidNumEx | $MidNumLetEx) $NumericEx;
    212 
    213 # For dictionary-based break
    214 $dictionary $dictionary;
    215