Home | History | Annotate | Download | only in rules
      1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 # License & terms of use: http://www.unicode.org/copyright.html
      3 # Copyright (c) 2002-2016  International Business Machines Corporation and
      4 # others. All Rights Reserved.
      5 #
      6 #  file:  line_loose_cj.txt
      7 #
      8 #         Line Breaking Rules
      9 #         Implement default line breaking as defined by
     10 #         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
     11 #         http://www.unicode.org/reports/tr14/, with the following modification:
     12 #
     13 #         Boundaries between hyphens and following letters are suppressed when
     14 #         there is a boundary preceding the hyphen. See rule 20.9
     15 #
     16 #         This tailors the line break behavior to correspond to CSS
     17 #         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
     18 #         It sets characters of class CJ to behave like ID.
     19 #         In addition, it allows breaks:
     20 #         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
     21 #         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
     22 #         * between characters of LineBreak class IN such as 2026
     23 #         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
     24 #           FF65 (all NS) and FF01, FF1F (both EX).
     25 #         * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
     26 #           this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
     27 #         * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
     28 #           this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
     29 
     30 
     31 #
     32 #  Character Classes defined by TR 14.
     33 #
     34 
     35 !!chain;
     36 !!quoted_literals_only;
     37 
     38 $AI = [:LineBreak =  Ambiguous:];
     39 $AL = [:LineBreak =  Alphabetic:];
     40 $BAX = [\u2010 \u2013];
     41 $BA = [[:LineBreak =  Break_After:] - $BAX];
     42 $HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
     43 $BB = [:LineBreak =  Break_Before:];
     44 $BK = [:LineBreak =  Mandatory_Break:];
     45 $B2 = [:LineBreak =  Break_Both:];
     46 $CB = [:LineBreak =  Contingent_Break:];
     47 $CJ = [:LineBreak =  Conditional_Japanese_Starter:];
     48 $CL = [:LineBreak =  Close_Punctuation:];
     49 # $CM = [:LineBreak =  Combining_Mark:];
     50 $CP = [:LineBreak =  Close_Parenthesis:];
     51 $CR = [:LineBreak =  Carriage_Return:];
     52 $EB = [:LineBreak =  EB:];
     53 $EM = [:LineBreak =  EM:];
     54 $EXX = [\uFF01 \uFF1F];
     55 $EX = [[:LineBreak =  Exclamation:] - $EXX];
     56 $GL = [:LineBreak =  Glue:];
     57 $HL = [:LineBreak =  Hebrew_Letter:];
     58 $HY = [:LineBreak =  Hyphen:];
     59 $H2 = [:LineBreak =  H2:];
     60 $H3 = [:LineBreak =  H3:];
     61 # CSS Loose tailoring: CJ resolves to ID
     62 $ID = [[:LineBreak =  Ideographic:] $CJ];
     63 $IN = [:LineBreak =  Inseperable:];
     64 $IS = [:LineBreak =  Infix_Numeric:];
     65 $JL = [:LineBreak =  JL:];
     66 $JV = [:LineBreak =  JV:];
     67 $JT = [:LineBreak =  JT:];
     68 $LF = [:LineBreak =  Line_Feed:];
     69 $NL = [:LineBreak =  Next_Line:];
     70 $NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
     71 $NS = [[:LineBreak =  Nonstarter:] - $NSX];
     72 $NU = [:LineBreak =  Numeric:];
     73 $OP = [:LineBreak =  Open_Punctuation:];
     74 $POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
     75 $PO = [[:LineBreak =  Postfix_Numeric:] - $POX];
     76 $PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
     77 $PR = [[:LineBreak =  Prefix_Numeric:] - $PRX];
     78 $QU = [:LineBreak =  Quotation:];
     79 $RI = [:LineBreak =  Regional_Indicator:];
     80 $SA = [:LineBreak =  Complex_Context:];
     81 $SG = [:LineBreak =  Surrogate:];
     82 $SP = [:LineBreak =  Space:];
     83 $SY = [:LineBreak =  Break_Symbols:];
     84 $WJ = [:LineBreak =  Word_Joiner:];
     85 $XX = [:LineBreak =  Unknown:];
     86 $ZW = [:LineBreak =  ZWSpace:];
     87 $ZWJ = [:LineBreak = ZWJ:];
     88 
     89 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
     90 #         list it in the numerous rules that use CM.
     91 # By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
     92 
     93 $CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
     94 
     95 #   Dictionary character set, for triggering language-based break engines. Currently
     96 #   limited to LineBreak=Complex_Context (SA).
     97 
     98 $dictionary = [$SA];
     99 
    100 #
    101 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
    102 #                               SA  (Dictionary chars, excluding Mn and Mc)
    103 #                               SG  (Unpaired Surrogates)
    104 #                               XX  (Unknown, unassigned)
    105 #                         as $AL  (Alphabetic)
    106 #
    107 $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
    108 
    109 
    110 ## -------------------------------------------------
    111 
    112 #
    113 # CAN_CM  is the set of characters that may combine with CM combining chars.
    114 #         Note that Linebreak UAX 14's concept of a combining char and the rules
    115 #         for what they can combine with are _very_ different from the rest of Unicode.
    116 #
    117 #         Note that $CM itself is left out of this set.  If CM is needed as a base
    118 #         it must be listed separately in the rule.
    119 #
    120 $CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
    121 $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
    122 
    123 #
    124 # AL_FOLLOW  set of chars that can unconditionally follow an AL
    125 #            Needed in rules where stand-alone $CM s are treated as AL.
    126 #
    127 $AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus];
    128 
    129 
    130 #
    131 #  Rule LB 4, 5    Mandatory (Hard) breaks.
    132 #
    133 $LB4Breaks    = [$BK $CR $LF $NL];
    134 $LB4NonBreaks = [^$BK $CR $LF $NL $CM];
    135 $CR $LF {100};
    136 
    137 #
    138 #  LB 6    Do not break before hard line breaks.
    139 #
    140 $LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
    141 $CAN_CM $CM*    $LB4Breaks {100};
    142 ^$CM+           $LB4Breaks {100};
    143 
    144 # LB 7         x SP
    145 #              x ZW
    146 $LB4NonBreaks [$SP $ZW];
    147 $CAN_CM $CM*  [$SP $ZW];
    148 ^$CM+         [$SP $ZW];
    149 
    150 #
    151 # LB 8         Break after zero width space
    152 #              ZW SP* 
    153 #
    154 $LB8Breaks    = [$LB4Breaks $ZW];
    155 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
    156 $ZW $SP* / [^$SP $ZW $LB4Breaks];
    157 
    158 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
    159 #
    160 $ZWJ [^$CM];
    161 
    162 # LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
    163 #                                $CM not covered by the above needs to behave like $AL
    164 #                                See definition of $CAN_CM.
    165 
    166 $CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
    167 ^$CM+;
    168 
    169 #
    170 # LB 11  Do not break before or after WORD JOINER & related characters.
    171 #
    172 $CAN_CM $CM*  $WJ;
    173 $LB8NonBreaks $WJ;
    174 ^$CM+         $WJ;
    175 
    176 $WJ $CM* .;
    177 
    178 #
    179 # LB 12  Do not break after NBSP and related characters.
    180 #         GL  x
    181 #
    182 $GL $CM* .;
    183 
    184 #
    185 # LB 12a  Do not break before NBSP and related characters ...
    186 #            [^SP BA HY] x GL
    187 #
    188 [[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
    189 ^$CM+ $GL;
    190 
    191 
    192 #
    193 # LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
    194 #
    195 # Do not include $EXX here
    196 $LB8NonBreaks $CL;
    197 $CAN_CM $CM*  $CL;
    198 ^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
    199 
    200 $LB8NonBreaks $CP;
    201 $CAN_CM $CM*  $CP;
    202 ^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
    203 
    204 $LB8NonBreaks $EX;
    205 $CAN_CM $CM*  $EX;
    206 ^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
    207 
    208 $LB8NonBreaks $IS;
    209 $CAN_CM $CM*  $IS;
    210 ^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
    211 
    212 $LB8NonBreaks $SY;
    213 $CAN_CM $CM*  $SY;
    214 ^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
    215 
    216 
    217 #
    218 # LB 14  Do not break after OP, even after spaces
    219 #
    220 $OP $CM* $SP* .;
    221 
    222 $OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
    223                                    # by rule 8, CM following a SP is stand-alone.
    224 
    225 # LB 15
    226 $QU $CM* $SP* $OP;
    227 
    228 # LB 16
    229 # Do not break between closing punctuation and $NS, even with intervening spaces
    230 # But DO allow a break between closing punctuation and $NSX, don't include it here
    231 ($CL | $CP) $CM* $SP* $NS;
    232 
    233 # LB 17
    234 $B2 $CM* $SP* $B2;
    235 
    236 #
    237 # LB 18  Break after spaces.
    238 #
    239 $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
    240 $LB18Breaks    = [$LB8Breaks $SP];
    241 
    242 
    243 # LB 19
    244 #         x QU
    245 $LB18NonBreaks $CM* $QU;
    246 ^$CM+               $QU;
    247 
    248 #         QU  x
    249 $QU $CM* .;
    250 
    251 # LB 20
    252 #        <break>  $CB
    253 #        $CB   <break>
    254 #
    255 $LB20NonBreaks = [$LB18NonBreaks - $CB];
    256 
    257 # LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
    258 #             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
    259 #             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
    260 #
    261 ^($HY | $HH) $CM* $ALPlus;
    262 
    263 # LB 21        x   (BA | HY | NS)
    264 #           BB x
    265 #
    266 # DO allow breaks here before $BAX and $NSX, so don't include them
    267 $LB20NonBreaks $CM* ($BA | $HY | $NS);
    268 
    269 
    270 ^$CM+ ($BA | $HY | $NS);
    271 
    272 $BB $CM* [^$CB];                                  #  $BB  x
    273 $BB $CM* $LB20NonBreaks;
    274 
    275 # LB 21a Don't break after Hebrew + Hyphen
    276 #   HL (HY | BA) x
    277 #
    278 $HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
    279 
    280 # LB 21b (forward) Don't break between SY and HL
    281 # (break between HL and SY already disallowed by LB 13 above)
    282 $SY $CM* $HL;
    283 
    284 # LB 22
    285 ($ALPlus | $HL) $CM* $IN;
    286 ^$CM+    $IN;     #  by rule 10, any otherwise unattached CM behaves as AL
    287 $EX $CM*    $IN;
    288 ($ID | $EB | $EM) $CM*  $IN;
    289 # $IN $CM*    $IN;  # delete this rule for CSS loose
    290 $NU $CM*    $IN;
    291 
    292 
    293 # $LB 23
    294 #
    295 ($ALPlus | $HL) $CM* $NU;
    296 ^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
    297 $NU $CM* ($ALPlus | $HL);
    298 
    299 # LB 23a
    300 # Do not include $POX here
    301 #
    302 $PR $CM* ($ID | $EB | $EM);
    303 ($ID | $EB | $EM) $CM*  $PO;
    304 
    305 
    306 #
    307 # LB 24
    308 #
    309 # Do not include $PRX here
    310 ($PR | $PO | $POX) $CM* ($ALPlus | $HL);
    311 ($ALPlus | $HL) $CM* ($PR | $PO | $POX);     # TODO: should this be ($PR | $PRX | $PO)
    312 ^$CM+ ($PR | $PO | $POX);       # Rule 10, any otherwise unattached CM behaves as AL
    313 
    314 #
    315 # LB 25   Numbers.
    316 #
    317 # Here do not include $PRX at the beginning or $POX at the end
    318 (($PR | $PO | $POX) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
    319     ($CM* ($CL | $CP))? ($CM* ($PR | $PRX | $PO))?;
    320 
    321 # LB 26  Do not break a Korean syllable
    322 #
    323 $JL $CM* ($JL | $JV | $H2 | $H3);
    324 ($JV | $H2) $CM* ($JV | $JT);
    325 ($JT | $H3) $CM* $JT;
    326 
    327 # LB 27  Treat korean Syllable Block the same as ID  (don't break it)
    328 # Do not include $POX or $PRX here
    329 ($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
    330 ($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
    331 $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
    332 
    333 
    334 # LB 28   Do not break between alphabetics
    335 #
    336 ($ALPlus | $HL) $CM* ($ALPlus | $HL);
    337 ^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
    338 
    339 # LB 29
    340 $IS $CM* ($ALPlus | $HL);
    341 
    342 # LB 30
    343 ($ALPlus | $HL | $NU) $CM* $OP;
    344 ^$CM+ $OP;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
    345 $CP $CM* ($ALPlus | $HL | $NU);
    346 
    347 # LB 30a  Do not break between regional indicators. Break after pairs of them.
    348 #         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
    349 $RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
    350 $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
    351 $RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
    352 # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
    353 #       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
    354 #       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
    355 
    356 # LB 30b Do not break between an Emoji Base and an Emoji Modifier
    357 $EB $CM* $EM;
    358 
    359 # LB 31 Break everywhere else.
    360 #       Match a single code point if no other rule applies.
    361 .;
    362