Home | History | Annotate | Download | only in rules
      1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 # License & terms of use: http://www.unicode.org/copyright.html
      3 #
      4 # Copyright (c) 2002-2016  International Business Machines Corporation and
      5 # others. All Rights Reserved.
      6 #
      7 #  file:  line_loose.txt
      8 #
      9 #         Line Breaking Rules
     10 #         Implement default line breaking as defined by
     11 #         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
     12 #         http://www.unicode.org/reports/tr14/, with the following modification:
     13 #
     14 #         Boundaries between hyphens and following letters are suppressed when
     15 #         there is a boundary preceding the hyphen. See rule 20.9
     16 #
     17 #         This tailors the line break behavior to correspond to CSS
     18 #         line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
     19 #         Chinese & Japanese.
     20 #         It sets characters of class CJ to behave like ID.
     21 #         In addition, it allows breaks:
     22 #         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
     23 #         * between characters of LineBreak class IN
     24 
     25 #
     26 #  Character Classes defined by TR 14.
     27 #
     28 
     29 !!chain;
     30 !!quoted_literals_only;
     31 
     32 $AI = [:LineBreak =  Ambiguous:];
     33 $AL = [:LineBreak =  Alphabetic:];
     34 $BA = [:LineBreak =  Break_After:];
     35 $HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
     36 $BB = [:LineBreak =  Break_Before:];
     37 $BK = [:LineBreak =  Mandatory_Break:];
     38 $B2 = [:LineBreak =  Break_Both:];
     39 $CB = [:LineBreak =  Contingent_Break:];
     40 $CJ = [:LineBreak =  Conditional_Japanese_Starter:];
     41 $CL = [:LineBreak =  Close_Punctuation:];
     42 # $CM = [:LineBreak =  Combining_Mark:];
     43 $CP = [:LineBreak =  Close_Parenthesis:];
     44 $CR = [:LineBreak =  Carriage_Return:];
     45 $EB = [:LineBreak =  EB:];
     46 $EM = [:LineBreak =  EM:];
     47 $EX = [:LineBreak =  Exclamation:];
     48 $GL = [:LineBreak =  Glue:];
     49 $HL = [:LineBreak =  Hebrew_Letter:];
     50 $HY = [:LineBreak =  Hyphen:];
     51 $H2 = [:LineBreak =  H2:];
     52 $H3 = [:LineBreak =  H3:];
     53 # CSS Loose tailoring: CJ resolves to ID
     54 $ID = [[:LineBreak =  Ideographic:] $CJ];
     55 $IN = [:LineBreak =  Inseperable:];
     56 $IS = [:LineBreak =  Infix_Numeric:];
     57 $JL = [:LineBreak =  JL:];
     58 $JV = [:LineBreak =  JV:];
     59 $JT = [:LineBreak =  JT:];
     60 $LF = [:LineBreak =  Line_Feed:];
     61 $NL = [:LineBreak =  Next_Line:];
     62 $NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
     63 $NS = [[:LineBreak =  Nonstarter:] - $NSX];
     64 $NU = [:LineBreak =  Numeric:];
     65 $OP = [:LineBreak =  Open_Punctuation:];
     66 $PO = [:LineBreak =  Postfix_Numeric:];
     67 $PR = [:LineBreak =  Prefix_Numeric:];
     68 $QU = [:LineBreak =  Quotation:];
     69 $RI = [:LineBreak =  Regional_Indicator:];
     70 $SA = [:LineBreak =  Complex_Context:];
     71 $SG = [:LineBreak =  Surrogate:];
     72 $SP = [:LineBreak =  Space:];
     73 $SY = [:LineBreak =  Break_Symbols:];
     74 $WJ = [:LineBreak =  Word_Joiner:];
     75 $XX = [:LineBreak =  Unknown:];
     76 $ZW = [:LineBreak =  ZWSpace:];
     77 $ZWJ = [:LineBreak = ZWJ:];
     78 
     79 # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
     80 #         list it in the numerous rules that use CM.
     81 # By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
     82 
     83 $CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
     84 
     85 #   Dictionary character set, for triggering language-based break engines. Currently
     86 #   limited to LineBreak=Complex_Context (SA).
     87 
     88 $dictionary = [$SA];
     89 
     90 #
     91 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
     92 #                               SA  (Dictionary chars, excluding Mn and Mc)
     93 #                               SG  (Unpaired Surrogates)
     94 #                               XX  (Unknown, unassigned)
     95 #                         as $AL  (Alphabetic)
     96 #
     97 $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
     98 
     99 
    100 ## -------------------------------------------------
    101 
    102 #
    103 # CAN_CM  is the set of characters that may combine with CM combining chars.
    104 #         Note that Linebreak UAX 14's concept of a combining char and the rules
    105 #         for what they can combine with are _very_ different from the rest of Unicode.
    106 #
    107 #         Note that $CM itself is left out of this set.  If CM is needed as a base
    108 #         it must be listed separately in the rule.
    109 #
    110 $CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
    111 $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
    112 
    113 #
    114 # AL_FOLLOW  set of chars that can unconditionally follow an AL
    115 #            Needed in rules where stand-alone $CM s are treated as AL.
    116 #
    117 $AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
    118 
    119 
    120 #
    121 #  Rule LB 4, 5    Mandatory (Hard) breaks.
    122 #
    123 $LB4Breaks    = [$BK $CR $LF $NL];
    124 $LB4NonBreaks = [^$BK $CR $LF $NL $CM];
    125 $CR $LF {100};
    126 
    127 #
    128 #  LB 6    Do not break before hard line breaks.
    129 #
    130 $LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
    131 $CAN_CM $CM*    $LB4Breaks {100};
    132 ^$CM+           $LB4Breaks {100};
    133 
    134 # LB 7         x SP
    135 #              x ZW
    136 $LB4NonBreaks [$SP $ZW];
    137 $CAN_CM $CM*  [$SP $ZW];
    138 ^$CM+         [$SP $ZW];
    139 
    140 #
    141 # LB 8         Break after zero width space
    142 #              ZW SP* 
    143 #
    144 $LB8Breaks    = [$LB4Breaks $ZW];
    145 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
    146 $ZW $SP* / [^$SP $ZW $LB4Breaks];
    147 
    148 # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
    149 #
    150 $ZWJ [^$CM];
    151 
    152 # LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
    153 #                                $CM not covered by the above needs to behave like $AL
    154 #                                See definition of $CAN_CM.
    155 
    156 $CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
    157 ^$CM+;
    158 
    159 #
    160 # LB 11  Do not break before or after WORD JOINER & related characters.
    161 #
    162 $CAN_CM $CM*  $WJ;
    163 $LB8NonBreaks $WJ;
    164 ^$CM+         $WJ;
    165 
    166 $WJ $CM* .;
    167 
    168 #
    169 # LB 12  Do not break after NBSP and related characters.
    170 #         GL  x
    171 #
    172 $GL $CM* .;
    173 
    174 #
    175 # LB 12a  Do not break before NBSP and related characters ...
    176 #            [^SP BA HY] x GL
    177 #
    178 [[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
    179 ^$CM+ $GL;
    180 
    181 
    182 
    183 #
    184 # LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
    185 #
    186 $LB8NonBreaks $CL;
    187 $CAN_CM $CM*  $CL;
    188 ^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
    189 
    190 $LB8NonBreaks $CP;
    191 $CAN_CM $CM*  $CP;
    192 ^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
    193 
    194 $LB8NonBreaks $EX;
    195 $CAN_CM $CM*  $EX;
    196 ^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
    197 
    198 $LB8NonBreaks $IS;
    199 $CAN_CM $CM*  $IS;
    200 ^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
    201 
    202 $LB8NonBreaks $SY;
    203 $CAN_CM $CM*  $SY;
    204 ^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
    205 
    206 
    207 #
    208 # LB 14  Do not break after OP, even after spaces
    209 #
    210 $OP $CM* $SP* .;
    211 
    212 $OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
    213                                    # by rule 8, CM following a SP is stand-alone.
    214 
    215 # LB 15
    216 $QU $CM* $SP* $OP;
    217 
    218 # LB 16
    219 # Do not break between closing punctuation and $NS, even with intervening spaces
    220 # But DO allow a break between closing punctuation and $NSX, don't include it here
    221 ($CL | $CP) $CM* $SP* $NS;
    222 
    223 # LB 17
    224 $B2 $CM* $SP* $B2;
    225 
    226 #
    227 # LB 18  Break after spaces.
    228 #
    229 $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
    230 $LB18Breaks    = [$LB8Breaks $SP];
    231 
    232 
    233 # LB 19
    234 #         x QU
    235 $LB18NonBreaks $CM* $QU;
    236 ^$CM+               $QU;
    237 
    238 #         QU  x
    239 $QU $CM* .;
    240 
    241 # LB 20
    242 #        <break>  $CB
    243 #        $CB   <break>
    244 #
    245 $LB20NonBreaks = [$LB18NonBreaks - $CB];
    246 
    247 # LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
    248 #             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
    249 #             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
    250 #
    251 ^($HY | $HH) $CM* $ALPlus;
    252 
    253 # LB 21        x   (BA | HY | NS)
    254 #           BB x
    255 #
    256 # DO allow breaks here before NSX, so don't include it
    257 $LB20NonBreaks $CM* ($BA | $HY | $NS);
    258 
    259 
    260 ^$CM+ ($BA | $HY | $NS);
    261 
    262 $BB $CM* [^$CB];                                  #  $BB  x
    263 $BB $CM* $LB20NonBreaks;
    264 
    265 # LB 21a Don't break after Hebrew + Hyphen
    266 #   HL (HY | BA) x
    267 #
    268 $HL $CM* ($HY | $BA) $CM* [^$CB]?;
    269 
    270 # LB 21b (forward) Don't break between SY and HL
    271 # (break between HL and SY already disallowed by LB 13 above)
    272 $SY $CM* $HL;
    273 
    274 # LB 22
    275 ($ALPlus | $HL) $CM* $IN;
    276 ^$CM+    $IN;     #  by rule 10, any otherwise unattached CM behaves as AL
    277 $EX $CM*    $IN;
    278 ($ID | $EB | $EM) $CM*  $IN;
    279 # $IN $CM*    $IN;  # delete this rule for CSS loose
    280 $NU $CM*    $IN;
    281 
    282 
    283 # $LB 23
    284 #
    285 ($ALPlus | $HL) $CM* $NU;
    286 ^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
    287 $NU $CM* ($ALPlus | $HL);
    288 
    289 # LB 23a
    290 #
    291 $PR $CM* ($ID | $EB | $EM);
    292 ($ID | $EB | $EM) $CM*  $PO;
    293 
    294 
    295 #
    296 # LB 24
    297 #
    298 ($PR | $PO) $CM* ($ALPlus | $HL);
    299 ($ALPlus | $HL) $CM* ($PR | $PO);
    300 ^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
    301 
    302 #
    303 # LB 25   Numbers.
    304 #
    305 (($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
    306     ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
    307 
    308 # LB 26  Do not break a Korean syllable
    309 #
    310 $JL $CM* ($JL | $JV | $H2 | $H3);
    311 ($JV | $H2) $CM* ($JV | $JT);
    312 ($JT | $H3) $CM* $JT;
    313 
    314 # LB 27  Treat korean Syllable Block the same as ID  (don't break it)
    315 ($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
    316 ($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
    317 $PR $CM* ($JL | $JV | $JT | $H2 | $H3);
    318 
    319 
    320 # LB 28   Do not break between alphabetics
    321 #
    322 ($ALPlus | $HL) $CM* ($ALPlus | $HL);
    323 ^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
    324 
    325 # LB 29
    326 $IS $CM* ($ALPlus | $HL);
    327 
    328 # LB 30
    329 ($ALPlus | $HL | $NU) $CM* $OP;
    330 ^$CM+ $OP;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
    331 $CP $CM* ($ALPlus | $HL | $NU);
    332 
    333 # LB 30a  Do not break between regional indicators. Break after pairs of them.
    334 #         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
    335 $RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
    336 $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
    337 $RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
    338 # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
    339 #       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
    340 #       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
    341 
    342 # LB 30b Do not break between an Emoji Base and an Emoji Modifier
    343 $EB $CM* $EM;
    344 
    345 # LB 31 Break everywhere else.
    346 #       Match a single code point if no other rule applies.
    347 .;
    348