Home | History | Annotate | Download | only in rules
      1 #
      2 #   Copyright (C) 2016 and later: Unicode, Inc. and others.
      3 #   License & terms of use: http://www.unicode.org/copyright.html
      4 #   Copyright (C) 2002-2016, International Business Machines Corporation and others.
      5 #       All Rights Reserved.
      6 #
      7 #   file:  char.txt
      8 #
      9 #   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
     10 #      See Unicode Standard Annex #29.
     11 #      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
     12 #      Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
     13 #      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
     14 
     15 !!quoted_literals_only;
     16 
     17 #
     18 #  Character Class Definitions.
     19 #
     20 $CR          = [\p{Grapheme_Cluster_Break = CR}];
     21 $LF          = [\p{Grapheme_Cluster_Break = LF}];
     22 $Control     = [[\p{Grapheme_Cluster_Break = Control}]];
     23 # TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
     24 #$Virama      = [[\p{Grapheme_Cluster_Break = Virama}]];
     25 #$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
     26 $Extend      = [[\p{Grapheme_Cluster_Break = Extend}]];
     27 $ZWJ         = [\p{Grapheme_Cluster_Break = ZWJ}];
     28 $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
     29 $Prepend     = [\p{Grapheme_Cluster_Break = Prepend}];
     30 $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
     31 
     32 #
     33 # Korean Syllable Definitions
     34 #
     35 $L           = [\p{Grapheme_Cluster_Break = L}];
     36 $V           = [\p{Grapheme_Cluster_Break = V}];
     37 $T           = [\p{Grapheme_Cluster_Break = T}];
     38 
     39 $LV          = [\p{Grapheme_Cluster_Break = LV}];
     40 $LVT         = [\p{Grapheme_Cluster_Break = LVT}];
     41 
     42 # Emoji defintions
     43 
     44 $Extended_Pict = [:ExtPict:];
     45 
     46 ## -------------------------------------------------
     47 !!chain;
     48 !!lookAheadHardBreak;
     49 
     50 $CR $LF;
     51 
     52 $L ($L | $V | $LV | $LVT);
     53 ($LV | $V) ($V | $T);
     54 ($LVT | $T) $T;
     55 
     56 # GB 9
     57 [^$Control $CR $LF] ($Extend | $ZWJ);
     58 
     59 # GB 9a (only for extended grapheme clusters)
     60 [^$Control $CR $LF] $SpacingMark;
     61 
     62 # GB 9b
     63 $Prepend [^$Control $CR $LF];
     64 
     65 # GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
     66 $Extended_Pict $Extend* $ZWJ $Extended_Pict;
     67 
     68 # GB 12-13. Keep pairs of regional indicators together
     69 #           Note that hard break '/' rule triggers only if there are three or more initial RIs,
     70 
     71 ^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator;
     72 ^$Prepend* $Regional_Indicator $Regional_Indicator;
     73 
     74 # GB 999 Match a single code point if no other rule applies.
     75 .;
     76 
     77