1 # 2 # Copyright (C) 2016 and later: Unicode, Inc. and others. 3 # License & terms of use: http://www.unicode.org/copyright.html 4 # Copyright (C) 2002-2016, International Business Machines Corporation and others. 5 # All Rights Reserved. 6 # 7 # file: char.txt 8 # 9 # ICU Character Break Rules, also known as Grapheme Cluster Boundaries 10 # See Unicode Standard Annex #29. 11 # These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 12 # Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088 13 # Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html 14 15 !!quoted_literals_only; 16 17 # 18 # Character Class Definitions. 19 # 20 $CR = [\p{Grapheme_Cluster_Break = CR}]; 21 $LF = [\p{Grapheme_Cluster_Break = LF}]; 22 $Control = [[\p{Grapheme_Cluster_Break = Control}]]; 23 # TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets. 24 #$Virama = [[\p{Grapheme_Cluster_Break = Virama}]]; 25 #$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]]; 26 $Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; 27 $ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; 28 $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; 29 $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; 30 $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; 31 32 # 33 # Korean Syllable Definitions 34 # 35 $L = [\p{Grapheme_Cluster_Break = L}]; 36 $V = [\p{Grapheme_Cluster_Break = V}]; 37 $T = [\p{Grapheme_Cluster_Break = T}]; 38 39 $LV = [\p{Grapheme_Cluster_Break = LV}]; 40 $LVT = [\p{Grapheme_Cluster_Break = LVT}]; 41 42 # Emoji defintions 43 44 $Extended_Pict = [:ExtPict:]; 45 46 ## ------------------------------------------------- 47 !!chain; 48 !!lookAheadHardBreak; 49 50 $CR $LF; 51 52 $L ($L | $V | $LV | $LVT); 53 ($LV | $V) ($V | $T); 54 ($LVT | $T) $T; 55 56 # GB 9 57 [^$Control $CR $LF] ($Extend | $ZWJ); 58 59 # GB 9a (only for extended grapheme clusters) 60 [^$Control $CR $LF] $SpacingMark; 61 62 # GB 9b 63 $Prepend [^$Control $CR $LF]; 64 65 # GB 11 Do not break within emoji modifier sequences or emoji zwj sequences. 66 $Extended_Pict $Extend* $ZWJ $Extended_Pict; 67 68 # GB 12-13. Keep pairs of regional indicators together 69 # Note that hard break '/' rule triggers only if there are three or more initial RIs, 70 71 ^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; 72 ^$Prepend* $Regional_Indicator $Regional_Indicator; 73 74 # GB 999 Match a single code point if no other rule applies. 75 .; 76 77