1 # 2 # Copyright (C) 2002-2015, International Business Machines Corporation and others. 3 # All Rights Reserved. 4 # 5 # file: sent.txt 6 # 7 # ICU Sentence Break Rules 8 # See Unicode Standard Annex #29. 9 # These rules are based on UAX #29 Revision 26 for Unicode Version 8.0 10 # 11 12 13 # 14 # Character categories as defined in TR 29 15 # 16 $CR = [\p{Sentence_Break = CR}]; 17 $LF = [\p{Sentence_Break = LF}]; 18 $Extend = [\p{Sentence_Break = Extend}]; 19 $Sep = [\p{Sentence_Break = Sep}]; 20 $Format = [\p{Sentence_Break = Format}]; 21 $Sp = [\p{Sentence_Break = Sp}]; 22 $Lower = [\p{Sentence_Break = Lower}]; 23 $Upper = [\p{Sentence_Break = Upper}]; 24 $OLetter = [\p{Sentence_Break = OLetter}]; 25 $Numeric = [\p{Sentence_Break = Numeric}]; 26 $ATerm = [\p{Sentence_Break = ATerm}]; 27 $SContinue = [\p{Sentence_Break = SContinue}]; 28 $STerm = [\p{Sentence_Break = STerm}]; 29 $Close = [\p{Sentence_Break = Close}]; 30 31 # 32 # Define extended forms of the character classes, 33 # incorporate trailing Extend or Format chars. 34 # Rules 4 and 5. 35 36 $SpEx = $Sp ($Extend | $Format)*; 37 $LowerEx = $Lower ($Extend | $Format)*; 38 $UpperEx = $Upper ($Extend | $Format)*; 39 $OLetterEx = $OLetter ($Extend | $Format)*; 40 $NumericEx = $Numeric ($Extend | $Format)*; 41 $ATermEx = $ATerm ($Extend | $Format)*; 42 $SContinueEx= $SContinue ($Extend | $Format)*; 43 $STermEx = $STerm ($Extend | $Format)*; 44 $CloseEx = $Close ($Extend | $Format)*; 45 46 47 ## ------------------------------------------------- 48 49 !!chain; 50 !!forward; 51 52 # Rule 3 - break after separators. Keep CR/LF together. 53 # 54 $CR $LF; 55 56 57 # Rule 4 - Break after $Sep. 58 # Rule 5 - Ignore $Format and $Extend 59 # 60 [^$Sep $CR $LF]? ($Extend | $Format)*; 61 62 63 # Rule 6 64 $ATermEx $NumericEx; 65 66 # Rule 7 67 ($UpperEx | $LowerEx) $ATermEx $UpperEx; 68 69 #Rule 8 70 $NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*; 71 $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; 72 73 # Rule 8a 74 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); 75 76 #Rule 9, 10, 11 77 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; 78 79 #Rule 12 80 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; 81 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; 82 83 ## ------------------------------------------------- 84 85 !!reverse; 86 87 $SpEx_R = ($Extend | $Format)* $Sp; 88 $ATermEx_R = ($Extend | $Format)* $ATerm; 89 $STermEx_R = ($Extend | $Format)* $STerm; 90 $CloseEx_R = ($Extend | $Format)* $Close; 91 92 # 93 # Reverse rules. 94 # For now, use the old style inexact reverse rules, which are easier 95 # to write, but less efficient. 96 # TODO: exact reverse rules. It appears that exact reverse rules 97 # may require improving support for look-ahead breaks in the 98 # builder. Needs more investigation. 99 # 100 101 [{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; 102 #.*; 103 104 # Explanation for this rule: 105 # 106 # It needs to back over 107 # The $Sep at which we probably begin 108 # All of the non $Sep chars leading to the preceding $Sep 109 # The preceding $Sep, which will be the second one that the rule matches. 110 # Any immediately preceding STerm or ATerm sequences. We need to see these 111 # to get the correct rule status when moving forwards again. 112 # 113 # [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match 114 # the entire string. 115 # 116 # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be 117 # at the beginning of the string at this point, and we don't want to fail. 118 # Can only use {eof} once, and it is used later. 119 # 120