1 # 2 # Copyright (C) 2016 and later: Unicode, Inc. and others. 3 # License & terms of use: http://www.unicode.org/copyright.html#License 4 5 # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. 6 # file: sentence.txt 7 8 type = sentence; # one of grapheme | word | line | sentence 9 locale = en; 10 11 CR = [\p{Sentence_Break = CR}]; 12 LF = [\p{Sentence_Break = LF}]; 13 Extend = [\p{Sentence_Break = Extend}]; 14 Sep = [\p{Sentence_Break = Sep}]; 15 Format = [\p{Sentence_Break = Format}]; 16 Sp = [\p{Sentence_Break = Sp}]; 17 Lower = [\p{Sentence_Break = Lower}]; 18 Upper = [\p{Sentence_Break = Upper}]; 19 OLetter = [\p{Sentence_Break = OLetter}]; 20 Numeric = [\p{Sentence_Break = Numeric}]; 21 ATerm = [\p{Sentence_Break = ATerm}]; 22 SContinue = [\p{Sentence_Break = SContinue}]; 23 STerm = [\p{Sentence_Break = STerm}]; 24 Close = [\p{Sentence_Break = Close}]; 25 26 ParaSep = [Sep CR LF]; 27 SATerm = [STerm ATerm]; 28 ExtFmt = [Extend Format]; 29 30 # SB2: eot 31 # Conventional regular expression matching for '$' as end-of-text also matches 32 # at a line separator just preceding the physical end of text. 33 # Instead, use a look-ahead assertion that there is no following character. 34 SB2: . (?!.); 35 36 SB3: CR LF; 37 SB4: ParaSep ; 38 39 # SB5: ignore Format and Extend characters. 40 41 SB6: ATerm ExtFmt* Numeric; 42 SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper; 43 SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower; 44 SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm); 45 46 SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ; 47 # Also covers SB10, SB11. 48 49 SB12: . ExtFmt* [^ExtFmt]?; 50 51