Home | History | Annotate | Download | only in break_rules
      1 #
      2 # Copyright (C) 2016 and later: Unicode, Inc. and others.
      3 # License & terms of use: http://www.unicode.org/copyright.html#License
      4 
      5 # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
      6 # file: sentence.txt
      7 
      8 type = sentence;      # one of grapheme | word | line | sentence
      9 locale = en;
     10 
     11 CR        = [\p{Sentence_Break = CR}];
     12 LF        = [\p{Sentence_Break = LF}];
     13 Extend    = [\p{Sentence_Break = Extend}];
     14 Sep       = [\p{Sentence_Break = Sep}];
     15 Format    = [\p{Sentence_Break = Format}];
     16 Sp        = [\p{Sentence_Break = Sp}];
     17 Lower     = [\p{Sentence_Break = Lower}];
     18 Upper     = [\p{Sentence_Break = Upper}];
     19 OLetter   = [\p{Sentence_Break = OLetter}];
     20 Numeric   = [\p{Sentence_Break = Numeric}];
     21 ATerm     = [\p{Sentence_Break = ATerm}];
     22 SContinue = [\p{Sentence_Break = SContinue}];
     23 STerm     = [\p{Sentence_Break = STerm}];
     24 Close     = [\p{Sentence_Break = Close}];
     25 
     26 ParaSep   = [Sep CR LF];
     27 SATerm    = [STerm ATerm];
     28 ExtFmt    = [Extend Format];
     29 
     30 # SB2:    eot
     31 #       Conventional regular expression matching for '$' as end-of-text also matches
     32 #       at a line separator just preceding the physical end of text.
     33 #       Instead, use a look-ahead assertion that there is no following character.
     34 SB2:    .  (?!.);
     35 
     36 SB3:    CR LF;
     37 SB4:    ParaSep ;
     38 
     39 # SB5: ignore Format and Extend characters.
     40 
     41 SB6:    ATerm ExtFmt* Numeric;
     42 SB7:    (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
     43 SB8:    ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
     44 SB8a:   SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
     45 
     46 SB9:    SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ;
     47         # Also covers SB10, SB11.
     48 
     49 SB12:   . ExtFmt* [^ExtFmt]?;
     50 
     51