1 # 2 # Copyright (C) 2016 and later: Unicode, Inc. and others. 3 # License & terms of use: http://www.unicode.org/copyright.html 4 # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. 5 6 # file: word_POSIX.txt 7 # 8 # Reference Word Break rules for intltest rbbi/RBBIMonkeyTest 9 # 10 # Note: Rule syntax and the monkey test itself are still a work in progress. 11 # They are expected to change with review and the addition of support for rule tailoring. 12 13 type = word; # one of grapheme | word | line | sentence 14 locale = en_US_POSIX; 15 16 17 CR = [\p{Word_Break = CR}]; 18 LF = [\p{Word_Break = LF}]; 19 Newline = [\p{Word_Break = Newline}]; 20 Extend = [\p{Word_Break = Extend}]; 21 ZWJ = [\p{Word_Break = ZWJ}]; 22 Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 23 Format = [\p{Word_Break = Format}]; 24 Katakana = [\p{Word_Break = Katakana}]; 25 Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 26 ALetter = [\p{Word_Break = ALetter}]; 27 Single_Quote = [\p{Word_Break = Single_Quote}]; 28 Double_Quote = [\p{Word_Break = Double_Quote}]; 29 MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; 30 MidLetter = [\p{Word_Break = MidLetter} - [\:]]; 31 MidNum = [\p{Word_Break = MidNum} [.]]; 32 Numeric = [\p{Word_Break = Numeric}]; 33 ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 34 WSegSpace = [\p{Word_Break = WSegSpace}]; 35 Extended_Pict = [:ExtPict:]; 36 37 #define dictionary, with the effect being that those characters don't appear in test data. 38 39 Han = [:Han:]; 40 Hiragana = [:Hiragana:]; 41 42 Control = [\p{Grapheme_Cluster_Break = Control}]; 43 HangulSyllable = [\uac00-\ud7a3]; 44 ComplexContext = [:LineBreak = Complex_Context:]; 45 KanaKanji = [Han Hiragana Katakana]; 46 dictionaryCJK = [KanaKanji HangulSyllable]; 47 dictionary = [ComplexContext dictionaryCJK]; 48 49 # leave dictionary scripts out of ALetter 50 51 ALetter = [ALetter - dictionary]; 52 53 AHLetter = [ALetter Hebrew_Letter]; 54 MidNumLetQ = [MidNumLet Single_Quote]; 55 ExtFmt = [Extend Format ZWJ]; 56 57 WB3: CR LF; 58 WB3a: (Newline | CR | LF) ; 59 WB3b: . (Newline | CR | LF); # actually redundant? No other rule combines. 60 # (but needed with UAX treat-as scheme.) 61 WB3c: ZWJ Extended_Pict; 62 WB3d: WSegSpace WSegSpace; 63 64 WB5: AHLetter ExtFmt* AHLetter; 65 66 # includes both WB6 and WB7 67 WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; 68 69 WB7a: Hebrew_Letter ExtFmt* Single_Quote; 70 WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c 71 72 WB8: Numeric ExtFmt* Numeric; 73 WB9: AHLetter ExtFmt* Numeric; 74 WB10: Numeric ExtFmt* AHLetter; 75 76 WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 77 WB13: Katakana ExtFmt* Katakana; 78 79 WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; 80 WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); 81 82 # WB rule 15 - 17, pairs of Regional Indicators stay unbroken. 83 # Interacts with WB3c. 84 WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict; 85 WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ; 86 87 # Rule WB 999 Any Any 88 # Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG). 89 WB999.1: . ExtFmt* ZWJ Extended_Pict; 90 WB999.2: . ExtFmt* ; 91 92