1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2003-2016 International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.dev.test.rbbi; 11 12 13 // Monkey testing of RuleBasedBreakIterator. 14 // The old, original monkey test. TODO: remove 15 // The new monkey test is class RBBIMonkeyTest. 16 17 import java.util.ArrayList; 18 import java.util.Arrays; 19 import java.util.List; 20 import java.util.Locale; 21 22 import org.junit.Test; 23 import org.junit.runner.RunWith; 24 import org.junit.runners.JUnit4; 25 26 import android.icu.dev.test.TestFmwk; 27 import android.icu.lang.UCharacter; 28 import android.icu.lang.UProperty; 29 import android.icu.text.BreakIterator; 30 import android.icu.text.RuleBasedBreakIterator; 31 import android.icu.text.UTF16; 32 import android.icu.text.UnicodeSet; 33 import android.icu.testsharding.MainTestShard; 34 35 36 /** 37 * Monkey tests for RBBI. These tests have independent implementations of 38 * the Unicode TR boundary rules, and compare results between these and ICU's 39 * implementation, using random data. 40 * 41 * Tests cover Grapheme Cluster (char), Word and Line breaks 42 * 43 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp 44 * 45 */ 46 @MainTestShard 47 @RunWith(JUnit4.class) 48 public class RBBITestMonkey extends TestFmwk { 49 // 50 // class RBBIMonkeyKind 51 // 52 // Monkey Test for Break Iteration 53 // Abstract interface class. Concrete derived classes independently 54 // implement the break rules for different iterator types. 55 // 56 // The Monkey Test itself uses doesn't know which type of break iterator it is 57 // testing, but works purely in terms of the interface defined here. 58 // 59 abstract static class RBBIMonkeyKind { 60 61 // Return a List of UnicodeSets, representing the character classes used 62 // for this type of iterator. 63 abstract List charClasses(); 64 65 // Set the test text on which subsequent calls to next() will operate 66 abstract void setText(StringBuffer text); 67 68 // Find the next break position, starting from the specified position. 69 // Return -1 after reaching end of string. 70 abstract int next(int i); 71 72 // A Character Property, one of the constants defined in class UProperty. 73 // The value of this property will be displayed for the characters 74 // near any test failure. 75 int fCharProperty; 76 } 77 78 // 79 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, 13267 80 // 81 static String gExtended_Pict = "[" + 82 "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767" + 83 "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" + 84 "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F" + 85 "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F" + 86 "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6" + 87 "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586" + 88 "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7" + 89 "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB" + 90 "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" + 91 "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C" + 92 "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637" + 93 "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A" + 94 "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9" + 95 "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD" + 96 "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF" + 97 "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5" + 98 "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F" + 99 "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F" + 100 "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F" + 101 "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF" + 102 "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8" + 103 "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF" + 104 "]"; 105 106 107 /** 108 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries. 109 * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets 110 */ 111 static class RBBICharMonkey extends RBBIMonkeyKind { 112 List fSets; 113 114 UnicodeSet fCRLFSet; 115 UnicodeSet fControlSet; 116 UnicodeSet fExtendSet; 117 UnicodeSet fRegionalIndicatorSet; 118 UnicodeSet fPrependSet; 119 UnicodeSet fSpacingSet; 120 UnicodeSet fLSet; 121 UnicodeSet fVSet; 122 UnicodeSet fTSet; 123 UnicodeSet fLVSet; 124 UnicodeSet fLVTSet; 125 UnicodeSet fHangulSet; 126 UnicodeSet fEmojiModifierSet; 127 UnicodeSet fEmojiBaseSet; 128 UnicodeSet fZWJSet; 129 UnicodeSet fExtendedPictSet; 130 UnicodeSet fEBGSet; 131 UnicodeSet fEmojiNRKSet; 132 UnicodeSet fAnySet; 133 134 135 StringBuffer fText; 136 137 138 RBBICharMonkey() { 139 fText = null; 140 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK; 141 fCRLFSet = new UnicodeSet("[\\r\\n]"); 142 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]"); 143 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]"); 144 fZWJSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]"); 145 fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"); 146 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]"); 147 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]"); 148 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]"); 149 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]"); 150 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]"); 151 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]"); 152 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]"); 153 fHangulSet = new UnicodeSet(); 154 fHangulSet.addAll(fLSet); 155 fHangulSet.addAll(fVSet); 156 fHangulSet.addAll(fTSet); 157 fHangulSet.addAll(fLVSet); 158 fHangulSet.addAll(fLVTSet); 159 160 fEmojiBaseSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}]"); 161 fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]"); 162 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 163 fEBGSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]"); 164 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]"); 165 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); 166 167 168 fSets = new ArrayList(); 169 fSets.add(fCRLFSet); 170 fSets.add(fControlSet); 171 fSets.add(fExtendSet); 172 fSets.add(fRegionalIndicatorSet); 173 if (!fPrependSet.isEmpty()) { 174 fSets.add(fPrependSet); 175 } 176 fSets.add(fSpacingSet); 177 fSets.add(fHangulSet); 178 fSets.add(fAnySet); 179 fSets.add(fEmojiBaseSet); 180 fSets.add(fEmojiModifierSet); 181 fSets.add(fZWJSet); 182 fSets.add(fExtendedPictSet); 183 fSets.add(fEBGSet); 184 fSets.add(fEmojiNRKSet); 185 } 186 187 188 @Override 189 void setText(StringBuffer s) { 190 fText = s; 191 } 192 193 @Override 194 List charClasses() { 195 return fSets; 196 } 197 198 @Override 199 int next(int prevPos) { 200 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 201 // break position being tested. The candidate break 202 // location is before p2. 203 204 int breakPos = -1; 205 206 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 207 int cBase; // for (X Extend*) patterns, the X character. 208 209 // Previous break at end of string. return DONE. 210 if (prevPos >= fText.length()) { 211 return -1; 212 } 213 /* p0 = */ p1 = p2 = p3 = prevPos; 214 c3 = UTF16.charAt(fText, prevPos); 215 c0 = c1 = c2 = cBase = 0; 216 217 // Loop runs once per "significant" character position in the input text. 218 for (;;) { 219 // Move all of the positions forward in the input string. 220 /* p0 = p1;*/ c0 = c1; 221 p1 = p2; c1 = c2; 222 p2 = p3; c2 = c3; 223 224 // Advance p3 by one codepoint 225 p3 = moveIndex32(fText, p3, 1); 226 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3); 227 228 if (p1 == p2) { 229 // Still warming up the loop. (won't work with zero length strings, but we don't care) 230 continue; 231 } 232 if (p2 == fText.length()) { 233 // Reached end of string. Always a break position. 234 break; 235 } 236 237 // Rule GB3 CR x LF 238 // No Extend or Format characters may appear between the CR and LF, 239 // which requires the additional check for p2 immediately following p1. 240 // 241 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 242 continue; 243 } 244 245 // Rule (GB4). ( Control | CR | LF ) <break> 246 if (fControlSet.contains(c1) || 247 c1 == 0x0D || 248 c1 == 0x0A) { 249 break; 250 } 251 252 // Rule (GB5) <break> ( Control | CR | LF ) 253 // 254 if (fControlSet.contains(c2) || 255 c2 == 0x0D || 256 c2 == 0x0A) { 257 break; 258 } 259 260 261 // Rule (GB6) L x ( L | V | LV | LVT ) 262 if (fLSet.contains(c1) && 263 (fLSet.contains(c2) || 264 fVSet.contains(c2) || 265 fLVSet.contains(c2) || 266 fLVTSet.contains(c2))) { 267 continue; 268 } 269 270 // Rule (GB7) ( LV | V ) x ( V | T ) 271 if ((fLVSet.contains(c1) || fVSet.contains(c1)) && 272 (fVSet.contains(c2) || fTSet.contains(c2))) { 273 continue; 274 } 275 276 // Rule (GB8) ( LVT | T) x T 277 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) && 278 fTSet.contains(c2)) { 279 continue; 280 } 281 282 // Rule (GB9) x (Extend | ZWJ) 283 if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) { 284 if (!fExtendSet.contains(c1)) { 285 cBase = c1; 286 } 287 continue; 288 } 289 290 // Rule (GB9a) x SpacingMark 291 if (fSpacingSet.contains(c2)) { 292 continue; 293 } 294 295 // Rule (GB9b) Prepend x 296 if (fPrependSet.contains(c1)) { 297 continue; 298 } 299 // Rule (GB10) (Emoji_Base | EBG) Extend* x Emoji_Modifier 300 if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) { 301 continue; 302 } 303 if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) && 304 fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) { 305 continue; 306 } 307 308 // Rule (GB11) (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji) 309 if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) && 310 (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 311 continue; 312 } 313 if ((fExtendedPictSet.contains(cBase) || fEmojiNRKSet.contains(cBase)) && fExtendSet.contains(c0) && fZWJSet.contains(c1) && 314 (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 315 continue; 316 } 317 318 // Rule (GB12-13) Regional_Indicator x Regional_Indicator 319 // Note: The first if condition is a little tricky. We only need to force 320 // a break if there are three or more contiguous RIs. If there are 321 // only two, a break following will occur via other rules, and will include 322 // any trailing extend characters, which is needed behavior. 323 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) 324 && fRegionalIndicatorSet.contains(c2)) { 325 break; 326 } 327 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 328 continue; 329 } 330 331 // Rule (GB999) Any <break> Any 332 break; 333 } 334 335 breakPos = p2; 336 return breakPos; 337 } 338 } 339 340 341 /** 342 * 343 * Word Monkey Test Class 344 * 345 * 346 * 347 */ 348 static class RBBIWordMonkey extends RBBIMonkeyKind { 349 List fSets; 350 StringBuffer fText; 351 352 UnicodeSet fCRSet; 353 UnicodeSet fLFSet; 354 UnicodeSet fNewlineSet; 355 UnicodeSet fRegionalIndicatorSet; 356 UnicodeSet fKatakanaSet; 357 UnicodeSet fHebrew_LetterSet; 358 UnicodeSet fALetterSet; 359 UnicodeSet fSingle_QuoteSet; 360 UnicodeSet fDouble_QuoteSet; 361 UnicodeSet fMidNumLetSet; 362 UnicodeSet fMidLetterSet; 363 UnicodeSet fMidNumSet; 364 UnicodeSet fNumericSet; 365 UnicodeSet fFormatSet; 366 UnicodeSet fExtendSet; 367 UnicodeSet fExtendNumLetSet; 368 UnicodeSet fOtherSet; 369 UnicodeSet fDictionarySet; 370 UnicodeSet fEBaseSet; 371 UnicodeSet fEBGSet; 372 UnicodeSet fEModifierSet; 373 UnicodeSet fZWJSet; 374 UnicodeSet fExtendedPictSet; 375 UnicodeSet fEmojiNRKSet; 376 377 378 RBBIWordMonkey() { 379 fCharProperty = UProperty.WORD_BREAK; 380 381 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]"); 382 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]"); 383 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]"); 384 fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); 385 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); 386 fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); 387 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); 388 fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); 389 fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); 390 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); 391 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); 392 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); 393 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); 394 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); 395 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]"); 396 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]"); 397 fEBaseSet = new UnicodeSet("[\\p{Word_Break = EB}]"); 398 fEBGSet = new UnicodeSet("[\\p{Word_Break = EBG}]"); 399 fEModifierSet = new UnicodeSet("[\\p{Word_Break = EM}]"); 400 fZWJSet = new UnicodeSet("[\\p{Word_Break = ZWJ}]"); 401 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 402 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]"); 403 404 fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"); 405 fDictionarySet.addAll(fKatakanaSet); 406 fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]")); 407 408 fALetterSet.removeAll(fDictionarySet); 409 410 fOtherSet = new UnicodeSet(); 411 fOtherSet.complement(); 412 fOtherSet.removeAll(fCRSet); 413 fOtherSet.removeAll(fLFSet); 414 fOtherSet.removeAll(fNewlineSet); 415 fOtherSet.removeAll(fALetterSet); 416 fOtherSet.removeAll(fSingle_QuoteSet); 417 fOtherSet.removeAll(fDouble_QuoteSet); 418 fOtherSet.removeAll(fKatakanaSet); 419 fOtherSet.removeAll(fHebrew_LetterSet); 420 fOtherSet.removeAll(fMidLetterSet); 421 fOtherSet.removeAll(fMidNumSet); 422 fOtherSet.removeAll(fNumericSet); 423 fOtherSet.removeAll(fFormatSet); 424 fOtherSet.removeAll(fExtendSet); 425 fOtherSet.removeAll(fExtendNumLetSet); 426 fOtherSet.removeAll(fRegionalIndicatorSet); 427 fOtherSet.removeAll(fEBaseSet); 428 fOtherSet.removeAll(fEBGSet); 429 fOtherSet.removeAll(fEModifierSet); 430 fOtherSet.removeAll(fZWJSet); 431 fOtherSet.removeAll(fExtendedPictSet); 432 fOtherSet.removeAll(fEmojiNRKSet); 433 434 // Inhibit dictionary characters from being tested at all. 435 // remove surrogates so as to not generate higher CJK characters 436 fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]")); 437 fOtherSet.removeAll(fDictionarySet); 438 439 fSets = new ArrayList(); 440 fSets.add(fCRSet); 441 fSets.add(fLFSet); 442 fSets.add(fNewlineSet); 443 fSets.add(fRegionalIndicatorSet); 444 fSets.add(fHebrew_LetterSet); 445 fSets.add(fALetterSet); 446 //fSets.add(fKatakanaSet); // Omit Katakana from fSets, which omits Katakana characters 447 // from the test data. They are all in the dictionary set, 448 // which this (old, to be retired) monkey test cannot handle. 449 fSets.add(fSingle_QuoteSet); 450 fSets.add(fDouble_QuoteSet); 451 fSets.add(fMidLetterSet); 452 fSets.add(fMidNumLetSet); 453 fSets.add(fMidNumSet); 454 fSets.add(fNumericSet); 455 fSets.add(fFormatSet); 456 fSets.add(fExtendSet); 457 fSets.add(fExtendNumLetSet); 458 fSets.add(fRegionalIndicatorSet); 459 fSets.add(fEBaseSet); 460 fSets.add(fEBGSet); 461 fSets.add(fEModifierSet); 462 fSets.add(fZWJSet); 463 fSets.add(fExtendedPictSet); 464 fSets.add(fEmojiNRKSet); 465 fSets.add(fOtherSet); 466 } 467 468 469 @Override 470 List charClasses() { 471 return fSets; 472 } 473 474 @Override 475 void setText(StringBuffer s) { 476 fText = s; 477 } 478 479 @Override 480 int next(int prevPos) { 481 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 482 // break position being tested. The candidate break 483 // location is before p2. 484 int breakPos = -1; 485 486 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 487 488 // Previous break at end of string. return DONE. 489 if (prevPos >= fText.length()) { 490 return -1; 491 } 492 /*p0 =*/ p1 = p2 = p3 = prevPos; 493 c3 = UTF16.charAt(fText, prevPos); 494 c0 = c1 = c2 = 0; 495 496 497 498 // Loop runs once per "significant" character position in the input text. 499 for (;;) { 500 // Move all of the positions forward in the input string. 501 /*p0 = p1;*/ c0 = c1; 502 p1 = p2; c1 = c2; 503 p2 = p3; c2 = c3; 504 505 // Advance p3 by X(Extend | Format)* Rule 4 506 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 507 do { 508 p3 = moveIndex32(fText, p3, 1); 509 c3 = -1; 510 if (p3>=fText.length()) { 511 break; 512 } 513 c3 = UTF16.charAt(fText, p3); 514 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 515 break; 516 } 517 } 518 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3)); 519 520 if (p1 == p2) { 521 // Still warming up the loop. (won't work with zero length strings, but we don't care) 522 continue; 523 } 524 if (p2 == fText.length()) { 525 // Reached end of string. Always a break position. 526 break; 527 } 528 529 // Rule (3) CR x LF 530 // No Extend or Format characters may appear between the CR and LF, 531 // which requires the additional check for p2 immediately following p1. 532 // 533 if (c1==0x0D && c2==0x0A) { 534 continue; 535 } 536 537 // Rule (3a) Break before and after newlines (including CR and LF) 538 // 539 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) { 540 break; 541 } 542 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 543 break; 544 } 545 546 // Rule (3c) ZWJ x (Extended_Pictographic | Emoji). 547 // Not ignoring extend chars, so peek into input text to 548 // get the potential ZWJ, the character immediately preceding c2. 549 if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 550 continue; 551 } 552 553 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 554 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 555 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 556 continue; 557 } 558 559 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 560 // 561 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 562 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 563 (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) { 564 continue; 565 } 566 567 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 568 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) && 569 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 570 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 571 continue; 572 } 573 574 // Rule (7a) Hebrew_Letter x Single_Quote 575 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) { 576 continue; 577 } 578 579 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 580 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) { 581 continue; 582 } 583 584 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 585 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) { 586 continue; 587 } 588 589 // Rule (8) Numeric x Numeric 590 if (fNumericSet.contains(c1) && 591 fNumericSet.contains(c2)) { 592 continue; 593 } 594 595 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 596 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 597 fNumericSet.contains(c2)) { 598 continue; 599 } 600 601 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 602 if (fNumericSet.contains(c1) && 603 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 604 continue; 605 } 606 607 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 608 if (fNumericSet.contains(c0) && 609 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 610 fNumericSet.contains(c2)) { 611 continue; 612 } 613 614 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 615 if (fNumericSet.contains(c1) && 616 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 617 setContains(fNumericSet, c3)) { 618 continue; 619 } 620 621 // Rule (13) Katakana x Katakana 622 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 623 // all Katakana are handled by the dictionary breaker. 624 if (fKatakanaSet.contains(c1) && 625 fKatakanaSet.contains(c2)) { 626 continue; 627 } 628 629 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 630 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) || 631 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) && 632 fExtendNumLetSet.contains(c2)) { 633 continue; 634 } 635 636 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 637 if (fExtendNumLetSet.contains(c1) && 638 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) || 639 fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) { 640 continue; 641 } 642 643 644 // Rule 14 (E_Base | EBG) x E_Modifier 645 if ((fEBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) { 646 continue; 647 } 648 649 // Rule 15 - 17 Group piars of Regional Indicators 650 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) { 651 break; 652 } 653 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 654 continue; 655 } 656 657 // Rule 999. Break found here. 658 break; 659 } 660 661 breakPos = p2; 662 return breakPos; 663 } 664 665 } 666 667 668 static class RBBILineMonkey extends RBBIMonkeyKind { 669 670 List fSets; 671 672 // UnicodeSets for each of the Line Breaking character classes. 673 // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier 674 // to verify that they are all accounted for. 675 676 UnicodeSet fBK; 677 UnicodeSet fCR; 678 UnicodeSet fLF; 679 UnicodeSet fCM; 680 UnicodeSet fNL; 681 UnicodeSet fSG; 682 UnicodeSet fWJ; 683 UnicodeSet fZW; 684 UnicodeSet fGL; 685 UnicodeSet fSP; 686 UnicodeSet fB2; 687 UnicodeSet fBA; 688 UnicodeSet fBB; 689 UnicodeSet fHY; 690 UnicodeSet fCB; 691 UnicodeSet fCL; 692 UnicodeSet fCP; 693 UnicodeSet fEX; 694 UnicodeSet fIN; 695 UnicodeSet fNS; 696 UnicodeSet fOP; 697 UnicodeSet fQU; 698 UnicodeSet fIS; 699 UnicodeSet fNU; 700 UnicodeSet fPO; 701 UnicodeSet fPR; 702 UnicodeSet fSY; 703 UnicodeSet fAI; 704 UnicodeSet fAL; 705 UnicodeSet fCJ; 706 UnicodeSet fH2; 707 UnicodeSet fH3; 708 UnicodeSet fHL; 709 UnicodeSet fID; 710 UnicodeSet fJL; 711 UnicodeSet fJV; 712 UnicodeSet fJT; 713 UnicodeSet fRI; 714 UnicodeSet fXX; 715 UnicodeSet fEB; 716 UnicodeSet fEM; 717 UnicodeSet fZWJ; 718 UnicodeSet fExtendedPict; 719 UnicodeSet fEmojiNRK; 720 721 StringBuffer fText; 722 int fOrigPositions; 723 724 725 726 RBBILineMonkey() 727 { 728 fCharProperty = UProperty.LINE_BREAK; 729 fSets = new ArrayList(); 730 731 fBK = new UnicodeSet("[\\p{Line_Break=BK}]"); 732 fCR = new UnicodeSet("[\\p{Line_break=CR}]"); 733 fLF = new UnicodeSet("[\\p{Line_break=LF}]"); 734 fCM = new UnicodeSet("[\\p{Line_break=CM}]"); 735 fNL = new UnicodeSet("[\\p{Line_break=NL}]"); 736 fSG = new UnicodeSet("[\\ud800-\\udfff]"); 737 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]"); 738 fZW = new UnicodeSet("[\\p{Line_break=ZW}]"); 739 fGL = new UnicodeSet("[\\p{Line_break=GL}]"); 740 fSP = new UnicodeSet("[\\p{Line_break=SP}]"); 741 fB2 = new UnicodeSet("[\\p{Line_break=B2}]"); 742 fBA = new UnicodeSet("[\\p{Line_break=BA}]"); 743 fBB = new UnicodeSet("[\\p{Line_break=BB}]"); 744 fHY = new UnicodeSet("[\\p{Line_break=HY}]"); 745 fCB = new UnicodeSet("[\\p{Line_break=CB}]"); 746 fCL = new UnicodeSet("[\\p{Line_break=CL}]"); 747 fCP = new UnicodeSet("[\\p{Line_break=CP}]"); 748 fEX = new UnicodeSet("[\\p{Line_break=EX}]"); 749 fIN = new UnicodeSet("[\\p{Line_break=IN}]"); 750 fNS = new UnicodeSet("[\\p{Line_break=NS}]"); 751 fOP = new UnicodeSet("[\\p{Line_break=OP}]"); 752 fQU = new UnicodeSet("[\\p{Line_break=QU}]"); 753 fIS = new UnicodeSet("[\\p{Line_break=IS}]"); 754 fNU = new UnicodeSet("[\\p{Line_break=NU}]"); 755 fPO = new UnicodeSet("[\\p{Line_break=PO}]"); 756 fPR = new UnicodeSet("[\\p{Line_break=PR}]"); 757 fSY = new UnicodeSet("[\\p{Line_break=SY}]"); 758 fAI = new UnicodeSet("[\\p{Line_break=AI}]"); 759 fAL = new UnicodeSet("[\\p{Line_break=AL}]"); 760 fCJ = new UnicodeSet("[\\p{Line_break=CJ}]"); 761 fH2 = new UnicodeSet("[\\p{Line_break=H2}]"); 762 fH3 = new UnicodeSet("[\\p{Line_break=H3}]"); 763 fHL = new UnicodeSet("[\\p{Line_break=HL}]"); 764 fID = new UnicodeSet("[\\p{Line_break=ID}]"); 765 fJL = new UnicodeSet("[\\p{Line_break=JL}]"); 766 fJV = new UnicodeSet("[\\p{Line_break=JV}]"); 767 fJT = new UnicodeSet("[\\p{Line_break=JT}]"); 768 fRI = new UnicodeSet("[\\p{Line_break=RI}]"); 769 fXX = new UnicodeSet("[\\p{Line_break=XX}]"); 770 fEB = new UnicodeSet("[\\p{Line_break=EB}]"); 771 fEM = new UnicodeSet("[\\p{Line_break=EM}]"); 772 fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]"); 773 fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9]]"); 774 fExtendedPict = new UnicodeSet(gExtended_Pict); 775 776 777 // Remove dictionary characters. 778 // The monkey test reference implementation of line break does not replicate the dictionary behavior, 779 // so dictionary characters are omitted from the monkey test data. 780 @SuppressWarnings("unused") 781 UnicodeSet dictionarySet = new UnicodeSet( 782 "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]"); 783 784 fAL.addAll(fXX); // Default behavior for XX is identical to AL 785 fAL.addAll(fAI); // Default behavior for AI is identical to AL 786 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL 787 788 fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. 789 fCM.addAll(fZWJ); // ZWJ behaves as a CM. 790 791 fSets.add(fBK); 792 fSets.add(fCR); 793 fSets.add(fLF); 794 fSets.add(fCM); 795 fSets.add(fNL); 796 fSets.add(fWJ); 797 fSets.add(fZW); 798 fSets.add(fGL); 799 fSets.add(fSP); 800 fSets.add(fB2); 801 fSets.add(fBA); 802 fSets.add(fBB); 803 fSets.add(fHY); 804 fSets.add(fCB); 805 fSets.add(fCL); 806 fSets.add(fCP); 807 fSets.add(fEX); 808 fSets.add(fIN); 809 fSets.add(fJL); 810 fSets.add(fJT); 811 fSets.add(fJV); 812 fSets.add(fNS); 813 fSets.add(fOP); 814 fSets.add(fQU); 815 fSets.add(fIS); 816 fSets.add(fNU); 817 fSets.add(fPO); 818 fSets.add(fPR); 819 fSets.add(fSY); 820 fSets.add(fAI); 821 fSets.add(fAL); 822 fSets.add(fH2); 823 fSets.add(fH3); 824 fSets.add(fHL); 825 fSets.add(fID); 826 fSets.add(fWJ); 827 fSets.add(fRI); 828 fSets.add(fSG); 829 fSets.add(fEB); 830 fSets.add(fEM); 831 fSets.add(fZWJ); 832 fSets.add(fExtendedPict); 833 fSets.add(fEmojiNRK); 834 } 835 836 @Override 837 void setText(StringBuffer s) { 838 fText = s; 839 } 840 841 842 843 844 @Override 845 int next(int startPos) { 846 int pos; // Index of the char following a potential break position 847 int thisChar; // Character at above position "pos" 848 849 int prevPos; // Index of the char preceding a potential break position 850 int prevChar; // Character at above position. Note that prevChar 851 // and thisChar may not be adjacent because combining 852 // characters between them will be ignored. 853 int prevCharX2; // Character before prevChar, more contex for LB 21a 854 855 int nextPos; // Index of the next character following pos. 856 // Usually skips over combining marks. 857 int tPos; // temp value. 858 int matchVals[] = null; // Number Expression Match Results 859 860 861 if (startPos >= fText.length()) { 862 return -1; 863 } 864 865 866 // Initial values for loop. Loop will run the first time without finding breaks, 867 // while the invalid values shift out and the "this" and 868 // "prev" positions are filled in with good values. 869 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 870 thisChar = prevChar = prevCharX2 = 0; 871 nextPos = startPos; 872 873 874 // Loop runs once per position in the test text, until a break position 875 // is found. In each iteration, we are testing for a possible break 876 // just preceding the character at index "pos". The character preceding 877 // this char is at postion "prevPos"; because of combining sequences, 878 // "prevPos" can be arbitrarily far before "pos". 879 for (;;) { 880 // Advance to the next position to be tested. 881 prevCharX2 = prevChar; 882 prevPos = pos; 883 prevChar = thisChar; 884 pos = nextPos; 885 nextPos = moveIndex32(fText, pos, 1); 886 887 // Rule LB2 - Break at end of text. 888 if (pos >= fText.length()) { 889 break; 890 } 891 892 // Rule LB 9 - adjust for combining sequences. 893 // We do this rule out-of-order because the adjustment does 894 // not effect the way that rules LB 3 through LB 6 match, 895 // and doing it here rather than after LB 6 is substantially 896 // simpler when combining sequences do occur. 897 898 899 // LB 9 Keep combining sequences together. 900 // advance over any CM class chars at "pos", 901 // result is "nextPos" for the following loop iteration. 902 thisChar = UTF16.charAt(fText, pos); 903 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || 904 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { 905 for (;;) { 906 if (nextPos == fText.length()) { 907 break; 908 } 909 int nextChar = UTF16.charAt(fText, nextPos); 910 if (!fCM.contains(nextChar)) { 911 break; 912 } 913 nextPos = moveIndex32(fText, nextPos, 1); 914 } 915 } 916 917 // LB 9 Treat X CM* as if it were X 918 // No explicit action required. 919 920 // LB 10 Treat any remaining combining mark as AL 921 if (fCM.contains(thisChar)) { 922 thisChar = 'A'; 923 } 924 925 926 // If the loop is still warming up - if we haven't shifted the initial 927 // -1 positions out of prevPos yet - loop back to advance the 928 // position in the input without any further looking for breaks. 929 if (prevPos == -1) { 930 continue; 931 } 932 933 // LB 4 Always break after hard line breaks, 934 if (fBK.contains(prevChar)) { 935 break; 936 } 937 938 // LB 5 Break after CR, LF, NL, but not inside CR LF 939 if (fCR.contains(prevChar) && fLF.contains(thisChar)) { 940 continue; 941 } 942 if (fCR.contains(prevChar) || 943 fLF.contains(prevChar) || 944 fNL.contains(prevChar)) { 945 break; 946 } 947 948 // LB 6 Don't break before hard line breaks 949 if (fBK.contains(thisChar) || fCR.contains(thisChar) || 950 fLF.contains(thisChar) || fNL.contains(thisChar) ) { 951 continue; 952 } 953 954 955 // LB 7 Don't break before spaces or zero-width space. 956 if (fSP.contains(thisChar)) { 957 continue; 958 } 959 960 if (fZW.contains(thisChar)) { 961 continue; 962 } 963 964 // LB 8 Break after zero width space 965 if (fZW.contains(prevChar)) { 966 break; 967 } 968 969 // LB 8a: ZWJ x (ID | Extended_Pictographic | Emoji) 970 // The monkey test's way of ignoring combining characters doesn't work 971 // for this rule. ZWJ is also a CM. Need to get the actual character 972 // preceding "thisChar", not ignoring combining marks, possibly ZWJ. 973 { 974 int prevC = fText.codePointBefore(pos); 975 if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) { 976 continue; 977 } 978 } 979 980 // LB 9, 10 Already done, at top of loop. 981 // 982 983 984 // LB 11 985 // x WJ 986 // WJ x 987 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { 988 continue; 989 } 990 991 992 // LB 12 993 // GL x 994 if (fGL.contains(prevChar)) { 995 continue; 996 } 997 998 // LB 12a 999 // [^SP BA HY] x GL 1000 if (!(fSP.contains(prevChar) || 1001 fBA.contains(prevChar) || 1002 fHY.contains(prevChar) ) && fGL.contains(thisChar)) { 1003 continue; 1004 } 1005 1006 1007 1008 // LB 13 Don't break before closings. 1009 // NU x CL, NU x CP and NU x IS are not matched here so that they will 1010 // fall into LB 17 and the more general number regular expression. 1011 // 1012 if (!fNU.contains(prevChar) && fCL.contains(thisChar) || 1013 !fNU.contains(prevChar) && fCP.contains(thisChar) || 1014 fEX.contains(thisChar) || 1015 !fNU.contains(prevChar) && fIS.contains(thisChar) || 1016 !fNU.contains(prevChar) && fSY.contains(thisChar)) { 1017 continue; 1018 } 1019 1020 // LB 14 Don't break after OP SP* 1021 // Scan backwards, checking for this sequence. 1022 // The OP char could include combining marks, so we actually check for 1023 // OP CM* SP* x 1024 tPos = prevPos; 1025 if (fSP.contains(prevChar)) { 1026 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1027 tPos=moveIndex32(fText, tPos, -1); 1028 } 1029 } 1030 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1031 tPos=moveIndex32(fText, tPos, -1); 1032 } 1033 if (fOP.contains(UTF16.charAt(fText, tPos))) { 1034 continue; 1035 } 1036 1037 // LB 15 Do not break within "[ 1038 // QU CM* SP* x OP 1039 if (fOP.contains(thisChar)) { 1040 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 1041 tPos = prevPos; 1042 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1043 tPos = moveIndex32(fText, tPos, -1); 1044 } 1045 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1046 tPos = moveIndex32(fText, tPos, -1); 1047 } 1048 if (fQU.contains(UTF16.charAt(fText, tPos))) { 1049 continue; 1050 } 1051 } 1052 1053 // LB 16 (CL | CP) SP* x NS 1054 if (fNS.contains(thisChar)) { 1055 tPos = prevPos; 1056 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1057 tPos = moveIndex32(fText, tPos, -1); 1058 } 1059 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1060 tPos = moveIndex32(fText, tPos, -1); 1061 } 1062 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) { 1063 continue; 1064 } 1065 } 1066 1067 1068 // LB 17 B2 SP* x B2 1069 if (fB2.contains(thisChar)) { 1070 tPos = prevPos; 1071 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1072 tPos = moveIndex32(fText, tPos, -1); 1073 } 1074 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1075 tPos = moveIndex32(fText, tPos, -1); 1076 } 1077 if (fB2.contains(UTF16.charAt(fText, tPos))) { 1078 continue; 1079 } 1080 } 1081 1082 // LB 18 break after space 1083 if (fSP.contains(prevChar)) { 1084 break; 1085 } 1086 1087 // LB 19 1088 // x QU 1089 // QU x 1090 if (fQU.contains(thisChar) || fQU.contains(prevChar)) { 1091 continue; 1092 } 1093 1094 // LB 20 Break around a CB 1095 if (fCB.contains(thisChar) || fCB.contains(prevChar)) { 1096 break; 1097 } 1098 1099 // LB 21 1100 if (fBA.contains(thisChar) || 1101 fHY.contains(thisChar) || 1102 fNS.contains(thisChar) || 1103 fBB.contains(prevChar) ) { 1104 continue; 1105 } 1106 1107 // LB 21a, HL (HY | BA) x 1108 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) { 1109 continue; 1110 } 1111 1112 // LB 21b, SY x HL 1113 if (fSY.contains(prevChar) && fHL.contains(thisChar)) { 1114 continue; 1115 } 1116 1117 // LB 22 1118 if (fAL.contains(prevChar) && fIN.contains(thisChar) || 1119 fEX.contains(prevChar) && fIN.contains(thisChar) || 1120 fHL.contains(prevChar) && fIN.contains(thisChar) || 1121 (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) || 1122 fIN.contains(prevChar) && fIN.contains(thisChar) || 1123 fNU.contains(prevChar) && fIN.contains(thisChar) ) { 1124 continue; 1125 } 1126 1127 // LB 23 (AL | HL) x NU 1128 // NU x (AL | HL) 1129 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) { 1130 continue; 1131 } 1132 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1133 continue; 1134 } 1135 1136 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 1137 // PR x (ID | EB | EM) 1138 // (ID | EB | EM) x PO 1139 if (fPR.contains(prevChar) && 1140 (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) { 1141 continue; 1142 } 1143 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && 1144 fPO.contains(thisChar)) { 1145 continue; 1146 } 1147 1148 // LB 24 Do not break between prefix and letters or ideographs. 1149 // (PR | PO) x (AL | HL) 1150 // (AL | HL) x (PR | PO) 1151 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) && 1152 (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1153 continue; 1154 } 1155 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && 1156 (fPR.contains(thisChar) || fPO.contains(thisChar))) { 1157 continue; 1158 } 1159 1160 1161 // LB 25 Numbers 1162 matchVals = LBNumberCheck(fText, prevPos, matchVals); 1163 if (matchVals[0] != -1) { 1164 // Matched a number. But could have been just a single digit, which would 1165 // not represent a "no break here" between prevChar and thisChar 1166 int numEndIdx = matchVals[1]; // idx of first char following num 1167 if (numEndIdx > pos) { 1168 // Number match includes at least the two chars being checked 1169 if (numEndIdx > nextPos) { 1170 // Number match includes additional chars. Update pos and nextPos 1171 // so that next loop iteration will continue at the end of the number, 1172 // checking for breaks between last char in number & whatever follows. 1173 nextPos = numEndIdx; 1174 pos = numEndIdx; 1175 do { 1176 pos = moveIndex32(fText, pos, -1); 1177 thisChar = UTF16.charAt(fText, pos); 1178 } 1179 while (fCM.contains(thisChar)); 1180 } 1181 continue; 1182 } 1183 } 1184 1185 1186 // LB 26 Do not break Korean Syllables 1187 if (fJL.contains(prevChar) && (fJL.contains(thisChar) || 1188 fJV.contains(thisChar) || 1189 fH2.contains(thisChar) || 1190 fH3.contains(thisChar))) { 1191 continue; 1192 } 1193 1194 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) && 1195 (fJV.contains(thisChar) || fJT.contains(thisChar))) { 1196 continue; 1197 } 1198 1199 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) && 1200 fJT.contains(thisChar)) { 1201 continue; 1202 } 1203 1204 // LB 27 Treat a Korean Syllable Block the same as ID 1205 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1206 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1207 fIN.contains(thisChar)) { 1208 continue; 1209 } 1210 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1211 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1212 fPO.contains(thisChar)) { 1213 continue; 1214 } 1215 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || 1216 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) { 1217 continue; 1218 } 1219 1220 1221 1222 // LB 28 Do not break between alphabetics 1223 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1224 continue; 1225 } 1226 1227 // LB 29 Do not break between numeric punctuation and alphabetics 1228 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1229 continue; 1230 } 1231 1232 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 1233 // (AL | NU) x OP 1234 // CP x (AL | NU) 1235 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) { 1236 continue; 1237 } 1238 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) { 1239 continue; 1240 } 1241 1242 // LB 30a Break between pairs of Regional Indicators. 1243 // RI RI <break> RI 1244 // RI x RI 1245 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { 1246 break; 1247 } 1248 if (fRI.contains(prevChar) && fRI.contains(thisChar)) { 1249 continue; 1250 } 1251 1252 // LB30b Emoji Base x Emoji Modifier 1253 if (fEB.contains(prevChar) && fEM.contains(thisChar)) { 1254 continue; 1255 } 1256 // LB 31 Break everywhere else 1257 break; 1258 } 1259 1260 return pos; 1261 } 1262 1263 1264 1265 // Match the following regular expression in the input text. 1266 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)? 1267 // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states) 1268 // retVals array [0] index of the start of the match, or -1 if no match 1269 // [1] index of first char following the match. 1270 // Can not use Java regex because need supplementary character support, 1271 // and because Unicode char properties version must be the same as in 1272 // the version of ICU being tested. 1273 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { 1274 if (retVals == null) { 1275 retVals = new int[2]; 1276 } 1277 retVals[0] = -1; // Indicates no match. 1278 int matchState = 0; 1279 int idx = startIdx; 1280 1281 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){ 1282 int c = UTF16.charAt(s, idx); 1283 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); 1284 switch (matchState) { 1285 case 0: 1286 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC || 1287 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1288 matchState = 1; 1289 break; 1290 } 1291 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1292 matchState = 4; 1293 break; 1294 } 1295 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1296 matchState = 4; 1297 break; 1298 } 1299 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1300 matchState = 7; 1301 break; 1302 } 1303 break matchLoop; /* No Match */ 1304 1305 case 1: 1306 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1307 matchState = 1; 1308 break; 1309 } 1310 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1311 matchState = 4; 1312 break; 1313 } 1314 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1315 matchState = 4; 1316 break; 1317 } 1318 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1319 matchState = 7; 1320 break; 1321 } 1322 break matchLoop; /* No Match */ 1323 1324 1325 case 4: 1326 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1327 matchState = 4; 1328 break; 1329 } 1330 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1331 matchState = 7; 1332 break; 1333 } 1334 break matchLoop; /* No Match */ 1335 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)? 1336 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states) 1337 1338 case 7: 1339 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1340 matchState = 7; 1341 break; 1342 } 1343 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1344 matchState = 7; 1345 break; 1346 } 1347 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1348 matchState = 7; 1349 break; 1350 } 1351 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) { 1352 matchState = 7; 1353 break; 1354 } 1355 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) { 1356 matchState = 9; 1357 break; 1358 } 1359 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) { 1360 matchState = 9; 1361 break; 1362 } 1363 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1364 matchState = 11; 1365 break; 1366 } 1367 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1368 matchState = 11; 1369 break; 1370 } 1371 1372 break matchLoop; // Match Complete. 1373 case 9: 1374 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1375 matchState = 9; 1376 break; 1377 } 1378 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1379 matchState = 11; 1380 break; 1381 } 1382 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1383 matchState = 11; 1384 break; 1385 } 1386 break matchLoop; // Match Complete. 1387 case 11: 1388 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1389 matchState = 11; 1390 break; 1391 } 1392 break matchLoop; // Match Complete. 1393 } 1394 } 1395 if (matchState > 4) { 1396 retVals[0] = startIdx; 1397 retVals[1] = idx; 1398 } 1399 return retVals; 1400 } 1401 1402 1403 @Override 1404 List charClasses() { 1405 return fSets; 1406 } 1407 1408 1409 1410 } 1411 1412 1413 /** 1414 * 1415 * Sentence Monkey Test Class 1416 * 1417 * 1418 * 1419 */ 1420 static class RBBISentenceMonkey extends RBBIMonkeyKind { 1421 List fSets; 1422 StringBuffer fText; 1423 1424 UnicodeSet fSepSet; 1425 UnicodeSet fFormatSet; 1426 UnicodeSet fSpSet; 1427 UnicodeSet fLowerSet; 1428 UnicodeSet fUpperSet; 1429 UnicodeSet fOLetterSet; 1430 UnicodeSet fNumericSet; 1431 UnicodeSet fATermSet; 1432 UnicodeSet fSContinueSet; 1433 UnicodeSet fSTermSet; 1434 UnicodeSet fCloseSet; 1435 UnicodeSet fOtherSet; 1436 UnicodeSet fExtendSet; 1437 1438 1439 1440 RBBISentenceMonkey() { 1441 fCharProperty = UProperty.SENTENCE_BREAK; 1442 1443 fSets = new ArrayList(); 1444 1445 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 1446 // set and made into character classes of their own. For the monkey impl, 1447 // they remain in SEP, since Sep always appears with CR and LF in the rules. 1448 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"); 1449 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]"); 1450 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]"); 1451 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]"); 1452 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]"); 1453 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]"); 1454 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]"); 1455 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]"); 1456 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]"); 1457 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]"); 1458 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]"); 1459 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]"); 1460 fOtherSet = new UnicodeSet(); 1461 1462 1463 fOtherSet.complement(); 1464 fOtherSet.removeAll(fSepSet); 1465 fOtherSet.removeAll(fFormatSet); 1466 fOtherSet.removeAll(fSpSet); 1467 fOtherSet.removeAll(fLowerSet); 1468 fOtherSet.removeAll(fUpperSet); 1469 fOtherSet.removeAll(fOLetterSet); 1470 fOtherSet.removeAll(fNumericSet); 1471 fOtherSet.removeAll(fATermSet); 1472 fOtherSet.removeAll(fSContinueSet); 1473 fOtherSet.removeAll(fSTermSet); 1474 fOtherSet.removeAll(fCloseSet); 1475 fOtherSet.removeAll(fExtendSet); 1476 1477 fSets.add(fSepSet); 1478 fSets.add(fFormatSet); 1479 1480 fSets.add(fSpSet); 1481 fSets.add(fLowerSet); 1482 fSets.add(fUpperSet); 1483 fSets.add(fOLetterSet); 1484 fSets.add(fNumericSet); 1485 fSets.add(fATermSet); 1486 fSets.add(fSContinueSet); 1487 fSets.add(fSTermSet); 1488 fSets.add(fCloseSet); 1489 fSets.add(fOtherSet); 1490 fSets.add(fExtendSet); 1491 } 1492 1493 1494 @Override 1495 List charClasses() { 1496 return fSets; 1497 } 1498 1499 @Override 1500 void setText(StringBuffer s) { 1501 fText = s; 1502 } 1503 1504 1505 // moveBack() Find the "significant" code point preceding the index i. 1506 // Skips over ($Extend | $Format)* 1507 // 1508 private int moveBack(int i) { 1509 1510 if (i <= 0) { 1511 return -1; 1512 } 1513 1514 int c; 1515 int j = i; 1516 do { 1517 j = moveIndex32(fText, j, -1); 1518 c = UTF16.charAt(fText, j); 1519 } 1520 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c))); 1521 return j; 1522 } 1523 1524 1525 int moveForward(int i) { 1526 if (i>=fText.length()) { 1527 return fText.length(); 1528 } 1529 int c; 1530 int j = i; 1531 do { 1532 j = moveIndex32(fText, j, 1); 1533 c = cAt(j); 1534 } 1535 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c))); 1536 return j; 1537 1538 } 1539 1540 int cAt(int pos) { 1541 if (pos<0 || pos>=fText.length()) { 1542 return -1; 1543 } 1544 return UTF16.charAt(fText, pos); 1545 } 1546 1547 @Override 1548 int next(int prevPos) { 1549 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 1550 // break position being tested. The candidate break 1551 // location is before p2. 1552 int breakPos = -1; 1553 1554 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1555 int c; 1556 1557 // Prev break at end of string. return DONE. 1558 if (prevPos >= fText.length()) { 1559 return -1; 1560 } 1561 /*p0 =*/ p1 = p2 = p3 = prevPos; 1562 c3 = UTF16.charAt(fText, prevPos); 1563 c0 = c1 = c2 = 0; 1564 1565 // Loop runs once per "significant" character position in the input text. 1566 for (;;) { 1567 // Move all of the positions forward in the input string. 1568 /*p0 = p1;*/ c0 = c1; 1569 p1 = p2; c1 = c2; 1570 p2 = p3; c2 = c3; 1571 1572 // Advancd p3 by X(Extend | Format)* Rule 4 1573 p3 = moveForward(p3); 1574 c3 = cAt(p3); 1575 1576 // Rule (3) CR x LF 1577 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 1578 continue; 1579 } 1580 1581 // Rule (4) Sep <break> 1582 if (fSepSet.contains(c1)) { 1583 p2 = p1+1; // Separators don't combine with Extend or Format 1584 break; 1585 } 1586 1587 if (p2 >= fText.length()) { 1588 // Reached end of string. Always a break position. 1589 break; 1590 } 1591 1592 if (p2 == prevPos) { 1593 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1594 continue; 1595 } 1596 1597 // Rule (6). ATerm x Numeric 1598 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) { 1599 continue; 1600 } 1601 1602 // Rule (7). (Upper | Lower) ATerm x Uppper 1603 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) && 1604 fATermSet.contains(c1) && fUpperSet.contains(c2)) { 1605 continue; 1606 } 1607 1608 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower 1609 // Note: Sterm | ATerm are added to the negated part of the expression by a 1610 // note to the Unicode 5.0 documents. 1611 int p8 = p1; 1612 while (p8>0 && fSpSet.contains(cAt(p8))) { 1613 p8 = moveBack(p8); 1614 } 1615 while (p8>0 && fCloseSet.contains(cAt(p8))) { 1616 p8 = moveBack(p8); 1617 } 1618 if (fATermSet.contains(cAt(p8))) { 1619 p8=p2; 1620 for (;;) { 1621 c = cAt(p8); 1622 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) || 1623 fLowerSet.contains(c) || fSepSet.contains(c) || 1624 fATermSet.contains(c) || fSTermSet.contains(c)) 1625 { 1626 break; 1627 } 1628 p8 = moveForward(p8); 1629 } 1630 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) { 1631 continue; 1632 } 1633 } 1634 1635 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm) 1636 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) { 1637 p8 = p1; 1638 while (setContains(fSpSet, cAt(p8))) { 1639 p8 = moveBack(p8); 1640 } 1641 while (setContains(fCloseSet, cAt(p8))) { 1642 p8 = moveBack(p8); 1643 } 1644 c = cAt(p8); 1645 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) { 1646 continue; 1647 } 1648 } 1649 1650 1651 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 1652 int p9 = p1; 1653 while (p9>0 && fCloseSet.contains(cAt(p9))) { 1654 p9 = moveBack(p9); 1655 } 1656 c = cAt(p9); 1657 if ((fSTermSet.contains(c) || fATermSet.contains(c))) { 1658 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) { 1659 continue; 1660 } 1661 } 1662 1663 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 1664 int p10 = p1; 1665 while (p10>0 && fSpSet.contains(cAt(p10))) { 1666 p10 = moveBack(p10); 1667 } 1668 while (p10>0 && fCloseSet.contains(cAt(p10))) { 1669 p10 = moveBack(p10); 1670 } 1671 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) { 1672 if (fSpSet.contains(c2) || fSepSet.contains(c2)) { 1673 continue; 1674 } 1675 } 1676 1677 // Rule (11) (STerm | ATerm) Close* Sp* <break> 1678 int p11 = p1; 1679 if (p11>0 && fSepSet.contains(cAt(p11))) { 1680 p11 = moveBack(p11); 1681 } 1682 while (p11>0 && fSpSet.contains(cAt(p11))) { 1683 p11 = moveBack(p11); 1684 } 1685 while (p11>0 && fCloseSet.contains(cAt(p11))) { 1686 p11 = moveBack(p11); 1687 } 1688 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) { 1689 break; 1690 } 1691 1692 // Rule (12) Any x Any 1693 continue; 1694 } 1695 breakPos = p2; 1696 return breakPos; 1697 } 1698 1699 1700 1701 } 1702 1703 1704 /** 1705 * Move an index into a string by n code points. 1706 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were 1707 * complicating usage. 1708 * @param s a Text string 1709 * @param pos The starting code unit index into the text string 1710 * @param amt The amount to adjust the string by. 1711 * @return The adjusted code unit index, pinned to the string's length, or 1712 * unchanged if input index was outside of the string. 1713 */ 1714 static int moveIndex32(StringBuffer s, int pos, int amt) { 1715 int i; 1716 char c; 1717 if (amt>0) { 1718 for (i=0; i<amt; i++) { 1719 if (pos >= s.length()) { 1720 return s.length(); 1721 } 1722 c = s.charAt(pos); 1723 pos++; 1724 if (UTF16.isLeadSurrogate(c) && pos < s.length()) { 1725 c = s.charAt(pos); 1726 if (UTF16.isTrailSurrogate(c)) { 1727 pos++; 1728 } 1729 } 1730 } 1731 } else { 1732 for (i=0; i>amt; i--) { 1733 if (pos <= 0) { 1734 return 0; 1735 } 1736 pos--; 1737 c = s.charAt(pos); 1738 if (UTF16.isTrailSurrogate(c) && pos >= 0) { 1739 c = s.charAt(pos); 1740 if (UTF16.isLeadSurrogate(c)) { 1741 pos--; 1742 } 1743 } 1744 } 1745 } 1746 return pos; 1747 } 1748 1749 /** 1750 * No-exceptions form of UnicodeSet.contains(c). 1751 * Simplifies loops that terminate with an end-of-input character value. 1752 * @param s A unicode set 1753 * @param c A code point value 1754 * @return true if the set contains c. 1755 */ 1756 static boolean setContains(UnicodeSet s, int c) { 1757 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) { 1758 return false; 1759 } 1760 return s.contains(c); 1761 } 1762 1763 1764 /** 1765 * return the index of the next code point in the input text. 1766 * @param i the preceding index 1767 */ 1768 static int nextCP(StringBuffer s, int i) { 1769 if (i == -1) { 1770 // End of Input indication. Continue to return end value. 1771 return -1; 1772 } 1773 int retVal = i + 1; 1774 if (retVal > s.length()) { 1775 return -1; 1776 } 1777 int c = UTF16.charAt(s, i); 1778 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) { 1779 retVal++; 1780 } 1781 return retVal; 1782 } 1783 1784 1785 /** 1786 * random number generator. Not using Java's built-in Randoms for two reasons: 1787 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. 1788 * 2. We need to get and restore the seed from values occurring in the middle 1789 * of a long sequence, to more easily reproduce failing cases. 1790 */ 1791 private static int m_seed = 1; 1792 private static int m_rand() 1793 { 1794 m_seed = m_seed * 1103515245 + 12345; 1795 return (m_seed >>> 16) % 32768; 1796 } 1797 1798 // Helper function for formatting error output. 1799 // Append a string into a fixed-size field in a StringBuffer. 1800 // Blank-pad the string if it is shorter than the field. 1801 // Truncate the source string if it is too long. 1802 // 1803 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) { 1804 int appendLen = src.length(); 1805 if (appendLen >= fieldLen) { 1806 dest.append(src.substring(0, fieldLen)); 1807 } else { 1808 dest.append(src); 1809 while (appendLen < fieldLen) { 1810 dest.append(' '); 1811 appendLen++; 1812 } 1813 } 1814 } 1815 1816 // Helper function for formatting error output. 1817 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format 1818 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) { 1819 String hexChars = "0123456789abcdef"; 1820 if (c < 0x10000) { 1821 dest.append("\\u"); 1822 for (int bn=12; bn>=0; bn-=4) { 1823 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1824 } 1825 appendToBuf(dest, " ", fieldLen-6); 1826 } else { 1827 dest.append("\\U"); 1828 for (int bn=28; bn>=0; bn-=4) { 1829 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1830 } 1831 appendToBuf(dest, " ", fieldLen-10); 1832 1833 } 1834 } 1835 1836 /** 1837 * Run a RBBI monkey test. Common routine, for all break iterator types. 1838 * Parameters: 1839 * bi - the break iterator to use 1840 * mk - MonkeyKind, abstraction for obtaining expected results 1841 * name - Name of test (char, word, etc.) for use in error messages 1842 * seed - Seed for starting random number generator (parameter from user) 1843 * numIterations 1844 */ 1845 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) { 1846 int TESTSTRINGLEN = 500; 1847 StringBuffer testText = new StringBuffer(); 1848 int numCharClasses; 1849 List chClasses; 1850 int[] expected = new int[TESTSTRINGLEN*2 + 1]; 1851 int expectedCount = 0; 1852 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1853 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1854 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1855 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1856 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1857 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1858 int i; 1859 int loopCount = 0; 1860 boolean printTestData = false; 1861 boolean printBreaksFromBI = false; 1862 1863 m_seed = seed; 1864 1865 numCharClasses = mk.charClasses().size(); 1866 chClasses = mk.charClasses(); 1867 1868 // Verify that the character classes all have at least one member. 1869 for (i=0; i<numCharClasses; i++) { 1870 UnicodeSet s = (UnicodeSet)chClasses.get(i); 1871 if (s == null || s.size() == 0) { 1872 errln("Character Class " + i + " is null or of zero size."); 1873 return; 1874 } 1875 } 1876 1877 //-------------------------------------------------------------------------------------------- 1878 // 1879 // Debugging settings. Comment out everything in the following block for normal operation 1880 // 1881 //-------------------------------------------------------------------------------------------- 1882 // numIterations = -1; 1883 // numIterations = 10000; // Same as exhaustive. 1884 // RuleBasedBreakIterator_New.fTrace = true; 1885 // m_seed = 859056465; 1886 // TESTSTRINGLEN = 50; 1887 // printTestData = true; 1888 // printBreaksFromBI = true; 1889 // ((RuleBasedBreakIterator_New)bi).dump(); 1890 1891 //-------------------------------------------------------------------------------------------- 1892 // 1893 // End of Debugging settings. 1894 // 1895 //-------------------------------------------------------------------------------------------- 1896 1897 int dotsOnLine = 0; 1898 while (loopCount < numIterations || numIterations == -1) { 1899 if (numIterations == -1 && loopCount % 10 == 0) { 1900 // If test is running in an infinite loop, display a periodic tic so 1901 // we can tell that it is making progress. 1902 System.out.print("."); 1903 if (dotsOnLine++ >= 80){ 1904 System.out.println(); 1905 dotsOnLine = 0; 1906 } 1907 } 1908 // Save current random number seed, so that we can recreate the random numbers 1909 // for this loop iteration in event of an error. 1910 seed = m_seed; 1911 1912 testText.setLength(0); 1913 // Populate a test string with data. 1914 if (printTestData) { 1915 System.out.println("Test Data string ..."); 1916 } 1917 for (i=0; i<TESTSTRINGLEN; i++) { 1918 int aClassNum = m_rand() % numCharClasses; 1919 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum); 1920 int charIdx = m_rand() % classSet.size(); 1921 int c = classSet.charAt(charIdx); 1922 if (c < 0) { // TODO: deal with sets containing strings. 1923 errln("c < 0"); 1924 } 1925 UTF16.appendCodePoint(testText, c); 1926 if (printTestData) { 1927 System.out.print(Integer.toHexString(c) + " "); 1928 } 1929 } 1930 if (printTestData) { 1931 System.out.println(); 1932 } 1933 1934 Arrays.fill(expected, 0); 1935 Arrays.fill(expectedBreaks, false); 1936 Arrays.fill(forwardBreaks, false); 1937 Arrays.fill(reverseBreaks, false); 1938 Arrays.fill(isBoundaryBreaks, false); 1939 Arrays.fill(followingBreaks, false); 1940 Arrays.fill(precedingBreaks, false); 1941 1942 // Calculate the expected results for this test string. 1943 mk.setText(testText); 1944 expectedCount = 0; 1945 expectedBreaks[0] = true; 1946 expected[expectedCount ++] = 0; 1947 int breakPos = 0; 1948 int lastBreakPos = -1; 1949 for (;;) { 1950 lastBreakPos = breakPos; 1951 breakPos = mk.next(breakPos); 1952 if (breakPos == -1) { 1953 break; 1954 } 1955 if (breakPos > testText.length()) { 1956 errln("breakPos > testText.length()"); 1957 } 1958 if (lastBreakPos >= breakPos) { 1959 errln("Next() not increasing."); 1960 // break; 1961 } 1962 expectedBreaks[breakPos] = true; 1963 expected[expectedCount ++] = breakPos; 1964 } 1965 1966 // Find the break positions using forward iteration 1967 if (printBreaksFromBI) { 1968 System.out.println("Breaks from BI..."); 1969 } 1970 bi.setText(testText.toString()); 1971 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) { 1972 if (i < 0 || i > testText.length()) { 1973 errln(name + " break monkey test: Out of range value returned by breakIterator::next()"); 1974 break; 1975 } 1976 if (printBreaksFromBI) { 1977 System.out.print(Integer.toHexString(i) + " "); 1978 } 1979 forwardBreaks[i] = true; 1980 } 1981 if (printBreaksFromBI) { 1982 System.out.println(); 1983 } 1984 1985 // Find the break positions using reverse iteration 1986 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) { 1987 if (i < 0 || i > testText.length()) { 1988 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name); 1989 break; 1990 } 1991 reverseBreaks[i] = true; 1992 } 1993 1994 // Find the break positions using isBoundary() tests. 1995 for (i=0; i<=testText.length(); i++) { 1996 isBoundaryBreaks[i] = bi.isBoundary(i); 1997 } 1998 1999 // Find the break positions using the following() function. 2000 lastBreakPos = 0; 2001 followingBreaks[0] = true; 2002 for (i=0; i<testText.length(); i++) { 2003 breakPos = bi.following(i); 2004 if (breakPos <= i || 2005 breakPos < lastBreakPos || 2006 breakPos > testText.length() || 2007 breakPos > lastBreakPos && lastBreakPos > i ) { 2008 errln(name + " break monkey test: " + 2009 "Out of range value returned by BreakIterator::following().\n" + 2010 "index=" + i + "following returned=" + breakPos + 2011 "lastBreak=" + lastBreakPos); 2012 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2013 } else { 2014 followingBreaks[breakPos] = true; 2015 lastBreakPos = breakPos; 2016 } 2017 } 2018 2019 // Find the break positions using the preceding() function. 2020 lastBreakPos = testText.length(); 2021 precedingBreaks[testText.length()] = true; 2022 for (i=testText.length(); i>0; i--) { 2023 breakPos = bi.preceding(i); 2024 if (breakPos >= i || 2025 breakPos > lastBreakPos || 2026 breakPos < 0 || 2027 breakPos < lastBreakPos && lastBreakPos < i ) { 2028 errln(name + " break monkey test: " + 2029 "Out of range value returned by BreakIterator::preceding().\n" + 2030 "index=" + i + "preceding returned=" + breakPos + 2031 "lastBreak=" + lastBreakPos); 2032 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2033 } else { 2034 precedingBreaks[breakPos] = true; 2035 lastBreakPos = breakPos; 2036 } 2037 } 2038 2039 2040 2041 // Compare the expected and actual results. 2042 for (i=0; i<=testText.length(); i++) { 2043 String errorType = null; 2044 if (forwardBreaks[i] != expectedBreaks[i]) { 2045 errorType = "next()"; 2046 } else if (reverseBreaks[i] != forwardBreaks[i]) { 2047 errorType = "previous()"; 2048 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 2049 errorType = "isBoundary()"; 2050 } else if (followingBreaks[i] != expectedBreaks[i]) { 2051 errorType = "following()"; 2052 } else if (precedingBreaks[i] != expectedBreaks[i]) { 2053 errorType = "preceding()"; 2054 } 2055 2056 if (errorType != null) { 2057 // Format a range of the test text that includes the failure as 2058 // a data item that can be included in the rbbi test data file. 2059 2060 // Start of the range is the last point where expected and actual results 2061 // both agreed that there was a break position. 2062 int startContext = i; 2063 int count = 0; 2064 for (;;) { 2065 if (startContext==0) { break; } 2066 startContext --; 2067 if (expectedBreaks[startContext]) { 2068 if (count == 2) break; 2069 count ++; 2070 } 2071 } 2072 2073 // End of range is two expected breaks past the start position. 2074 int endContext = i + 1; 2075 int ci; 2076 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 2077 for (;;) { 2078 if (endContext >= testText.length()) {break;} 2079 if (expectedBreaks[endContext-1]) { 2080 if (count == 0) break; 2081 count --; 2082 } 2083 endContext ++; 2084 } 2085 } 2086 2087 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>" 2088 StringBuffer errorText = new StringBuffer(); 2089 2090 int c; // Char from test data 2091 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) { 2092 if (ci == i) { 2093 // This is the location of the error. 2094 errorText.append("<?>---------------------------------\n"); 2095 } else if (expectedBreaks[ci]) { 2096 // This a non-error expected break position. 2097 errorText.append("------------------------------------\n"); 2098 } 2099 if (ci < testText.length()) { 2100 c = UTF16.charAt(testText, ci); 2101 appendCharToBuf(errorText, c, 11); 2102 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT); 2103 appendToBuf(errorText, gc, 8); 2104 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty); 2105 String extraPropValue = 2106 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG); 2107 appendToBuf(errorText, extraPropValue, 20); 2108 2109 String charName = UCharacter.getExtendedName(c); 2110 appendToBuf(errorText, charName, 40); 2111 errorText.append('\n'); 2112 } 2113 } 2114 if (ci == testText.length() && ci != -1) { 2115 errorText.append("<>"); 2116 } 2117 errorText.append("</data>\n"); 2118 2119 // Output the error 2120 errln(name + " break monkey test error. " + 2121 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") + 2122 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" + 2123 errorText); 2124 break; 2125 } 2126 } 2127 2128 loopCount++; 2129 } 2130 } 2131 2132 @Test 2133 public void TestCharMonkey() { 2134 2135 int loopCount = 500; 2136 int seed = 1; 2137 2138 if (TestFmwk.getExhaustiveness() >= 9) { 2139 loopCount = 10000; 2140 } 2141 2142 RBBICharMonkey m = new RBBICharMonkey(); 2143 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2144 RunMonkey(bi, m, "char", seed, loopCount); 2145 } 2146 2147 @Test 2148 public void TestWordMonkey() { 2149 2150 int loopCount = 500; 2151 int seed = 1; 2152 2153 if (TestFmwk.getExhaustiveness() >= 9) { 2154 loopCount = 10000; 2155 } 2156 2157 logln("Word Break Monkey Test"); 2158 RBBIWordMonkey m = new RBBIWordMonkey(); 2159 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2160 RunMonkey(bi, m, "word", seed, loopCount); 2161 } 2162 2163 @Test 2164 public void TestLineMonkey() { 2165 int loopCount = 500; 2166 int seed = 1; 2167 2168 if (TestFmwk.getExhaustiveness() >= 9) { 2169 loopCount = 10000; 2170 } 2171 2172 logln("Line Break Monkey Test"); 2173 RBBILineMonkey m = new RBBILineMonkey(); 2174 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2175 RunMonkey(bi, m, "line", seed, loopCount); 2176 } 2177 2178 @Test 2179 public void TestSentMonkey() { 2180 2181 int loopCount = 500; 2182 int seed = 1; 2183 2184 if (TestFmwk.getExhaustiveness() >= 9) { 2185 loopCount = 3000; 2186 } 2187 2188 logln("Sentence Break Monkey Test"); 2189 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2190 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2191 RunMonkey(bi, m, "sent", seed, loopCount); 2192 } 2193 // 2194 // Round-trip monkey tests. 2195 // Verify that break iterators created from the rule source from the default 2196 // break iterators still pass the monkey test for the iterator type. 2197 // 2198 // This is a major test for the Rule Compiler. The default break iterators are built 2199 // from pre-compiled binary rule data that was created using ICU4C; these 2200 // round-trip rule recompile tests verify that the Java rule compiler can 2201 // rebuild break iterators from the original source rules. 2202 // 2203 @Test 2204 public void TestRTCharMonkey() { 2205 2206 int loopCount = 200; 2207 int seed = 1; 2208 2209 if (TestFmwk.getExhaustiveness() >= 9) { 2210 loopCount = 2000; 2211 } 2212 2213 RBBICharMonkey m = new RBBICharMonkey(); 2214 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2215 String rules = bi.toString(); 2216 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2217 RunMonkey(rtbi, m, "char", seed, loopCount); 2218 } 2219 2220 @Test 2221 public void TestRTWordMonkey() { 2222 2223 int loopCount = 200; 2224 int seed = 1; 2225 2226 if (TestFmwk.getExhaustiveness() >= 9) { 2227 loopCount = 2000; 2228 } 2229 logln("Word Break Monkey Test"); 2230 RBBIWordMonkey m = new RBBIWordMonkey(); 2231 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2232 String rules = bi.toString(); 2233 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2234 RunMonkey(rtbi, m, "word", seed, loopCount); 2235 } 2236 2237 @Test 2238 public void TestRTLineMonkey() { 2239 int loopCount = 200; 2240 int seed = 1; 2241 2242 if (TestFmwk.getExhaustiveness() >= 9) { 2243 loopCount = 2000; 2244 } 2245 2246 logln("Line Break Monkey Test"); 2247 RBBILineMonkey m = new RBBILineMonkey(); 2248 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2249 String rules = bi.toString(); 2250 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2251 RunMonkey(rtbi, m, "line", seed, loopCount); 2252 } 2253 2254 @Test 2255 public void TestRTSentMonkey() { 2256 2257 int loopCount = 200; 2258 int seed = 1; 2259 2260 if (TestFmwk.getExhaustiveness() >= 9) { 2261 loopCount = 1000; 2262 } 2263 2264 logln("Sentence Break Monkey Test"); 2265 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2266 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2267 String rules = bi.toString(); 2268 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2269 RunMonkey(rtbi, m, "sent", seed, loopCount); 2270 } 2271 } 2272 2273