1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.test.rbbi; 10 11 12 // Monkey testing of RuleBasedBreakIterator. 13 // The old, original monkey test. TODO: remove 14 // The new monkey test is class RBBIMonkeyTest. 15 16 import java.util.ArrayList; 17 import java.util.Arrays; 18 import java.util.List; 19 import java.util.Locale; 20 21 import org.junit.Test; 22 import org.junit.runner.RunWith; 23 import org.junit.runners.JUnit4; 24 25 import com.ibm.icu.dev.test.TestFmwk; 26 import com.ibm.icu.lang.UCharacter; 27 import com.ibm.icu.lang.UProperty; 28 import com.ibm.icu.text.BreakIterator; 29 import com.ibm.icu.text.RuleBasedBreakIterator; 30 import com.ibm.icu.text.UTF16; 31 import com.ibm.icu.text.UnicodeSet; 32 33 34 /** 35 * Monkey tests for RBBI. These tests have independent implementations of 36 * the Unicode TR boundary rules, and compare results between these and ICU's 37 * implementation, using random data. 38 * 39 * Tests cover Grapheme Cluster (char), Word and Line breaks 40 * 41 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp 42 * 43 */ 44 @RunWith(JUnit4.class) 45 public class RBBITestMonkey extends TestFmwk { 46 // 47 // class RBBIMonkeyKind 48 // 49 // Monkey Test for Break Iteration 50 // Abstract interface class. Concrete derived classes independently 51 // implement the break rules for different iterator types. 52 // 53 // The Monkey Test itself uses doesn't know which type of break iterator it is 54 // testing, but works purely in terms of the interface defined here. 55 // 56 abstract static class RBBIMonkeyKind { 57 58 // Return a List of UnicodeSets, representing the character classes used 59 // for this type of iterator. 60 abstract List charClasses(); 61 62 // Set the test text on which subsequent calls to next() will operate 63 abstract void setText(StringBuffer text); 64 65 // Find the next break position, starting from the specified position. 66 // Return -1 after reaching end of string. 67 abstract int next(int i); 68 69 // A Character Property, one of the constants defined in class UProperty. 70 // The value of this property will be displayed for the characters 71 // near any test failure. 72 int fCharProperty; 73 } 74 75 // 76 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, 13267 77 // 78 static String gExtended_Pict = "[" + 79 "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767" + 80 "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" + 81 "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F" + 82 "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F" + 83 "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6" + 84 "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586" + 85 "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7" + 86 "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB" + 87 "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" + 88 "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C" + 89 "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637" + 90 "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A" + 91 "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9" + 92 "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD" + 93 "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF" + 94 "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5" + 95 "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F" + 96 "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F" + 97 "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F" + 98 "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF" + 99 "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8" + 100 "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF" + 101 "]"; 102 103 104 /** 105 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries. 106 * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets 107 */ 108 static class RBBICharMonkey extends RBBIMonkeyKind { 109 List fSets; 110 111 UnicodeSet fCRLFSet; 112 UnicodeSet fControlSet; 113 UnicodeSet fExtendSet; 114 UnicodeSet fRegionalIndicatorSet; 115 UnicodeSet fPrependSet; 116 UnicodeSet fSpacingSet; 117 UnicodeSet fLSet; 118 UnicodeSet fVSet; 119 UnicodeSet fTSet; 120 UnicodeSet fLVSet; 121 UnicodeSet fLVTSet; 122 UnicodeSet fHangulSet; 123 UnicodeSet fEmojiModifierSet; 124 UnicodeSet fEmojiBaseSet; 125 UnicodeSet fZWJSet; 126 UnicodeSet fExtendedPictSet; 127 UnicodeSet fEBGSet; 128 UnicodeSet fEmojiNRKSet; 129 UnicodeSet fAnySet; 130 131 132 StringBuffer fText; 133 134 135 RBBICharMonkey() { 136 fText = null; 137 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK; 138 fCRLFSet = new UnicodeSet("[\\r\\n]"); 139 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]"); 140 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]"); 141 fZWJSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]"); 142 fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"); 143 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]"); 144 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]"); 145 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]"); 146 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]"); 147 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]"); 148 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]"); 149 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]"); 150 fHangulSet = new UnicodeSet(); 151 fHangulSet.addAll(fLSet); 152 fHangulSet.addAll(fVSet); 153 fHangulSet.addAll(fTSet); 154 fHangulSet.addAll(fLVSet); 155 fHangulSet.addAll(fLVTSet); 156 157 fEmojiBaseSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}]"); 158 fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]"); 159 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 160 fEBGSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]"); 161 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]"); 162 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); 163 164 165 fSets = new ArrayList(); 166 fSets.add(fCRLFSet); 167 fSets.add(fControlSet); 168 fSets.add(fExtendSet); 169 fSets.add(fRegionalIndicatorSet); 170 if (!fPrependSet.isEmpty()) { 171 fSets.add(fPrependSet); 172 } 173 fSets.add(fSpacingSet); 174 fSets.add(fHangulSet); 175 fSets.add(fAnySet); 176 fSets.add(fEmojiBaseSet); 177 fSets.add(fEmojiModifierSet); 178 fSets.add(fZWJSet); 179 fSets.add(fExtendedPictSet); 180 fSets.add(fEBGSet); 181 fSets.add(fEmojiNRKSet); 182 } 183 184 185 @Override 186 void setText(StringBuffer s) { 187 fText = s; 188 } 189 190 @Override 191 List charClasses() { 192 return fSets; 193 } 194 195 @Override 196 int next(int prevPos) { 197 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 198 // break position being tested. The candidate break 199 // location is before p2. 200 201 int breakPos = -1; 202 203 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 204 int cBase; // for (X Extend*) patterns, the X character. 205 206 // Previous break at end of string. return DONE. 207 if (prevPos >= fText.length()) { 208 return -1; 209 } 210 /* p0 = */ p1 = p2 = p3 = prevPos; 211 c3 = UTF16.charAt(fText, prevPos); 212 c0 = c1 = c2 = cBase = 0; 213 214 // Loop runs once per "significant" character position in the input text. 215 for (;;) { 216 // Move all of the positions forward in the input string. 217 /* p0 = p1;*/ c0 = c1; 218 p1 = p2; c1 = c2; 219 p2 = p3; c2 = c3; 220 221 // Advance p3 by one codepoint 222 p3 = moveIndex32(fText, p3, 1); 223 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3); 224 225 if (p1 == p2) { 226 // Still warming up the loop. (won't work with zero length strings, but we don't care) 227 continue; 228 } 229 if (p2 == fText.length()) { 230 // Reached end of string. Always a break position. 231 break; 232 } 233 234 // Rule GB3 CR x LF 235 // No Extend or Format characters may appear between the CR and LF, 236 // which requires the additional check for p2 immediately following p1. 237 // 238 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 239 continue; 240 } 241 242 // Rule (GB4). ( Control | CR | LF ) <break> 243 if (fControlSet.contains(c1) || 244 c1 == 0x0D || 245 c1 == 0x0A) { 246 break; 247 } 248 249 // Rule (GB5) <break> ( Control | CR | LF ) 250 // 251 if (fControlSet.contains(c2) || 252 c2 == 0x0D || 253 c2 == 0x0A) { 254 break; 255 } 256 257 258 // Rule (GB6) L x ( L | V | LV | LVT ) 259 if (fLSet.contains(c1) && 260 (fLSet.contains(c2) || 261 fVSet.contains(c2) || 262 fLVSet.contains(c2) || 263 fLVTSet.contains(c2))) { 264 continue; 265 } 266 267 // Rule (GB7) ( LV | V ) x ( V | T ) 268 if ((fLVSet.contains(c1) || fVSet.contains(c1)) && 269 (fVSet.contains(c2) || fTSet.contains(c2))) { 270 continue; 271 } 272 273 // Rule (GB8) ( LVT | T) x T 274 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) && 275 fTSet.contains(c2)) { 276 continue; 277 } 278 279 // Rule (GB9) x (Extend | ZWJ) 280 if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) { 281 if (!fExtendSet.contains(c1)) { 282 cBase = c1; 283 } 284 continue; 285 } 286 287 // Rule (GB9a) x SpacingMark 288 if (fSpacingSet.contains(c2)) { 289 continue; 290 } 291 292 // Rule (GB9b) Prepend x 293 if (fPrependSet.contains(c1)) { 294 continue; 295 } 296 // Rule (GB10) (Emoji_Base | EBG) Extend* x Emoji_Modifier 297 if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) { 298 continue; 299 } 300 if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) && 301 fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) { 302 continue; 303 } 304 305 // Rule (GB11) (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji) 306 if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) && 307 (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 308 continue; 309 } 310 if ((fExtendedPictSet.contains(cBase) || fEmojiNRKSet.contains(cBase)) && fExtendSet.contains(c0) && fZWJSet.contains(c1) && 311 (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 312 continue; 313 } 314 315 // Rule (GB12-13) Regional_Indicator x Regional_Indicator 316 // Note: The first if condition is a little tricky. We only need to force 317 // a break if there are three or more contiguous RIs. If there are 318 // only two, a break following will occur via other rules, and will include 319 // any trailing extend characters, which is needed behavior. 320 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) 321 && fRegionalIndicatorSet.contains(c2)) { 322 break; 323 } 324 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 325 continue; 326 } 327 328 // Rule (GB999) Any <break> Any 329 break; 330 } 331 332 breakPos = p2; 333 return breakPos; 334 } 335 } 336 337 338 /** 339 * 340 * Word Monkey Test Class 341 * 342 * 343 * 344 */ 345 static class RBBIWordMonkey extends RBBIMonkeyKind { 346 List fSets; 347 StringBuffer fText; 348 349 UnicodeSet fCRSet; 350 UnicodeSet fLFSet; 351 UnicodeSet fNewlineSet; 352 UnicodeSet fRegionalIndicatorSet; 353 UnicodeSet fKatakanaSet; 354 UnicodeSet fHebrew_LetterSet; 355 UnicodeSet fALetterSet; 356 UnicodeSet fSingle_QuoteSet; 357 UnicodeSet fDouble_QuoteSet; 358 UnicodeSet fMidNumLetSet; 359 UnicodeSet fMidLetterSet; 360 UnicodeSet fMidNumSet; 361 UnicodeSet fNumericSet; 362 UnicodeSet fFormatSet; 363 UnicodeSet fExtendSet; 364 UnicodeSet fExtendNumLetSet; 365 UnicodeSet fOtherSet; 366 UnicodeSet fDictionarySet; 367 UnicodeSet fEBaseSet; 368 UnicodeSet fEBGSet; 369 UnicodeSet fEModifierSet; 370 UnicodeSet fZWJSet; 371 UnicodeSet fExtendedPictSet; 372 UnicodeSet fEmojiNRKSet; 373 374 375 RBBIWordMonkey() { 376 fCharProperty = UProperty.WORD_BREAK; 377 378 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]"); 379 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]"); 380 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]"); 381 fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); 382 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); 383 fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); 384 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); 385 fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); 386 fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); 387 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); 388 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); 389 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); 390 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); 391 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); 392 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]"); 393 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]"); 394 fEBaseSet = new UnicodeSet("[\\p{Word_Break = EB}]"); 395 fEBGSet = new UnicodeSet("[\\p{Word_Break = EBG}]"); 396 fEModifierSet = new UnicodeSet("[\\p{Word_Break = EM}]"); 397 fZWJSet = new UnicodeSet("[\\p{Word_Break = ZWJ}]"); 398 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 399 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9]]"); 400 401 fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"); 402 fDictionarySet.addAll(fKatakanaSet); 403 fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]")); 404 405 fALetterSet.removeAll(fDictionarySet); 406 407 fOtherSet = new UnicodeSet(); 408 fOtherSet.complement(); 409 fOtherSet.removeAll(fCRSet); 410 fOtherSet.removeAll(fLFSet); 411 fOtherSet.removeAll(fNewlineSet); 412 fOtherSet.removeAll(fALetterSet); 413 fOtherSet.removeAll(fSingle_QuoteSet); 414 fOtherSet.removeAll(fDouble_QuoteSet); 415 fOtherSet.removeAll(fKatakanaSet); 416 fOtherSet.removeAll(fHebrew_LetterSet); 417 fOtherSet.removeAll(fMidLetterSet); 418 fOtherSet.removeAll(fMidNumSet); 419 fOtherSet.removeAll(fNumericSet); 420 fOtherSet.removeAll(fFormatSet); 421 fOtherSet.removeAll(fExtendSet); 422 fOtherSet.removeAll(fExtendNumLetSet); 423 fOtherSet.removeAll(fRegionalIndicatorSet); 424 fOtherSet.removeAll(fEBaseSet); 425 fOtherSet.removeAll(fEBGSet); 426 fOtherSet.removeAll(fEModifierSet); 427 fOtherSet.removeAll(fZWJSet); 428 fOtherSet.removeAll(fExtendedPictSet); 429 fOtherSet.removeAll(fEmojiNRKSet); 430 431 // Inhibit dictionary characters from being tested at all. 432 // remove surrogates so as to not generate higher CJK characters 433 fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]")); 434 fOtherSet.removeAll(fDictionarySet); 435 436 fSets = new ArrayList(); 437 fSets.add(fCRSet); 438 fSets.add(fLFSet); 439 fSets.add(fNewlineSet); 440 fSets.add(fRegionalIndicatorSet); 441 fSets.add(fHebrew_LetterSet); 442 fSets.add(fALetterSet); 443 //fSets.add(fKatakanaSet); // Omit Katakana from fSets, which omits Katakana characters 444 // from the test data. They are all in the dictionary set, 445 // which this (old, to be retired) monkey test cannot handle. 446 fSets.add(fSingle_QuoteSet); 447 fSets.add(fDouble_QuoteSet); 448 fSets.add(fMidLetterSet); 449 fSets.add(fMidNumLetSet); 450 fSets.add(fMidNumSet); 451 fSets.add(fNumericSet); 452 fSets.add(fFormatSet); 453 fSets.add(fExtendSet); 454 fSets.add(fExtendNumLetSet); 455 fSets.add(fRegionalIndicatorSet); 456 fSets.add(fEBaseSet); 457 fSets.add(fEBGSet); 458 fSets.add(fEModifierSet); 459 fSets.add(fZWJSet); 460 fSets.add(fExtendedPictSet); 461 fSets.add(fEmojiNRKSet); 462 fSets.add(fOtherSet); 463 } 464 465 466 @Override 467 List charClasses() { 468 return fSets; 469 } 470 471 @Override 472 void setText(StringBuffer s) { 473 fText = s; 474 } 475 476 @Override 477 int next(int prevPos) { 478 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 479 // break position being tested. The candidate break 480 // location is before p2. 481 int breakPos = -1; 482 483 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 484 485 // Previous break at end of string. return DONE. 486 if (prevPos >= fText.length()) { 487 return -1; 488 } 489 /*p0 =*/ p1 = p2 = p3 = prevPos; 490 c3 = UTF16.charAt(fText, prevPos); 491 c0 = c1 = c2 = 0; 492 493 494 495 // Loop runs once per "significant" character position in the input text. 496 for (;;) { 497 // Move all of the positions forward in the input string. 498 /*p0 = p1;*/ c0 = c1; 499 p1 = p2; c1 = c2; 500 p2 = p3; c2 = c3; 501 502 // Advance p3 by X(Extend | Format)* Rule 4 503 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 504 do { 505 p3 = moveIndex32(fText, p3, 1); 506 c3 = -1; 507 if (p3>=fText.length()) { 508 break; 509 } 510 c3 = UTF16.charAt(fText, p3); 511 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 512 break; 513 } 514 } 515 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3)); 516 517 if (p1 == p2) { 518 // Still warming up the loop. (won't work with zero length strings, but we don't care) 519 continue; 520 } 521 if (p2 == fText.length()) { 522 // Reached end of string. Always a break position. 523 break; 524 } 525 526 // Rule (3) CR x LF 527 // No Extend or Format characters may appear between the CR and LF, 528 // which requires the additional check for p2 immediately following p1. 529 // 530 if (c1==0x0D && c2==0x0A) { 531 continue; 532 } 533 534 // Rule (3a) Break before and after newlines (including CR and LF) 535 // 536 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) { 537 break; 538 } 539 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 540 break; 541 } 542 543 // Rule (3c) ZWJ x (Extended_Pictographic | Emoji). 544 // Not ignoring extend chars, so peek into input text to 545 // get the potential ZWJ, the character immediately preceding c2. 546 if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 547 continue; 548 } 549 550 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 551 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 552 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 553 continue; 554 } 555 556 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 557 // 558 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 559 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 560 (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) { 561 continue; 562 } 563 564 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 565 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) && 566 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 567 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 568 continue; 569 } 570 571 // Rule (7a) Hebrew_Letter x Single_Quote 572 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) { 573 continue; 574 } 575 576 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 577 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) { 578 continue; 579 } 580 581 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 582 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) { 583 continue; 584 } 585 586 // Rule (8) Numeric x Numeric 587 if (fNumericSet.contains(c1) && 588 fNumericSet.contains(c2)) { 589 continue; 590 } 591 592 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 593 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 594 fNumericSet.contains(c2)) { 595 continue; 596 } 597 598 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 599 if (fNumericSet.contains(c1) && 600 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 601 continue; 602 } 603 604 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 605 if (fNumericSet.contains(c0) && 606 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 607 fNumericSet.contains(c2)) { 608 continue; 609 } 610 611 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 612 if (fNumericSet.contains(c1) && 613 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 614 setContains(fNumericSet, c3)) { 615 continue; 616 } 617 618 // Rule (13) Katakana x Katakana 619 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 620 // all Katakana are handled by the dictionary breaker. 621 if (fKatakanaSet.contains(c1) && 622 fKatakanaSet.contains(c2)) { 623 continue; 624 } 625 626 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 627 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) || 628 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) && 629 fExtendNumLetSet.contains(c2)) { 630 continue; 631 } 632 633 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 634 if (fExtendNumLetSet.contains(c1) && 635 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) || 636 fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) { 637 continue; 638 } 639 640 641 // Rule 14 (E_Base | EBG) x E_Modifier 642 if ((fEBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) { 643 continue; 644 } 645 646 // Rule 15 - 17 Group piars of Regional Indicators 647 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) { 648 break; 649 } 650 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 651 continue; 652 } 653 654 // Rule 999. Break found here. 655 break; 656 } 657 658 breakPos = p2; 659 return breakPos; 660 } 661 662 } 663 664 665 static class RBBILineMonkey extends RBBIMonkeyKind { 666 667 List fSets; 668 669 // UnicodeSets for each of the Line Breaking character classes. 670 // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier 671 // to verify that they are all accounted for. 672 673 UnicodeSet fBK; 674 UnicodeSet fCR; 675 UnicodeSet fLF; 676 UnicodeSet fCM; 677 UnicodeSet fNL; 678 UnicodeSet fSG; 679 UnicodeSet fWJ; 680 UnicodeSet fZW; 681 UnicodeSet fGL; 682 UnicodeSet fSP; 683 UnicodeSet fB2; 684 UnicodeSet fBA; 685 UnicodeSet fBB; 686 UnicodeSet fHY; 687 UnicodeSet fCB; 688 UnicodeSet fCL; 689 UnicodeSet fCP; 690 UnicodeSet fEX; 691 UnicodeSet fIN; 692 UnicodeSet fNS; 693 UnicodeSet fOP; 694 UnicodeSet fQU; 695 UnicodeSet fIS; 696 UnicodeSet fNU; 697 UnicodeSet fPO; 698 UnicodeSet fPR; 699 UnicodeSet fSY; 700 UnicodeSet fAI; 701 UnicodeSet fAL; 702 UnicodeSet fCJ; 703 UnicodeSet fH2; 704 UnicodeSet fH3; 705 UnicodeSet fHL; 706 UnicodeSet fID; 707 UnicodeSet fJL; 708 UnicodeSet fJV; 709 UnicodeSet fJT; 710 UnicodeSet fRI; 711 UnicodeSet fXX; 712 UnicodeSet fEB; 713 UnicodeSet fEM; 714 UnicodeSet fZWJ; 715 UnicodeSet fExtendedPict; 716 UnicodeSet fEmojiNRK; 717 718 StringBuffer fText; 719 int fOrigPositions; 720 721 722 723 RBBILineMonkey() 724 { 725 fCharProperty = UProperty.LINE_BREAK; 726 fSets = new ArrayList(); 727 728 fBK = new UnicodeSet("[\\p{Line_Break=BK}]"); 729 fCR = new UnicodeSet("[\\p{Line_break=CR}]"); 730 fLF = new UnicodeSet("[\\p{Line_break=LF}]"); 731 fCM = new UnicodeSet("[\\p{Line_break=CM}]"); 732 fNL = new UnicodeSet("[\\p{Line_break=NL}]"); 733 fSG = new UnicodeSet("[\\ud800-\\udfff]"); 734 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]"); 735 fZW = new UnicodeSet("[\\p{Line_break=ZW}]"); 736 fGL = new UnicodeSet("[\\p{Line_break=GL}]"); 737 fSP = new UnicodeSet("[\\p{Line_break=SP}]"); 738 fB2 = new UnicodeSet("[\\p{Line_break=B2}]"); 739 fBA = new UnicodeSet("[\\p{Line_break=BA}]"); 740 fBB = new UnicodeSet("[\\p{Line_break=BB}]"); 741 fHY = new UnicodeSet("[\\p{Line_break=HY}]"); 742 fCB = new UnicodeSet("[\\p{Line_break=CB}]"); 743 fCL = new UnicodeSet("[\\p{Line_break=CL}]"); 744 fCP = new UnicodeSet("[\\p{Line_break=CP}]"); 745 fEX = new UnicodeSet("[\\p{Line_break=EX}]"); 746 fIN = new UnicodeSet("[\\p{Line_break=IN}]"); 747 fNS = new UnicodeSet("[\\p{Line_break=NS}]"); 748 fOP = new UnicodeSet("[\\p{Line_break=OP}]"); 749 fQU = new UnicodeSet("[\\p{Line_break=QU}]"); 750 fIS = new UnicodeSet("[\\p{Line_break=IS}]"); 751 fNU = new UnicodeSet("[\\p{Line_break=NU}]"); 752 fPO = new UnicodeSet("[\\p{Line_break=PO}]"); 753 fPR = new UnicodeSet("[\\p{Line_break=PR}]"); 754 fSY = new UnicodeSet("[\\p{Line_break=SY}]"); 755 fAI = new UnicodeSet("[\\p{Line_break=AI}]"); 756 fAL = new UnicodeSet("[\\p{Line_break=AL}]"); 757 fCJ = new UnicodeSet("[\\p{Line_break=CJ}]"); 758 fH2 = new UnicodeSet("[\\p{Line_break=H2}]"); 759 fH3 = new UnicodeSet("[\\p{Line_break=H3}]"); 760 fHL = new UnicodeSet("[\\p{Line_break=HL}]"); 761 fID = new UnicodeSet("[\\p{Line_break=ID}]"); 762 fJL = new UnicodeSet("[\\p{Line_break=JL}]"); 763 fJV = new UnicodeSet("[\\p{Line_break=JV}]"); 764 fJT = new UnicodeSet("[\\p{Line_break=JT}]"); 765 fRI = new UnicodeSet("[\\p{Line_break=RI}]"); 766 fXX = new UnicodeSet("[\\p{Line_break=XX}]"); 767 fEB = new UnicodeSet("[\\p{Line_break=EB}]"); 768 fEM = new UnicodeSet("[\\p{Line_break=EM}]"); 769 fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]"); 770 fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9]]"); 771 fExtendedPict = new UnicodeSet(gExtended_Pict); 772 773 774 // Remove dictionary characters. 775 // The monkey test reference implementation of line break does not replicate the dictionary behavior, 776 // so dictionary characters are omitted from the monkey test data. 777 @SuppressWarnings("unused") 778 UnicodeSet dictionarySet = new UnicodeSet( 779 "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]"); 780 781 fAL.addAll(fXX); // Default behavior for XX is identical to AL 782 fAL.addAll(fAI); // Default behavior for AI is identical to AL 783 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL 784 785 fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. 786 fCM.addAll(fZWJ); // ZWJ behaves as a CM. 787 788 fSets.add(fBK); 789 fSets.add(fCR); 790 fSets.add(fLF); 791 fSets.add(fCM); 792 fSets.add(fNL); 793 fSets.add(fWJ); 794 fSets.add(fZW); 795 fSets.add(fGL); 796 fSets.add(fSP); 797 fSets.add(fB2); 798 fSets.add(fBA); 799 fSets.add(fBB); 800 fSets.add(fHY); 801 fSets.add(fCB); 802 fSets.add(fCL); 803 fSets.add(fCP); 804 fSets.add(fEX); 805 fSets.add(fIN); 806 fSets.add(fJL); 807 fSets.add(fJT); 808 fSets.add(fJV); 809 fSets.add(fNS); 810 fSets.add(fOP); 811 fSets.add(fQU); 812 fSets.add(fIS); 813 fSets.add(fNU); 814 fSets.add(fPO); 815 fSets.add(fPR); 816 fSets.add(fSY); 817 fSets.add(fAI); 818 fSets.add(fAL); 819 fSets.add(fH2); 820 fSets.add(fH3); 821 fSets.add(fHL); 822 fSets.add(fID); 823 fSets.add(fWJ); 824 fSets.add(fRI); 825 fSets.add(fSG); 826 fSets.add(fEB); 827 fSets.add(fEM); 828 fSets.add(fZWJ); 829 fSets.add(fExtendedPict); 830 fSets.add(fEmojiNRK); 831 } 832 833 @Override 834 void setText(StringBuffer s) { 835 fText = s; 836 } 837 838 839 840 841 @Override 842 int next(int startPos) { 843 int pos; // Index of the char following a potential break position 844 int thisChar; // Character at above position "pos" 845 846 int prevPos; // Index of the char preceding a potential break position 847 int prevChar; // Character at above position. Note that prevChar 848 // and thisChar may not be adjacent because combining 849 // characters between them will be ignored. 850 int prevCharX2; // Character before prevChar, more contex for LB 21a 851 852 int nextPos; // Index of the next character following pos. 853 // Usually skips over combining marks. 854 int tPos; // temp value. 855 int matchVals[] = null; // Number Expression Match Results 856 857 858 if (startPos >= fText.length()) { 859 return -1; 860 } 861 862 863 // Initial values for loop. Loop will run the first time without finding breaks, 864 // while the invalid values shift out and the "this" and 865 // "prev" positions are filled in with good values. 866 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 867 thisChar = prevChar = prevCharX2 = 0; 868 nextPos = startPos; 869 870 871 // Loop runs once per position in the test text, until a break position 872 // is found. In each iteration, we are testing for a possible break 873 // just preceding the character at index "pos". The character preceding 874 // this char is at postion "prevPos"; because of combining sequences, 875 // "prevPos" can be arbitrarily far before "pos". 876 for (;;) { 877 // Advance to the next position to be tested. 878 prevCharX2 = prevChar; 879 prevPos = pos; 880 prevChar = thisChar; 881 pos = nextPos; 882 nextPos = moveIndex32(fText, pos, 1); 883 884 // Rule LB2 - Break at end of text. 885 if (pos >= fText.length()) { 886 break; 887 } 888 889 // Rule LB 9 - adjust for combining sequences. 890 // We do this rule out-of-order because the adjustment does 891 // not effect the way that rules LB 3 through LB 6 match, 892 // and doing it here rather than after LB 6 is substantially 893 // simpler when combining sequences do occur. 894 895 896 // LB 9 Keep combining sequences together. 897 // advance over any CM class chars at "pos", 898 // result is "nextPos" for the following loop iteration. 899 thisChar = UTF16.charAt(fText, pos); 900 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || 901 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { 902 for (;;) { 903 if (nextPos == fText.length()) { 904 break; 905 } 906 int nextChar = UTF16.charAt(fText, nextPos); 907 if (!fCM.contains(nextChar)) { 908 break; 909 } 910 nextPos = moveIndex32(fText, nextPos, 1); 911 } 912 } 913 914 // LB 9 Treat X CM* as if it were X 915 // No explicit action required. 916 917 // LB 10 Treat any remaining combining mark as AL 918 if (fCM.contains(thisChar)) { 919 thisChar = 'A'; 920 } 921 922 923 // If the loop is still warming up - if we haven't shifted the initial 924 // -1 positions out of prevPos yet - loop back to advance the 925 // position in the input without any further looking for breaks. 926 if (prevPos == -1) { 927 continue; 928 } 929 930 // LB 4 Always break after hard line breaks, 931 if (fBK.contains(prevChar)) { 932 break; 933 } 934 935 // LB 5 Break after CR, LF, NL, but not inside CR LF 936 if (fCR.contains(prevChar) && fLF.contains(thisChar)) { 937 continue; 938 } 939 if (fCR.contains(prevChar) || 940 fLF.contains(prevChar) || 941 fNL.contains(prevChar)) { 942 break; 943 } 944 945 // LB 6 Don't break before hard line breaks 946 if (fBK.contains(thisChar) || fCR.contains(thisChar) || 947 fLF.contains(thisChar) || fNL.contains(thisChar) ) { 948 continue; 949 } 950 951 952 // LB 7 Don't break before spaces or zero-width space. 953 if (fSP.contains(thisChar)) { 954 continue; 955 } 956 957 if (fZW.contains(thisChar)) { 958 continue; 959 } 960 961 // LB 8 Break after zero width space 962 if (fZW.contains(prevChar)) { 963 break; 964 } 965 966 // LB 8a: ZWJ x (ID | Extended_Pictographic | Emoji) 967 // The monkey test's way of ignoring combining characters doesn't work 968 // for this rule. ZWJ is also a CM. Need to get the actual character 969 // preceding "thisChar", not ignoring combining marks, possibly ZWJ. 970 { 971 int prevC = fText.codePointBefore(pos); 972 if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) { 973 continue; 974 } 975 } 976 977 // LB 9, 10 Already done, at top of loop. 978 // 979 980 981 // LB 11 982 // x WJ 983 // WJ x 984 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { 985 continue; 986 } 987 988 989 // LB 12 990 // GL x 991 if (fGL.contains(prevChar)) { 992 continue; 993 } 994 995 // LB 12a 996 // [^SP BA HY] x GL 997 if (!(fSP.contains(prevChar) || 998 fBA.contains(prevChar) || 999 fHY.contains(prevChar) ) && fGL.contains(thisChar)) { 1000 continue; 1001 } 1002 1003 1004 1005 // LB 13 Don't break before closings. 1006 // NU x CL, NU x CP and NU x IS are not matched here so that they will 1007 // fall into LB 17 and the more general number regular expression. 1008 // 1009 if (!fNU.contains(prevChar) && fCL.contains(thisChar) || 1010 !fNU.contains(prevChar) && fCP.contains(thisChar) || 1011 fEX.contains(thisChar) || 1012 !fNU.contains(prevChar) && fIS.contains(thisChar) || 1013 !fNU.contains(prevChar) && fSY.contains(thisChar)) { 1014 continue; 1015 } 1016 1017 // LB 14 Don't break after OP SP* 1018 // Scan backwards, checking for this sequence. 1019 // The OP char could include combining marks, so we actually check for 1020 // OP CM* SP* x 1021 tPos = prevPos; 1022 if (fSP.contains(prevChar)) { 1023 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1024 tPos=moveIndex32(fText, tPos, -1); 1025 } 1026 } 1027 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1028 tPos=moveIndex32(fText, tPos, -1); 1029 } 1030 if (fOP.contains(UTF16.charAt(fText, tPos))) { 1031 continue; 1032 } 1033 1034 // LB 15 Do not break within "[ 1035 // QU CM* SP* x OP 1036 if (fOP.contains(thisChar)) { 1037 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 1038 tPos = prevPos; 1039 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1040 tPos = moveIndex32(fText, tPos, -1); 1041 } 1042 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1043 tPos = moveIndex32(fText, tPos, -1); 1044 } 1045 if (fQU.contains(UTF16.charAt(fText, tPos))) { 1046 continue; 1047 } 1048 } 1049 1050 // LB 16 (CL | CP) SP* x NS 1051 if (fNS.contains(thisChar)) { 1052 tPos = prevPos; 1053 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1054 tPos = moveIndex32(fText, tPos, -1); 1055 } 1056 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1057 tPos = moveIndex32(fText, tPos, -1); 1058 } 1059 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) { 1060 continue; 1061 } 1062 } 1063 1064 1065 // LB 17 B2 SP* x B2 1066 if (fB2.contains(thisChar)) { 1067 tPos = prevPos; 1068 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1069 tPos = moveIndex32(fText, tPos, -1); 1070 } 1071 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1072 tPos = moveIndex32(fText, tPos, -1); 1073 } 1074 if (fB2.contains(UTF16.charAt(fText, tPos))) { 1075 continue; 1076 } 1077 } 1078 1079 // LB 18 break after space 1080 if (fSP.contains(prevChar)) { 1081 break; 1082 } 1083 1084 // LB 19 1085 // x QU 1086 // QU x 1087 if (fQU.contains(thisChar) || fQU.contains(prevChar)) { 1088 continue; 1089 } 1090 1091 // LB 20 Break around a CB 1092 if (fCB.contains(thisChar) || fCB.contains(prevChar)) { 1093 break; 1094 } 1095 1096 // LB 21 1097 if (fBA.contains(thisChar) || 1098 fHY.contains(thisChar) || 1099 fNS.contains(thisChar) || 1100 fBB.contains(prevChar) ) { 1101 continue; 1102 } 1103 1104 // LB 21a, HL (HY | BA) x 1105 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) { 1106 continue; 1107 } 1108 1109 // LB 21b, SY x HL 1110 if (fSY.contains(prevChar) && fHL.contains(thisChar)) { 1111 continue; 1112 } 1113 1114 // LB 22 1115 if (fAL.contains(prevChar) && fIN.contains(thisChar) || 1116 fEX.contains(prevChar) && fIN.contains(thisChar) || 1117 fHL.contains(prevChar) && fIN.contains(thisChar) || 1118 (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) || 1119 fIN.contains(prevChar) && fIN.contains(thisChar) || 1120 fNU.contains(prevChar) && fIN.contains(thisChar) ) { 1121 continue; 1122 } 1123 1124 // LB 23 (AL | HL) x NU 1125 // NU x (AL | HL) 1126 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) { 1127 continue; 1128 } 1129 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1130 continue; 1131 } 1132 1133 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 1134 // PR x (ID | EB | EM) 1135 // (ID | EB | EM) x PO 1136 if (fPR.contains(prevChar) && 1137 (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) { 1138 continue; 1139 } 1140 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && 1141 fPO.contains(thisChar)) { 1142 continue; 1143 } 1144 1145 // LB 24 Do not break between prefix and letters or ideographs. 1146 // (PR | PO) x (AL | HL) 1147 // (AL | HL) x (PR | PO) 1148 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) && 1149 (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1150 continue; 1151 } 1152 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && 1153 (fPR.contains(thisChar) || fPO.contains(thisChar))) { 1154 continue; 1155 } 1156 1157 1158 // LB 25 Numbers 1159 matchVals = LBNumberCheck(fText, prevPos, matchVals); 1160 if (matchVals[0] != -1) { 1161 // Matched a number. But could have been just a single digit, which would 1162 // not represent a "no break here" between prevChar and thisChar 1163 int numEndIdx = matchVals[1]; // idx of first char following num 1164 if (numEndIdx > pos) { 1165 // Number match includes at least the two chars being checked 1166 if (numEndIdx > nextPos) { 1167 // Number match includes additional chars. Update pos and nextPos 1168 // so that next loop iteration will continue at the end of the number, 1169 // checking for breaks between last char in number & whatever follows. 1170 nextPos = numEndIdx; 1171 pos = numEndIdx; 1172 do { 1173 pos = moveIndex32(fText, pos, -1); 1174 thisChar = UTF16.charAt(fText, pos); 1175 } 1176 while (fCM.contains(thisChar)); 1177 } 1178 continue; 1179 } 1180 } 1181 1182 1183 // LB 26 Do not break Korean Syllables 1184 if (fJL.contains(prevChar) && (fJL.contains(thisChar) || 1185 fJV.contains(thisChar) || 1186 fH2.contains(thisChar) || 1187 fH3.contains(thisChar))) { 1188 continue; 1189 } 1190 1191 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) && 1192 (fJV.contains(thisChar) || fJT.contains(thisChar))) { 1193 continue; 1194 } 1195 1196 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) && 1197 fJT.contains(thisChar)) { 1198 continue; 1199 } 1200 1201 // LB 27 Treat a Korean Syllable Block the same as ID 1202 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1203 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1204 fIN.contains(thisChar)) { 1205 continue; 1206 } 1207 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1208 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1209 fPO.contains(thisChar)) { 1210 continue; 1211 } 1212 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || 1213 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) { 1214 continue; 1215 } 1216 1217 1218 1219 // LB 28 Do not break between alphabetics 1220 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1221 continue; 1222 } 1223 1224 // LB 29 Do not break between numeric punctuation and alphabetics 1225 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1226 continue; 1227 } 1228 1229 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 1230 // (AL | NU) x OP 1231 // CP x (AL | NU) 1232 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) { 1233 continue; 1234 } 1235 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) { 1236 continue; 1237 } 1238 1239 // LB 30a Break between pairs of Regional Indicators. 1240 // RI RI <break> RI 1241 // RI x RI 1242 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { 1243 break; 1244 } 1245 if (fRI.contains(prevChar) && fRI.contains(thisChar)) { 1246 continue; 1247 } 1248 1249 // LB30b Emoji Base x Emoji Modifier 1250 if (fEB.contains(prevChar) && fEM.contains(thisChar)) { 1251 continue; 1252 } 1253 // LB 31 Break everywhere else 1254 break; 1255 } 1256 1257 return pos; 1258 } 1259 1260 1261 1262 // Match the following regular expression in the input text. 1263 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)? 1264 // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states) 1265 // retVals array [0] index of the start of the match, or -1 if no match 1266 // [1] index of first char following the match. 1267 // Can not use Java regex because need supplementary character support, 1268 // and because Unicode char properties version must be the same as in 1269 // the version of ICU being tested. 1270 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { 1271 if (retVals == null) { 1272 retVals = new int[2]; 1273 } 1274 retVals[0] = -1; // Indicates no match. 1275 int matchState = 0; 1276 int idx = startIdx; 1277 1278 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){ 1279 int c = UTF16.charAt(s, idx); 1280 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); 1281 switch (matchState) { 1282 case 0: 1283 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC || 1284 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1285 matchState = 1; 1286 break; 1287 } 1288 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1289 matchState = 4; 1290 break; 1291 } 1292 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1293 matchState = 4; 1294 break; 1295 } 1296 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1297 matchState = 7; 1298 break; 1299 } 1300 break matchLoop; /* No Match */ 1301 1302 case 1: 1303 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1304 matchState = 1; 1305 break; 1306 } 1307 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1308 matchState = 4; 1309 break; 1310 } 1311 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1312 matchState = 4; 1313 break; 1314 } 1315 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1316 matchState = 7; 1317 break; 1318 } 1319 break matchLoop; /* No Match */ 1320 1321 1322 case 4: 1323 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1324 matchState = 4; 1325 break; 1326 } 1327 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1328 matchState = 7; 1329 break; 1330 } 1331 break matchLoop; /* No Match */ 1332 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)? 1333 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states) 1334 1335 case 7: 1336 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1337 matchState = 7; 1338 break; 1339 } 1340 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1341 matchState = 7; 1342 break; 1343 } 1344 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1345 matchState = 7; 1346 break; 1347 } 1348 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) { 1349 matchState = 7; 1350 break; 1351 } 1352 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) { 1353 matchState = 9; 1354 break; 1355 } 1356 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) { 1357 matchState = 9; 1358 break; 1359 } 1360 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1361 matchState = 11; 1362 break; 1363 } 1364 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1365 matchState = 11; 1366 break; 1367 } 1368 1369 break matchLoop; // Match Complete. 1370 case 9: 1371 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1372 matchState = 9; 1373 break; 1374 } 1375 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1376 matchState = 11; 1377 break; 1378 } 1379 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1380 matchState = 11; 1381 break; 1382 } 1383 break matchLoop; // Match Complete. 1384 case 11: 1385 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1386 matchState = 11; 1387 break; 1388 } 1389 break matchLoop; // Match Complete. 1390 } 1391 } 1392 if (matchState > 4) { 1393 retVals[0] = startIdx; 1394 retVals[1] = idx; 1395 } 1396 return retVals; 1397 } 1398 1399 1400 @Override 1401 List charClasses() { 1402 return fSets; 1403 } 1404 1405 1406 1407 } 1408 1409 1410 /** 1411 * 1412 * Sentence Monkey Test Class 1413 * 1414 * 1415 * 1416 */ 1417 static class RBBISentenceMonkey extends RBBIMonkeyKind { 1418 List fSets; 1419 StringBuffer fText; 1420 1421 UnicodeSet fSepSet; 1422 UnicodeSet fFormatSet; 1423 UnicodeSet fSpSet; 1424 UnicodeSet fLowerSet; 1425 UnicodeSet fUpperSet; 1426 UnicodeSet fOLetterSet; 1427 UnicodeSet fNumericSet; 1428 UnicodeSet fATermSet; 1429 UnicodeSet fSContinueSet; 1430 UnicodeSet fSTermSet; 1431 UnicodeSet fCloseSet; 1432 UnicodeSet fOtherSet; 1433 UnicodeSet fExtendSet; 1434 1435 1436 1437 RBBISentenceMonkey() { 1438 fCharProperty = UProperty.SENTENCE_BREAK; 1439 1440 fSets = new ArrayList(); 1441 1442 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 1443 // set and made into character classes of their own. For the monkey impl, 1444 // they remain in SEP, since Sep always appears with CR and LF in the rules. 1445 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"); 1446 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]"); 1447 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]"); 1448 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]"); 1449 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]"); 1450 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]"); 1451 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]"); 1452 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]"); 1453 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]"); 1454 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]"); 1455 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]"); 1456 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]"); 1457 fOtherSet = new UnicodeSet(); 1458 1459 1460 fOtherSet.complement(); 1461 fOtherSet.removeAll(fSepSet); 1462 fOtherSet.removeAll(fFormatSet); 1463 fOtherSet.removeAll(fSpSet); 1464 fOtherSet.removeAll(fLowerSet); 1465 fOtherSet.removeAll(fUpperSet); 1466 fOtherSet.removeAll(fOLetterSet); 1467 fOtherSet.removeAll(fNumericSet); 1468 fOtherSet.removeAll(fATermSet); 1469 fOtherSet.removeAll(fSContinueSet); 1470 fOtherSet.removeAll(fSTermSet); 1471 fOtherSet.removeAll(fCloseSet); 1472 fOtherSet.removeAll(fExtendSet); 1473 1474 fSets.add(fSepSet); 1475 fSets.add(fFormatSet); 1476 1477 fSets.add(fSpSet); 1478 fSets.add(fLowerSet); 1479 fSets.add(fUpperSet); 1480 fSets.add(fOLetterSet); 1481 fSets.add(fNumericSet); 1482 fSets.add(fATermSet); 1483 fSets.add(fSContinueSet); 1484 fSets.add(fSTermSet); 1485 fSets.add(fCloseSet); 1486 fSets.add(fOtherSet); 1487 fSets.add(fExtendSet); 1488 } 1489 1490 1491 @Override 1492 List charClasses() { 1493 return fSets; 1494 } 1495 1496 @Override 1497 void setText(StringBuffer s) { 1498 fText = s; 1499 } 1500 1501 1502 // moveBack() Find the "significant" code point preceding the index i. 1503 // Skips over ($Extend | $Format)* 1504 // 1505 private int moveBack(int i) { 1506 1507 if (i <= 0) { 1508 return -1; 1509 } 1510 1511 int c; 1512 int j = i; 1513 do { 1514 j = moveIndex32(fText, j, -1); 1515 c = UTF16.charAt(fText, j); 1516 } 1517 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c))); 1518 return j; 1519 } 1520 1521 1522 int moveForward(int i) { 1523 if (i>=fText.length()) { 1524 return fText.length(); 1525 } 1526 int c; 1527 int j = i; 1528 do { 1529 j = moveIndex32(fText, j, 1); 1530 c = cAt(j); 1531 } 1532 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c))); 1533 return j; 1534 1535 } 1536 1537 int cAt(int pos) { 1538 if (pos<0 || pos>=fText.length()) { 1539 return -1; 1540 } 1541 return UTF16.charAt(fText, pos); 1542 } 1543 1544 @Override 1545 int next(int prevPos) { 1546 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 1547 // break position being tested. The candidate break 1548 // location is before p2. 1549 int breakPos = -1; 1550 1551 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1552 int c; 1553 1554 // Prev break at end of string. return DONE. 1555 if (prevPos >= fText.length()) { 1556 return -1; 1557 } 1558 /*p0 =*/ p1 = p2 = p3 = prevPos; 1559 c3 = UTF16.charAt(fText, prevPos); 1560 c0 = c1 = c2 = 0; 1561 1562 // Loop runs once per "significant" character position in the input text. 1563 for (;;) { 1564 // Move all of the positions forward in the input string. 1565 /*p0 = p1;*/ c0 = c1; 1566 p1 = p2; c1 = c2; 1567 p2 = p3; c2 = c3; 1568 1569 // Advancd p3 by X(Extend | Format)* Rule 4 1570 p3 = moveForward(p3); 1571 c3 = cAt(p3); 1572 1573 // Rule (3) CR x LF 1574 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 1575 continue; 1576 } 1577 1578 // Rule (4) Sep <break> 1579 if (fSepSet.contains(c1)) { 1580 p2 = p1+1; // Separators don't combine with Extend or Format 1581 break; 1582 } 1583 1584 if (p2 >= fText.length()) { 1585 // Reached end of string. Always a break position. 1586 break; 1587 } 1588 1589 if (p2 == prevPos) { 1590 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1591 continue; 1592 } 1593 1594 // Rule (6). ATerm x Numeric 1595 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) { 1596 continue; 1597 } 1598 1599 // Rule (7). (Upper | Lower) ATerm x Uppper 1600 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) && 1601 fATermSet.contains(c1) && fUpperSet.contains(c2)) { 1602 continue; 1603 } 1604 1605 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower 1606 // Note: Sterm | ATerm are added to the negated part of the expression by a 1607 // note to the Unicode 5.0 documents. 1608 int p8 = p1; 1609 while (p8>0 && fSpSet.contains(cAt(p8))) { 1610 p8 = moveBack(p8); 1611 } 1612 while (p8>0 && fCloseSet.contains(cAt(p8))) { 1613 p8 = moveBack(p8); 1614 } 1615 if (fATermSet.contains(cAt(p8))) { 1616 p8=p2; 1617 for (;;) { 1618 c = cAt(p8); 1619 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) || 1620 fLowerSet.contains(c) || fSepSet.contains(c) || 1621 fATermSet.contains(c) || fSTermSet.contains(c)) 1622 { 1623 break; 1624 } 1625 p8 = moveForward(p8); 1626 } 1627 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) { 1628 continue; 1629 } 1630 } 1631 1632 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm) 1633 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) { 1634 p8 = p1; 1635 while (setContains(fSpSet, cAt(p8))) { 1636 p8 = moveBack(p8); 1637 } 1638 while (setContains(fCloseSet, cAt(p8))) { 1639 p8 = moveBack(p8); 1640 } 1641 c = cAt(p8); 1642 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) { 1643 continue; 1644 } 1645 } 1646 1647 1648 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 1649 int p9 = p1; 1650 while (p9>0 && fCloseSet.contains(cAt(p9))) { 1651 p9 = moveBack(p9); 1652 } 1653 c = cAt(p9); 1654 if ((fSTermSet.contains(c) || fATermSet.contains(c))) { 1655 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) { 1656 continue; 1657 } 1658 } 1659 1660 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 1661 int p10 = p1; 1662 while (p10>0 && fSpSet.contains(cAt(p10))) { 1663 p10 = moveBack(p10); 1664 } 1665 while (p10>0 && fCloseSet.contains(cAt(p10))) { 1666 p10 = moveBack(p10); 1667 } 1668 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) { 1669 if (fSpSet.contains(c2) || fSepSet.contains(c2)) { 1670 continue; 1671 } 1672 } 1673 1674 // Rule (11) (STerm | ATerm) Close* Sp* <break> 1675 int p11 = p1; 1676 if (p11>0 && fSepSet.contains(cAt(p11))) { 1677 p11 = moveBack(p11); 1678 } 1679 while (p11>0 && fSpSet.contains(cAt(p11))) { 1680 p11 = moveBack(p11); 1681 } 1682 while (p11>0 && fCloseSet.contains(cAt(p11))) { 1683 p11 = moveBack(p11); 1684 } 1685 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) { 1686 break; 1687 } 1688 1689 // Rule (12) Any x Any 1690 continue; 1691 } 1692 breakPos = p2; 1693 return breakPos; 1694 } 1695 1696 1697 1698 } 1699 1700 1701 /** 1702 * Move an index into a string by n code points. 1703 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were 1704 * complicating usage. 1705 * @param s a Text string 1706 * @param pos The starting code unit index into the text string 1707 * @param amt The amount to adjust the string by. 1708 * @return The adjusted code unit index, pinned to the string's length, or 1709 * unchanged if input index was outside of the string. 1710 */ 1711 static int moveIndex32(StringBuffer s, int pos, int amt) { 1712 int i; 1713 char c; 1714 if (amt>0) { 1715 for (i=0; i<amt; i++) { 1716 if (pos >= s.length()) { 1717 return s.length(); 1718 } 1719 c = s.charAt(pos); 1720 pos++; 1721 if (UTF16.isLeadSurrogate(c) && pos < s.length()) { 1722 c = s.charAt(pos); 1723 if (UTF16.isTrailSurrogate(c)) { 1724 pos++; 1725 } 1726 } 1727 } 1728 } else { 1729 for (i=0; i>amt; i--) { 1730 if (pos <= 0) { 1731 return 0; 1732 } 1733 pos--; 1734 c = s.charAt(pos); 1735 if (UTF16.isTrailSurrogate(c) && pos >= 0) { 1736 c = s.charAt(pos); 1737 if (UTF16.isLeadSurrogate(c)) { 1738 pos--; 1739 } 1740 } 1741 } 1742 } 1743 return pos; 1744 } 1745 1746 /** 1747 * No-exceptions form of UnicodeSet.contains(c). 1748 * Simplifies loops that terminate with an end-of-input character value. 1749 * @param s A unicode set 1750 * @param c A code point value 1751 * @return true if the set contains c. 1752 */ 1753 static boolean setContains(UnicodeSet s, int c) { 1754 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) { 1755 return false; 1756 } 1757 return s.contains(c); 1758 } 1759 1760 1761 /** 1762 * return the index of the next code point in the input text. 1763 * @param i the preceding index 1764 */ 1765 static int nextCP(StringBuffer s, int i) { 1766 if (i == -1) { 1767 // End of Input indication. Continue to return end value. 1768 return -1; 1769 } 1770 int retVal = i + 1; 1771 if (retVal > s.length()) { 1772 return -1; 1773 } 1774 int c = UTF16.charAt(s, i); 1775 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) { 1776 retVal++; 1777 } 1778 return retVal; 1779 } 1780 1781 1782 /** 1783 * random number generator. Not using Java's built-in Randoms for two reasons: 1784 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. 1785 * 2. We need to get and restore the seed from values occurring in the middle 1786 * of a long sequence, to more easily reproduce failing cases. 1787 */ 1788 private static int m_seed = 1; 1789 private static int m_rand() 1790 { 1791 m_seed = m_seed * 1103515245 + 12345; 1792 return (m_seed >>> 16) % 32768; 1793 } 1794 1795 // Helper function for formatting error output. 1796 // Append a string into a fixed-size field in a StringBuffer. 1797 // Blank-pad the string if it is shorter than the field. 1798 // Truncate the source string if it is too long. 1799 // 1800 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) { 1801 int appendLen = src.length(); 1802 if (appendLen >= fieldLen) { 1803 dest.append(src.substring(0, fieldLen)); 1804 } else { 1805 dest.append(src); 1806 while (appendLen < fieldLen) { 1807 dest.append(' '); 1808 appendLen++; 1809 } 1810 } 1811 } 1812 1813 // Helper function for formatting error output. 1814 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format 1815 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) { 1816 String hexChars = "0123456789abcdef"; 1817 if (c < 0x10000) { 1818 dest.append("\\u"); 1819 for (int bn=12; bn>=0; bn-=4) { 1820 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1821 } 1822 appendToBuf(dest, " ", fieldLen-6); 1823 } else { 1824 dest.append("\\U"); 1825 for (int bn=28; bn>=0; bn-=4) { 1826 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1827 } 1828 appendToBuf(dest, " ", fieldLen-10); 1829 1830 } 1831 } 1832 1833 /** 1834 * Run a RBBI monkey test. Common routine, for all break iterator types. 1835 * Parameters: 1836 * bi - the break iterator to use 1837 * mk - MonkeyKind, abstraction for obtaining expected results 1838 * name - Name of test (char, word, etc.) for use in error messages 1839 * seed - Seed for starting random number generator (parameter from user) 1840 * numIterations 1841 */ 1842 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) { 1843 int TESTSTRINGLEN = 500; 1844 StringBuffer testText = new StringBuffer(); 1845 int numCharClasses; 1846 List chClasses; 1847 int[] expected = new int[TESTSTRINGLEN*2 + 1]; 1848 int expectedCount = 0; 1849 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1850 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1851 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1852 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1853 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1854 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1855 int i; 1856 int loopCount = 0; 1857 boolean printTestData = false; 1858 boolean printBreaksFromBI = false; 1859 1860 m_seed = seed; 1861 1862 numCharClasses = mk.charClasses().size(); 1863 chClasses = mk.charClasses(); 1864 1865 // Verify that the character classes all have at least one member. 1866 for (i=0; i<numCharClasses; i++) { 1867 UnicodeSet s = (UnicodeSet)chClasses.get(i); 1868 if (s == null || s.size() == 0) { 1869 errln("Character Class " + i + " is null or of zero size."); 1870 return; 1871 } 1872 } 1873 1874 //-------------------------------------------------------------------------------------------- 1875 // 1876 // Debugging settings. Comment out everything in the following block for normal operation 1877 // 1878 //-------------------------------------------------------------------------------------------- 1879 // numIterations = -1; 1880 // numIterations = 10000; // Same as exhaustive. 1881 // RuleBasedBreakIterator_New.fTrace = true; 1882 // m_seed = 859056465; 1883 // TESTSTRINGLEN = 50; 1884 // printTestData = true; 1885 // printBreaksFromBI = true; 1886 // ((RuleBasedBreakIterator_New)bi).dump(); 1887 1888 //-------------------------------------------------------------------------------------------- 1889 // 1890 // End of Debugging settings. 1891 // 1892 //-------------------------------------------------------------------------------------------- 1893 1894 int dotsOnLine = 0; 1895 while (loopCount < numIterations || numIterations == -1) { 1896 if (numIterations == -1 && loopCount % 10 == 0) { 1897 // If test is running in an infinite loop, display a periodic tic so 1898 // we can tell that it is making progress. 1899 System.out.print("."); 1900 if (dotsOnLine++ >= 80){ 1901 System.out.println(); 1902 dotsOnLine = 0; 1903 } 1904 } 1905 // Save current random number seed, so that we can recreate the random numbers 1906 // for this loop iteration in event of an error. 1907 seed = m_seed; 1908 1909 testText.setLength(0); 1910 // Populate a test string with data. 1911 if (printTestData) { 1912 System.out.println("Test Data string ..."); 1913 } 1914 for (i=0; i<TESTSTRINGLEN; i++) { 1915 int aClassNum = m_rand() % numCharClasses; 1916 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum); 1917 int charIdx = m_rand() % classSet.size(); 1918 int c = classSet.charAt(charIdx); 1919 if (c < 0) { // TODO: deal with sets containing strings. 1920 errln("c < 0"); 1921 } 1922 UTF16.appendCodePoint(testText, c); 1923 if (printTestData) { 1924 System.out.print(Integer.toHexString(c) + " "); 1925 } 1926 } 1927 if (printTestData) { 1928 System.out.println(); 1929 } 1930 1931 Arrays.fill(expected, 0); 1932 Arrays.fill(expectedBreaks, false); 1933 Arrays.fill(forwardBreaks, false); 1934 Arrays.fill(reverseBreaks, false); 1935 Arrays.fill(isBoundaryBreaks, false); 1936 Arrays.fill(followingBreaks, false); 1937 Arrays.fill(precedingBreaks, false); 1938 1939 // Calculate the expected results for this test string. 1940 mk.setText(testText); 1941 expectedCount = 0; 1942 expectedBreaks[0] = true; 1943 expected[expectedCount ++] = 0; 1944 int breakPos = 0; 1945 int lastBreakPos = -1; 1946 for (;;) { 1947 lastBreakPos = breakPos; 1948 breakPos = mk.next(breakPos); 1949 if (breakPos == -1) { 1950 break; 1951 } 1952 if (breakPos > testText.length()) { 1953 errln("breakPos > testText.length()"); 1954 } 1955 if (lastBreakPos >= breakPos) { 1956 errln("Next() not increasing."); 1957 // break; 1958 } 1959 expectedBreaks[breakPos] = true; 1960 expected[expectedCount ++] = breakPos; 1961 } 1962 1963 // Find the break positions using forward iteration 1964 if (printBreaksFromBI) { 1965 System.out.println("Breaks from BI..."); 1966 } 1967 bi.setText(testText.toString()); 1968 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) { 1969 if (i < 0 || i > testText.length()) { 1970 errln(name + " break monkey test: Out of range value returned by breakIterator::next()"); 1971 break; 1972 } 1973 if (printBreaksFromBI) { 1974 System.out.print(Integer.toHexString(i) + " "); 1975 } 1976 forwardBreaks[i] = true; 1977 } 1978 if (printBreaksFromBI) { 1979 System.out.println(); 1980 } 1981 1982 // Find the break positions using reverse iteration 1983 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) { 1984 if (i < 0 || i > testText.length()) { 1985 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name); 1986 break; 1987 } 1988 reverseBreaks[i] = true; 1989 } 1990 1991 // Find the break positions using isBoundary() tests. 1992 for (i=0; i<=testText.length(); i++) { 1993 isBoundaryBreaks[i] = bi.isBoundary(i); 1994 } 1995 1996 // Find the break positions using the following() function. 1997 lastBreakPos = 0; 1998 followingBreaks[0] = true; 1999 for (i=0; i<testText.length(); i++) { 2000 breakPos = bi.following(i); 2001 if (breakPos <= i || 2002 breakPos < lastBreakPos || 2003 breakPos > testText.length() || 2004 breakPos > lastBreakPos && lastBreakPos > i ) { 2005 errln(name + " break monkey test: " + 2006 "Out of range value returned by BreakIterator::following().\n" + 2007 "index=" + i + "following returned=" + breakPos + 2008 "lastBreak=" + lastBreakPos); 2009 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2010 } else { 2011 followingBreaks[breakPos] = true; 2012 lastBreakPos = breakPos; 2013 } 2014 } 2015 2016 // Find the break positions using the preceding() function. 2017 lastBreakPos = testText.length(); 2018 precedingBreaks[testText.length()] = true; 2019 for (i=testText.length(); i>0; i--) { 2020 breakPos = bi.preceding(i); 2021 if (breakPos >= i || 2022 breakPos > lastBreakPos || 2023 breakPos < 0 || 2024 breakPos < lastBreakPos && lastBreakPos < i ) { 2025 errln(name + " break monkey test: " + 2026 "Out of range value returned by BreakIterator::preceding().\n" + 2027 "index=" + i + "preceding returned=" + breakPos + 2028 "lastBreak=" + lastBreakPos); 2029 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2030 } else { 2031 precedingBreaks[breakPos] = true; 2032 lastBreakPos = breakPos; 2033 } 2034 } 2035 2036 2037 2038 // Compare the expected and actual results. 2039 for (i=0; i<=testText.length(); i++) { 2040 String errorType = null; 2041 if (forwardBreaks[i] != expectedBreaks[i]) { 2042 errorType = "next()"; 2043 } else if (reverseBreaks[i] != forwardBreaks[i]) { 2044 errorType = "previous()"; 2045 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 2046 errorType = "isBoundary()"; 2047 } else if (followingBreaks[i] != expectedBreaks[i]) { 2048 errorType = "following()"; 2049 } else if (precedingBreaks[i] != expectedBreaks[i]) { 2050 errorType = "preceding()"; 2051 } 2052 2053 if (errorType != null) { 2054 // Format a range of the test text that includes the failure as 2055 // a data item that can be included in the rbbi test data file. 2056 2057 // Start of the range is the last point where expected and actual results 2058 // both agreed that there was a break position. 2059 int startContext = i; 2060 int count = 0; 2061 for (;;) { 2062 if (startContext==0) { break; } 2063 startContext --; 2064 if (expectedBreaks[startContext]) { 2065 if (count == 2) break; 2066 count ++; 2067 } 2068 } 2069 2070 // End of range is two expected breaks past the start position. 2071 int endContext = i + 1; 2072 int ci; 2073 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 2074 for (;;) { 2075 if (endContext >= testText.length()) {break;} 2076 if (expectedBreaks[endContext-1]) { 2077 if (count == 0) break; 2078 count --; 2079 } 2080 endContext ++; 2081 } 2082 } 2083 2084 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>" 2085 StringBuffer errorText = new StringBuffer(); 2086 2087 int c; // Char from test data 2088 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) { 2089 if (ci == i) { 2090 // This is the location of the error. 2091 errorText.append("<?>---------------------------------\n"); 2092 } else if (expectedBreaks[ci]) { 2093 // This a non-error expected break position. 2094 errorText.append("------------------------------------\n"); 2095 } 2096 if (ci < testText.length()) { 2097 c = UTF16.charAt(testText, ci); 2098 appendCharToBuf(errorText, c, 11); 2099 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT); 2100 appendToBuf(errorText, gc, 8); 2101 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty); 2102 String extraPropValue = 2103 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG); 2104 appendToBuf(errorText, extraPropValue, 20); 2105 2106 String charName = UCharacter.getExtendedName(c); 2107 appendToBuf(errorText, charName, 40); 2108 errorText.append('\n'); 2109 } 2110 } 2111 if (ci == testText.length() && ci != -1) { 2112 errorText.append("<>"); 2113 } 2114 errorText.append("</data>\n"); 2115 2116 // Output the error 2117 errln(name + " break monkey test error. " + 2118 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") + 2119 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" + 2120 errorText); 2121 break; 2122 } 2123 } 2124 2125 loopCount++; 2126 } 2127 } 2128 2129 @Test 2130 public void TestCharMonkey() { 2131 2132 int loopCount = 500; 2133 int seed = 1; 2134 2135 if (TestFmwk.getExhaustiveness() >= 9) { 2136 loopCount = 10000; 2137 } 2138 2139 RBBICharMonkey m = new RBBICharMonkey(); 2140 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2141 RunMonkey(bi, m, "char", seed, loopCount); 2142 } 2143 2144 @Test 2145 public void TestWordMonkey() { 2146 2147 int loopCount = 500; 2148 int seed = 1; 2149 2150 if (TestFmwk.getExhaustiveness() >= 9) { 2151 loopCount = 10000; 2152 } 2153 2154 logln("Word Break Monkey Test"); 2155 RBBIWordMonkey m = new RBBIWordMonkey(); 2156 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2157 RunMonkey(bi, m, "word", seed, loopCount); 2158 } 2159 2160 @Test 2161 public void TestLineMonkey() { 2162 int loopCount = 500; 2163 int seed = 1; 2164 2165 if (TestFmwk.getExhaustiveness() >= 9) { 2166 loopCount = 10000; 2167 } 2168 2169 logln("Line Break Monkey Test"); 2170 RBBILineMonkey m = new RBBILineMonkey(); 2171 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2172 RunMonkey(bi, m, "line", seed, loopCount); 2173 } 2174 2175 @Test 2176 public void TestSentMonkey() { 2177 2178 int loopCount = 500; 2179 int seed = 1; 2180 2181 if (TestFmwk.getExhaustiveness() >= 9) { 2182 loopCount = 3000; 2183 } 2184 2185 logln("Sentence Break Monkey Test"); 2186 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2187 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2188 RunMonkey(bi, m, "sent", seed, loopCount); 2189 } 2190 // 2191 // Round-trip monkey tests. 2192 // Verify that break iterators created from the rule source from the default 2193 // break iterators still pass the monkey test for the iterator type. 2194 // 2195 // This is a major test for the Rule Compiler. The default break iterators are built 2196 // from pre-compiled binary rule data that was created using ICU4C; these 2197 // round-trip rule recompile tests verify that the Java rule compiler can 2198 // rebuild break iterators from the original source rules. 2199 // 2200 @Test 2201 public void TestRTCharMonkey() { 2202 2203 int loopCount = 200; 2204 int seed = 1; 2205 2206 if (TestFmwk.getExhaustiveness() >= 9) { 2207 loopCount = 2000; 2208 } 2209 2210 RBBICharMonkey m = new RBBICharMonkey(); 2211 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2212 String rules = bi.toString(); 2213 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2214 RunMonkey(rtbi, m, "char", seed, loopCount); 2215 } 2216 2217 @Test 2218 public void TestRTWordMonkey() { 2219 2220 int loopCount = 200; 2221 int seed = 1; 2222 2223 if (TestFmwk.getExhaustiveness() >= 9) { 2224 loopCount = 2000; 2225 } 2226 logln("Word Break Monkey Test"); 2227 RBBIWordMonkey m = new RBBIWordMonkey(); 2228 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2229 String rules = bi.toString(); 2230 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2231 RunMonkey(rtbi, m, "word", seed, loopCount); 2232 } 2233 2234 @Test 2235 public void TestRTLineMonkey() { 2236 int loopCount = 200; 2237 int seed = 1; 2238 2239 if (TestFmwk.getExhaustiveness() >= 9) { 2240 loopCount = 2000; 2241 } 2242 2243 logln("Line Break Monkey Test"); 2244 RBBILineMonkey m = new RBBILineMonkey(); 2245 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2246 String rules = bi.toString(); 2247 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2248 RunMonkey(rtbi, m, "line", seed, loopCount); 2249 } 2250 2251 @Test 2252 public void TestRTSentMonkey() { 2253 2254 int loopCount = 200; 2255 int seed = 1; 2256 2257 if (TestFmwk.getExhaustiveness() >= 9) { 2258 loopCount = 1000; 2259 } 2260 2261 logln("Sentence Break Monkey Test"); 2262 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2263 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2264 String rules = bi.toString(); 2265 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2266 RunMonkey(rtbi, m, "sent", seed, loopCount); 2267 } 2268 } 2269 2270