1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package java.text; 19 20 import java.util.Locale; 21 import libcore.icu.ICU; 22 import libcore.icu.NativeBreakIterator; 23 24 /** 25 * Locates boundaries in text. This class defines a protocol for objects that 26 * break up a piece of natural-language text according to a set of criteria. 27 * Instances or subclasses of {@code BreakIterator} can be provided, for 28 * example, to break a piece of text into words, sentences, or logical 29 * characters according to the conventions of some language or group of 30 * languages. We provide four built-in types of {@code BreakIterator}: 31 * <ul> 32 * <li>{@link #getSentenceInstance()} returns a {@code BreakIterator} that 33 * locates boundaries between sentences. This is useful for triple-click 34 * selection, for example.</li> 35 * <li>{@link #getWordInstance()} returns a {@code BreakIterator} that locates 36 * boundaries between words. This is useful for double-click selection or "find 37 * whole words" searches. This type of {@code BreakIterator} makes sure there is 38 * a boundary position at the beginning and end of each legal word (numbers 39 * count as words, too). Whitespace and punctuation are kept separate from real 40 * words.</li> 41 * <li>{@code getLineInstance()} returns a {@code BreakIterator} that locates 42 * positions where it is legal for a text editor to wrap lines. This is similar 43 * to word breaking, but not the same: punctuation and whitespace are generally 44 * kept with words (you don't want a line to start with whitespace, for 45 * example), and some special characters can force a position to be considered a 46 * line break position or prevent a position from being a line break position.</li> 47 * <li>{@code getCharacterInstance()} returns a {@code BreakIterator} that 48 * locates boundaries between logical characters. Because of the structure of 49 * the Unicode encoding, a logical character may be stored internally as more 50 * than one Unicode code point. (A with an umlaut may be stored as an a followed 51 * by a separate combining umlaut character, for example, but the user still 52 * thinks of it as one character.) This iterator allows various processes 53 * (especially text editors) to treat as characters the units of text that a 54 * user would think of as characters, rather than the units of text that the 55 * computer sees as "characters".</li> 56 * </ul> {@code BreakIterator}'s interface follows an "iterator" model (hence 57 * the name), meaning it has a concept of a "current position" and methods like 58 * {@code first()}, {@code last()}, {@code next()}, and {@code previous()} that 59 * update the current position. All {@code BreakIterator}s uphold the following 60 * invariants: 61 * <ul> 62 * <li>The beginning and end of the text are always treated as boundary 63 * positions.</li> 64 * <li>The current position of the iterator is always a boundary position 65 * (random- access methods move the iterator to the nearest boundary position 66 * before or after the specified position, not <i>to</i> the specified 67 * position).</li> 68 * <li>{@code DONE} is used as a flag to indicate when iteration has stopped. 69 * {@code DONE} is only returned when the current position is the end of the 70 * text and the user calls {@code next()}, or when the current position is the 71 * beginning of the text and the user calls {@code previous()}.</li> 72 * <li>Break positions are numbered by the positions of the characters that 73 * follow them. Thus, under normal circumstances, the position before the first 74 * character is 0, the position after the first character is 1, and the position 75 * after the last character is 1 plus the length of the string.</li> 76 * <li>The client can change the position of an iterator, or the text it 77 * analyzes, at will, but cannot change the behavior. If the user wants 78 * different behavior, he must instantiate a new iterator.</li> 79 * </ul> 80 * <p> 81 * {@code BreakIterator} accesses the text it analyzes through a 82 * {@link CharacterIterator}, which makes it possible to use {@code 83 * BreakIterator} to analyze text in any text-storage vehicle that provides a 84 * {@code CharacterIterator} interface. 85 * <p> 86 * <em>Note:</em> Some types of {@code BreakIterator} can take a long time to 87 * create, and instances of {@code BreakIterator} are not currently cached by 88 * the system. For optimal performance, keep instances of {@code BreakIterator} 89 * around as long as it makes sense. For example, when word-wrapping a document, 90 * don't create and destroy a new {@code BreakIterator} for each line. Create 91 * one break iterator for the whole document (or whatever stretch of text you're 92 * wrapping) and use it to do the whole job of wrapping the text. 93 * <p> 94 * <em>Examples</em>: 95 * <p> 96 * Creating and using text boundaries: 97 * <blockquote> 98 * 99 * <pre> 100 * public static void main(String args[]) { 101 * if (args.length == 1) { 102 * String stringToExamine = args[0]; 103 * //print each word in order 104 * BreakIterator boundary = BreakIterator.getWordInstance(); 105 * boundary.setText(stringToExamine); 106 * printEachForward(boundary, stringToExamine); 107 * //print each sentence in reverse order 108 * boundary = BreakIterator.getSentenceInstance(Locale.US); 109 * boundary.setText(stringToExamine); 110 * printEachBackward(boundary, stringToExamine); 111 * printFirst(boundary, stringToExamine); 112 * printLast(boundary, stringToExamine); 113 * } 114 * } 115 * </pre> 116 * 117 * </blockquote> 118 * <p> 119 * Print each element in order: 120 * <blockquote> 121 * 122 * <pre> 123 * public static void printEachForward(BreakIterator boundary, String source) { 124 * int start = boundary.first(); 125 * for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { 126 * System.out.println(source.substring(start, end)); 127 * } 128 * } 129 * </pre> 130 * 131 * </blockquote> 132 * <p> 133 * Print each element in reverse order: 134 * <blockquote> 135 * 136 * <pre> 137 * public static void printEachBackward(BreakIterator boundary, String source) { 138 * int end = boundary.last(); 139 * for (int start = boundary.previous(); start != BreakIterator.DONE; end = start, start = boundary 140 * .previous()) { 141 * System.out.println(source.substring(start, end)); 142 * } 143 * } 144 * </pre> 145 * 146 * </blockquote> 147 * <p> 148 * Print the first element: 149 * <blockquote> 150 * 151 * <pre> 152 * public static void printFirst(BreakIterator boundary, String source) { 153 * int start = boundary.first(); 154 * int end = boundary.next(); 155 * System.out.println(source.substring(start, end)); 156 * } 157 * </pre> 158 * 159 * </blockquote> 160 * <p> 161 * Print the last element: 162 * <blockquote> 163 * 164 * <pre> 165 * public static void printLast(BreakIterator boundary, String source) { 166 * int end = boundary.last(); 167 * int start = boundary.previous(); 168 * System.out.println(source.substring(start, end)); 169 * } 170 * </pre> 171 * 172 * </blockquote> 173 * <p> 174 * Print the element at a specified position: 175 * <blockquote> 176 * 177 * <pre> 178 * public static void printAt(BreakIterator boundary, int pos, String source) { 179 * int end = boundary.following(pos); 180 * int start = boundary.previous(); 181 * System.out.println(source.substring(start, end)); 182 * } 183 * </pre> 184 * 185 * </blockquote> 186 * <p> 187 * Find the next word: 188 * <blockquote> 189 * 190 * <pre> 191 * public static int nextWordStartAfter(int pos, String text) { 192 * BreakIterator wb = BreakIterator.getWordInstance(); 193 * wb.setText(text); 194 * int last = wb.following(pos); 195 * int current = wb.next(); 196 * while (current != BreakIterator.DONE) { 197 * for (int p = last; p < current; p++) { 198 * if (Character.isLetter(text.charAt(p))) 199 * return last; 200 * } 201 * last = current; 202 * current = wb.next(); 203 * } 204 * return BreakIterator.DONE; 205 * } 206 * </pre> 207 * 208 * </blockquote> 209 * <p> 210 * The iterator returned by {@code BreakIterator.getWordInstance()} is unique in 211 * that the break positions it returns don't represent both the start and end of 212 * the thing being iterated over. That is, a sentence-break iterator returns 213 * breaks that each represent the end of one sentence and the beginning of the 214 * next. With the word-break iterator, the characters between two boundaries 215 * might be a word, or they might be the punctuation or whitespace between two 216 * words. The above code uses a simple heuristic to determine which boundary is 217 * the beginning of a word: If the characters between this boundary and the next 218 * boundary include at least one letter (this can be an alphabetical letter, a 219 * CJK ideograph, a Hangul syllable, a Kana character, etc.), then the text 220 * between this boundary and the next is a word; otherwise, it's the material 221 * between words.) 222 * 223 * @see CharacterIterator 224 */ 225 public abstract class BreakIterator implements Cloneable { 226 227 /** 228 * This constant is returned by iterate methods like {@code previous()} or 229 * {@code next()} if they have returned all valid boundaries. 230 */ 231 public static final int DONE = -1; 232 233 // the wrapped ICU implementation 234 NativeBreakIterator wrapped; 235 236 /** 237 * Default constructor, for use by subclasses. 238 */ 239 protected BreakIterator() { 240 } 241 242 /* 243 * wrapping constructor 244 */ 245 BreakIterator(NativeBreakIterator iterator) { 246 wrapped = iterator; 247 } 248 249 /** 250 * Returns an array of locales for which custom {@code BreakIterator} instances 251 * are available. 252 * <p>Note that Android does not support user-supplied locale service providers. 253 */ 254 public static Locale[] getAvailableLocales() { 255 return ICU.getAvailableBreakIteratorLocales(); 256 } 257 258 /** 259 * Returns a new instance of {@code BreakIterator} to iterate over 260 * characters using the user's default locale. 261 * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>". 262 * @return a new instance of {@code BreakIterator} using the default locale. 263 */ 264 public static BreakIterator getCharacterInstance() { 265 return getCharacterInstance(Locale.getDefault()); 266 } 267 268 /** 269 * Returns a new instance of {@code BreakIterator} to iterate over 270 * characters using the given locale. 271 * 272 * @param where 273 * the given locale. 274 * @return a new instance of {@code BreakIterator} using the given locale. 275 */ 276 public static BreakIterator getCharacterInstance(Locale where) { 277 return new RuleBasedBreakIterator(NativeBreakIterator.getCharacterInstance(where)); 278 } 279 280 /** 281 * Returns a new instance of {{@code BreakIterator} to iterate over 282 * line breaks using the user's default locale. 283 * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>". 284 * @return a new instance of {@code BreakIterator} using the default locale. 285 */ 286 public static BreakIterator getLineInstance() { 287 return getLineInstance(Locale.getDefault()); 288 } 289 290 /** 291 * Returns a new instance of {@code BreakIterator} to iterate over 292 * line breaks using the given locale. 293 * 294 * @param where 295 * the given locale. 296 * @return a new instance of {@code BreakIterator} using the given locale. 297 * @throws NullPointerException if {@code where} is {@code null}. 298 */ 299 public static BreakIterator getLineInstance(Locale where) { 300 return new RuleBasedBreakIterator(NativeBreakIterator.getLineInstance(where)); 301 } 302 303 /** 304 * Returns a new instance of {@code BreakIterator} to iterate over 305 * sentence-breaks using the default locale. 306 * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>". 307 * @return a new instance of {@code BreakIterator} using the default locale. 308 */ 309 public static BreakIterator getSentenceInstance() { 310 return getSentenceInstance(Locale.getDefault()); 311 } 312 313 /** 314 * Returns a new instance of {@code BreakIterator} to iterate over 315 * sentence-breaks using the given locale. 316 * 317 * @param where 318 * the given locale. 319 * @return a new instance of {@code BreakIterator} using the given locale. 320 * @throws NullPointerException if {@code where} is {@code null}. 321 */ 322 public static BreakIterator getSentenceInstance(Locale where) { 323 return new RuleBasedBreakIterator(NativeBreakIterator.getSentenceInstance(where)); 324 } 325 326 /** 327 * Returns a new instance of {@code BreakIterator} to iterate over 328 * word-breaks using the default locale. 329 * See "<a href="../util/Locale.html#default_locale">Be wary of the default locale</a>". 330 * @return a new instance of {@code BreakIterator} using the default locale. 331 */ 332 public static BreakIterator getWordInstance() { 333 return getWordInstance(Locale.getDefault()); 334 } 335 336 /** 337 * Returns a new instance of {@code BreakIterator} to iterate over 338 * word-breaks using the given locale. 339 * 340 * @param where 341 * the given locale. 342 * @return a new instance of {@code BreakIterator} using the given locale. 343 * @throws NullPointerException if {@code where} is {@code null}. 344 */ 345 public static BreakIterator getWordInstance(Locale where) { 346 return new RuleBasedBreakIterator(NativeBreakIterator.getWordInstance(where)); 347 } 348 349 /** 350 * Indicates whether the given offset is a boundary position. If this method 351 * returns true, the current iteration position is set to the given 352 * position; if the function returns false, the current iteration position 353 * is set as though {@link #following(int)} had been called. 354 * 355 * @param offset 356 * the given offset to check. 357 * @return {@code true} if the given offset is a boundary position; {@code 358 * false} otherwise. 359 */ 360 public boolean isBoundary(int offset) { 361 return wrapped.isBoundary(offset); 362 } 363 364 /** 365 * Returns the position of last boundary preceding the given offset, and 366 * sets the current position to the returned value, or {@code DONE} if the 367 * given offset specifies the starting position. 368 * 369 * @param offset 370 * the given start position to be searched for. 371 * @return the position of the last boundary preceding the given offset. 372 * @throws IllegalArgumentException 373 * if the offset is invalid. 374 */ 375 public int preceding(int offset) { 376 return wrapped.preceding(offset); 377 } 378 379 /** 380 * Sets the new text string to be analyzed, the current position will be 381 * reset to the beginning of this new string, and the old string will be 382 * lost. 383 * 384 * @param newText 385 * the new text string to be analyzed. 386 */ 387 public void setText(String newText) { 388 if (newText == null) { 389 throw new NullPointerException("newText == null"); 390 } 391 wrapped.setText(newText); 392 } 393 394 /** 395 * Returns this iterator's current position. 396 * 397 * @return this iterator's current position. 398 */ 399 public abstract int current(); 400 401 /** 402 * Sets this iterator's current position to the first boundary and returns 403 * that position. 404 * 405 * @return the position of the first boundary. 406 */ 407 public abstract int first(); 408 409 /** 410 * Sets the position of the first boundary to the one following the given 411 * offset and returns this position. Returns {@code DONE} if there is no 412 * boundary after the given offset. 413 * 414 * @param offset 415 * the given position to be searched for. 416 * @return the position of the first boundary following the given offset. 417 * @throws IllegalArgumentException 418 * if the offset is invalid. 419 */ 420 public abstract int following(int offset); 421 422 /** 423 * Returns a {@code CharacterIterator} which represents the text being 424 * analyzed. Please note that the returned value is probably the internal 425 * iterator used by this object. If the invoker wants to modify the status 426 * of the returned iterator, it is recommended to first create a clone of 427 * the iterator returned. 428 * 429 * @return a {@code CharacterIterator} which represents the text being 430 * analyzed. 431 */ 432 public abstract CharacterIterator getText(); 433 434 /** 435 * Sets this iterator's current position to the last boundary and returns 436 * that position. 437 * 438 * @return the position of last boundary. 439 */ 440 public abstract int last(); 441 442 /** 443 * Sets this iterator's current position to the next boundary after the 444 * current position, and returns this position. Returns {@code DONE} if no 445 * boundary was found after the current position. 446 * 447 * @return the position of last boundary. 448 */ 449 public abstract int next(); 450 451 /** 452 * Sets this iterator's current position to the next boundary after the 453 * given position, and returns that position. Returns {@code DONE} if no 454 * boundary was found after the given position. 455 * 456 * @param n 457 * the given position. 458 * @return the position of last boundary. 459 */ 460 public abstract int next(int n); 461 462 /** 463 * Sets this iterator's current position to the previous boundary before the 464 * current position and returns that position. Returns {@code DONE} if 465 * no boundary was found before the current position. 466 * 467 * @return the position of last boundary. 468 */ 469 public abstract int previous(); 470 471 /** 472 * Sets the new text to be analyzed by the given {@code CharacterIterator}. 473 * The position will be reset to the beginning of the new text, and other 474 * status information of this iterator will be kept. 475 * 476 * @param newText 477 * the {@code CharacterIterator} referring to the text to be 478 * analyzed. 479 */ 480 public abstract void setText(CharacterIterator newText); 481 482 /** 483 * Returns a copy of this iterator. 484 */ 485 @Override 486 public Object clone() { 487 try { 488 BreakIterator cloned = (BreakIterator) super.clone(); 489 cloned.wrapped = (NativeBreakIterator) wrapped.clone(); 490 return cloned; 491 } catch (CloneNotSupportedException e) { 492 throw new AssertionError(e); 493 } 494 } 495 } 496