1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 * ******************************************************************************** 5 * Copyright (C) 2007-2011, International Business Machines Corporation and others. 6 * All Rights Reserved. 7 * ******************************************************************************** 8 */ 9 package com.ibm.icu.impl; 10 11 import java.util.Iterator; 12 import java.util.LinkedList; 13 import java.util.List; 14 import java.util.ListIterator; 15 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.text.UTF16; 18 19 /** 20 * TextTrieMap is a trie implementation for supporting 21 * fast prefix match for the key. 22 */ 23 public class TextTrieMap<V> { 24 25 private Node _root = new Node(); 26 boolean _ignoreCase; 27 28 /** 29 * Constructs a TextTrieMap object. 30 * 31 * @param ignoreCase true to use simple case insensitive match 32 */ 33 public TextTrieMap(boolean ignoreCase) { 34 _ignoreCase = ignoreCase; 35 } 36 37 /** 38 * Adds the text key and its associated object in this object. 39 * 40 * @param text The text. 41 * @param val The value object associated with the text. 42 */ 43 public TextTrieMap<V> put(CharSequence text, V val) { 44 CharIterator chitr = new CharIterator(text, 0, _ignoreCase); 45 _root.add(chitr, val); 46 return this; 47 } 48 49 /** 50 * Gets an iterator of the objects associated with the 51 * longest prefix matching string key. 52 * 53 * @param text The text to be matched with prefixes. 54 * @return An iterator of the objects associated with 55 * the longest prefix matching matching key, or null 56 * if no matching entry is found. 57 */ 58 public Iterator<V> get(String text) { 59 return get(text, 0); 60 } 61 62 /** 63 * Gets an iterator of the objects associated with the 64 * longest prefix matching string key starting at the 65 * specified position. 66 * 67 * @param text The text to be matched with prefixes. 68 * @param start The start index of of the text 69 * @return An iterator of the objects associated with the 70 * longest prefix matching matching key, or null if no 71 * matching entry is found. 72 */ 73 public Iterator<V> get(CharSequence text, int start) { 74 return get(text, start, null); 75 } 76 77 public Iterator<V> get(CharSequence text, int start, int[] matchLen) { 78 LongestMatchHandler<V> handler = new LongestMatchHandler<V>(); 79 find(text, start, handler); 80 if (matchLen != null && matchLen.length > 0) { 81 matchLen[0] = handler.getMatchLength(); 82 } 83 return handler.getMatches(); 84 } 85 86 public void find(CharSequence text, ResultHandler<V> handler) { 87 find(text, 0, handler); 88 } 89 90 public void find(CharSequence text, int offset, ResultHandler<V> handler) { 91 CharIterator chitr = new CharIterator(text, offset, _ignoreCase); 92 find(_root, chitr, handler); 93 } 94 95 private synchronized void find(Node node, CharIterator chitr, ResultHandler<V> handler) { 96 Iterator<V> values = node.values(); 97 if (values != null) { 98 if (!handler.handlePrefixMatch(chitr.processedLength(), values)) { 99 return; 100 } 101 } 102 103 Node nextMatch = node.findMatch(chitr); 104 if (nextMatch != null) { 105 find(nextMatch, chitr, handler); 106 } 107 } 108 109 /** 110 * Creates an object that consumes code points one at a time and returns intermediate prefix 111 * matches. Returns null if no match exists. 112 * 113 * @return An instance of {@link ParseState}, or null if the starting code point is not a 114 * prefix for any entry in the trie. 115 */ 116 public ParseState openParseState(int startingCp) { 117 // Check to see whether this is a valid starting character. If not, return null. 118 if (_ignoreCase) { 119 startingCp = UCharacter.foldCase(startingCp, true); 120 } 121 int count = Character.charCount(startingCp); 122 char ch1 = (count == 1) ? (char) startingCp : UTF16.getLeadSurrogate(startingCp); 123 if (!_root.hasChildFor(ch1)) { 124 return null; 125 } 126 127 return new ParseState(_root); 128 } 129 130 /** 131 * ParseState is mutable, not thread-safe, and intended to be used internally by parsers for 132 * consuming values from this trie. 133 */ 134 public class ParseState { 135 private Node node; 136 private int offset; 137 private Node.StepResult result; 138 139 ParseState(Node start) { 140 node = start; 141 offset = 0; 142 result = start.new StepResult(); 143 } 144 145 /** 146 * Consumes a code point and walk to the next node in the trie. 147 * 148 * @param cp The code point to consume. 149 */ 150 public void accept(int cp) { 151 assert node != null; 152 if (_ignoreCase) { 153 cp = UCharacter.foldCase(cp, true); 154 } 155 int count = Character.charCount(cp); 156 char ch1 = (count == 1) ? (char) cp : UTF16.getLeadSurrogate(cp); 157 node.takeStep(ch1, offset, result); 158 if (count == 2 && result.node != null) { 159 char ch2 = UTF16.getTrailSurrogate(cp); 160 result.node.takeStep(ch2, result.offset, result); 161 } 162 node = result.node; 163 offset = result.offset; 164 } 165 166 /** 167 * Gets the exact prefix matches for all code points that have been consumed so far. 168 * 169 * @return The matches. 170 */ 171 public Iterator<V> getCurrentMatches() { 172 if (node != null && offset == node.charCount()) { 173 return node.values(); 174 } 175 return null; 176 } 177 178 /** 179 * Checks whether any more code points can be consumed. 180 * 181 * @return true if no more code points can be consumed; false otherwise. 182 */ 183 public boolean atEnd() { 184 return node == null || (node.charCount() == offset && node._children == null); 185 } 186 } 187 188 public static class CharIterator implements Iterator<Character> { 189 private boolean _ignoreCase; 190 private CharSequence _text; 191 private int _nextIdx; 192 private int _startIdx; 193 194 private Character _remainingChar; 195 196 CharIterator(CharSequence text, int offset, boolean ignoreCase) { 197 _text = text; 198 _nextIdx = _startIdx = offset; 199 _ignoreCase = ignoreCase; 200 } 201 202 /* (non-Javadoc) 203 * @see java.util.Iterator#hasNext() 204 */ 205 @Override 206 public boolean hasNext() { 207 if (_nextIdx == _text.length() && _remainingChar == null) { 208 return false; 209 } 210 return true; 211 } 212 213 /* (non-Javadoc) 214 * @see java.util.Iterator#next() 215 */ 216 @Override 217 public Character next() { 218 if (_nextIdx == _text.length() && _remainingChar == null) { 219 return null; 220 } 221 Character next; 222 if (_remainingChar != null) { 223 next = _remainingChar; 224 _remainingChar = null; 225 } else { 226 if (_ignoreCase) { 227 int cp = UCharacter.foldCase(Character.codePointAt(_text, _nextIdx), true); 228 _nextIdx = _nextIdx + Character.charCount(cp); 229 230 char[] chars = Character.toChars(cp); 231 next = chars[0]; 232 if (chars.length == 2) { 233 _remainingChar = chars[1]; 234 } 235 } else { 236 next = _text.charAt(_nextIdx); 237 _nextIdx++; 238 } 239 } 240 return next; 241 } 242 243 /* (non-Javadoc) 244 * @see java.util.Iterator#remove() 245 */ 246 @Override 247 public void remove() { 248 throw new UnsupportedOperationException("remove() not supproted"); 249 } 250 251 public int nextIndex() { 252 return _nextIdx; 253 } 254 255 public int processedLength() { 256 if (_remainingChar != null) { 257 throw new IllegalStateException("In the middle of surrogate pair"); 258 } 259 return _nextIdx - _startIdx; 260 } 261 } 262 263 /** 264 * Callback handler for processing prefix matches used by 265 * find method. 266 */ 267 public interface ResultHandler<V> { 268 /** 269 * Handles a prefix key match 270 * 271 * @param matchLength Matched key's length 272 * @param values An iterator of the objects associated with the matched key 273 * @return Return true to continue the search in the trie, false to quit. 274 */ 275 public boolean handlePrefixMatch(int matchLength, Iterator<V> values); 276 } 277 278 private static class LongestMatchHandler<V> implements ResultHandler<V> { 279 private Iterator<V> matches = null; 280 private int length = 0; 281 282 @Override 283 public boolean handlePrefixMatch(int matchLength, Iterator<V> values) { 284 if (matchLength > length) { 285 length = matchLength; 286 matches = values; 287 } 288 return true; 289 } 290 291 public Iterator<V> getMatches() { 292 return matches; 293 } 294 295 public int getMatchLength() { 296 return length; 297 } 298 } 299 300 /** 301 * Inner class representing a text node in the trie. 302 */ 303 private class Node { 304 private char[] _text; 305 private List<V> _values; 306 private List<Node> _children; 307 308 private Node() { 309 } 310 311 private Node(char[] text, List<V> values, List<Node> children) { 312 _text = text; 313 _values = values; 314 _children = children; 315 } 316 317 public int charCount() { 318 return _text == null ? 0 : _text.length; 319 } 320 321 public boolean hasChildFor(char ch) { 322 for (int i=0; _children != null && i < _children.size(); i++) { 323 Node child = _children.get(i); 324 if (ch < child._text[0]) break; 325 if (ch == child._text[0]) { 326 return true; 327 } 328 } 329 return false; 330 } 331 332 public Iterator<V> values() { 333 if (_values == null) { 334 return null; 335 } 336 return _values.iterator(); 337 } 338 339 public void add(CharIterator chitr, V value) { 340 StringBuilder buf = new StringBuilder(); 341 while (chitr.hasNext()) { 342 buf.append(chitr.next()); 343 } 344 add(toCharArray(buf), 0, value); 345 } 346 347 public Node findMatch(CharIterator chitr) { 348 if (_children == null) { 349 return null; 350 } 351 if (!chitr.hasNext()) { 352 return null; 353 } 354 Node match = null; 355 Character ch = chitr.next(); 356 for (Node child : _children) { 357 if (ch < child._text[0]) { 358 break; 359 } 360 if (ch == child._text[0]) { 361 if (child.matchFollowing(chitr)) { 362 match = child; 363 } 364 break; 365 } 366 } 367 return match; 368 } 369 370 public class StepResult { 371 public Node node; 372 public int offset; 373 } 374 public void takeStep(char ch, int offset, StepResult result) { 375 assert offset <= charCount(); 376 if (offset == charCount()) { 377 // Go to a child node 378 for (int i=0; _children != null && i < _children.size(); i++) { 379 Node child = _children.get(i); 380 if (ch < child._text[0]) break; 381 if (ch == child._text[0]) { 382 // Found a matching child node 383 result.node = child; 384 result.offset = 1; 385 return; 386 } 387 } 388 // No matching children; fall through 389 } else if (_text[offset] == ch) { 390 // Return to this node; increase offset 391 result.node = this; 392 result.offset = offset + 1; 393 return; 394 } 395 // No matches 396 result.node = null; 397 result.offset = -1; 398 return; 399 } 400 401 private void add(char[] text, int offset, V value) { 402 if (text.length == offset) { 403 _values = addValue(_values, value); 404 return; 405 } 406 407 if (_children == null) { 408 _children = new LinkedList<Node>(); 409 Node child = new Node(subArray(text, offset), addValue(null, value), null); 410 _children.add(child); 411 return; 412 } 413 414 // walk through children 415 ListIterator<Node> litr = _children.listIterator(); 416 while (litr.hasNext()) { 417 Node next = litr.next(); 418 if (text[offset] < next._text[0]) { 419 litr.previous(); 420 break; 421 } 422 if (text[offset] == next._text[0]) { 423 int matchLen = next.lenMatches(text, offset); 424 if (matchLen == next._text.length) { 425 // full match 426 next.add(text, offset + matchLen, value); 427 } else { 428 // partial match, create a branch 429 next.split(matchLen); 430 next.add(text, offset + matchLen, value); 431 } 432 return; 433 } 434 } 435 // add a new child to this node 436 litr.add(new Node(subArray(text, offset), addValue(null, value), null)); 437 } 438 439 private boolean matchFollowing(CharIterator chitr) { 440 boolean matched = true; 441 int idx = 1; 442 while (idx < _text.length) { 443 if(!chitr.hasNext()) { 444 matched = false; 445 break; 446 } 447 Character ch = chitr.next(); 448 if (ch != _text[idx]) { 449 matched = false; 450 break; 451 } 452 idx++; 453 } 454 return matched; 455 } 456 457 private int lenMatches(char[] text, int offset) { 458 int textLen = text.length - offset; 459 int limit = _text.length < textLen ? _text.length : textLen; 460 int len = 0; 461 while (len < limit) { 462 if (_text[len] != text[offset + len]) { 463 break; 464 } 465 len++; 466 } 467 return len; 468 } 469 470 private void split(int offset) { 471 // split the current node at the offset 472 char[] childText = subArray(_text, offset); 473 _text = subArray(_text, 0, offset); 474 475 // add the Node representing after the offset as a child 476 Node child = new Node(childText, _values, _children); 477 _values = null; 478 479 _children = new LinkedList<Node>(); 480 _children.add(child); 481 } 482 483 private List<V> addValue(List<V> list, V value) { 484 if (list == null) { 485 list = new LinkedList<V>(); 486 } 487 list.add(value); 488 return list; 489 } 490 } 491 492 private static char[] toCharArray(CharSequence text) { 493 char[] array = new char[text.length()]; 494 for (int i = 0; i < array.length; i++) { 495 array[i] = text.charAt(i); 496 } 497 return array; 498 } 499 500 private static char[] subArray(char[] array, int start) { 501 if (start == 0) { 502 return array; 503 } 504 char[] sub = new char[array.length - start]; 505 System.arraycopy(array, start, sub, 0, sub.length); 506 return sub; 507 } 508 509 private static char[] subArray(char[] array, int start, int limit) { 510 if (start == 0 && limit == array.length) { 511 return array; 512 } 513 char[] sub = new char[limit - start]; 514 System.arraycopy(array, start, sub, 0, limit - start); 515 return sub; 516 } 517 } 518