1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.ArrayList; 11 import java.util.BitSet; 12 import java.util.Collection; 13 import java.util.Comparator; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.Map; 17 import java.util.Map.Entry; 18 import java.util.Set; 19 import java.util.TreeMap; 20 import java.util.TreeSet; 21 22 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness; 23 24 public class StateDictionary<T> extends Dictionary<T> { 25 26 private static final boolean DEBUG_FLATTEN = false; 27 28 // results of build 29 private final ArrayList<Row> builtRows; 30 31 private final Row builtBaseRow; 32 33 private final IntMap<T> builtResults; 34 35 private final int builtMaxByteLength; 36 37 private final StringByteConverter byteString; 38 39 // TODO remove before deployment; not thread safe 40 private static int debugReferenceCount = 0; 41 private int debugReferenceNumber = debugReferenceCount++; 42 43 /** 44 * Only should be called by StateDictionaryBuilder 45 * 46 * @param builtBaseRow2 47 * @param builtRows2 48 * @param builtResults2 49 * @param builtMaxByteLength 50 * TODO 51 * @param byteConverter 52 * TODO 53 */ 54 StateDictionary(Row builtBaseRow2, ArrayList<Row> builtRows2, 55 IntMap<T> builtResults2, int builtMaxByteLength, 56 StringByteConverter byteConverter) { 57 builtBaseRow = builtBaseRow2; 58 builtRows = builtRows2; 59 builtResults = builtResults2; 60 this.builtMaxByteLength = builtMaxByteLength; 61 this.byteString = byteConverter; 62 // builtBaseValue = builtResults.get(0); 63 } 64 65 @Override 66 public Matcher<T> getMatcher() { 67 return new StateMatcher(); 68 } 69 70 public String toString() { 71 StringBuilder result = new StringBuilder(); 72 // TreeSet<Row> rowSet = new TreeSet<Row>(builtRows); 73 for (Row row : builtRows) { 74 result.append(row.toString()).append(CldrUtility.LINE_SEPARATOR); 75 } 76 Map<T, Integer> map = builtResults.getValueMap(); 77 Set<Pair<Integer, String>> sorted = new TreeSet<Pair<Integer, String>>(); 78 for (T item : map.keySet()) { 79 sorted.add(new Pair<Integer, String>(map.get(item), item.toString())); 80 } 81 for (Pair<Integer, String> pair : sorted) { 82 result.append(pair.getFirst()).append("*=").append(pair.getSecond()).append(CldrUtility.LINE_SEPARATOR); 83 } 84 return result.toString(); 85 } 86 87 public Iterator<Entry<CharSequence, T>> getMapping() { 88 // TODO Optimize this to only return the items on demand 89 return new TextFetcher().getWords().entrySet().iterator(); 90 } 91 92 @Override 93 public String debugShow() { 94 return new TextFetcher().debugShow(); 95 } 96 97 public int getRowCount() { 98 return builtRows.size(); 99 } 100 101 /** 102 * Internals. The text is transformed into a byte stream. A state table is 103 * used to successively map {state, byte, result} to {newstate, newresult, 104 * isReturn}. A state is represented by a Row, which is a mapping from byte to 105 * a Cell, where each cell has the {nextRow, delta result, returns flag}. 106 * 107 * <pre> 108 * state = next state (row) 109 * result += delta result 110 * if (returns) return the result 111 * <pre> 112 * However, the result and state are preserved for the next call on next(). 113 * 114 */ 115 116 static class Row implements Comparable { 117 enum Uniqueness { 118 // the unknown value is only used in building 119 UNIQUE, AMBIGUOUS, UNKNOWN; 120 121 public String debugName() { 122 switch (this) { 123 case UNIQUE: 124 return (""); 125 case AMBIGUOUS: 126 return ""; 127 default: 128 return "?"; 129 } 130 } 131 } 132 133 Uniqueness hasUniqueValue = Uniqueness.UNKNOWN; 134 135 final TreeMap<Byte, Cell> byteToCell = new TreeMap<Byte, Cell>(); 136 137 // keeps track of the number of cells with returns 138 transient int returnCount; 139 140 transient int terminatingReturnCount; 141 142 private int referenceNumber; 143 144 Row(int rowNumber) { 145 referenceNumber = rowNumber; 146 } 147 148 public int nonTerminating() { 149 return byteToCell.size() - terminatingReturnCount; 150 } 151 152 public int nonReturn() { 153 return byteToCell.size() - returnCount; 154 } 155 156 public int maximumDepth() { 157 int result = 0; 158 for (Cell cell : byteToCell.values()) { 159 if (cell.nextRow != null) { 160 int temp = cell.nextRow.maximumDepth() + 1; 161 if (result < temp) { 162 result = temp; 163 } 164 } 165 } 166 return result; 167 } 168 169 public int compareTo(Object o) { 170 Row other = (Row) o; 171 int result; 172 // we want to sort items first with the fewest number of non-terminating 173 // returns 174 // cells, then most 175 // number of terminating returns, then most number of returns 176 if (0 != (result = maximumDepth() - other.maximumDepth())) 177 return result; 178 if (0 != (result = byteToCell.size() - other.byteToCell.size())) 179 return result; 180 // otherwise, try alphabetic among the keys. We are guaranteed that the 181 // sizes are the same 182 java.util.Iterator<Byte> otherIt = other.byteToCell.keySet().iterator(); 183 for (byte key : byteToCell.keySet()) { 184 int otherKey = otherIt.next(); 185 if (0 != (result = key - otherKey)) { 186 return result; 187 } 188 // at this point, we are guaranteed that the keys are the same. Compare 189 // deltaResults, and row 190 Cell cell = byteToCell.get(key); 191 Cell otherCell = other.byteToCell.get(key); 192 if (0 != (result = cell.deltaResult - otherCell.deltaResult)) { 193 return result; 194 } 195 } 196 // if we fail completely, use the age. 197 return referenceNumber - other.referenceNumber; 198 } 199 200 public String toString() { 201 StringBuilder buffer = new StringBuilder(); 202 buffer.append("R" + getReferenceNumber() + hasUniqueValue.debugName() + "{"); 203 boolean first = true; 204 Set<Byte> sorted = new TreeSet<Byte>(unsignedByteComparator); 205 sorted.addAll(byteToCell.keySet()); 206 for (Byte key : sorted) { 207 if (first) { 208 first = false; 209 } else { 210 buffer.append(' '); 211 } 212 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2)); 213 buffer.append('='); 214 buffer.append(byteToCell.get(key)); 215 } 216 buffer.append('}'); 217 return buffer.toString(); 218 } 219 220 public String toStringCells() { 221 StringBuilder buffer = new StringBuilder(); 222 for (Byte key : byteToCell.keySet()) { 223 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2)); 224 buffer.append(byteToCell.get(key).toString()); 225 buffer.append(' '); 226 } 227 return buffer.toString(); 228 } 229 230 public int getReferenceNumber() { 231 return referenceNumber; 232 } 233 234 int compact(byte[] target) { 235 int pos = 0; 236 for (Byte key : byteToCell.keySet()) { 237 target[pos++] = key; 238 pos = byteToCell.get(key).addBytes(target, pos, 0); 239 } 240 target[pos++] = 0; 241 return pos; 242 } 243 } 244 245 static class Cell { 246 public Row nextRow; // next state 247 248 public int deltaResult; 249 250 public boolean returns; 251 252 public int addBytes(byte[] target, int pos, int rowDelta) { 253 pos = StateDictionary.addBytes(deltaResult, target, pos); 254 int rowOffset = nextRow == null ? 0 : rowDelta - nextRow.getReferenceNumber(); 255 rowOffset <<= 1; // make room for returns 256 if (returns) 257 rowOffset |= 1; 258 return StateDictionary.addBytes(rowOffset, target, pos); 259 } 260 261 public String toString() { 262 String result = deltaResult == 0 ? "" : String.valueOf(deltaResult); 263 if (returns) { 264 result += "*"; 265 } 266 if (nextRow != null) { 267 if (result.length() != 0) { 268 result += "/"; 269 } 270 result += "R" + nextRow.getReferenceNumber(); 271 } 272 return result; 273 } 274 } 275 276 // should be private, but easier to debug if package private 277 class StateMatcher extends Matcher<T> { 278 private static final boolean SHOW_DEBUG = false; 279 280 final private byte[] matchByteBuffer = new byte[byteString 281 .getMaxBytesPerChar()]; 282 283 private int matchByteStringIndex; 284 285 private int matchByteBufferLength; 286 287 // only used in matching 288 private Row matchCurrentRow; 289 290 private int matchIntValue = -1; 291 292 private Row matchLastRow; 293 294 @Override 295 public Matcher<T> setOffset(int offset) { 296 matchCurrentRow = builtBaseRow; 297 partialLastRow = null; // can remove this later, only for debugging 298 partialMatchValue = 0; // ditto 299 matchIntValue = 0; 300 myMatchEnd = offset; 301 matchValue = null; 302 byteString.clear(); 303 matchByteStringIndex = offset; 304 return super.setOffset(offset); 305 } 306 307 int myMatchEnd; 308 309 private Row partialLastRow; 310 311 private int partialMatchValue; 312 313 public Status next() { 314 if (SHOW_DEBUG) { 315 System.out.println("NEXT: " + this); 316 } 317 if (matchCurrentRow == null) { 318 matchIntValue = -1; 319 matchValue = null; 320 return Status.NONE; 321 } 322 Status result = Status.PARTIAL; 323 324 while (text.hasCharAt(myMatchEnd)) { 325 // get more bytes IF matchEnd is set right 326 if (myMatchEnd == matchByteStringIndex) { 327 if (true) { // matchEnd < text.length() 328 char ch = text.charAt(matchByteStringIndex++); 329 matchByteBufferLength = byteString.toBytes(ch, matchByteBuffer, 0); 330 if (SHOW_DEBUG) { 331 System.out.println("\tChar: " + ch + "\t" + com.ibm.icu.impl.Utility.hex(ch) + "\t->\t" 332 + CldrUtility.hex(matchByteBuffer, 0, matchByteBufferLength, " ")); 333 } 334 } else { 335 matchByteBufferLength = byteString.toBytes(matchByteBuffer, 0); 336 } 337 } 338 for (int i = 0; i < matchByteBufferLength; ++i) { 339 result = nextByte(matchByteBuffer[i]); 340 if (result != Status.PARTIAL) { 341 break; 342 } 343 } 344 // Normally, we will never have a return value except at the end of a character, so we don't need 345 // to check after each nextByte. However, if the byteString converts C to a sequence of bytes that 346 // is a prefix of what it converts D into, then we will get a partial match *WITHIN* a character 347 348 if (result == Status.PARTIAL) { 349 ++myMatchEnd; 350 // and continue with the loop 351 } else if (result == Status.MATCH) { 352 ++myMatchEnd; 353 matchValue = builtResults.get(matchIntValue); 354 matchEnd = myMatchEnd; 355 if (SHOW_DEBUG) { 356 System.out.println("NEXT RESULT: " + result + "\t" + this); 357 } 358 return result; 359 } else { 360 // if we didn't get a MATCH, we have NONE. But in reality, there could be a possible match 361 // so we check to see whether the current row allows for any continuation. 362 if (myMatchEnd > offset && matchCurrentRow.byteToCell.size() > 0) { 363 result = Status.PARTIAL; 364 } 365 if (result == Status.NONE) { 366 matchIntValue = -1; 367 matchValue = null; 368 } 369 break; 370 } 371 } 372 matchLastRow = matchCurrentRow; 373 matchCurrentRow = null; 374 if (result == Status.PARTIAL) { 375 matchValue = builtResults.get(matchIntValue); 376 matchEnd = myMatchEnd; 377 partialLastRow = matchLastRow; 378 partialMatchValue = matchIntValue; 379 if (SHOW_DEBUG) { 380 System.out.println("NEXT RESULT: " + result + "\t" + this); 381 } 382 } 383 return result; 384 } 385 386 /** 387 * Returns NONE if we cannot go any farther, MATCH if there was a match, and PARTIAL otherwise. 388 * If we couldn't go any farther, then the currentRow is left alone. 389 * 390 * @param chunk 391 * @return 392 */ 393 private Status nextByte(int chunk) { 394 if (SHOW_DEBUG) { 395 System.out.println("\t" + debugReferenceNumber + "\tRow: " + matchCurrentRow); 396 } 397 Cell cell = matchCurrentRow.byteToCell.get((byte) chunk); 398 if (cell == null) { 399 return Status.NONE; 400 } 401 matchIntValue += cell.deltaResult; 402 matchCurrentRow = cell.nextRow; 403 if (cell.returns) { 404 return Status.MATCH; 405 } 406 return Status.PARTIAL; 407 } 408 409 public int getIntMatchValue() { 410 return matchIntValue; 411 } 412 413 public boolean nextUniquePartial() { 414 if (partialLastRow.hasUniqueValue == Uniqueness.UNIQUE) { 415 matchValue = builtResults.get(partialMatchValue); 416 matchEnd = myMatchEnd; 417 return true; 418 } 419 return false; 420 } 421 422 @Override 423 public StateDictionary<T> getDictionary() { 424 return StateDictionary.this; 425 } 426 } 427 428 static final Comparator<Byte> unsignedByteComparator = new Comparator<Byte>() { 429 430 public int compare(Byte o1, Byte o2) { 431 int b1 = o1 & 0xFF; 432 int b2 = o2 & 0xFF; 433 return b1 < b2 ? -1 : b1 > b2 ? 1 : 0; 434 } 435 436 }; 437 438 static final Comparator<Row> rowComparator = new Comparator<Row>() { 439 440 public int compare(Row row1, Row row2) { 441 if (row1 == row2) { 442 return 0; 443 } else if (row1 == null) { 444 return -1; 445 } else if (row2 == null) { 446 return 1; 447 } 448 int result; 449 if (0 != (result = row1.byteToCell.size() - row2.byteToCell.size())) { 450 return result; 451 } 452 java.util.Iterator<Byte> otherIt = row2.byteToCell.keySet().iterator(); 453 for (byte key : row1.byteToCell.keySet()) { 454 byte otherKey = otherIt.next(); 455 if (0 != (result = key - otherKey)) { 456 return result; 457 } 458 // at this point, we are guaranteed that the keys are the same. Compare 459 // deltaResults, returns, and then recurse on the the row 460 Cell cell1 = row1.byteToCell.get(key); 461 Cell cell2 = row2.byteToCell.get(key); 462 if (0 != (result = cell1.deltaResult - cell2.deltaResult)) { 463 return result; 464 } 465 if (cell1.returns != cell2.returns) { 466 return cell1.returns ? 1 : -1; 467 } 468 if (0 != (result = compare(cell1.nextRow, cell2.nextRow))) { 469 return result; 470 } 471 } 472 return 0; 473 474 } 475 476 }; 477 478 static int addBytes(int source, byte[] target, int pos) { 479 // swap the top bit 480 if (source < 0) { 481 source = ((-source) << 1) | 1; 482 } else { 483 source <<= 1; 484 } 485 // emit the rest as 7 bit quantities with 1 as termination bit 486 while (true) { 487 byte b = (byte) (source & 0x7F); 488 source >>>= 7; 489 if (source == 0) { 490 b |= 0x80; 491 target[pos++] = b; 492 return pos; 493 } 494 target[pos++] = b; 495 } 496 } 497 498 private class TextFetcher { 499 500 Map<CharSequence, T> result = new TreeMap<CharSequence, T>(); 501 502 byte[] soFar = new byte[builtMaxByteLength]; 503 504 BitSet shown = new BitSet(); 505 506 StringBuilder buffer = new StringBuilder(); 507 508 StringBuilder debugTreeView = new StringBuilder(); 509 510 private HashSet<Row> rowsSeen = new HashSet<Row>(); 511 512 public Map<CharSequence, T> getWords() { 513 result.clear(); 514 getWords(0, 0, builtBaseRow); 515 return result; 516 } 517 518 public String debugShow() { 519 rowsSeen.clear(); 520 Counter<Integer> debugCounter = new Counter<Integer>(); 521 getDebugWords(0, 0, builtBaseRow, Integer.MAX_VALUE); 522 for (Row row : builtRows) { 523 debugCounter.add(row.byteToCell.size(), 1); 524 } 525 for (Integer item : (Collection<Integer>) debugCounter 526 .getKeysetSortedByKey()) { 527 debugTreeView.append("cells in row=\t").append(item).append( 528 "\trows with count=\t").append(debugCounter.getCount(item)).append( 529 CldrUtility.LINE_SEPARATOR); 530 } 531 return debugTreeView.toString(); 532 } 533 534 private void getDebugWords(int byteLength, int resultSoFar, Row row, 535 int suppressAbove) { 536 // we do this to show when rows have already been seen, and so the structure has been compacted 537 if (rowsSeen.contains(row)) { 538 // reset if bigger 539 if (suppressAbove > byteLength) { 540 suppressAbove = byteLength; 541 } 542 } else { 543 rowsSeen.add(row); 544 } 545 // walk through the cells, display and recurse 546 Set<Byte> sorted = new TreeSet<Byte>(unsignedByteComparator); 547 sorted.addAll(row.byteToCell.keySet()); 548 for (Byte key : sorted) { 549 Cell cell = row.byteToCell.get(key); 550 soFar[byteLength] = key; 551 shown.set(byteLength, false); 552 int currentValue = resultSoFar + cell.deltaResult; 553 if (cell.returns) { 554 CharSequence key2 = stringFromBytes(soFar, byteLength + 1); 555 T value2 = builtResults.get(currentValue); 556 for (int i = 0; i <= byteLength; ++i) { 557 debugTreeView.append(' '); 558 if (i >= suppressAbove) { 559 debugTreeView.append("++"); 560 } else if (shown.get(i)) { 561 debugTreeView.append("--"); 562 } else { 563 debugTreeView.append(com.ibm.icu.impl.Utility.hex( 564 soFar[i] & 0xFF, 2)); 565 shown.set(i); 566 } 567 } 568 debugTreeView.append("\t<").append(key2).append(">\t<") 569 .append(value2).append(">" + CldrUtility.LINE_SEPARATOR); 570 } 571 if (cell.nextRow != null) { 572 getDebugWords(byteLength + 1, currentValue, cell.nextRow, 573 suppressAbove); 574 } 575 } 576 } 577 578 // recurse through the strings 579 private void getWords(int byteLength, int resultSoFar, Row row) { 580 for (Byte key : row.byteToCell.keySet()) { 581 Cell cell = row.byteToCell.get(key); 582 soFar[byteLength] = key; 583 int currentValue = resultSoFar + cell.deltaResult; 584 if (cell.returns) { 585 CharSequence key2 = stringFromBytes(soFar, byteLength + 1); 586 T value2 = builtResults.get(currentValue); 587 result.put(key2, value2); 588 } 589 if (cell.nextRow != null) { 590 getWords(byteLength + 1, currentValue, cell.nextRow); 591 } 592 } 593 } 594 595 private CharSequence stringFromBytes(byte[] soFar, int len) { 596 buffer.setLength(0); 597 byteString.fromBytes(soFar, 0, len, buffer); 598 return buffer.toString(); 599 } 600 } 601 602 /** 603 * Just for testing flattening. 604 * 605 * 606 */ 607 public void flatten() { 608 TreeSet<Row> s = new TreeSet<Row>(builtRows); 609 int count = 0; 610 int oldDepth = 999; 611 String oldCell = ""; 612 int uniqueCount = 0; 613 int cellCount = 0; 614 byte[] target = new byte[500]; 615 int totalBytesCompacted = 0; 616 for (Row row : s) { 617 row.referenceNumber = count++; 618 int depth = row.maximumDepth(); 619 if (depth != oldDepth) { 620 if (DEBUG_FLATTEN) { 621 System.out.println("*** " + depth + "***"); 622 } 623 oldDepth = depth; 624 } 625 int bytesCompacted = row.compact(target); 626 if (DEBUG_FLATTEN) { 627 System.out.println(bytesCompacted + "\t" + row); 628 } 629 String newCell = row.toStringCells(); 630 if (!newCell.equals(oldCell)) { 631 uniqueCount++; 632 totalBytesCompacted += bytesCompacted; 633 cellCount += row.byteToCell.size(); 634 } 635 oldCell = newCell; 636 637 for (Cell cell : row.byteToCell.values()) { 638 if (cell.nextRow != null && cell.nextRow.referenceNumber > row.referenceNumber) { 639 if (DEBUG_FLATTEN) { 640 System.out.println("*** Fail"); 641 } 642 break; 643 } 644 } 645 } 646 System.out.println("Count: " + count); 647 System.out.println("UniqueCount: " + uniqueCount); 648 System.out.println("CellCount: " + cellCount); 649 System.out.println("TotalBytesCompacted: " + totalBytesCompacted); 650 } 651 652 }