Home | History | Annotate | Download | only in util
      1 /*
      2  **********************************************************************
      3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
      4  **********************************************************************
      5  * Author: Mark Davis
      6  **********************************************************************
      7  */
      8 package org.unicode.cldr.util;
      9 
     10 import java.util.ArrayList;
     11 import java.util.BitSet;
     12 import java.util.Collection;
     13 import java.util.Comparator;
     14 import java.util.HashSet;
     15 import java.util.Iterator;
     16 import java.util.Map;
     17 import java.util.Map.Entry;
     18 import java.util.Set;
     19 import java.util.TreeMap;
     20 import java.util.TreeSet;
     21 
     22 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;
     23 
     24 public class StateDictionary<T> extends Dictionary<T> {
     25 
     26     private static final boolean DEBUG_FLATTEN = false;
     27 
     28     // results of build
     29     private final ArrayList<Row> builtRows;
     30 
     31     private final Row builtBaseRow;
     32 
     33     private final IntMap<T> builtResults;
     34 
     35     private final int builtMaxByteLength;
     36 
     37     private final StringByteConverter byteString;
     38 
     39     // TODO remove before deployment; not thread safe
     40     private static int debugReferenceCount = 0;
     41     private int debugReferenceNumber = debugReferenceCount++;
     42 
     43     /**
     44      * Only should be called by StateDictionaryBuilder
     45      *
     46      * @param builtBaseRow2
     47      * @param builtRows2
     48      * @param builtResults2
     49      * @param builtMaxByteLength
     50      *            TODO
     51      * @param byteConverter
     52      *            TODO
     53      */
     54     StateDictionary(Row builtBaseRow2, ArrayList<Row> builtRows2,
     55         IntMap<T> builtResults2, int builtMaxByteLength,
     56         StringByteConverter byteConverter) {
     57         builtBaseRow = builtBaseRow2;
     58         builtRows = builtRows2;
     59         builtResults = builtResults2;
     60         this.builtMaxByteLength = builtMaxByteLength;
     61         this.byteString = byteConverter;
     62         // builtBaseValue = builtResults.get(0);
     63     }
     64 
     65     @Override
     66     public Matcher<T> getMatcher() {
     67         return new StateMatcher();
     68     }
     69 
     70     public String toString() {
     71         StringBuilder result = new StringBuilder();
     72         // TreeSet<Row> rowSet = new TreeSet<Row>(builtRows);
     73         for (Row row : builtRows) {
     74             result.append(row.toString()).append(CldrUtility.LINE_SEPARATOR);
     75         }
     76         Map<T, Integer> map = builtResults.getValueMap();
     77         Set<Pair<Integer, String>> sorted = new TreeSet<Pair<Integer, String>>();
     78         for (T item : map.keySet()) {
     79             sorted.add(new Pair<Integer, String>(map.get(item), item.toString()));
     80         }
     81         for (Pair<Integer, String> pair : sorted) {
     82             result.append(pair.getFirst()).append("*=").append(pair.getSecond()).append(CldrUtility.LINE_SEPARATOR);
     83         }
     84         return result.toString();
     85     }
     86 
     87     public Iterator<Entry<CharSequence, T>> getMapping() {
     88         // TODO Optimize this to only return the items on demand
     89         return new TextFetcher().getWords().entrySet().iterator();
     90     }
     91 
     92     @Override
     93     public String debugShow() {
     94         return new TextFetcher().debugShow();
     95     }
     96 
     97     public int getRowCount() {
     98         return builtRows.size();
     99     }
    100 
    101     /**
    102      * Internals. The text is transformed into a byte stream. A state table is
    103      * used to successively map {state, byte, result} to {newstate, newresult,
    104      * isReturn}. A state is represented by a Row, which is a mapping from byte to
    105      * a Cell, where each cell has the {nextRow, delta result, returns flag}.
    106      *
    107      * <pre>
    108      *  state = next state (row)
    109      *  result += delta result
    110      *  if (returns) return the result
    111      *  &lt;pre&gt;
    112      * However, the result and state are preserved for the next call on next().
    113      *
    114      */
    115 
    116     static class Row implements Comparable {
    117         enum Uniqueness {
    118             // the unknown value is only used in building
    119             UNIQUE, AMBIGUOUS, UNKNOWN;
    120 
    121             public String debugName() {
    122                 switch (this) {
    123                 case UNIQUE:
    124                     return ("");
    125                 case AMBIGUOUS:
    126                     return "";
    127                 default:
    128                     return "?";
    129                 }
    130             }
    131         }
    132 
    133         Uniqueness hasUniqueValue = Uniqueness.UNKNOWN;
    134 
    135         final TreeMap<Byte, Cell> byteToCell = new TreeMap<Byte, Cell>();
    136 
    137         // keeps track of the number of cells with returns
    138         transient int returnCount;
    139 
    140         transient int terminatingReturnCount;
    141 
    142         private int referenceNumber;
    143 
    144         Row(int rowNumber) {
    145             referenceNumber = rowNumber;
    146         }
    147 
    148         public int nonTerminating() {
    149             return byteToCell.size() - terminatingReturnCount;
    150         }
    151 
    152         public int nonReturn() {
    153             return byteToCell.size() - returnCount;
    154         }
    155 
    156         public int maximumDepth() {
    157             int result = 0;
    158             for (Cell cell : byteToCell.values()) {
    159                 if (cell.nextRow != null) {
    160                     int temp = cell.nextRow.maximumDepth() + 1;
    161                     if (result < temp) {
    162                         result = temp;
    163                     }
    164                 }
    165             }
    166             return result;
    167         }
    168 
    169         public int compareTo(Object o) {
    170             Row other = (Row) o;
    171             int result;
    172             // we want to sort items first with the fewest number of non-terminating
    173             // returns
    174             // cells, then most
    175             // number of terminating returns, then most number of returns
    176             if (0 != (result = maximumDepth() - other.maximumDepth()))
    177                 return result;
    178             if (0 != (result = byteToCell.size() - other.byteToCell.size()))
    179                 return result;
    180             // otherwise, try alphabetic among the keys. We are guaranteed that the
    181             // sizes are the same
    182             java.util.Iterator<Byte> otherIt = other.byteToCell.keySet().iterator();
    183             for (byte key : byteToCell.keySet()) {
    184                 int otherKey = otherIt.next();
    185                 if (0 != (result = key - otherKey)) {
    186                     return result;
    187                 }
    188                 // at this point, we are guaranteed that the keys are the same. Compare
    189                 // deltaResults, and row
    190                 Cell cell = byteToCell.get(key);
    191                 Cell otherCell = other.byteToCell.get(key);
    192                 if (0 != (result = cell.deltaResult - otherCell.deltaResult)) {
    193                     return result;
    194                 }
    195             }
    196             // if we fail completely, use the age.
    197             return referenceNumber - other.referenceNumber;
    198         }
    199 
    200         public String toString() {
    201             StringBuilder buffer = new StringBuilder();
    202             buffer.append("R" + getReferenceNumber() + hasUniqueValue.debugName() + "{");
    203             boolean first = true;
    204             Set<Byte> sorted = new TreeSet<Byte>(unsignedByteComparator);
    205             sorted.addAll(byteToCell.keySet());
    206             for (Byte key : sorted) {
    207                 if (first) {
    208                     first = false;
    209                 } else {
    210                     buffer.append(' ');
    211                 }
    212                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
    213                 buffer.append('=');
    214                 buffer.append(byteToCell.get(key));
    215             }
    216             buffer.append('}');
    217             return buffer.toString();
    218         }
    219 
    220         public String toStringCells() {
    221             StringBuilder buffer = new StringBuilder();
    222             for (Byte key : byteToCell.keySet()) {
    223                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
    224                 buffer.append(byteToCell.get(key).toString());
    225                 buffer.append(' ');
    226             }
    227             return buffer.toString();
    228         }
    229 
    230         public int getReferenceNumber() {
    231             return referenceNumber;
    232         }
    233 
    234         int compact(byte[] target) {
    235             int pos = 0;
    236             for (Byte key : byteToCell.keySet()) {
    237                 target[pos++] = key;
    238                 pos = byteToCell.get(key).addBytes(target, pos, 0);
    239             }
    240             target[pos++] = 0;
    241             return pos;
    242         }
    243     }
    244 
    245     static class Cell {
    246         public Row nextRow; // next state
    247 
    248         public int deltaResult;
    249 
    250         public boolean returns;
    251 
    252         public int addBytes(byte[] target, int pos, int rowDelta) {
    253             pos = StateDictionary.addBytes(deltaResult, target, pos);
    254             int rowOffset = nextRow == null ? 0 : rowDelta - nextRow.getReferenceNumber();
    255             rowOffset <<= 1; // make room for returns
    256             if (returns)
    257                 rowOffset |= 1;
    258             return StateDictionary.addBytes(rowOffset, target, pos);
    259         }
    260 
    261         public String toString() {
    262             String result = deltaResult == 0 ? "" : String.valueOf(deltaResult);
    263             if (returns) {
    264                 result += "*";
    265             }
    266             if (nextRow != null) {
    267                 if (result.length() != 0) {
    268                     result += "/";
    269                 }
    270                 result += "R" + nextRow.getReferenceNumber();
    271             }
    272             return result;
    273         }
    274     }
    275 
    276     // should be private, but easier to debug if package private
    277     class StateMatcher extends Matcher<T> {
    278         private static final boolean SHOW_DEBUG = false;
    279 
    280         final private byte[] matchByteBuffer = new byte[byteString
    281             .getMaxBytesPerChar()];
    282 
    283         private int matchByteStringIndex;
    284 
    285         private int matchByteBufferLength;
    286 
    287         // only used in matching
    288         private Row matchCurrentRow;
    289 
    290         private int matchIntValue = -1;
    291 
    292         private Row matchLastRow;
    293 
    294         @Override
    295         public Matcher<T> setOffset(int offset) {
    296             matchCurrentRow = builtBaseRow;
    297             partialLastRow = null; // can remove this later, only for debugging
    298             partialMatchValue = 0; // ditto
    299             matchIntValue = 0;
    300             myMatchEnd = offset;
    301             matchValue = null;
    302             byteString.clear();
    303             matchByteStringIndex = offset;
    304             return super.setOffset(offset);
    305         }
    306 
    307         int myMatchEnd;
    308 
    309         private Row partialLastRow;
    310 
    311         private int partialMatchValue;
    312 
    313         public Status next() {
    314             if (SHOW_DEBUG) {
    315                 System.out.println("NEXT: " + this);
    316             }
    317             if (matchCurrentRow == null) {
    318                 matchIntValue = -1;
    319                 matchValue = null;
    320                 return Status.NONE;
    321             }
    322             Status result = Status.PARTIAL;
    323 
    324             while (text.hasCharAt(myMatchEnd)) {
    325                 // get more bytes IF matchEnd is set right
    326                 if (myMatchEnd == matchByteStringIndex) {
    327                     if (true) { // matchEnd < text.length()
    328                         char ch = text.charAt(matchByteStringIndex++);
    329                         matchByteBufferLength = byteString.toBytes(ch, matchByteBuffer, 0);
    330                         if (SHOW_DEBUG) {
    331                             System.out.println("\tChar: " + ch + "\t" + com.ibm.icu.impl.Utility.hex(ch) + "\t->\t"
    332                                 + CldrUtility.hex(matchByteBuffer, 0, matchByteBufferLength, " "));
    333                         }
    334                     } else {
    335                         matchByteBufferLength = byteString.toBytes(matchByteBuffer, 0);
    336                     }
    337                 }
    338                 for (int i = 0; i < matchByteBufferLength; ++i) {
    339                     result = nextByte(matchByteBuffer[i]);
    340                     if (result != Status.PARTIAL) {
    341                         break;
    342                     }
    343                 }
    344                 // Normally, we will never have a return value except at the end of a character, so we don't need
    345                 // to check after each nextByte. However, if the byteString converts C to a sequence of bytes that
    346                 // is a prefix of what it converts D into, then we will get a partial match *WITHIN* a character
    347 
    348                 if (result == Status.PARTIAL) {
    349                     ++myMatchEnd;
    350                     // and continue with the loop
    351                 } else if (result == Status.MATCH) {
    352                     ++myMatchEnd;
    353                     matchValue = builtResults.get(matchIntValue);
    354                     matchEnd = myMatchEnd;
    355                     if (SHOW_DEBUG) {
    356                         System.out.println("NEXT RESULT: " + result + "\t" + this);
    357                     }
    358                     return result;
    359                 } else {
    360                     // if we didn't get a MATCH, we have NONE. But in reality, there could be a possible match
    361                     // so we check to see whether the current row allows for any continuation.
    362                     if (myMatchEnd > offset && matchCurrentRow.byteToCell.size() > 0) {
    363                         result = Status.PARTIAL;
    364                     }
    365                     if (result == Status.NONE) {
    366                         matchIntValue = -1;
    367                         matchValue = null;
    368                     }
    369                     break;
    370                 }
    371             }
    372             matchLastRow = matchCurrentRow;
    373             matchCurrentRow = null;
    374             if (result == Status.PARTIAL) {
    375                 matchValue = builtResults.get(matchIntValue);
    376                 matchEnd = myMatchEnd;
    377                 partialLastRow = matchLastRow;
    378                 partialMatchValue = matchIntValue;
    379                 if (SHOW_DEBUG) {
    380                     System.out.println("NEXT RESULT: " + result + "\t" + this);
    381                 }
    382             }
    383             return result;
    384         }
    385 
    386         /**
    387          * Returns NONE if we cannot go any farther, MATCH if there was a match, and PARTIAL otherwise.
    388          * If we couldn't go any farther, then the currentRow is left alone.
    389          *
    390          * @param chunk
    391          * @return
    392          */
    393         private Status nextByte(int chunk) {
    394             if (SHOW_DEBUG) {
    395                 System.out.println("\t" + debugReferenceNumber + "\tRow: " + matchCurrentRow);
    396             }
    397             Cell cell = matchCurrentRow.byteToCell.get((byte) chunk);
    398             if (cell == null) {
    399                 return Status.NONE;
    400             }
    401             matchIntValue += cell.deltaResult;
    402             matchCurrentRow = cell.nextRow;
    403             if (cell.returns) {
    404                 return Status.MATCH;
    405             }
    406             return Status.PARTIAL;
    407         }
    408 
    409         public int getIntMatchValue() {
    410             return matchIntValue;
    411         }
    412 
    413         public boolean nextUniquePartial() {
    414             if (partialLastRow.hasUniqueValue == Uniqueness.UNIQUE) {
    415                 matchValue = builtResults.get(partialMatchValue);
    416                 matchEnd = myMatchEnd;
    417                 return true;
    418             }
    419             return false;
    420         }
    421 
    422         @Override
    423         public StateDictionary<T> getDictionary() {
    424             return StateDictionary.this;
    425         }
    426     }
    427 
    428     static final Comparator<Byte> unsignedByteComparator = new Comparator<Byte>() {
    429 
    430         public int compare(Byte o1, Byte o2) {
    431             int b1 = o1 & 0xFF;
    432             int b2 = o2 & 0xFF;
    433             return b1 < b2 ? -1 : b1 > b2 ? 1 : 0;
    434         }
    435 
    436     };
    437 
    438     static final Comparator<Row> rowComparator = new Comparator<Row>() {
    439 
    440         public int compare(Row row1, Row row2) {
    441             if (row1 == row2) {
    442                 return 0;
    443             } else if (row1 == null) {
    444                 return -1;
    445             } else if (row2 == null) {
    446                 return 1;
    447             }
    448             int result;
    449             if (0 != (result = row1.byteToCell.size() - row2.byteToCell.size())) {
    450                 return result;
    451             }
    452             java.util.Iterator<Byte> otherIt = row2.byteToCell.keySet().iterator();
    453             for (byte key : row1.byteToCell.keySet()) {
    454                 byte otherKey = otherIt.next();
    455                 if (0 != (result = key - otherKey)) {
    456                     return result;
    457                 }
    458                 // at this point, we are guaranteed that the keys are the same. Compare
    459                 // deltaResults, returns, and then recurse on the the row
    460                 Cell cell1 = row1.byteToCell.get(key);
    461                 Cell cell2 = row2.byteToCell.get(key);
    462                 if (0 != (result = cell1.deltaResult - cell2.deltaResult)) {
    463                     return result;
    464                 }
    465                 if (cell1.returns != cell2.returns) {
    466                     return cell1.returns ? 1 : -1;
    467                 }
    468                 if (0 != (result = compare(cell1.nextRow, cell2.nextRow))) {
    469                     return result;
    470                 }
    471             }
    472             return 0;
    473 
    474         }
    475 
    476     };
    477 
    478     static int addBytes(int source, byte[] target, int pos) {
    479         // swap the top bit
    480         if (source < 0) {
    481             source = ((-source) << 1) | 1;
    482         } else {
    483             source <<= 1;
    484         }
    485         // emit the rest as 7 bit quantities with 1 as termination bit
    486         while (true) {
    487             byte b = (byte) (source & 0x7F);
    488             source >>>= 7;
    489             if (source == 0) {
    490                 b |= 0x80;
    491                 target[pos++] = b;
    492                 return pos;
    493             }
    494             target[pos++] = b;
    495         }
    496     }
    497 
    498     private class TextFetcher {
    499 
    500         Map<CharSequence, T> result = new TreeMap<CharSequence, T>();
    501 
    502         byte[] soFar = new byte[builtMaxByteLength];
    503 
    504         BitSet shown = new BitSet();
    505 
    506         StringBuilder buffer = new StringBuilder();
    507 
    508         StringBuilder debugTreeView = new StringBuilder();
    509 
    510         private HashSet<Row> rowsSeen = new HashSet<Row>();
    511 
    512         public Map<CharSequence, T> getWords() {
    513             result.clear();
    514             getWords(0, 0, builtBaseRow);
    515             return result;
    516         }
    517 
    518         public String debugShow() {
    519             rowsSeen.clear();
    520             Counter<Integer> debugCounter = new Counter<Integer>();
    521             getDebugWords(0, 0, builtBaseRow, Integer.MAX_VALUE);
    522             for (Row row : builtRows) {
    523                 debugCounter.add(row.byteToCell.size(), 1);
    524             }
    525             for (Integer item : (Collection<Integer>) debugCounter
    526                 .getKeysetSortedByKey()) {
    527                 debugTreeView.append("cells in row=\t").append(item).append(
    528                     "\trows with count=\t").append(debugCounter.getCount(item)).append(
    529                         CldrUtility.LINE_SEPARATOR);
    530             }
    531             return debugTreeView.toString();
    532         }
    533 
    534         private void getDebugWords(int byteLength, int resultSoFar, Row row,
    535             int suppressAbove) {
    536             // we do this to show when rows have already been seen, and so the structure has been compacted
    537             if (rowsSeen.contains(row)) {
    538                 // reset if bigger
    539                 if (suppressAbove > byteLength) {
    540                     suppressAbove = byteLength;
    541                 }
    542             } else {
    543                 rowsSeen.add(row);
    544             }
    545             // walk through the cells, display and recurse
    546             Set<Byte> sorted = new TreeSet<Byte>(unsignedByteComparator);
    547             sorted.addAll(row.byteToCell.keySet());
    548             for (Byte key : sorted) {
    549                 Cell cell = row.byteToCell.get(key);
    550                 soFar[byteLength] = key;
    551                 shown.set(byteLength, false);
    552                 int currentValue = resultSoFar + cell.deltaResult;
    553                 if (cell.returns) {
    554                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
    555                     T value2 = builtResults.get(currentValue);
    556                     for (int i = 0; i <= byteLength; ++i) {
    557                         debugTreeView.append(' ');
    558                         if (i >= suppressAbove) {
    559                             debugTreeView.append("++");
    560                         } else if (shown.get(i)) {
    561                             debugTreeView.append("--");
    562                         } else {
    563                             debugTreeView.append(com.ibm.icu.impl.Utility.hex(
    564                                 soFar[i] & 0xFF, 2));
    565                             shown.set(i);
    566                         }
    567                     }
    568                     debugTreeView.append("\t<").append(key2).append(">\t<")
    569                         .append(value2).append(">" + CldrUtility.LINE_SEPARATOR);
    570                 }
    571                 if (cell.nextRow != null) {
    572                     getDebugWords(byteLength + 1, currentValue, cell.nextRow,
    573                         suppressAbove);
    574                 }
    575             }
    576         }
    577 
    578         // recurse through the strings
    579         private void getWords(int byteLength, int resultSoFar, Row row) {
    580             for (Byte key : row.byteToCell.keySet()) {
    581                 Cell cell = row.byteToCell.get(key);
    582                 soFar[byteLength] = key;
    583                 int currentValue = resultSoFar + cell.deltaResult;
    584                 if (cell.returns) {
    585                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
    586                     T value2 = builtResults.get(currentValue);
    587                     result.put(key2, value2);
    588                 }
    589                 if (cell.nextRow != null) {
    590                     getWords(byteLength + 1, currentValue, cell.nextRow);
    591                 }
    592             }
    593         }
    594 
    595         private CharSequence stringFromBytes(byte[] soFar, int len) {
    596             buffer.setLength(0);
    597             byteString.fromBytes(soFar, 0, len, buffer);
    598             return buffer.toString();
    599         }
    600     }
    601 
    602     /**
    603      * Just for testing flattening.
    604      *
    605      *
    606      */
    607     public void flatten() {
    608         TreeSet<Row> s = new TreeSet<Row>(builtRows);
    609         int count = 0;
    610         int oldDepth = 999;
    611         String oldCell = "";
    612         int uniqueCount = 0;
    613         int cellCount = 0;
    614         byte[] target = new byte[500];
    615         int totalBytesCompacted = 0;
    616         for (Row row : s) {
    617             row.referenceNumber = count++;
    618             int depth = row.maximumDepth();
    619             if (depth != oldDepth) {
    620                 if (DEBUG_FLATTEN) {
    621                     System.out.println("*** " + depth + "***");
    622                 }
    623                 oldDepth = depth;
    624             }
    625             int bytesCompacted = row.compact(target);
    626             if (DEBUG_FLATTEN) {
    627                 System.out.println(bytesCompacted + "\t" + row);
    628             }
    629             String newCell = row.toStringCells();
    630             if (!newCell.equals(oldCell)) {
    631                 uniqueCount++;
    632                 totalBytesCompacted += bytesCompacted;
    633                 cellCount += row.byteToCell.size();
    634             }
    635             oldCell = newCell;
    636 
    637             for (Cell cell : row.byteToCell.values()) {
    638                 if (cell.nextRow != null && cell.nextRow.referenceNumber > row.referenceNumber) {
    639                     if (DEBUG_FLATTEN) {
    640                         System.out.println("*** Fail");
    641                     }
    642                     break;
    643                 }
    644             }
    645         }
    646         System.out.println("Count: " + count);
    647         System.out.println("UniqueCount: " + uniqueCount);
    648         System.out.println("CellCount: " + cellCount);
    649         System.out.println("TotalBytesCompacted: " + totalBytesCompacted);
    650     }
    651 
    652 }