Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  * ********************************************************************************
      5  * Copyright (C) 2007-2011, International Business Machines Corporation and others.
      6  * All Rights Reserved.
      7  * ********************************************************************************
      8  */
      9 package com.ibm.icu.impl;
     10 
     11 import java.util.Iterator;
     12 import java.util.LinkedList;
     13 import java.util.List;
     14 import java.util.ListIterator;
     15 
     16 import com.ibm.icu.lang.UCharacter;
     17 import com.ibm.icu.text.UTF16;
     18 
     19 /**
     20  * TextTrieMap is a trie implementation for supporting
     21  * fast prefix match for the key.
     22  */
     23 public class TextTrieMap<V> {
     24 
     25     private Node _root = new Node();
     26     boolean _ignoreCase;
     27 
     28     /**
     29      * Constructs a TextTrieMap object.
     30      *
     31      * @param ignoreCase true to use simple case insensitive match
     32      */
     33     public TextTrieMap(boolean ignoreCase) {
     34         _ignoreCase = ignoreCase;
     35     }
     36 
     37     /**
     38      * Adds the text key and its associated object in this object.
     39      *
     40      * @param text The text.
     41      * @param val The value object associated with the text.
     42      */
     43     public TextTrieMap<V> put(CharSequence text, V val) {
     44         CharIterator chitr = new CharIterator(text, 0, _ignoreCase);
     45         _root.add(chitr, val);
     46         return this;
     47     }
     48 
     49     /**
     50      * Gets an iterator of the objects associated with the
     51      * longest prefix matching string key.
     52      *
     53      * @param text The text to be matched with prefixes.
     54      * @return An iterator of the objects associated with
     55      * the longest prefix matching matching key, or null
     56      * if no matching entry is found.
     57      */
     58     public Iterator<V> get(String text) {
     59         return get(text, 0);
     60     }
     61 
     62     /**
     63      * Gets an iterator of the objects associated with the
     64      * longest prefix matching string key starting at the
     65      * specified position.
     66      *
     67      * @param text The text to be matched with prefixes.
     68      * @param start The start index of of the text
     69      * @return An iterator of the objects associated with the
     70      * longest prefix matching matching key, or null if no
     71      * matching entry is found.
     72      */
     73     public Iterator<V> get(CharSequence text, int start) {
     74         return get(text, start, null);
     75     }
     76 
     77     public Iterator<V> get(CharSequence text, int start, int[] matchLen) {
     78         LongestMatchHandler<V> handler = new LongestMatchHandler<V>();
     79         find(text, start, handler);
     80         if (matchLen != null && matchLen.length > 0) {
     81             matchLen[0] = handler.getMatchLength();
     82         }
     83         return handler.getMatches();
     84     }
     85 
     86     public void find(CharSequence text, ResultHandler<V> handler) {
     87         find(text, 0, handler);
     88     }
     89 
     90     public void find(CharSequence text, int offset, ResultHandler<V> handler) {
     91         CharIterator chitr = new CharIterator(text, offset, _ignoreCase);
     92         find(_root, chitr, handler);
     93     }
     94 
     95     private synchronized void find(Node node, CharIterator chitr, ResultHandler<V> handler) {
     96         Iterator<V> values = node.values();
     97         if (values != null) {
     98             if (!handler.handlePrefixMatch(chitr.processedLength(), values)) {
     99                 return;
    100             }
    101         }
    102 
    103         Node nextMatch = node.findMatch(chitr);
    104         if (nextMatch != null) {
    105             find(nextMatch, chitr, handler);
    106         }
    107     }
    108 
    109     /**
    110      * Creates an object that consumes code points one at a time and returns intermediate prefix
    111      * matches.  Returns null if no match exists.
    112      *
    113      * @return An instance of {@link ParseState}, or null if the starting code point is not a
    114      * prefix for any entry in the trie.
    115      */
    116     public ParseState openParseState(int startingCp) {
    117       // Check to see whether this is a valid starting character.  If not, return null.
    118       if (_ignoreCase) {
    119         startingCp = UCharacter.foldCase(startingCp, true);
    120       }
    121       int count = Character.charCount(startingCp);
    122       char ch1 = (count == 1) ? (char) startingCp : UTF16.getLeadSurrogate(startingCp);
    123       if (!_root.hasChildFor(ch1)) {
    124         return null;
    125       }
    126 
    127       return new ParseState(_root);
    128     }
    129 
    130     /**
    131      * ParseState is mutable, not thread-safe, and intended to be used internally by parsers for
    132      * consuming values from this trie.
    133      */
    134     public class ParseState {
    135       private Node node;
    136       private int offset;
    137       private Node.StepResult result;
    138 
    139       ParseState(Node start) {
    140         node = start;
    141         offset = 0;
    142         result = start.new StepResult();
    143       }
    144 
    145       /**
    146        * Consumes a code point and walk to the next node in the trie.
    147        *
    148        * @param cp The code point to consume.
    149        */
    150       public void accept(int cp) {
    151         assert node != null;
    152         if (_ignoreCase) {
    153           cp = UCharacter.foldCase(cp, true);
    154         }
    155         int count = Character.charCount(cp);
    156         char ch1 = (count == 1) ? (char) cp : UTF16.getLeadSurrogate(cp);
    157         node.takeStep(ch1, offset, result);
    158         if (count == 2 && result.node != null) {
    159           char ch2 = UTF16.getTrailSurrogate(cp);
    160           result.node.takeStep(ch2, result.offset, result);
    161         }
    162         node = result.node;
    163         offset = result.offset;
    164       }
    165 
    166       /**
    167        * Gets the exact prefix matches for all code points that have been consumed so far.
    168        *
    169        * @return The matches.
    170        */
    171       public Iterator<V> getCurrentMatches() {
    172         if (node != null && offset == node.charCount()) {
    173           return node.values();
    174         }
    175         return null;
    176       }
    177 
    178       /**
    179        * Checks whether any more code points can be consumed.
    180        *
    181        * @return true if no more code points can be consumed; false otherwise.
    182        */
    183       public boolean atEnd() {
    184         return node == null || (node.charCount() == offset && node._children == null);
    185       }
    186     }
    187 
    188     public static class CharIterator implements Iterator<Character> {
    189         private boolean _ignoreCase;
    190         private CharSequence _text;
    191         private int _nextIdx;
    192         private int _startIdx;
    193 
    194         private Character _remainingChar;
    195 
    196         CharIterator(CharSequence text, int offset, boolean ignoreCase) {
    197             _text = text;
    198             _nextIdx = _startIdx = offset;
    199             _ignoreCase = ignoreCase;
    200         }
    201 
    202         /* (non-Javadoc)
    203          * @see java.util.Iterator#hasNext()
    204          */
    205         @Override
    206         public boolean hasNext() {
    207             if (_nextIdx == _text.length() && _remainingChar == null) {
    208                 return false;
    209             }
    210             return true;
    211         }
    212 
    213         /* (non-Javadoc)
    214          * @see java.util.Iterator#next()
    215          */
    216         @Override
    217         public Character next() {
    218             if (_nextIdx == _text.length() && _remainingChar == null) {
    219                 return null;
    220             }
    221             Character next;
    222             if (_remainingChar != null) {
    223                 next = _remainingChar;
    224                 _remainingChar = null;
    225             } else {
    226                 if (_ignoreCase) {
    227                     int cp = UCharacter.foldCase(Character.codePointAt(_text, _nextIdx), true);
    228                     _nextIdx = _nextIdx + Character.charCount(cp);
    229 
    230                     char[] chars = Character.toChars(cp);
    231                     next = chars[0];
    232                     if (chars.length == 2) {
    233                         _remainingChar = chars[1];
    234                     }
    235                 } else {
    236                     next = _text.charAt(_nextIdx);
    237                     _nextIdx++;
    238                 }
    239             }
    240             return next;
    241         }
    242 
    243         /* (non-Javadoc)
    244          * @see java.util.Iterator#remove()
    245          */
    246         @Override
    247         public void remove() {
    248             throw new UnsupportedOperationException("remove() not supproted");
    249         }
    250 
    251         public int nextIndex() {
    252             return _nextIdx;
    253         }
    254 
    255         public int processedLength() {
    256             if (_remainingChar != null) {
    257                 throw new IllegalStateException("In the middle of surrogate pair");
    258             }
    259             return _nextIdx - _startIdx;
    260         }
    261     }
    262 
    263     /**
    264      * Callback handler for processing prefix matches used by
    265      * find method.
    266      */
    267     public interface ResultHandler<V> {
    268         /**
    269          * Handles a prefix key match
    270          *
    271          * @param matchLength Matched key's length
    272          * @param values An iterator of the objects associated with the matched key
    273          * @return Return true to continue the search in the trie, false to quit.
    274          */
    275         public boolean handlePrefixMatch(int matchLength, Iterator<V> values);
    276     }
    277 
    278     private static class LongestMatchHandler<V> implements ResultHandler<V> {
    279         private Iterator<V> matches = null;
    280         private int length = 0;
    281 
    282         @Override
    283         public boolean handlePrefixMatch(int matchLength, Iterator<V> values) {
    284             if (matchLength > length) {
    285                 length = matchLength;
    286                 matches = values;
    287             }
    288             return true;
    289         }
    290 
    291         public Iterator<V> getMatches() {
    292             return matches;
    293         }
    294 
    295         public int getMatchLength() {
    296             return length;
    297         }
    298     }
    299 
    300     /**
    301      * Inner class representing a text node in the trie.
    302      */
    303     private class Node {
    304         private char[] _text;
    305         private List<V> _values;
    306         private List<Node> _children;
    307 
    308         private Node() {
    309         }
    310 
    311         private Node(char[] text, List<V> values, List<Node> children) {
    312             _text = text;
    313             _values = values;
    314             _children = children;
    315         }
    316 
    317         public int charCount() {
    318           return _text == null ? 0 : _text.length;
    319         }
    320 
    321         public boolean hasChildFor(char ch) {
    322           for (int i=0; _children != null && i < _children.size(); i++) {
    323             Node child = _children.get(i);
    324             if (ch < child._text[0]) break;
    325             if (ch == child._text[0]) {
    326               return true;
    327             }
    328           }
    329           return false;
    330         }
    331 
    332         public Iterator<V> values() {
    333             if (_values == null) {
    334                 return null;
    335             }
    336             return _values.iterator();
    337         }
    338 
    339         public void add(CharIterator chitr, V value) {
    340             StringBuilder buf = new StringBuilder();
    341             while (chitr.hasNext()) {
    342                 buf.append(chitr.next());
    343             }
    344             add(toCharArray(buf), 0, value);
    345         }
    346 
    347         public Node findMatch(CharIterator chitr) {
    348             if (_children == null) {
    349                 return null;
    350             }
    351             if (!chitr.hasNext()) {
    352                 return null;
    353             }
    354             Node match = null;
    355             Character ch = chitr.next();
    356             for (Node child : _children) {
    357                 if (ch < child._text[0]) {
    358                     break;
    359                 }
    360                 if (ch == child._text[0]) {
    361                     if (child.matchFollowing(chitr)) {
    362                         match = child;
    363                     }
    364                     break;
    365                 }
    366             }
    367             return match;
    368         }
    369 
    370         public class StepResult {
    371           public Node node;
    372           public int offset;
    373         }
    374         public void takeStep(char ch, int offset, StepResult result) {
    375           assert offset <= charCount();
    376           if (offset == charCount()) {
    377             // Go to a child node
    378             for (int i=0; _children != null && i < _children.size(); i++) {
    379               Node child = _children.get(i);
    380               if (ch < child._text[0]) break;
    381               if (ch == child._text[0]) {
    382                 // Found a matching child node
    383                 result.node = child;
    384                 result.offset = 1;
    385                 return;
    386               }
    387             }
    388             // No matching children; fall through
    389           } else if (_text[offset] == ch) {
    390             // Return to this node; increase offset
    391             result.node = this;
    392             result.offset = offset + 1;
    393             return;
    394           }
    395           // No matches
    396           result.node = null;
    397           result.offset = -1;
    398           return;
    399         }
    400 
    401         private void add(char[] text, int offset, V value) {
    402             if (text.length == offset) {
    403                 _values = addValue(_values, value);
    404                 return;
    405             }
    406 
    407             if (_children == null) {
    408                 _children = new LinkedList<Node>();
    409                 Node child = new Node(subArray(text, offset), addValue(null, value), null);
    410                 _children.add(child);
    411                 return;
    412             }
    413 
    414             // walk through children
    415             ListIterator<Node> litr = _children.listIterator();
    416             while (litr.hasNext()) {
    417                 Node next = litr.next();
    418                 if (text[offset] < next._text[0]) {
    419                     litr.previous();
    420                     break;
    421                 }
    422                 if (text[offset] == next._text[0]) {
    423                     int matchLen = next.lenMatches(text, offset);
    424                     if (matchLen == next._text.length) {
    425                         // full match
    426                         next.add(text, offset + matchLen, value);
    427                     } else {
    428                         // partial match, create a branch
    429                         next.split(matchLen);
    430                         next.add(text, offset + matchLen, value);
    431                     }
    432                     return;
    433                 }
    434             }
    435             // add a new child to this node
    436             litr.add(new Node(subArray(text, offset), addValue(null, value), null));
    437         }
    438 
    439         private boolean matchFollowing(CharIterator chitr) {
    440             boolean matched = true;
    441             int idx = 1;
    442             while (idx < _text.length) {
    443                 if(!chitr.hasNext()) {
    444                     matched = false;
    445                     break;
    446                 }
    447                 Character ch = chitr.next();
    448                 if (ch != _text[idx]) {
    449                     matched = false;
    450                     break;
    451                 }
    452                 idx++;
    453             }
    454             return matched;
    455         }
    456 
    457         private int lenMatches(char[] text, int offset) {
    458             int textLen = text.length - offset;
    459             int limit = _text.length < textLen ? _text.length : textLen;
    460             int len = 0;
    461             while (len < limit) {
    462                 if (_text[len] != text[offset + len]) {
    463                     break;
    464                 }
    465                 len++;
    466             }
    467             return len;
    468         }
    469 
    470         private void split(int offset) {
    471             // split the current node at the offset
    472             char[] childText = subArray(_text, offset);
    473             _text = subArray(_text, 0, offset);
    474 
    475             // add the Node representing after the offset as a child
    476             Node child = new Node(childText, _values, _children);
    477             _values = null;
    478 
    479             _children = new LinkedList<Node>();
    480             _children.add(child);
    481         }
    482 
    483         private List<V> addValue(List<V> list, V value) {
    484             if (list == null) {
    485                 list = new LinkedList<V>();
    486             }
    487             list.add(value);
    488             return list;
    489         }
    490     }
    491 
    492     private static char[] toCharArray(CharSequence text) {
    493         char[] array = new char[text.length()];
    494         for (int i = 0; i < array.length; i++) {
    495             array[i] = text.charAt(i);
    496         }
    497         return array;
    498     }
    499 
    500     private static char[] subArray(char[] array, int start) {
    501         if (start == 0) {
    502             return array;
    503         }
    504         char[] sub = new char[array.length - start];
    505         System.arraycopy(array, start, sub, 0, sub.length);
    506         return sub;
    507     }
    508 
    509     private static char[] subArray(char[] array, int start, int limit) {
    510         if (start == 0 && limit == array.length) {
    511             return array;
    512         }
    513         char[] sub = new char[limit - start];
    514         System.arraycopy(array, start, sub, 0, limit - start);
    515         return sub;
    516     }
    517 }
    518