Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2010 Adam Barth. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef SuffixTree_h
     27 #define SuffixTree_h
     28 
     29 #include "wtf/Vector.h"
     30 #include "wtf/text/WTFString.h"
     31 
     32 namespace WebCore {
     33 
     34 class UnicodeCodebook {
     35 public:
     36     static int codeWord(UChar c) { return c; }
     37     enum { codeSize = 1 << 8 * sizeof(UChar) };
     38 };
     39 
     40 class ASCIICodebook {
     41 public:
     42     static int codeWord(UChar c) { return c & (codeSize - 1); }
     43     enum { codeSize = 1 << (8 * sizeof(char) - 1) };
     44 };
     45 
     46 template<typename Codebook>
     47 class SuffixTree {
     48 public:
     49     SuffixTree(const String& text, unsigned depth)
     50         : m_depth(depth)
     51         , m_leaf(true)
     52     {
     53         build(text);
     54     }
     55 
     56     bool mightContain(const String& query)
     57     {
     58         Node* current = &m_root;
     59         int limit = std::min(m_depth, query.length());
     60         for (int i = 0; i < limit; ++i) {
     61             current = current->at(Codebook::codeWord(query[i]));
     62             if (!current)
     63                 return false;
     64         }
     65         return true;
     66     }
     67 
     68 private:
     69     class Node {
     70     public:
     71         Node(bool isLeaf = false)
     72         {
     73             m_children.resize(Codebook::codeSize);
     74             m_children.fill(0);
     75             m_isLeaf = isLeaf;
     76         }
     77 
     78         ~Node()
     79         {
     80             for (unsigned i = 0; i < m_children.size(); ++i) {
     81                 Node* child = m_children.at(i);
     82                 if (child && !child->m_isLeaf)
     83                     delete child;
     84             }
     85         }
     86 
     87         Node*& at(int codeWord) { return m_children.at(codeWord); }
     88 
     89     private:
     90         typedef Vector<Node*, Codebook::codeSize> ChildrenVector;
     91 
     92         ChildrenVector m_children;
     93         bool m_isLeaf;
     94     };
     95 
     96     void build(const String& text)
     97     {
     98         for (unsigned base = 0; base < text.length(); ++base) {
     99             Node* current = &m_root;
    100             unsigned limit = std::min(base + m_depth, text.length());
    101             for (unsigned offset = 0; base + offset < limit; ++offset) {
    102                 ASSERT(current != &m_leaf);
    103                 Node*& child = current->at(Codebook::codeWord(text[base + offset]));
    104                 if (!child)
    105                     child = base + offset + 1 == limit ? &m_leaf : new Node();
    106                 current = child;
    107             }
    108         }
    109     }
    110 
    111     Node m_root;
    112     unsigned m_depth;
    113 
    114     // Instead of allocating a fresh empty leaf node for ever leaf in the tree
    115     // (there can be a lot of these), we alias all the leaves to this "static"
    116     // leaf node.
    117     Node m_leaf;
    118 };
    119 
    120 } // namespace WebCore
    121 
    122 #endif // SuffixTree_h
    123