Home | History | Annotate | Download | only in bsdiff
      1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "bsdiff/suffix_array_index.h"
      6 
      7 #include <limits>
      8 #include <vector>
      9 
     10 #include <divsufsort.h>
     11 #include <divsufsort64.h>
     12 
     13 #include "bsdiff/logging.h"
     14 
     15 namespace {
     16 
     17 // libdivsufsort C++ overloaded functions used to allow calling the right
     18 // implementation based on the pointer size.
     19 int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
     20   return divsufsort(text, sa, n);
     21 }
     22 int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
     23   return divsufsort64(text, sa, n);
     24 }
     25 
     26 saidx_t CallSaSearch(const uint8_t* text,
     27                      size_t text_size,
     28                      const uint8_t* pattern,
     29                      size_t pattern_size,
     30                      const saidx_t* sa,
     31                      size_t sa_size,
     32                      saidx_t* left) {
     33   return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
     34 }
     35 saidx64_t CallSaSearch(const uint8_t* text,
     36                        size_t text_size,
     37                        const uint8_t* pattern,
     38                        size_t pattern_size,
     39                        const saidx64_t* sa,
     40                        size_t sa_size,
     41                        saidx64_t* left) {
     42   return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
     43 }
     44 
     45 }  // namespace
     46 
     47 namespace bsdiff {
     48 
     49 // The SAIDX template type must be either saidx_t or saidx64_t, which will
     50 // depend on the maximum size of the input text needed.
     51 template <typename SAIDX>
     52 class SuffixArrayIndex : public SuffixArrayIndexInterface {
     53  public:
     54   SuffixArrayIndex() = default;
     55 
     56   // Initialize and construct the suffix array of the |text| string of length
     57   // |n|. The memory pointed by |text| must be kept alive. Returns whether the
     58   // construction succeeded.
     59   bool Init(const uint8_t* text, size_t n);
     60 
     61   // SuffixArrayIndexInterface overrides.
     62   void SearchPrefix(const uint8_t* target,
     63                     size_t length,
     64                     size_t* out_length,
     65                     uint64_t* out_pos) const override;
     66 
     67  private:
     68   const uint8_t* text_{nullptr};  // Owned by the caller.
     69   size_t n_{0};
     70 
     71   std::vector<SAIDX> sa_;
     72 };
     73 
     74 template <typename SAIDX>
     75 bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
     76   if (!sa_.empty()) {
     77     // Already initialized.
     78     LOG(ERROR) << "SuffixArray already initialized";
     79     return false;
     80   }
     81   if (static_cast<uint64_t>(n) >
     82       static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
     83     LOG(ERROR) << "Input too big (" << n << ") for this implementation";
     84     return false;
     85   }
     86   text_ = text;
     87   n_ = n;
     88   sa_.resize(n + 1);
     89 
     90   if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
     91     LOG(ERROR) << "divsufsrot() failed";
     92     return false;
     93   }
     94 
     95   return true;
     96 }
     97 
     98 template <typename SAIDX>
     99 void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
    100                                            size_t length,
    101                                            size_t* out_length,
    102                                            uint64_t* out_pos) const {
    103   SAIDX suf_left;
    104   SAIDX count =
    105       CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
    106   if (count > 0) {
    107     // This is the simple case where we found the whole |target| string was
    108     // found.
    109     *out_pos = sa_[suf_left];
    110     *out_length = length;
    111     return;
    112   }
    113   // In this case, |suf_left| points to the first suffix array position such
    114   // that the suffix at that position is lexicographically larger than |target|.
    115   // We only need to check whether the previous entry or the current entry is a
    116   // longer match.
    117   size_t prev_suffix_len = 0;
    118   if (suf_left > 0) {
    119     const size_t prev_max_len =
    120         std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
    121     const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
    122     prev_suffix_len =
    123         std::mismatch(target, target + prev_max_len, prev_suffix).first -
    124         target;
    125   }
    126 
    127   size_t next_suffix_len = 0;
    128   if (static_cast<size_t>(suf_left) < n_) {
    129     const uint8_t* next_suffix = text_ + sa_[suf_left];
    130     const size_t next_max_len =
    131         std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
    132     next_suffix_len =
    133         std::mismatch(target, target + next_max_len, next_suffix).first -
    134         target;
    135   }
    136 
    137   *out_length = std::max(next_suffix_len, prev_suffix_len);
    138   if (!*out_length) {
    139     *out_pos = 0;
    140   } else if (next_suffix_len >= prev_suffix_len) {
    141     *out_pos = sa_[suf_left];
    142   } else {
    143     *out_pos = sa_[suf_left - 1];
    144   }
    145 }
    146 
    147 std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
    148     const uint8_t* text,
    149     size_t n) {
    150   // The maximum supported size when using the suffix array based on the 32-bit
    151   // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
    152   // than the maximum representable number so references like "n + 1" are don't
    153   // overflow.
    154   const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
    155   std::unique_ptr<SuffixArrayIndexInterface> ret;
    156 
    157   if (n > kMaxSaidxSize) {
    158     SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
    159     ret.reset(sa_ptr);
    160     if (!sa_ptr->Init(text, n))
    161       return nullptr;
    162   } else {
    163     SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
    164     ret.reset(sa_ptr);
    165     if (!sa_ptr->Init(text, n))
    166       return nullptr;
    167   }
    168   return ret;
    169 }
    170 
    171 
    172 }  // namespace bsdiff
    173