// Copyright 2017 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "bsdiff/suffix_array_index.h" #include <limits> #include <vector> #include <divsufsort.h> #include <divsufsort64.h> #include "bsdiff/logging.h" namespace { // libdivsufsort C++ overloaded functions used to allow calling the right // implementation based on the pointer size. int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) { return divsufsort(text, sa, n); } int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) { return divsufsort64(text, sa, n); } saidx_t CallSaSearch(const uint8_t* text, size_t text_size, const uint8_t* pattern, size_t pattern_size, const saidx_t* sa, size_t sa_size, saidx_t* left) { return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left); } saidx64_t CallSaSearch(const uint8_t* text, size_t text_size, const uint8_t* pattern, size_t pattern_size, const saidx64_t* sa, size_t sa_size, saidx64_t* left) { return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left); } } // namespace namespace bsdiff { // The SAIDX template type must be either saidx_t or saidx64_t, which will // depend on the maximum size of the input text needed. template <typename SAIDX> class SuffixArrayIndex : public SuffixArrayIndexInterface { public: SuffixArrayIndex() = default; // Initialize and construct the suffix array of the |text| string of length // |n|. The memory pointed by |text| must be kept alive. Returns whether the // construction succeeded. bool Init(const uint8_t* text, size_t n); // SuffixArrayIndexInterface overrides. void SearchPrefix(const uint8_t* target, size_t length, size_t* out_length, uint64_t* out_pos) const override; private: const uint8_t* text_{nullptr}; // Owned by the caller. size_t n_{0}; std::vector<SAIDX> sa_; }; template <typename SAIDX> bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) { if (!sa_.empty()) { // Already initialized. LOG(ERROR) << "SuffixArray already initialized"; return false; } if (static_cast<uint64_t>(n) > static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) { LOG(ERROR) << "Input too big (" << n << ") for this implementation"; return false; } text_ = text; n_ = n; sa_.resize(n + 1); if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) { LOG(ERROR) << "divsufsrot() failed"; return false; } return true; } template <typename SAIDX> void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target, size_t length, size_t* out_length, uint64_t* out_pos) const { SAIDX suf_left; SAIDX count = CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left); if (count > 0) { // This is the simple case where we found the whole |target| string was // found. *out_pos = sa_[suf_left]; *out_length = length; return; } // In this case, |suf_left| points to the first suffix array position such // that the suffix at that position is lexicographically larger than |target|. // We only need to check whether the previous entry or the current entry is a // longer match. size_t prev_suffix_len = 0; if (suf_left > 0) { const size_t prev_max_len = std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length); const uint8_t* prev_suffix = text_ + sa_[suf_left - 1]; prev_suffix_len = std::mismatch(target, target + prev_max_len, prev_suffix).first - target; } size_t next_suffix_len = 0; if (static_cast<size_t>(suf_left) < n_) { const uint8_t* next_suffix = text_ + sa_[suf_left]; const size_t next_max_len = std::min(n_ - static_cast<size_t>(sa_[suf_left]), length); next_suffix_len = std::mismatch(target, target + next_max_len, next_suffix).first - target; } *out_length = std::max(next_suffix_len, prev_suffix_len); if (!*out_length) { *out_pos = 0; } else if (next_suffix_len >= prev_suffix_len) { *out_pos = sa_[suf_left]; } else { *out_pos = sa_[suf_left - 1]; } } std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex( const uint8_t* text, size_t n) { // The maximum supported size when using the suffix array based on the 32-bit // saidx_t type. We limit this to something a bit smaller (16 bytes smaller) // than the maximum representable number so references like "n + 1" are don't // overflow. const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16; std::unique_ptr<SuffixArrayIndexInterface> ret; if (n > kMaxSaidxSize) { SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>(); ret.reset(sa_ptr); if (!sa_ptr->Init(text, n)) return nullptr; } else { SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>(); ret.reset(sa_ptr); if (!sa_ptr->Init(text, n)) return nullptr; } return ret; } } // namespace bsdiff