Home | History | Annotate | Download | only in devtools
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/devtools/devtools_file_system_indexer.h"
      6 
      7 #include <iterator>
      8 
      9 #include "base/bind.h"
     10 #include "base/callback.h"
     11 #include "base/file_util.h"
     12 #include "base/files/file_enumerator.h"
     13 #include "base/files/file_util_proxy.h"
     14 #include "base/lazy_instance.h"
     15 #include "base/logging.h"
     16 #include "base/platform_file.h"
     17 #include "base/strings/utf_string_conversions.h"
     18 #include "content/public/browser/browser_thread.h"
     19 
     20 using base::Bind;
     21 using base::Callback;
     22 using base::FileEnumerator;
     23 using base::FilePath;
     24 using base::FileUtilProxy;
     25 using base::Time;
     26 using base::TimeDelta;
     27 using base::TimeTicks;
     28 using base::PassPlatformFile;
     29 using base::PlatformFile;
     30 using base::PlatformFileError;
     31 using base::PlatformFileInfo;
     32 using content::BrowserThread;
     33 using std::map;
     34 using std::set;
     35 using std::string;
     36 using std::vector;
     37 
     38 namespace {
     39 
     40 typedef int32 Trigram;
     41 typedef char TrigramChar;
     42 typedef uint16 FileId;
     43 
     44 const int kMinTimeoutBetweenWorkedNitification = 200;
     45 // Trigram characters include all ASCII printable characters (32-126) except for
     46 // the capital letters, because the index is case insensitive.
     47 const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
     48 const size_t kTrigramCount =
     49     kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
     50 const int kMaxReadLength = 10 * 1024;
     51 const TrigramChar kUndefinedTrigramChar = -1;
     52 const Trigram kUndefinedTrigram = -1;
     53 
     54 base::LazyInstance<vector<bool> >::Leaky g_is_binary_char =
     55     LAZY_INSTANCE_INITIALIZER;
     56 
     57 base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars =
     58     LAZY_INSTANCE_INITIALIZER;
     59 
     60 class Index {
     61  public:
     62   Index();
     63   Time LastModifiedTimeForFile(const FilePath& file_path);
     64   void SetTrigramsForFile(const FilePath& file_path,
     65                           const vector<Trigram>& index,
     66                           const Time& time);
     67   vector<FilePath> Search(string query);
     68   void PrintStats();
     69   void NormalizeVectors();
     70 
     71  private:
     72   ~Index();
     73 
     74   FileId GetFileId(const FilePath& file_path);
     75 
     76   typedef map<FilePath, FileId> FileIdsMap;
     77   FileIdsMap file_ids_;
     78   FileId last_file_id_;
     79   // The index in this vector is the trigram id.
     80   vector<vector<FileId> > index_;
     81   typedef map<FilePath, Time> IndexedFilesMap;
     82   IndexedFilesMap index_times_;
     83   vector<bool> is_normalized_;
     84 
     85   DISALLOW_COPY_AND_ASSIGN(Index);
     86 };
     87 
     88 base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
     89 
     90 void InitIsBinaryCharMap() {
     91   for (size_t i = 0; i < 256; ++i) {
     92     unsigned char ch = i;
     93     bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
     94     g_is_binary_char.Get().push_back(is_binary_char);
     95   }
     96 }
     97 
     98 bool IsBinaryChar(char c) {
     99   unsigned char uc = static_cast<unsigned char>(c);
    100   return g_is_binary_char.Get()[uc];
    101 }
    102 
    103 void InitTrigramCharsMap() {
    104   for (size_t i = 0; i < 256; ++i) {
    105     if (i > 127) {
    106       g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
    107       continue;
    108     }
    109     char ch = i;
    110     if (ch == '\t')
    111       ch = ' ';
    112     if (ch >= 'A' && ch <= 'Z')
    113       ch = ch - 'A' + 'a';
    114     if ((IsBinaryChar(ch)) || (ch < ' ')) {
    115       g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
    116       continue;
    117     }
    118 
    119     if (ch >= 'Z')
    120       ch = ch - 'Z' - 1 + 'A';
    121     ch -= ' ';
    122     char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount);
    123     CHECK(ch >= 0 && ch < signed_trigram_char_count);
    124     g_trigram_chars.Get().push_back(ch);
    125   }
    126 }
    127 
    128 TrigramChar TrigramCharForChar(char c) {
    129   unsigned char uc = static_cast<unsigned char>(c);
    130   return g_trigram_chars.Get()[uc];
    131 }
    132 
    133 Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
    134   static int kTrigramCharacterCountSquared =
    135       kTrigramCharacterCount * kTrigramCharacterCount;
    136   if (trigram_chars[index] == kUndefinedTrigramChar ||
    137       trigram_chars[index + 1] == kUndefinedTrigramChar ||
    138       trigram_chars[index + 2] == kUndefinedTrigramChar)
    139     return kUndefinedTrigram;
    140   Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
    141                     kTrigramCharacterCount * trigram_chars[index + 1] +
    142                     trigram_chars[index + 2];
    143   return trigram;
    144 }
    145 
    146 Index::Index() : last_file_id_(0) {
    147   index_.resize(kTrigramCount);
    148   is_normalized_.resize(kTrigramCount);
    149   std::fill(is_normalized_.begin(), is_normalized_.end(), true);
    150 }
    151 
    152 Index::~Index() {}
    153 
    154 Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
    155   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    156   Time last_modified_time;
    157   if (index_times_.find(file_path) != index_times_.end())
    158     last_modified_time = index_times_[file_path];
    159   return last_modified_time;
    160 }
    161 
    162 void Index::SetTrigramsForFile(const FilePath& file_path,
    163                                const vector<Trigram>& index,
    164                                const Time& time) {
    165   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    166   FileId file_id = GetFileId(file_path);
    167   vector<Trigram>::const_iterator it = index.begin();
    168   for (; it != index.end(); ++it) {
    169     Trigram trigram = *it;
    170     index_[trigram].push_back(file_id);
    171     is_normalized_[trigram] = false;
    172   }
    173   index_times_[file_path] = time;
    174 }
    175 
    176 vector<FilePath> Index::Search(string query) {
    177   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    178   const char* data = query.c_str();
    179   vector<TrigramChar> trigram_chars;
    180   trigram_chars.reserve(query.size());
    181   for (size_t i = 0; i < query.size(); ++i)
    182       trigram_chars.push_back(TrigramCharForChar(data[i]));
    183   vector<Trigram> trigrams;
    184   for (size_t i = 0; i + 2 < query.size(); ++i) {
    185     Trigram trigram = TrigramAtIndex(trigram_chars, i);
    186     if (trigram != kUndefinedTrigram)
    187       trigrams.push_back(trigram);
    188   }
    189   set<FileId> file_ids;
    190   bool first = true;
    191   vector<Trigram>::const_iterator it = trigrams.begin();
    192   for (; it != trigrams.end(); ++it) {
    193     Trigram trigram = *it;
    194     if (first) {
    195       std::copy(index_[trigram].begin(),
    196                 index_[trigram].end(),
    197                 std::inserter(file_ids, file_ids.begin()));
    198       first = false;
    199       continue;
    200     }
    201     set<FileId> intersection;
    202     std::set_intersection(file_ids.begin(),
    203                           file_ids.end(),
    204                           index_[trigram].begin(),
    205                           index_[trigram].end(),
    206                           std::inserter(intersection, intersection.begin()));
    207     file_ids.swap(intersection);
    208   }
    209   vector<FilePath> result;
    210   FileIdsMap::const_iterator ids_it = file_ids_.begin();
    211   for (; ids_it != file_ids_.end(); ++ids_it) {
    212     if (trigrams.size() == 0 ||
    213         file_ids.find(ids_it->second) != file_ids.end()) {
    214       result.push_back(ids_it->first);
    215     }
    216   }
    217   return result;
    218 }
    219 
    220 FileId Index::GetFileId(const FilePath& file_path) {
    221   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    222   string file_path_str = file_path.AsUTF8Unsafe();
    223   if (file_ids_.find(file_path) != file_ids_.end())
    224     return file_ids_[file_path];
    225   file_ids_[file_path] = ++last_file_id_;
    226   return last_file_id_;
    227 }
    228 
    229 void Index::NormalizeVectors() {
    230   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    231   for (size_t i = 0; i < kTrigramCount; ++i) {
    232     if (!is_normalized_[i]) {
    233       std::sort(index_[i].begin(), index_[i].end());
    234       if (index_[i].capacity() > index_[i].size())
    235         vector<FileId>(index_[i]).swap(index_[i]);
    236       is_normalized_[i] = true;
    237     }
    238   }
    239 }
    240 
    241 void Index::PrintStats() {
    242   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    243   LOG(ERROR) << "Index stats:";
    244   size_t size = 0;
    245   size_t maxSize = 0;
    246   size_t capacity = 0;
    247   for (size_t i = 0; i < kTrigramCount; ++i) {
    248     if (index_[i].size() > maxSize)
    249       maxSize = index_[i].size();
    250     size += index_[i].size();
    251     capacity += index_[i].capacity();
    252   }
    253   LOG(ERROR) << "  - total trigram count: " << size;
    254   LOG(ERROR) << "  - max file count per trigram: " << maxSize;
    255   LOG(ERROR) << "  - total vectors capacity " << capacity;
    256   size_t total_index_size =
    257       capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
    258   LOG(ERROR) << "  - estimated total index size " << total_index_size;
    259 }
    260 
    261 typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
    262 
    263 }  // namespace
    264 
    265 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
    266     const FilePath& file_system_path,
    267     const TotalWorkCallback& total_work_callback,
    268     const WorkedCallback& worked_callback,
    269     const DoneCallback& done_callback)
    270     : file_system_path_(file_system_path),
    271       total_work_callback_(total_work_callback),
    272       worked_callback_(worked_callback),
    273       done_callback_(done_callback),
    274       files_indexed_(0),
    275       stopped_(false) {
    276   current_trigrams_set_.resize(kTrigramCount);
    277   current_trigrams_.reserve(kTrigramCount);
    278 }
    279 
    280 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
    281 
    282 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
    283   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    284   BrowserThread::PostTask(
    285       BrowserThread::FILE,
    286       FROM_HERE,
    287       Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
    288 }
    289 
    290 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
    291   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    292   BrowserThread::PostTask(BrowserThread::FILE,
    293                           FROM_HERE,
    294                           Bind(&FileSystemIndexingJob::StopOnFileThread, this));
    295 }
    296 
    297 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
    298   stopped_ = true;
    299 }
    300 
    301 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
    302   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    303   if (stopped_)
    304     return;
    305   if (!file_enumerator_) {
    306     file_enumerator_.reset(
    307         new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
    308   }
    309   FilePath file_path = file_enumerator_->Next();
    310   if (file_path.empty()) {
    311     BrowserThread::PostTask(
    312         BrowserThread::UI,
    313         FROM_HERE,
    314         Bind(total_work_callback_, file_path_times_.size()));
    315     indexing_it_ = file_path_times_.begin();
    316     IndexFiles();
    317     return;
    318   }
    319   Time saved_last_modified_time =
    320       g_trigram_index.Get().LastModifiedTimeForFile(file_path);
    321   FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
    322   Time current_last_modified_time = file_info.GetLastModifiedTime();
    323   if (current_last_modified_time > saved_last_modified_time) {
    324     file_path_times_[file_path] = current_last_modified_time;
    325   }
    326   BrowserThread::PostTask(
    327       BrowserThread::FILE,
    328       FROM_HERE,
    329       Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
    330 }
    331 
    332 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
    333   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    334   if (stopped_)
    335     return;
    336   if (indexing_it_ == file_path_times_.end()) {
    337     g_trigram_index.Get().NormalizeVectors();
    338     BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
    339     return;
    340   }
    341   FilePath file_path = indexing_it_->first;
    342   FileUtilProxy::CreateOrOpen(
    343       BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
    344       file_path,
    345       base::PLATFORM_FILE_OPEN | base::PLATFORM_FILE_READ,
    346       Bind(&FileSystemIndexingJob::StartFileIndexing, this));
    347 }
    348 
    349 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
    350     PlatformFileError error,
    351     PassPlatformFile pass_file,
    352     bool) {
    353   if (error != base::PLATFORM_FILE_OK) {
    354     current_file_ = base::kInvalidPlatformFileValue;
    355     FinishFileIndexing(false);
    356     return;
    357   }
    358   current_file_ = pass_file.ReleaseValue();
    359   current_file_offset_ = 0;
    360   current_trigrams_.clear();
    361   std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
    362   ReadFromFile();
    363 }
    364 
    365 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
    366   if (stopped_) {
    367     CloseFile();
    368     return;
    369   }
    370   FileUtilProxy::Read(
    371       BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
    372       current_file_,
    373       current_file_offset_,
    374       kMaxReadLength,
    375       Bind(&FileSystemIndexingJob::OnRead, this));
    376 }
    377 
    378 void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
    379     PlatformFileError error,
    380     const char* data,
    381     int bytes_read) {
    382   if (error != base::PLATFORM_FILE_OK) {
    383     FinishFileIndexing(false);
    384     return;
    385   }
    386 
    387   if (!bytes_read || bytes_read < 3) {
    388     FinishFileIndexing(true);
    389     return;
    390   }
    391 
    392   size_t size = static_cast<size_t>(bytes_read);
    393   vector<TrigramChar> trigram_chars;
    394   trigram_chars.reserve(size);
    395   for (size_t i = 0; i < size; ++i) {
    396     if (IsBinaryChar(data[i])) {
    397       current_trigrams_.clear();
    398       FinishFileIndexing(true);
    399       return;
    400     }
    401     trigram_chars.push_back(TrigramCharForChar(data[i]));
    402   }
    403 
    404   for (size_t i = 0; i + 2 < size; ++i) {
    405     Trigram trigram = TrigramAtIndex(trigram_chars, i);
    406     if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
    407       current_trigrams_set_[trigram] = true;
    408       current_trigrams_.push_back(trigram);
    409     }
    410   }
    411   current_file_offset_ += bytes_read - 2;
    412   ReadFromFile();
    413 }
    414 
    415 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
    416     bool success) {
    417   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    418   CloseFile();
    419   if (success) {
    420     FilePath file_path = indexing_it_->first;
    421     g_trigram_index.Get().SetTrigramsForFile(
    422         file_path, current_trigrams_, file_path_times_[file_path]);
    423   }
    424   ReportWorked();
    425   ++indexing_it_;
    426   IndexFiles();
    427 }
    428 
    429 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
    430   if (current_file_ != base::kInvalidPlatformFileValue) {
    431     FileUtilProxy::Close(
    432         BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
    433         current_file_,
    434         Bind(&FileSystemIndexingJob::CloseCallback, this));
    435   }
    436 }
    437 
    438 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
    439     PlatformFileError error) {}
    440 
    441 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
    442   TimeTicks current_time = TimeTicks::Now();
    443   bool should_send_worked_nitification = true;
    444   if (!last_worked_notification_time_.is_null()) {
    445     TimeDelta delta = current_time - last_worked_notification_time_;
    446     if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
    447       should_send_worked_nitification = false;
    448   }
    449   ++files_indexed_;
    450   if (should_send_worked_nitification) {
    451     last_worked_notification_time_ = current_time;
    452     BrowserThread::PostTask(
    453         BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_));
    454     files_indexed_ = 0;
    455   }
    456 }
    457 
    458 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
    459   static bool maps_initialized = false;
    460   if (!maps_initialized) {
    461     InitIsBinaryCharMap();
    462     InitTrigramCharsMap();
    463     maps_initialized = true;
    464   }
    465 }
    466 
    467 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
    468 
    469 scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
    470 DevToolsFileSystemIndexer::IndexPath(
    471     const string& file_system_path,
    472     const TotalWorkCallback& total_work_callback,
    473     const WorkedCallback& worked_callback,
    474     const DoneCallback& done_callback) {
    475   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    476   scoped_refptr<FileSystemIndexingJob> indexing_job =
    477       new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path),
    478                                 total_work_callback,
    479                                 worked_callback,
    480                                 done_callback);
    481   indexing_job->Start();
    482   return indexing_job;
    483 }
    484 
    485 void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
    486                                              const string& query,
    487                                              const SearchCallback& callback) {
    488   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    489   BrowserThread::PostTask(
    490       BrowserThread::FILE,
    491       FROM_HERE,
    492       Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread,
    493            this,
    494            file_system_path,
    495            query,
    496            callback));
    497 }
    498 
    499 void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
    500     const string& file_system_path,
    501     const string& query,
    502     const SearchCallback& callback) {
    503   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
    504   vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
    505   vector<string> result;
    506   FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
    507   vector<FilePath>::const_iterator it = file_paths.begin();
    508   for (; it != file_paths.end(); ++it) {
    509     if (path.IsParent(*it))
    510       result.push_back(it->AsUTF8Unsafe());
    511   }
    512   BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result));
    513 }
    514