1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/browser/devtools/devtools_file_system_indexer.h" 6 7 #include <iterator> 8 9 #include "base/bind.h" 10 #include "base/callback.h" 11 #include "base/file_util.h" 12 #include "base/files/file_enumerator.h" 13 #include "base/files/file_util_proxy.h" 14 #include "base/lazy_instance.h" 15 #include "base/logging.h" 16 #include "base/platform_file.h" 17 #include "base/strings/utf_string_conversions.h" 18 #include "content/public/browser/browser_thread.h" 19 20 using base::Bind; 21 using base::Callback; 22 using base::FileEnumerator; 23 using base::FilePath; 24 using base::FileUtilProxy; 25 using base::Time; 26 using base::TimeDelta; 27 using base::TimeTicks; 28 using base::PassPlatformFile; 29 using base::PlatformFile; 30 using base::PlatformFileError; 31 using base::PlatformFileInfo; 32 using content::BrowserThread; 33 using std::map; 34 using std::set; 35 using std::string; 36 using std::vector; 37 38 namespace { 39 40 typedef int32 Trigram; 41 typedef char TrigramChar; 42 typedef uint16 FileId; 43 44 const int kMinTimeoutBetweenWorkedNitification = 200; 45 // Trigram characters include all ASCII printable characters (32-126) except for 46 // the capital letters, because the index is case insensitive. 47 const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1; 48 const size_t kTrigramCount = 49 kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount; 50 const int kMaxReadLength = 10 * 1024; 51 const TrigramChar kUndefinedTrigramChar = -1; 52 const Trigram kUndefinedTrigram = -1; 53 54 base::LazyInstance<vector<bool> >::Leaky g_is_binary_char = 55 LAZY_INSTANCE_INITIALIZER; 56 57 base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars = 58 LAZY_INSTANCE_INITIALIZER; 59 60 class Index { 61 public: 62 Index(); 63 Time LastModifiedTimeForFile(const FilePath& file_path); 64 void SetTrigramsForFile(const FilePath& file_path, 65 const vector<Trigram>& index, 66 const Time& time); 67 vector<FilePath> Search(string query); 68 void PrintStats(); 69 void NormalizeVectors(); 70 71 private: 72 ~Index(); 73 74 FileId GetFileId(const FilePath& file_path); 75 76 typedef map<FilePath, FileId> FileIdsMap; 77 FileIdsMap file_ids_; 78 FileId last_file_id_; 79 // The index in this vector is the trigram id. 80 vector<vector<FileId> > index_; 81 typedef map<FilePath, Time> IndexedFilesMap; 82 IndexedFilesMap index_times_; 83 vector<bool> is_normalized_; 84 85 DISALLOW_COPY_AND_ASSIGN(Index); 86 }; 87 88 base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER; 89 90 void InitIsBinaryCharMap() { 91 for (size_t i = 0; i < 256; ++i) { 92 unsigned char ch = i; 93 bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127; 94 g_is_binary_char.Get().push_back(is_binary_char); 95 } 96 } 97 98 bool IsBinaryChar(char c) { 99 unsigned char uc = static_cast<unsigned char>(c); 100 return g_is_binary_char.Get()[uc]; 101 } 102 103 void InitTrigramCharsMap() { 104 for (size_t i = 0; i < 256; ++i) { 105 if (i > 127) { 106 g_trigram_chars.Get().push_back(kUndefinedTrigramChar); 107 continue; 108 } 109 char ch = i; 110 if (ch == '\t') 111 ch = ' '; 112 if (ch >= 'A' && ch <= 'Z') 113 ch = ch - 'A' + 'a'; 114 if ((IsBinaryChar(ch)) || (ch < ' ')) { 115 g_trigram_chars.Get().push_back(kUndefinedTrigramChar); 116 continue; 117 } 118 119 if (ch >= 'Z') 120 ch = ch - 'Z' - 1 + 'A'; 121 ch -= ' '; 122 char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount); 123 CHECK(ch >= 0 && ch < signed_trigram_char_count); 124 g_trigram_chars.Get().push_back(ch); 125 } 126 } 127 128 TrigramChar TrigramCharForChar(char c) { 129 unsigned char uc = static_cast<unsigned char>(c); 130 return g_trigram_chars.Get()[uc]; 131 } 132 133 Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) { 134 static int kTrigramCharacterCountSquared = 135 kTrigramCharacterCount * kTrigramCharacterCount; 136 if (trigram_chars[index] == kUndefinedTrigramChar || 137 trigram_chars[index + 1] == kUndefinedTrigramChar || 138 trigram_chars[index + 2] == kUndefinedTrigramChar) 139 return kUndefinedTrigram; 140 Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] + 141 kTrigramCharacterCount * trigram_chars[index + 1] + 142 trigram_chars[index + 2]; 143 return trigram; 144 } 145 146 Index::Index() : last_file_id_(0) { 147 index_.resize(kTrigramCount); 148 is_normalized_.resize(kTrigramCount); 149 std::fill(is_normalized_.begin(), is_normalized_.end(), true); 150 } 151 152 Index::~Index() {} 153 154 Time Index::LastModifiedTimeForFile(const FilePath& file_path) { 155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 156 Time last_modified_time; 157 if (index_times_.find(file_path) != index_times_.end()) 158 last_modified_time = index_times_[file_path]; 159 return last_modified_time; 160 } 161 162 void Index::SetTrigramsForFile(const FilePath& file_path, 163 const vector<Trigram>& index, 164 const Time& time) { 165 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 166 FileId file_id = GetFileId(file_path); 167 vector<Trigram>::const_iterator it = index.begin(); 168 for (; it != index.end(); ++it) { 169 Trigram trigram = *it; 170 index_[trigram].push_back(file_id); 171 is_normalized_[trigram] = false; 172 } 173 index_times_[file_path] = time; 174 } 175 176 vector<FilePath> Index::Search(string query) { 177 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 178 const char* data = query.c_str(); 179 vector<TrigramChar> trigram_chars; 180 trigram_chars.reserve(query.size()); 181 for (size_t i = 0; i < query.size(); ++i) 182 trigram_chars.push_back(TrigramCharForChar(data[i])); 183 vector<Trigram> trigrams; 184 for (size_t i = 0; i + 2 < query.size(); ++i) { 185 Trigram trigram = TrigramAtIndex(trigram_chars, i); 186 if (trigram != kUndefinedTrigram) 187 trigrams.push_back(trigram); 188 } 189 set<FileId> file_ids; 190 bool first = true; 191 vector<Trigram>::const_iterator it = trigrams.begin(); 192 for (; it != trigrams.end(); ++it) { 193 Trigram trigram = *it; 194 if (first) { 195 std::copy(index_[trigram].begin(), 196 index_[trigram].end(), 197 std::inserter(file_ids, file_ids.begin())); 198 first = false; 199 continue; 200 } 201 set<FileId> intersection; 202 std::set_intersection(file_ids.begin(), 203 file_ids.end(), 204 index_[trigram].begin(), 205 index_[trigram].end(), 206 std::inserter(intersection, intersection.begin())); 207 file_ids.swap(intersection); 208 } 209 vector<FilePath> result; 210 FileIdsMap::const_iterator ids_it = file_ids_.begin(); 211 for (; ids_it != file_ids_.end(); ++ids_it) { 212 if (trigrams.size() == 0 || 213 file_ids.find(ids_it->second) != file_ids.end()) { 214 result.push_back(ids_it->first); 215 } 216 } 217 return result; 218 } 219 220 FileId Index::GetFileId(const FilePath& file_path) { 221 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 222 string file_path_str = file_path.AsUTF8Unsafe(); 223 if (file_ids_.find(file_path) != file_ids_.end()) 224 return file_ids_[file_path]; 225 file_ids_[file_path] = ++last_file_id_; 226 return last_file_id_; 227 } 228 229 void Index::NormalizeVectors() { 230 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 231 for (size_t i = 0; i < kTrigramCount; ++i) { 232 if (!is_normalized_[i]) { 233 std::sort(index_[i].begin(), index_[i].end()); 234 if (index_[i].capacity() > index_[i].size()) 235 vector<FileId>(index_[i]).swap(index_[i]); 236 is_normalized_[i] = true; 237 } 238 } 239 } 240 241 void Index::PrintStats() { 242 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 243 LOG(ERROR) << "Index stats:"; 244 size_t size = 0; 245 size_t maxSize = 0; 246 size_t capacity = 0; 247 for (size_t i = 0; i < kTrigramCount; ++i) { 248 if (index_[i].size() > maxSize) 249 maxSize = index_[i].size(); 250 size += index_[i].size(); 251 capacity += index_[i].capacity(); 252 } 253 LOG(ERROR) << " - total trigram count: " << size; 254 LOG(ERROR) << " - max file count per trigram: " << maxSize; 255 LOG(ERROR) << " - total vectors capacity " << capacity; 256 size_t total_index_size = 257 capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount; 258 LOG(ERROR) << " - estimated total index size " << total_index_size; 259 } 260 261 typedef Callback<void(bool, const vector<bool>&)> IndexerCallback; 262 263 } // namespace 264 265 DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob( 266 const FilePath& file_system_path, 267 const TotalWorkCallback& total_work_callback, 268 const WorkedCallback& worked_callback, 269 const DoneCallback& done_callback) 270 : file_system_path_(file_system_path), 271 total_work_callback_(total_work_callback), 272 worked_callback_(worked_callback), 273 done_callback_(done_callback), 274 files_indexed_(0), 275 stopped_(false) { 276 current_trigrams_set_.resize(kTrigramCount); 277 current_trigrams_.reserve(kTrigramCount); 278 } 279 280 DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {} 281 282 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() { 283 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 284 BrowserThread::PostTask( 285 BrowserThread::FILE, 286 FROM_HERE, 287 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this)); 288 } 289 290 void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() { 291 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 292 BrowserThread::PostTask(BrowserThread::FILE, 293 FROM_HERE, 294 Bind(&FileSystemIndexingJob::StopOnFileThread, this)); 295 } 296 297 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() { 298 stopped_ = true; 299 } 300 301 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() { 302 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 303 if (stopped_) 304 return; 305 if (!file_enumerator_) { 306 file_enumerator_.reset( 307 new FileEnumerator(file_system_path_, true, FileEnumerator::FILES)); 308 } 309 FilePath file_path = file_enumerator_->Next(); 310 if (file_path.empty()) { 311 BrowserThread::PostTask( 312 BrowserThread::UI, 313 FROM_HERE, 314 Bind(total_work_callback_, file_path_times_.size())); 315 indexing_it_ = file_path_times_.begin(); 316 IndexFiles(); 317 return; 318 } 319 Time saved_last_modified_time = 320 g_trigram_index.Get().LastModifiedTimeForFile(file_path); 321 FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo(); 322 Time current_last_modified_time = file_info.GetLastModifiedTime(); 323 if (current_last_modified_time > saved_last_modified_time) { 324 file_path_times_[file_path] = current_last_modified_time; 325 } 326 BrowserThread::PostTask( 327 BrowserThread::FILE, 328 FROM_HERE, 329 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this)); 330 } 331 332 void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() { 333 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 334 if (stopped_) 335 return; 336 if (indexing_it_ == file_path_times_.end()) { 337 g_trigram_index.Get().NormalizeVectors(); 338 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_); 339 return; 340 } 341 FilePath file_path = indexing_it_->first; 342 FileUtilProxy::CreateOrOpen( 343 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(), 344 file_path, 345 base::PLATFORM_FILE_OPEN | base::PLATFORM_FILE_READ, 346 Bind(&FileSystemIndexingJob::StartFileIndexing, this)); 347 } 348 349 void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing( 350 PlatformFileError error, 351 PassPlatformFile pass_file, 352 bool) { 353 if (error != base::PLATFORM_FILE_OK) { 354 current_file_ = base::kInvalidPlatformFileValue; 355 FinishFileIndexing(false); 356 return; 357 } 358 current_file_ = pass_file.ReleaseValue(); 359 current_file_offset_ = 0; 360 current_trigrams_.clear(); 361 std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false); 362 ReadFromFile(); 363 } 364 365 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() { 366 if (stopped_) { 367 CloseFile(); 368 return; 369 } 370 FileUtilProxy::Read( 371 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(), 372 current_file_, 373 current_file_offset_, 374 kMaxReadLength, 375 Bind(&FileSystemIndexingJob::OnRead, this)); 376 } 377 378 void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead( 379 PlatformFileError error, 380 const char* data, 381 int bytes_read) { 382 if (error != base::PLATFORM_FILE_OK) { 383 FinishFileIndexing(false); 384 return; 385 } 386 387 if (!bytes_read || bytes_read < 3) { 388 FinishFileIndexing(true); 389 return; 390 } 391 392 size_t size = static_cast<size_t>(bytes_read); 393 vector<TrigramChar> trigram_chars; 394 trigram_chars.reserve(size); 395 for (size_t i = 0; i < size; ++i) { 396 if (IsBinaryChar(data[i])) { 397 current_trigrams_.clear(); 398 FinishFileIndexing(true); 399 return; 400 } 401 trigram_chars.push_back(TrigramCharForChar(data[i])); 402 } 403 404 for (size_t i = 0; i + 2 < size; ++i) { 405 Trigram trigram = TrigramAtIndex(trigram_chars, i); 406 if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) { 407 current_trigrams_set_[trigram] = true; 408 current_trigrams_.push_back(trigram); 409 } 410 } 411 current_file_offset_ += bytes_read - 2; 412 ReadFromFile(); 413 } 414 415 void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing( 416 bool success) { 417 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 418 CloseFile(); 419 if (success) { 420 FilePath file_path = indexing_it_->first; 421 g_trigram_index.Get().SetTrigramsForFile( 422 file_path, current_trigrams_, file_path_times_[file_path]); 423 } 424 ReportWorked(); 425 ++indexing_it_; 426 IndexFiles(); 427 } 428 429 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() { 430 if (current_file_ != base::kInvalidPlatformFileValue) { 431 FileUtilProxy::Close( 432 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(), 433 current_file_, 434 Bind(&FileSystemIndexingJob::CloseCallback, this)); 435 } 436 } 437 438 void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback( 439 PlatformFileError error) {} 440 441 void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() { 442 TimeTicks current_time = TimeTicks::Now(); 443 bool should_send_worked_nitification = true; 444 if (!last_worked_notification_time_.is_null()) { 445 TimeDelta delta = current_time - last_worked_notification_time_; 446 if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification) 447 should_send_worked_nitification = false; 448 } 449 ++files_indexed_; 450 if (should_send_worked_nitification) { 451 last_worked_notification_time_ = current_time; 452 BrowserThread::PostTask( 453 BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_)); 454 files_indexed_ = 0; 455 } 456 } 457 458 DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() { 459 static bool maps_initialized = false; 460 if (!maps_initialized) { 461 InitIsBinaryCharMap(); 462 InitTrigramCharsMap(); 463 maps_initialized = true; 464 } 465 } 466 467 DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {} 468 469 scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob> 470 DevToolsFileSystemIndexer::IndexPath( 471 const string& file_system_path, 472 const TotalWorkCallback& total_work_callback, 473 const WorkedCallback& worked_callback, 474 const DoneCallback& done_callback) { 475 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 476 scoped_refptr<FileSystemIndexingJob> indexing_job = 477 new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path), 478 total_work_callback, 479 worked_callback, 480 done_callback); 481 indexing_job->Start(); 482 return indexing_job; 483 } 484 485 void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path, 486 const string& query, 487 const SearchCallback& callback) { 488 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); 489 BrowserThread::PostTask( 490 BrowserThread::FILE, 491 FROM_HERE, 492 Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread, 493 this, 494 file_system_path, 495 query, 496 callback)); 497 } 498 499 void DevToolsFileSystemIndexer::SearchInPathOnFileThread( 500 const string& file_system_path, 501 const string& query, 502 const SearchCallback& callback) { 503 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE)); 504 vector<FilePath> file_paths = g_trigram_index.Get().Search(query); 505 vector<string> result; 506 FilePath path = FilePath::FromUTF8Unsafe(file_system_path); 507 vector<FilePath>::const_iterator it = file_paths.begin(); 508 for (; it != file_paths.end(); ++it) { 509 if (path.IsParent(*it)) 510 result.push_back(it->AsUTF8Unsafe()); 511 } 512 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result)); 513 } 514