1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // A policy for storing activity log data to a database that performs 6 // aggregation to reduce the size of the database. The database layout is 7 // nearly the same as FullStreamUIPolicy, which stores a complete log, with a 8 // few changes: 9 // - a "count" column is added to track how many log records were merged 10 // together into this row 11 // - the "time" column measures the most recent time that the current row was 12 // updated 13 // When writing a record, if a row already exists where all other columns 14 // (extension_id, action_type, api_name, args, urls, etc.) all match, and the 15 // previous time falls within today (the current time), then the count field on 16 // the old row is incremented. Otherwise, a new row is written. 17 // 18 // For many text columns, repeated strings are compressed by moving string 19 // storage to a separate table ("string_ids") and storing only an identifier in 20 // the logging table. For example, if the api_name_x column contained the 21 // value 4 and the string_ids table contained a row with primary key 4 and 22 // value 'tabs.query', then the api_name field should be taken to have the 23 // value 'tabs.query'. Each column ending with "_x" is compressed in this way. 24 // All lookups are to the string_ids table, except for the page_url_x and 25 // arg_url_x columns, which are converted via the url_ids table (this 26 // separation of URL values is to help simplify history clearing). 27 // 28 // The activitylog_uncompressed view allows for simpler reading of the activity 29 // log contents with identifiers already translated to string values. 30 31 #include "chrome/browser/extensions/activity_log/counting_policy.h" 32 33 #include <map> 34 #include <string> 35 #include <vector> 36 37 #include "base/callback.h" 38 #include "base/files/file_path.h" 39 #include "base/json/json_reader.h" 40 #include "base/json/json_string_value_serializer.h" 41 #include "base/strings/string_util.h" 42 #include "base/strings/stringprintf.h" 43 #include "chrome/common/chrome_constants.h" 44 45 using content::BrowserThread; 46 47 namespace { 48 49 // Delay between cleaning passes (to delete old action records) through the 50 // database. 51 const int kCleaningDelayInHours = 12; 52 53 // We should log the arguments to these API calls. Be careful when 54 // constructing this whitelist to not keep arguments that might compromise 55 // privacy by logging too much data to the activity log. 56 // 57 // TODO(mvrable): The contents of this whitelist should be reviewed and 58 // expanded as needed. 59 const char* kAlwaysLog[] = {"extension.connect", "extension.sendMessage", 60 "tabs.executeScript", "tabs.insertCSS"}; 61 62 // Columns in the main database table. See the file-level comment for a 63 // discussion of how data is stored and the meanings of the _x columns. 64 const char* kTableContentFields[] = { 65 "count", "extension_id_x", "time", "action_type", "api_name_x", "args_x", 66 "page_url_x", "page_title_x", "arg_url_x", "other_x"}; 67 const char* kTableFieldTypes[] = { 68 "INTEGER NOT NULL DEFAULT 1", "INTEGER NOT NULL", "INTEGER", "INTEGER", 69 "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", 70 "INTEGER"}; 71 72 // Miscellaneous SQL commands for initializing the database; these should be 73 // idempotent. 74 static const char kPolicyMiscSetup[] = 75 // The activitylog_uncompressed view performs string lookups for simpler 76 // access to the log data. 77 "DROP VIEW IF EXISTS activitylog_uncompressed;\n" 78 "CREATE VIEW activitylog_uncompressed AS\n" 79 "SELECT count,\n" 80 " x1.value AS extension_id,\n" 81 " time,\n" 82 " action_type,\n" 83 " x2.value AS api_name,\n" 84 " x3.value AS args,\n" 85 " x4.value AS page_url,\n" 86 " x5.value AS page_title,\n" 87 " x6.value AS arg_url,\n" 88 " x7.value AS other\n" 89 "FROM activitylog_compressed\n" 90 " LEFT JOIN string_ids AS x1 ON (x1.id = extension_id_x)\n" 91 " LEFT JOIN string_ids AS x2 ON (x2.id = api_name_x)\n" 92 " LEFT JOIN string_ids AS x3 ON (x3.id = args_x)\n" 93 " LEFT JOIN url_ids AS x4 ON (x4.id = page_url_x)\n" 94 " LEFT JOIN string_ids AS x5 ON (x5.id = page_title_x)\n" 95 " LEFT JOIN url_ids AS x6 ON (x6.id = arg_url_x)\n" 96 " LEFT JOIN string_ids AS x7 ON (x7.id = other_x);\n" 97 // An index on all fields except count and time: all the fields that aren't 98 // changed when incrementing a count. This should accelerate finding the 99 // rows to update (at worst several rows will need to be checked to find 100 // the one in the right time range). 101 "CREATE INDEX IF NOT EXISTS activitylog_compressed_index\n" 102 "ON activitylog_compressed(extension_id_x, action_type, api_name_x,\n" 103 " args_x, page_url_x, page_title_x, arg_url_x, other_x)"; 104 105 // SQL statements to clean old, unused entries out of the string and URL id 106 // tables. 107 static const char kStringTableCleanup[] = 108 "DELETE FROM string_ids WHERE id NOT IN\n" 109 "(SELECT extension_id_x FROM activitylog_compressed\n" 110 " UNION SELECT api_name_x FROM activitylog_compressed\n" 111 " UNION SELECT args_x FROM activitylog_compressed\n" 112 " UNION SELECT page_title_x FROM activitylog_compressed\n" 113 " UNION SELECT other_x FROM activitylog_compressed)"; 114 static const char kUrlTableCleanup[] = 115 "DELETE FROM url_ids WHERE id NOT IN\n" 116 "(SELECT page_url_x FROM activitylog_compressed\n" 117 " UNION SELECT arg_url_x FROM activitylog_compressed)"; 118 119 } // namespace 120 121 namespace extensions { 122 123 // A specialized Action subclass which is used to represent an action read from 124 // the database with a corresponding count. 125 class CountedAction : public Action { 126 public: 127 CountedAction(const std::string& extension_id, 128 const base::Time& time, 129 const ActionType action_type, 130 const std::string& api_name) 131 : Action(extension_id, time, action_type, api_name) {} 132 133 // Number of merged records for this action. 134 int count() const { return count_; } 135 void set_count(int count) { count_ = count; } 136 137 virtual std::string PrintForDebug() const OVERRIDE; 138 139 protected: 140 virtual ~CountedAction() {} 141 142 private: 143 int count_; 144 }; 145 146 std::string CountedAction::PrintForDebug() const { 147 return base::StringPrintf( 148 "%s COUNT=%d", Action::PrintForDebug().c_str(), count()); 149 } 150 151 const char* CountingPolicy::kTableName = "activitylog_compressed"; 152 const char* CountingPolicy::kReadViewName = "activitylog_uncompressed"; 153 154 CountingPolicy::CountingPolicy(Profile* profile) 155 : ActivityLogDatabasePolicy( 156 profile, 157 base::FilePath(chrome::kExtensionActivityLogFilename)), 158 string_table_("string_ids"), 159 url_table_("url_ids"), 160 retention_time_(base::TimeDelta::FromHours(60)) { 161 for (size_t i = 0; i < arraysize(kAlwaysLog); i++) { 162 api_arg_whitelist_.insert(kAlwaysLog[i]); 163 } 164 } 165 166 CountingPolicy::~CountingPolicy() {} 167 168 bool CountingPolicy::InitDatabase(sql::Connection* db) { 169 if (!Util::DropObsoleteTables(db)) 170 return false; 171 172 if (!string_table_.Initialize(db)) 173 return false; 174 if (!url_table_.Initialize(db)) 175 return false; 176 177 // Create the unified activity log entry table. 178 if (!ActivityDatabase::InitializeTable(db, 179 kTableName, 180 kTableContentFields, 181 kTableFieldTypes, 182 arraysize(kTableContentFields))) 183 return false; 184 185 // Create a view for easily accessing the uncompressed form of the data, and 186 // any necessary indexes if needed. 187 return db->Execute(kPolicyMiscSetup); 188 } 189 190 void CountingPolicy::ProcessAction(scoped_refptr<Action> action) { 191 ScheduleAndForget(this, &CountingPolicy::QueueAction, action); 192 } 193 194 void CountingPolicy::QueueAction(scoped_refptr<Action> action) { 195 if (activity_database()->is_db_valid()) { 196 action = action->Clone(); 197 Util::StripPrivacySensitiveFields(action); 198 Util::StripArguments(api_arg_whitelist_, action); 199 queued_actions_.push_back(action); 200 activity_database()->AdviseFlush(queued_actions_.size()); 201 } 202 } 203 204 bool CountingPolicy::FlushDatabase(sql::Connection* db) { 205 // Columns that must match exactly for database rows to be coalesced. 206 static const char* matched_columns[] = { 207 "extension_id_x", "action_type", "api_name_x", "args_x", "page_url_x", 208 "page_title_x", "arg_url_x", "other_x"}; 209 Action::ActionVector queue; 210 queue.swap(queued_actions_); 211 212 // Whether to clean old records out of the activity log database. Do this 213 // much less frequently than database flushes since it is expensive, but 214 // always check on the first database flush (since there might be a large 215 // amount of data to clear). 216 bool clean_database = (last_database_cleaning_time_.is_null() || 217 Now() - last_database_cleaning_time_ > 218 base::TimeDelta::FromHours(kCleaningDelayInHours)); 219 220 if (queue.empty() && !clean_database) 221 return true; 222 223 sql::Transaction transaction(db); 224 if (!transaction.Begin()) 225 return false; 226 227 std::string insert_str = 228 "INSERT INTO " + std::string(kTableName) + "(count, time"; 229 std::string update_str = 230 "UPDATE " + std::string(kTableName) + 231 " SET count = count + 1, time = max(?, time)" 232 " WHERE time >= ? AND time < ?"; 233 234 for (size_t i = 0; i < arraysize(matched_columns); i++) { 235 insert_str = 236 base::StringPrintf("%s, %s", insert_str.c_str(), matched_columns[i]); 237 update_str = base::StringPrintf( 238 "%s AND %s IS ?", update_str.c_str(), matched_columns[i]); 239 } 240 insert_str += ") VALUES (1, ?"; 241 for (size_t i = 0; i < arraysize(matched_columns); i++) { 242 insert_str += ", ?"; 243 } 244 insert_str += ")"; 245 246 Action::ActionVector::size_type i; 247 for (i = 0; i != queue.size(); ++i) { 248 const Action& action = *queue[i]; 249 250 base::Time day_start = action.time().LocalMidnight(); 251 base::Time next_day = Util::AddDays(day_start, 1); 252 253 // The contents in values must match up with fields in matched_columns. A 254 // value of -1 is used to encode a null database value. 255 int64 id; 256 std::vector<int64> matched_values; 257 258 if (!string_table_.StringToInt(db, action.extension_id(), &id)) 259 return false; 260 matched_values.push_back(id); 261 262 matched_values.push_back(static_cast<int>(action.action_type())); 263 264 if (!string_table_.StringToInt(db, action.api_name(), &id)) 265 return false; 266 matched_values.push_back(id); 267 268 if (action.args()) { 269 std::string args = Util::Serialize(action.args()); 270 // TODO(mvrable): For now, truncate long argument lists. This is a 271 // workaround for excessively-long values coming from DOM logging. When 272 // the V8ValueConverter is fixed to return more reasonable values, we can 273 // drop the truncation. 274 if (args.length() > 10000) { 275 args = "[\"<too_large>\"]"; 276 } 277 if (!string_table_.StringToInt(db, args, &id)) 278 return false; 279 matched_values.push_back(id); 280 } else { 281 matched_values.push_back(-1); 282 } 283 284 std::string page_url_string = action.SerializePageUrl(); 285 if (!page_url_string.empty()) { 286 if (!url_table_.StringToInt(db, page_url_string, &id)) 287 return false; 288 matched_values.push_back(id); 289 } else { 290 matched_values.push_back(-1); 291 } 292 293 // TODO(mvrable): Create a title_table_? 294 if (!action.page_title().empty()) { 295 if (!string_table_.StringToInt(db, action.page_title(), &id)) 296 return false; 297 matched_values.push_back(id); 298 } else { 299 matched_values.push_back(-1); 300 } 301 302 std::string arg_url_string = action.SerializeArgUrl(); 303 if (!arg_url_string.empty()) { 304 if (!url_table_.StringToInt(db, arg_url_string, &id)) 305 return false; 306 matched_values.push_back(id); 307 } else { 308 matched_values.push_back(-1); 309 } 310 311 if (action.other()) { 312 if (!string_table_.StringToInt(db, Util::Serialize(action.other()), &id)) 313 return false; 314 matched_values.push_back(id); 315 } else { 316 matched_values.push_back(-1); 317 } 318 319 // Assume there is an existing row for this action, and try to update the 320 // count. 321 sql::Statement update_statement(db->GetCachedStatement( 322 sql::StatementID(SQL_FROM_HERE), update_str.c_str())); 323 update_statement.BindInt64(0, action.time().ToInternalValue()); 324 update_statement.BindInt64(1, day_start.ToInternalValue()); 325 update_statement.BindInt64(2, next_day.ToInternalValue()); 326 for (size_t j = 0; j < matched_values.size(); j++) { 327 // A call to BindNull when matched_values contains -1 is likely not 328 // necessary as parameters default to null before they are explicitly 329 // bound. But to be completely clear, and in case a cached statement 330 // ever comes with some values already bound, we bind all parameters 331 // (even null ones) explicitly. 332 if (matched_values[j] == -1) 333 update_statement.BindNull(j + 3); 334 else 335 update_statement.BindInt64(j + 3, matched_values[j]); 336 } 337 if (!update_statement.Run()) 338 return false; 339 340 // Check if the update succeeded (was the count of updated rows non-zero)? 341 // If it failed because no matching row existed, fall back to inserting a 342 // new record. 343 if (db->GetLastChangeCount() > 0) { 344 if (db->GetLastChangeCount() > 1) { 345 LOG(WARNING) << "Found and updated multiple rows in the activity log " 346 << "database; counts may be off!"; 347 } 348 continue; 349 } 350 sql::Statement insert_statement(db->GetCachedStatement( 351 sql::StatementID(SQL_FROM_HERE), insert_str.c_str())); 352 insert_statement.BindInt64(0, action.time().ToInternalValue()); 353 for (size_t j = 0; j < matched_values.size(); j++) { 354 if (matched_values[j] == -1) 355 insert_statement.BindNull(j + 1); 356 else 357 insert_statement.BindInt64(j + 1, matched_values[j]); 358 } 359 if (!insert_statement.Run()) 360 return false; 361 } 362 363 if (clean_database) { 364 base::Time cutoff = (Now() - retention_time()).LocalMidnight(); 365 if (!CleanOlderThan(db, cutoff)) 366 return false; 367 last_database_cleaning_time_ = Now(); 368 } 369 370 if (!transaction.Commit()) 371 return false; 372 373 return true; 374 } 375 376 scoped_ptr<Action::ActionVector> CountingPolicy::DoReadData( 377 const std::string& extension_id, 378 const int days_ago) { 379 // Ensure data is flushed to the database first so that we query over all 380 // data. 381 activity_database()->AdviseFlush(ActivityDatabase::kFlushImmediately); 382 383 DCHECK_GE(days_ago, 0); 384 scoped_ptr<Action::ActionVector> actions(new Action::ActionVector()); 385 386 sql::Connection* db = GetDatabaseConnection(); 387 if (!db) { 388 return actions.Pass(); 389 } 390 391 int64 early_bound; 392 int64 late_bound; 393 Util::ComputeDatabaseTimeBounds(Now(), days_ago, &early_bound, &late_bound); 394 std::string query_str = base::StringPrintf( 395 "SELECT time, action_type, api_name, args, page_url, page_title, " 396 "arg_url, other, count " 397 "FROM %s WHERE extension_id=? AND time>? AND time<=? " 398 "ORDER BY time DESC", 399 kReadViewName); 400 sql::Statement query(db->GetCachedStatement(SQL_FROM_HERE, 401 query_str.c_str())); 402 query.BindString(0, extension_id); 403 query.BindInt64(1, early_bound); 404 query.BindInt64(2, late_bound); 405 406 while (query.is_valid() && query.Step()) { 407 scoped_refptr<CountedAction> action = 408 new CountedAction(extension_id, 409 base::Time::FromInternalValue(query.ColumnInt64(0)), 410 static_cast<Action::ActionType>(query.ColumnInt(1)), 411 query.ColumnString(2)); 412 413 if (query.ColumnType(3) != sql::COLUMN_TYPE_NULL) { 414 scoped_ptr<Value> parsed_value( 415 base::JSONReader::Read(query.ColumnString(3))); 416 if (parsed_value && parsed_value->IsType(Value::TYPE_LIST)) { 417 action->set_args( 418 make_scoped_ptr(static_cast<ListValue*>(parsed_value.release()))); 419 } else { 420 LOG(WARNING) << "Unable to parse args: '" << query.ColumnString(3) 421 << "'"; 422 } 423 } 424 425 action->ParsePageUrl(query.ColumnString(4)); 426 action->set_page_title(query.ColumnString(5)); 427 action->ParseArgUrl(query.ColumnString(6)); 428 429 if (query.ColumnType(7) != sql::COLUMN_TYPE_NULL) { 430 scoped_ptr<Value> parsed_value( 431 base::JSONReader::Read(query.ColumnString(7))); 432 if (parsed_value && parsed_value->IsType(Value::TYPE_DICTIONARY)) { 433 action->set_other(make_scoped_ptr( 434 static_cast<DictionaryValue*>(parsed_value.release()))); 435 } else { 436 LOG(WARNING) << "Unable to parse other: '" << query.ColumnString(7) 437 << "'"; 438 } 439 } 440 441 action->set_count(query.ColumnInt(8)); 442 443 actions->push_back(action); 444 } 445 446 return actions.Pass(); 447 } 448 449 void CountingPolicy::ReadData( 450 const std::string& extension_id, 451 const int day, 452 const base::Callback<void(scoped_ptr<Action::ActionVector>)>& callback) { 453 BrowserThread::PostTaskAndReplyWithResult( 454 BrowserThread::DB, 455 FROM_HERE, 456 base::Bind(&CountingPolicy::DoReadData, 457 base::Unretained(this), 458 extension_id, 459 day), 460 callback); 461 } 462 463 void CountingPolicy::OnDatabaseFailure() { 464 queued_actions_.clear(); 465 } 466 467 void CountingPolicy::OnDatabaseClose() { 468 delete this; 469 } 470 471 // Cleans old records from the activity log database. 472 bool CountingPolicy::CleanOlderThan(sql::Connection* db, 473 const base::Time& cutoff) { 474 std::string clean_statement = 475 "DELETE FROM " + std::string(kTableName) + " WHERE time < ?"; 476 sql::Statement cleaner(db->GetCachedStatement(sql::StatementID(SQL_FROM_HERE), 477 clean_statement.c_str())); 478 cleaner.BindInt64(0, cutoff.ToInternalValue()); 479 if (!cleaner.Run()) 480 return false; 481 return CleanStringTables(db); 482 } 483 484 // Cleans unused interned strings from the database. This should be run after 485 // deleting rows from the main log table to clean out stale values. 486 bool CountingPolicy::CleanStringTables(sql::Connection* db) { 487 sql::Statement cleaner1(db->GetCachedStatement( 488 sql::StatementID(SQL_FROM_HERE), kStringTableCleanup)); 489 if (!cleaner1.Run()) 490 return false; 491 if (db->GetLastChangeCount() > 0) 492 string_table_.ClearCache(); 493 494 sql::Statement cleaner2(db->GetCachedStatement( 495 sql::StatementID(SQL_FROM_HERE), kUrlTableCleanup)); 496 if (!cleaner2.Run()) 497 return false; 498 if (db->GetLastChangeCount() > 0) 499 url_table_.ClearCache(); 500 501 return true; 502 } 503 504 void CountingPolicy::Close() { 505 // The policy object should have never been created if there's no DB thread. 506 DCHECK(BrowserThread::IsMessageLoopValid(BrowserThread::DB)); 507 ScheduleAndForget(activity_database(), &ActivityDatabase::Close); 508 } 509 510 } // namespace extensions 511