Home | History | Annotate | Download | only in activity_log
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // A policy for storing activity log data to a database that performs
      6 // aggregation to reduce the size of the database.  The database layout is
      7 // nearly the same as FullStreamUIPolicy, which stores a complete log, with a
      8 // few changes:
      9 //   - a "count" column is added to track how many log records were merged
     10 //     together into this row
     11 //   - the "time" column measures the most recent time that the current row was
     12 //     updated
     13 // When writing a record, if a row already exists where all other columns
     14 // (extension_id, action_type, api_name, args, urls, etc.) all match, and the
     15 // previous time falls within today (the current time), then the count field on
     16 // the old row is incremented.  Otherwise, a new row is written.
     17 //
     18 // For many text columns, repeated strings are compressed by moving string
     19 // storage to a separate table ("string_ids") and storing only an identifier in
     20 // the logging table.  For example, if the api_name_x column contained the
     21 // value 4 and the string_ids table contained a row with primary key 4 and
     22 // value 'tabs.query', then the api_name field should be taken to have the
     23 // value 'tabs.query'.  Each column ending with "_x" is compressed in this way.
     24 // All lookups are to the string_ids table, except for the page_url_x and
     25 // arg_url_x columns, which are converted via the url_ids table (this
     26 // separation of URL values is to help simplify history clearing).
     27 //
     28 // The activitylog_uncompressed view allows for simpler reading of the activity
     29 // log contents with identifiers already translated to string values.
     30 
     31 #include "chrome/browser/extensions/activity_log/counting_policy.h"
     32 
     33 #include <map>
     34 #include <string>
     35 #include <vector>
     36 
     37 #include "base/callback.h"
     38 #include "base/files/file_path.h"
     39 #include "base/json/json_reader.h"
     40 #include "base/json/json_string_value_serializer.h"
     41 #include "base/strings/string_util.h"
     42 #include "base/strings/stringprintf.h"
     43 #include "chrome/common/chrome_constants.h"
     44 
     45 using content::BrowserThread;
     46 
     47 namespace {
     48 
     49 // Delay between cleaning passes (to delete old action records) through the
     50 // database.
     51 const int kCleaningDelayInHours = 12;
     52 
     53 // We should log the arguments to these API calls.  Be careful when
     54 // constructing this whitelist to not keep arguments that might compromise
     55 // privacy by logging too much data to the activity log.
     56 //
     57 // TODO(mvrable): The contents of this whitelist should be reviewed and
     58 // expanded as needed.
     59 const char* kAlwaysLog[] = {"extension.connect", "extension.sendMessage",
     60                             "tabs.executeScript", "tabs.insertCSS"};
     61 
     62 // Columns in the main database table.  See the file-level comment for a
     63 // discussion of how data is stored and the meanings of the _x columns.
     64 const char* kTableContentFields[] = {
     65     "count", "extension_id_x", "time", "action_type", "api_name_x", "args_x",
     66     "page_url_x", "page_title_x", "arg_url_x", "other_x"};
     67 const char* kTableFieldTypes[] = {
     68     "INTEGER NOT NULL DEFAULT 1", "INTEGER NOT NULL", "INTEGER", "INTEGER",
     69     "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
     70     "INTEGER"};
     71 
     72 // Miscellaneous SQL commands for initializing the database; these should be
     73 // idempotent.
     74 static const char kPolicyMiscSetup[] =
     75     // The activitylog_uncompressed view performs string lookups for simpler
     76     // access to the log data.
     77     "DROP VIEW IF EXISTS activitylog_uncompressed;\n"
     78     "CREATE VIEW activitylog_uncompressed AS\n"
     79     "SELECT count,\n"
     80     "    x1.value AS extension_id,\n"
     81     "    time,\n"
     82     "    action_type,\n"
     83     "    x2.value AS api_name,\n"
     84     "    x3.value AS args,\n"
     85     "    x4.value AS page_url,\n"
     86     "    x5.value AS page_title,\n"
     87     "    x6.value AS arg_url,\n"
     88     "    x7.value AS other\n"
     89     "FROM activitylog_compressed\n"
     90     "    LEFT JOIN string_ids AS x1 ON (x1.id = extension_id_x)\n"
     91     "    LEFT JOIN string_ids AS x2 ON (x2.id = api_name_x)\n"
     92     "    LEFT JOIN string_ids AS x3 ON (x3.id = args_x)\n"
     93     "    LEFT JOIN url_ids    AS x4 ON (x4.id = page_url_x)\n"
     94     "    LEFT JOIN string_ids AS x5 ON (x5.id = page_title_x)\n"
     95     "    LEFT JOIN url_ids    AS x6 ON (x6.id = arg_url_x)\n"
     96     "    LEFT JOIN string_ids AS x7 ON (x7.id = other_x);\n"
     97     // An index on all fields except count and time: all the fields that aren't
     98     // changed when incrementing a count.  This should accelerate finding the
     99     // rows to update (at worst several rows will need to be checked to find
    100     // the one in the right time range).
    101     "CREATE INDEX IF NOT EXISTS activitylog_compressed_index\n"
    102     "ON activitylog_compressed(extension_id_x, action_type, api_name_x,\n"
    103     "    args_x, page_url_x, page_title_x, arg_url_x, other_x)";
    104 
    105 // SQL statements to clean old, unused entries out of the string and URL id
    106 // tables.
    107 static const char kStringTableCleanup[] =
    108     "DELETE FROM string_ids WHERE id NOT IN\n"
    109     "(SELECT extension_id_x FROM activitylog_compressed\n"
    110     " UNION SELECT api_name_x FROM activitylog_compressed\n"
    111     " UNION SELECT args_x FROM activitylog_compressed\n"
    112     " UNION SELECT page_title_x FROM activitylog_compressed\n"
    113     " UNION SELECT other_x FROM activitylog_compressed)";
    114 static const char kUrlTableCleanup[] =
    115     "DELETE FROM url_ids WHERE id NOT IN\n"
    116     "(SELECT page_url_x FROM activitylog_compressed\n"
    117     " UNION SELECT arg_url_x FROM activitylog_compressed)";
    118 
    119 }  // namespace
    120 
    121 namespace extensions {
    122 
    123 // A specialized Action subclass which is used to represent an action read from
    124 // the database with a corresponding count.
    125 class CountedAction : public Action {
    126  public:
    127   CountedAction(const std::string& extension_id,
    128                 const base::Time& time,
    129                 const ActionType action_type,
    130                 const std::string& api_name)
    131       : Action(extension_id, time, action_type, api_name) {}
    132 
    133   // Number of merged records for this action.
    134   int count() const { return count_; }
    135   void set_count(int count) { count_ = count; }
    136 
    137   virtual std::string PrintForDebug() const OVERRIDE;
    138 
    139  protected:
    140   virtual ~CountedAction() {}
    141 
    142  private:
    143   int count_;
    144 };
    145 
    146 std::string CountedAction::PrintForDebug() const {
    147   return base::StringPrintf(
    148       "%s COUNT=%d", Action::PrintForDebug().c_str(), count());
    149 }
    150 
    151 const char* CountingPolicy::kTableName = "activitylog_compressed";
    152 const char* CountingPolicy::kReadViewName = "activitylog_uncompressed";
    153 
    154 CountingPolicy::CountingPolicy(Profile* profile)
    155     : ActivityLogDatabasePolicy(
    156           profile,
    157           base::FilePath(chrome::kExtensionActivityLogFilename)),
    158       string_table_("string_ids"),
    159       url_table_("url_ids"),
    160       retention_time_(base::TimeDelta::FromHours(60)) {
    161   for (size_t i = 0; i < arraysize(kAlwaysLog); i++) {
    162     api_arg_whitelist_.insert(kAlwaysLog[i]);
    163   }
    164 }
    165 
    166 CountingPolicy::~CountingPolicy() {}
    167 
    168 bool CountingPolicy::InitDatabase(sql::Connection* db) {
    169   if (!Util::DropObsoleteTables(db))
    170     return false;
    171 
    172   if (!string_table_.Initialize(db))
    173     return false;
    174   if (!url_table_.Initialize(db))
    175     return false;
    176 
    177   // Create the unified activity log entry table.
    178   if (!ActivityDatabase::InitializeTable(db,
    179                                          kTableName,
    180                                          kTableContentFields,
    181                                          kTableFieldTypes,
    182                                          arraysize(kTableContentFields)))
    183     return false;
    184 
    185   // Create a view for easily accessing the uncompressed form of the data, and
    186   // any necessary indexes if needed.
    187   return db->Execute(kPolicyMiscSetup);
    188 }
    189 
    190 void CountingPolicy::ProcessAction(scoped_refptr<Action> action) {
    191   ScheduleAndForget(this, &CountingPolicy::QueueAction, action);
    192 }
    193 
    194 void CountingPolicy::QueueAction(scoped_refptr<Action> action) {
    195   if (activity_database()->is_db_valid()) {
    196     action = action->Clone();
    197     Util::StripPrivacySensitiveFields(action);
    198     Util::StripArguments(api_arg_whitelist_, action);
    199     queued_actions_.push_back(action);
    200     activity_database()->AdviseFlush(queued_actions_.size());
    201   }
    202 }
    203 
    204 bool CountingPolicy::FlushDatabase(sql::Connection* db) {
    205   // Columns that must match exactly for database rows to be coalesced.
    206   static const char* matched_columns[] = {
    207       "extension_id_x", "action_type", "api_name_x", "args_x", "page_url_x",
    208       "page_title_x", "arg_url_x", "other_x"};
    209   Action::ActionVector queue;
    210   queue.swap(queued_actions_);
    211 
    212   // Whether to clean old records out of the activity log database.  Do this
    213   // much less frequently than database flushes since it is expensive, but
    214   // always check on the first database flush (since there might be a large
    215   // amount of data to clear).
    216   bool clean_database = (last_database_cleaning_time_.is_null() ||
    217                          Now() - last_database_cleaning_time_ >
    218                              base::TimeDelta::FromHours(kCleaningDelayInHours));
    219 
    220   if (queue.empty() && !clean_database)
    221     return true;
    222 
    223   sql::Transaction transaction(db);
    224   if (!transaction.Begin())
    225     return false;
    226 
    227   std::string insert_str =
    228       "INSERT INTO " + std::string(kTableName) + "(count, time";
    229   std::string update_str =
    230       "UPDATE " + std::string(kTableName) +
    231       " SET count = count + 1, time = max(?, time)"
    232       " WHERE time >= ? AND time < ?";
    233 
    234   for (size_t i = 0; i < arraysize(matched_columns); i++) {
    235     insert_str =
    236         base::StringPrintf("%s, %s", insert_str.c_str(), matched_columns[i]);
    237     update_str = base::StringPrintf(
    238         "%s AND %s IS ?", update_str.c_str(), matched_columns[i]);
    239   }
    240   insert_str += ") VALUES (1, ?";
    241   for (size_t i = 0; i < arraysize(matched_columns); i++) {
    242     insert_str += ", ?";
    243   }
    244   insert_str += ")";
    245 
    246   Action::ActionVector::size_type i;
    247   for (i = 0; i != queue.size(); ++i) {
    248     const Action& action = *queue[i];
    249 
    250     base::Time day_start = action.time().LocalMidnight();
    251     base::Time next_day = Util::AddDays(day_start, 1);
    252 
    253     // The contents in values must match up with fields in matched_columns.  A
    254     // value of -1 is used to encode a null database value.
    255     int64 id;
    256     std::vector<int64> matched_values;
    257 
    258     if (!string_table_.StringToInt(db, action.extension_id(), &id))
    259       return false;
    260     matched_values.push_back(id);
    261 
    262     matched_values.push_back(static_cast<int>(action.action_type()));
    263 
    264     if (!string_table_.StringToInt(db, action.api_name(), &id))
    265       return false;
    266     matched_values.push_back(id);
    267 
    268     if (action.args()) {
    269       std::string args = Util::Serialize(action.args());
    270       // TODO(mvrable): For now, truncate long argument lists.  This is a
    271       // workaround for excessively-long values coming from DOM logging.  When
    272       // the V8ValueConverter is fixed to return more reasonable values, we can
    273       // drop the truncation.
    274       if (args.length() > 10000) {
    275         args = "[\"<too_large>\"]";
    276       }
    277       if (!string_table_.StringToInt(db, args, &id))
    278         return false;
    279       matched_values.push_back(id);
    280     } else {
    281       matched_values.push_back(-1);
    282     }
    283 
    284     std::string page_url_string = action.SerializePageUrl();
    285     if (!page_url_string.empty()) {
    286       if (!url_table_.StringToInt(db, page_url_string, &id))
    287         return false;
    288       matched_values.push_back(id);
    289     } else {
    290       matched_values.push_back(-1);
    291     }
    292 
    293     // TODO(mvrable): Create a title_table_?
    294     if (!action.page_title().empty()) {
    295       if (!string_table_.StringToInt(db, action.page_title(), &id))
    296         return false;
    297       matched_values.push_back(id);
    298     } else {
    299       matched_values.push_back(-1);
    300     }
    301 
    302     std::string arg_url_string = action.SerializeArgUrl();
    303     if (!arg_url_string.empty()) {
    304       if (!url_table_.StringToInt(db, arg_url_string, &id))
    305         return false;
    306       matched_values.push_back(id);
    307     } else {
    308       matched_values.push_back(-1);
    309     }
    310 
    311     if (action.other()) {
    312       if (!string_table_.StringToInt(db, Util::Serialize(action.other()), &id))
    313         return false;
    314       matched_values.push_back(id);
    315     } else {
    316       matched_values.push_back(-1);
    317     }
    318 
    319     // Assume there is an existing row for this action, and try to update the
    320     // count.
    321     sql::Statement update_statement(db->GetCachedStatement(
    322         sql::StatementID(SQL_FROM_HERE), update_str.c_str()));
    323     update_statement.BindInt64(0, action.time().ToInternalValue());
    324     update_statement.BindInt64(1, day_start.ToInternalValue());
    325     update_statement.BindInt64(2, next_day.ToInternalValue());
    326     for (size_t j = 0; j < matched_values.size(); j++) {
    327       // A call to BindNull when matched_values contains -1 is likely not
    328       // necessary as parameters default to null before they are explicitly
    329       // bound.  But to be completely clear, and in case a cached statement
    330       // ever comes with some values already bound, we bind all parameters
    331       // (even null ones) explicitly.
    332       if (matched_values[j] == -1)
    333         update_statement.BindNull(j + 3);
    334       else
    335         update_statement.BindInt64(j + 3, matched_values[j]);
    336     }
    337     if (!update_statement.Run())
    338       return false;
    339 
    340     // Check if the update succeeded (was the count of updated rows non-zero)?
    341     // If it failed because no matching row existed, fall back to inserting a
    342     // new record.
    343     if (db->GetLastChangeCount() > 0) {
    344       if (db->GetLastChangeCount() > 1) {
    345         LOG(WARNING) << "Found and updated multiple rows in the activity log "
    346                      << "database; counts may be off!";
    347       }
    348       continue;
    349     }
    350     sql::Statement insert_statement(db->GetCachedStatement(
    351         sql::StatementID(SQL_FROM_HERE), insert_str.c_str()));
    352     insert_statement.BindInt64(0, action.time().ToInternalValue());
    353     for (size_t j = 0; j < matched_values.size(); j++) {
    354       if (matched_values[j] == -1)
    355         insert_statement.BindNull(j + 1);
    356       else
    357         insert_statement.BindInt64(j + 1, matched_values[j]);
    358     }
    359     if (!insert_statement.Run())
    360       return false;
    361   }
    362 
    363   if (clean_database) {
    364     base::Time cutoff = (Now() - retention_time()).LocalMidnight();
    365     if (!CleanOlderThan(db, cutoff))
    366       return false;
    367     last_database_cleaning_time_ = Now();
    368   }
    369 
    370   if (!transaction.Commit())
    371     return false;
    372 
    373   return true;
    374 }
    375 
    376 scoped_ptr<Action::ActionVector> CountingPolicy::DoReadData(
    377     const std::string& extension_id,
    378     const int days_ago) {
    379   // Ensure data is flushed to the database first so that we query over all
    380   // data.
    381   activity_database()->AdviseFlush(ActivityDatabase::kFlushImmediately);
    382 
    383   DCHECK_GE(days_ago, 0);
    384   scoped_ptr<Action::ActionVector> actions(new Action::ActionVector());
    385 
    386   sql::Connection* db = GetDatabaseConnection();
    387   if (!db) {
    388     return actions.Pass();
    389   }
    390 
    391   int64 early_bound;
    392   int64 late_bound;
    393   Util::ComputeDatabaseTimeBounds(Now(), days_ago, &early_bound, &late_bound);
    394   std::string query_str = base::StringPrintf(
    395       "SELECT time, action_type, api_name, args, page_url, page_title, "
    396       "arg_url, other, count "
    397       "FROM %s WHERE extension_id=? AND time>? AND time<=? "
    398       "ORDER BY time DESC",
    399       kReadViewName);
    400   sql::Statement query(db->GetCachedStatement(SQL_FROM_HERE,
    401                                               query_str.c_str()));
    402   query.BindString(0, extension_id);
    403   query.BindInt64(1, early_bound);
    404   query.BindInt64(2, late_bound);
    405 
    406   while (query.is_valid() && query.Step()) {
    407     scoped_refptr<CountedAction> action =
    408         new CountedAction(extension_id,
    409                           base::Time::FromInternalValue(query.ColumnInt64(0)),
    410                           static_cast<Action::ActionType>(query.ColumnInt(1)),
    411                           query.ColumnString(2));
    412 
    413     if (query.ColumnType(3) != sql::COLUMN_TYPE_NULL) {
    414       scoped_ptr<Value> parsed_value(
    415           base::JSONReader::Read(query.ColumnString(3)));
    416       if (parsed_value && parsed_value->IsType(Value::TYPE_LIST)) {
    417         action->set_args(
    418             make_scoped_ptr(static_cast<ListValue*>(parsed_value.release())));
    419       } else {
    420         LOG(WARNING) << "Unable to parse args: '" << query.ColumnString(3)
    421                      << "'";
    422       }
    423     }
    424 
    425     action->ParsePageUrl(query.ColumnString(4));
    426     action->set_page_title(query.ColumnString(5));
    427     action->ParseArgUrl(query.ColumnString(6));
    428 
    429     if (query.ColumnType(7) != sql::COLUMN_TYPE_NULL) {
    430       scoped_ptr<Value> parsed_value(
    431           base::JSONReader::Read(query.ColumnString(7)));
    432       if (parsed_value && parsed_value->IsType(Value::TYPE_DICTIONARY)) {
    433         action->set_other(make_scoped_ptr(
    434             static_cast<DictionaryValue*>(parsed_value.release())));
    435       } else {
    436         LOG(WARNING) << "Unable to parse other: '" << query.ColumnString(7)
    437                      << "'";
    438       }
    439     }
    440 
    441     action->set_count(query.ColumnInt(8));
    442 
    443     actions->push_back(action);
    444   }
    445 
    446   return actions.Pass();
    447 }
    448 
    449 void CountingPolicy::ReadData(
    450     const std::string& extension_id,
    451     const int day,
    452     const base::Callback<void(scoped_ptr<Action::ActionVector>)>& callback) {
    453   BrowserThread::PostTaskAndReplyWithResult(
    454       BrowserThread::DB,
    455       FROM_HERE,
    456       base::Bind(&CountingPolicy::DoReadData,
    457                  base::Unretained(this),
    458                  extension_id,
    459                  day),
    460       callback);
    461 }
    462 
    463 void CountingPolicy::OnDatabaseFailure() {
    464   queued_actions_.clear();
    465 }
    466 
    467 void CountingPolicy::OnDatabaseClose() {
    468   delete this;
    469 }
    470 
    471 // Cleans old records from the activity log database.
    472 bool CountingPolicy::CleanOlderThan(sql::Connection* db,
    473                                     const base::Time& cutoff) {
    474   std::string clean_statement =
    475       "DELETE FROM " + std::string(kTableName) + " WHERE time < ?";
    476   sql::Statement cleaner(db->GetCachedStatement(sql::StatementID(SQL_FROM_HERE),
    477                                                 clean_statement.c_str()));
    478   cleaner.BindInt64(0, cutoff.ToInternalValue());
    479   if (!cleaner.Run())
    480     return false;
    481   return CleanStringTables(db);
    482 }
    483 
    484 // Cleans unused interned strings from the database.  This should be run after
    485 // deleting rows from the main log table to clean out stale values.
    486 bool CountingPolicy::CleanStringTables(sql::Connection* db) {
    487   sql::Statement cleaner1(db->GetCachedStatement(
    488       sql::StatementID(SQL_FROM_HERE), kStringTableCleanup));
    489   if (!cleaner1.Run())
    490     return false;
    491   if (db->GetLastChangeCount() > 0)
    492     string_table_.ClearCache();
    493 
    494   sql::Statement cleaner2(db->GetCachedStatement(
    495       sql::StatementID(SQL_FROM_HERE), kUrlTableCleanup));
    496   if (!cleaner2.Run())
    497     return false;
    498   if (db->GetLastChangeCount() > 0)
    499     url_table_.ClearCache();
    500 
    501   return true;
    502 }
    503 
    504 void CountingPolicy::Close() {
    505   // The policy object should have never been created if there's no DB thread.
    506   DCHECK(BrowserThread::IsMessageLoopValid(BrowserThread::DB));
    507   ScheduleAndForget(activity_database(), &ActivityDatabase::Close);
    508 }
    509 
    510 }  // namespace extensions
    511