Home | History | Annotate | Download | only in importer
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* ***** BEGIN LICENSE BLOCK *****
      3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
      4  *
      5  * The contents of this file are subject to the Mozilla Public License Version
      6  * 1.1 (the "License"); you may not use this file except in compliance with
      7  * the License. You may obtain a copy of the License at
      8  * http://www.mozilla.org/MPL/
      9  *
     10  * Software distributed under the License is distributed on an "AS IS" basis,
     11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
     12  * for the specific language governing rights and limitations under the
     13  * License.
     14  *
     15  * The Original Code is the Mork Reader.
     16  *
     17  * The Initial Developer of the Original Code is
     18  * Google Inc.
     19  * Portions created by the Initial Developer are Copyright (C) 2006
     20  * the Initial Developer. All Rights Reserved.
     21  *
     22  * Contributor(s):
     23  *   Brian Ryner <bryner (at) brianryner.com> (original author)
     24  *
     25  * Alternatively, the contents of this file may be used under the terms of
     26  * either the GNU General Public License Version 2 or later (the "GPL"), or
     27  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
     28  * in which case the provisions of the GPL or the LGPL are applicable instead
     29  * of those above. If you wish to allow use of your version of this file only
     30  * under the terms of either the GPL or the LGPL, and not to allow others to
     31  * use your version of this file under the terms of the MPL, indicate your
     32  * decision by deleting the provisions above and replace them with the notice
     33  * and other provisions required by the GPL or the LGPL. If you do not delete
     34  * the provisions above, a recipient may use your version of this file under
     35  * the terms of any one of the MPL, the GPL or the LGPL.
     36  *
     37  * ***** END LICENSE BLOCK ***** */
     38 
     39 // Source:
     40 // http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
     41 // This file has been converted to google style.
     42 
     43 #include "chrome/browser/importer/mork_reader.h"
     44 
     45 #include <algorithm>
     46 
     47 #include "base/file_path.h"
     48 #include "base/i18n/icu_string_conversions.h"
     49 #include "base/logging.h"
     50 #include "base/message_loop.h"
     51 #include "base/string_number_conversions.h"
     52 #include "base/string_util.h"
     53 #include "chrome/browser/history/history_types.h"
     54 #include "chrome/browser/importer/firefox_importer_utils.h"
     55 #include "chrome/browser/importer/importer_bridge.h"
     56 
     57 namespace {
     58 
     59 // Convert a hex character (0-9, A-F) to its corresponding byte value.
     60 // Returns -1 if the character is invalid.
     61 inline int HexCharToInt(char c) {
     62   if ('0' <= c && c <= '9')
     63     return c - '0';
     64   if ('A' <= c && c <= 'F')
     65     return c - 'A' + 10;
     66   return -1;
     67 }
     68 
     69 // Unescape a Mork value.  Mork uses $xx escaping to encode non-ASCII
     70 // characters.  Additionally, '$' and '\' are backslash-escaped.
     71 // The result of the unescape is in returned.
     72 std::string MorkUnescape(const std::string& input) {
     73   // We optimize for speed over space here -- size the result buffer to
     74   // the size of the source, which is an upper bound on the size of the
     75   // unescaped string.
     76   std::string result;
     77   size_t input_length = input.size();
     78   result.reserve(input_length);
     79 
     80   for (size_t i = 0; i < input_length; i++) {
     81     char c = input[i];
     82     if (c == '\\') {
     83       // Escaped literal, slip the backslash, append the next character.
     84       i++;
     85       if (i < input_length)
     86         result.push_back(input[i]);
     87     } else if (c == '$') {
     88       // Dollar sign denotes a hex character.
     89       if (i < input_length - 2) {
     90         // Would be nice to use ToInteger() here, but it currently
     91         // requires a null-terminated string.
     92         int first = HexCharToInt(input[++i]);
     93         int second = HexCharToInt(input[++i]);
     94         if (first >= 0 && second >= 0)
     95           result.push_back((first << 4) | second);
     96       }
     97     } else {
     98       // Regular character, just append.
     99       result.push_back(input[i]);
    100     }
    101   }
    102   return result;
    103 }
    104 
    105 }  // namespace
    106 
    107 MorkReader::MorkReader() {
    108 }
    109 
    110 MorkReader::~MorkReader() {
    111   // Need to delete all the pointers to vectors we have in the table.
    112   for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
    113     delete i->second;
    114 }
    115 
    116 bool MorkReader::Read(const FilePath& path) {
    117   stream_.open(path.value().c_str());
    118   if (!stream_.is_open())
    119     return false;
    120 
    121   std::string line;
    122   if (!ReadLine(&line) ||
    123       line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
    124     return false;  // Unexpected file format.
    125 
    126   IndexMap column_map;
    127   while (ReadLine(&line)) {
    128     // Trim off leading spaces
    129     size_t idx = 0;
    130     size_t len = line.size();
    131     while (idx < len && line[idx] == ' ')
    132       ++idx;
    133     if (idx >= len)
    134       continue;
    135 
    136     // Look at the line to figure out what section type this is
    137     if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
    138       // Column map.  We begin by creating a hash of column id to column name.
    139       StringMap column_name_map;
    140       ParseMap(line, idx, &column_name_map);
    141 
    142       // Now that we have the list of columns, we put them into a flat array.
    143       // Rows will have value arrays of the same size, with indexes that
    144       // correspond to the columns array.  As we insert each column into the
    145       // array, we also make an entry in columnMap so that we can look up the
    146       // index given the column id.
    147       columns_.reserve(column_name_map.size());
    148 
    149       for (StringMap::const_iterator i = column_name_map.begin();
    150            i != column_name_map.end(); ++i) {
    151         column_map[i->first] = static_cast<int>(columns_.size());
    152         MorkColumn col(i->first, i->second);
    153         columns_.push_back(col);
    154       }
    155     } else if (StartsWithASCII(&line[idx], "<(", true)) {
    156       // Value map.
    157       ParseMap(line, idx, &value_map_);
    158     } else if (line[idx] == '{' || line[idx] == '[') {
    159       // Table / table row.
    160       ParseTable(line, idx, &column_map);
    161     } else {
    162       // Don't know, hopefully don't care.
    163     }
    164   }
    165   return true;
    166 }
    167 
    168 // Parses a key/value map of the form
    169 // <(k1=v1)(k2=v2)...>
    170 bool MorkReader::ParseMap(const std::string& first_line,
    171                           size_t start_index,
    172                           StringMap* map) {
    173   // If the first line is the a=c line (column map), just skip over it.
    174   std::string line(first_line);
    175   if (StartsWithASCII(line, "< <(a=c)>", true))
    176     ReadLine(&line);
    177 
    178   std::string key;
    179   do {
    180     size_t idx = start_index;
    181     size_t len = line.size();
    182     size_t token_start;
    183 
    184     while (idx < len) {
    185       switch (line[idx++]) {
    186         case '(':
    187           // Beginning of a key/value pair.
    188           if (!key.empty()) {
    189             DLOG(WARNING) << "unterminated key/value pair?";
    190             key.clear();
    191           }
    192 
    193           token_start = idx;
    194           while (idx < len && line[idx] != '=')
    195             ++idx;
    196           key.assign(&line[token_start], idx - token_start);
    197           break;
    198 
    199         case '=': {
    200           // Beginning of the value.
    201           if (key.empty()) {
    202             DLOG(WARNING) << "stray value";
    203             break;
    204           }
    205 
    206           token_start = idx;
    207           while (idx < len && line[idx] != ')') {
    208             if (line[idx] == '\\')
    209               ++idx;  // Skip escaped ')' characters.
    210             ++idx;
    211           }
    212           size_t token_end = std::min(idx, len);
    213           ++idx;
    214 
    215           std::string value = MorkUnescape(
    216               std::string(&line[token_start], token_end - token_start));
    217           (*map)[key] = value;
    218           key.clear();
    219           break;
    220         }
    221         case '>':
    222           // End of the map.
    223           DLOG_IF(WARNING, key.empty()) <<
    224               "map terminates inside of key/value pair";
    225           return true;
    226       }
    227     }
    228 
    229     // We should start reading the next line at the beginning.
    230     start_index = 0;
    231   } while (ReadLine(&line));
    232 
    233   // We ran out of lines and the map never terminated.  This probably indicates
    234   // a parsing error.
    235   DLOG(WARNING) << "didn't find end of key/value map";
    236   return false;
    237 }
    238 
    239 // Parses a table row of the form [123(^45^67)..]
    240 // (row id 123 has the value with id 67 for the column with id 45).
    241 // A '^' prefix for a column or value references an entry in the column or
    242 // value map.  '=' is used as the separator when the value is a literal.
    243 void MorkReader::ParseTable(const std::string& first_line,
    244                             size_t start_index,
    245                             const IndexMap* column_map) {
    246   std::string line(first_line);
    247 
    248   // Column index of the cell we're parsing, minus one if invalid.
    249   int column_index = -1;
    250 
    251   // Points to the current row we're parsing inside of the |table_|, will be
    252   // NULL if we're not inside a row.
    253   ColumnDataList* current_row = NULL;
    254 
    255   bool in_meta_row = false;
    256 
    257   do {
    258     size_t idx = start_index;
    259     size_t len = line.size();
    260 
    261     while (idx < len) {
    262       switch (line[idx++]) {
    263         case '{':
    264           // This marks the beginning of a table section.  There's a lot of
    265           // junk before the first row that looks like cell values but isn't.
    266           // Skip to the first '['.
    267           while (idx < len && line[idx] != '[') {
    268             if (line[idx] == '{') {
    269               in_meta_row = true;  // The meta row is enclosed in { }
    270             } else if (line[idx] == '}') {
    271               in_meta_row = false;
    272             }
    273             ++idx;
    274           }
    275           break;
    276 
    277         case '[': {
    278           // Start of a new row.  Consume the row id, up to the first '('.
    279           // Row edits also have a table namespace, separated from the row id
    280           // by a colon.  We don't make use of the namespace, but we need to
    281           // make sure not to consider it part of the row id.
    282           if (current_row) {
    283             DLOG(WARNING) << "unterminated row?";
    284             current_row = NULL;
    285           }
    286 
    287           // Check for a '-' at the start of the id.  This signifies that
    288           // if the row already exists, we should delete all columns from it
    289           // before adding the new values.
    290           bool cut_columns;
    291           if (idx < len && line[idx] == '-') {
    292             cut_columns = true;
    293             ++idx;
    294           } else {
    295             cut_columns = false;
    296           }
    297 
    298           // Locate the range of the ID.
    299           size_t token_start = idx;  // Index of the first char of the token.
    300           while (idx < len &&
    301                  line[idx] != '(' &&
    302                  line[idx] != ']' &&
    303                  line[idx] != ':') {
    304             ++idx;
    305           }
    306           size_t token_end = idx;  // Index of the char following the token.
    307           while (idx < len && line[idx] != '(' && line[idx] != ']') {
    308             ++idx;
    309           }
    310 
    311           if (in_meta_row) {
    312             // Need to create the meta row.
    313             meta_row_.resize(columns_.size());
    314             current_row = &meta_row_;
    315           } else {
    316             // Find or create the regular row for this.
    317             IDString row_id(&line[token_start], token_end - token_start);
    318             RowMap::iterator found_row = table_.find(row_id);
    319             if (found_row == table_.end()) {
    320               // We don't already have this row, create a new one for it.
    321               current_row = new ColumnDataList(columns_.size());
    322               table_[row_id] = current_row;
    323             } else {
    324               // The row already exists and we're adding/replacing things.
    325               current_row = found_row->second;
    326             }
    327           }
    328           if (cut_columns) {
    329             for (size_t i = 0; i < current_row->size(); ++i)
    330               (*current_row)[i].clear();
    331           }
    332           break;
    333         }
    334 
    335         case ']':
    336           // We're done with the row.
    337           current_row = NULL;
    338           in_meta_row = false;
    339           break;
    340 
    341         case '(': {
    342           if (!current_row) {
    343             DLOG(WARNING) << "cell value outside of row";
    344             break;
    345           }
    346 
    347           bool column_is_atom;
    348           if (line[idx] == '^') {
    349             column_is_atom = true;
    350             ++idx;  // This is not part of the column id, advance past it.
    351           } else {
    352             column_is_atom = false;
    353           }
    354           size_t token_start = idx;
    355           while (idx < len && line[idx] != '^' && line[idx] != '=') {
    356             if (line[idx] == '\\')
    357               ++idx;  // Skip escaped characters.
    358             ++idx;
    359           }
    360 
    361           size_t token_end = std::min(idx, len);
    362 
    363           IDString column;
    364           if (column_is_atom)
    365             column.assign(&line[token_start], token_end - token_start);
    366           else
    367             column = MorkUnescape(line.substr(token_start,
    368                                               token_end - token_start));
    369 
    370           IndexMap::const_iterator found_column = column_map->find(column);
    371           if (found_column == column_map->end()) {
    372             DLOG(WARNING) << "Column not in column map, discarding it";
    373             column_index = -1;
    374           } else {
    375             column_index = found_column->second;
    376           }
    377           break;
    378         }
    379 
    380         case '=':
    381         case '^': {
    382           if (column_index == -1) {
    383             DLOG(WARNING) << "stray ^ or = marker";
    384             break;
    385           }
    386 
    387           bool value_is_atom = (line[idx - 1] == '^');
    388           size_t token_start = idx - 1;  // Include the '=' or '^' marker.
    389           while (idx < len && line[idx] != ')') {
    390             if (line[idx] == '\\')
    391               ++idx;  // Skip escaped characters.
    392             ++idx;
    393           }
    394           size_t token_end = std::min(idx, len);
    395           ++idx;
    396 
    397           if (value_is_atom) {
    398             (*current_row)[column_index].assign(&line[token_start],
    399                                                 token_end - token_start);
    400           } else {
    401             (*current_row)[column_index] =
    402                 MorkUnescape(line.substr(token_start, token_end - token_start));
    403           }
    404           column_index = -1;
    405         }
    406         break;
    407       }
    408     }
    409 
    410     // Start parsing the next line at the beginning.
    411     start_index = 0;
    412   } while (current_row && ReadLine(&line));
    413 }
    414 
    415 bool MorkReader::ReadLine(std::string* line) {
    416   line->resize(256);
    417   std::getline(stream_, *line);
    418   if (stream_.eof() || stream_.bad())
    419     return false;
    420 
    421   while (!line->empty() &&  (*line)[line->size() - 1] == '\\') {
    422     // There is a continuation for this line.  Read it and append.
    423     std::string new_line;
    424     std::getline(stream_, new_line);
    425     if (stream_.eof())
    426       return false;
    427     line->erase(line->size() - 1);
    428     line->append(new_line);
    429   }
    430 
    431   return true;
    432 }
    433 
    434 void MorkReader::NormalizeValue(std::string* value) const {
    435   if (value->empty())
    436     return;
    437   MorkReader::StringMap::const_iterator i;
    438   switch (value->at(0)) {
    439     case '^':
    440       // Hex ID, lookup the name for it in the |value_map_|.
    441       i = value_map_.find(value->substr(1));
    442       if (i == value_map_.end())
    443         value->clear();
    444       else
    445         *value = i->second;
    446       break;
    447     case '=':
    448       // Just use the literal after the equals sign.
    449       value->erase(value->begin());
    450       break;
    451     default:
    452       // Anything else is invalid.
    453       value->clear();
    454       break;
    455   }
    456 }
    457 
    458 // Source:
    459 // http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
    460 
    461 // Columns for entry (non-meta) history rows
    462 enum {
    463   kURLColumn,
    464   kNameColumn,
    465   kVisitCountColumn,
    466   kHiddenColumn,
    467   kTypedColumn,
    468   kLastVisitColumn,
    469   kColumnCount  // Keep me last.
    470 };
    471 
    472 static const char * const gColumnNames[] = {
    473   "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
    474 };
    475 
    476 struct TableReadClosure {
    477   explicit TableReadClosure(const MorkReader& r)
    478       : reader(r),
    479         swap_bytes(false),
    480         byte_order_column(-1) {
    481     for (int i = 0; i < kColumnCount; ++i)
    482       column_indexes[i] = -1;
    483   }
    484 
    485   // Backpointers to the reader and history we're operating on.
    486   const MorkReader& reader;
    487 
    488   // Whether we need to swap bytes (file format is other-endian).
    489   bool swap_bytes;
    490 
    491   // Indexes of the columns that we care about.
    492   int column_indexes[kColumnCount];
    493   int byte_order_column;
    494 };
    495 
    496 void AddToHistory(MorkReader::ColumnDataList* column_values,
    497                   const TableReadClosure& data,
    498                   std::vector<history::URLRow>* rows) {
    499   std::string values[kColumnCount];
    500 
    501   for (size_t i = 0; i < kColumnCount; ++i) {
    502     if (data.column_indexes[i] != -1) {
    503       values[i] = column_values->at(data.column_indexes[i]);
    504       data.reader.NormalizeValue(&values[i]);
    505       // Do not import hidden records.
    506       if (i == kHiddenColumn && values[i] == "1")
    507         return;
    508     }
    509   }
    510 
    511   GURL url(values[kURLColumn]);
    512 
    513   if (CanImportURL(url)) {
    514     history::URLRow row(url);
    515 
    516     string16 title;
    517     if (data.swap_bytes) {
    518       base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE,
    519                             base::OnStringConversionError::SKIP, &title);
    520     } else {
    521       base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE,
    522                             base::OnStringConversionError::SKIP, &title);
    523     }
    524     row.set_title(title);
    525 
    526     int count = atoi(values[kVisitCountColumn].c_str());
    527     if (count == 0)
    528       count = 1;
    529     row.set_visit_count(count);
    530 
    531     int64 date;
    532     base::StringToInt64(values[kLastVisitColumn], &date);
    533     if (date != 0)
    534       row.set_last_visit(base::Time::FromTimeT(date / 1000000));
    535 
    536     bool is_typed = (values[kTypedColumn] == "1");
    537     if (is_typed)
    538       row.set_typed_count(1);
    539 
    540     rows->push_back(row);
    541   }
    542 }
    543 
    544 // It sets up the file stream and loops over the lines in the file to
    545 // parse them, then adds the resulting row set to history.
    546 void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) {
    547   MorkReader reader;
    548   reader.Read(file);
    549 
    550   // Gather up the column ids so we don't need to find them on each row
    551   TableReadClosure data(reader);
    552   const MorkReader::MorkColumnList& columns = reader.columns();
    553   for (size_t i = 0; i < columns.size(); ++i) {
    554     for (int j = 0; j < kColumnCount; ++j)
    555       if (columns[i].name == gColumnNames[j]) {
    556         data.column_indexes[j] = static_cast<int>(i);
    557         break;
    558       }
    559     if (columns[i].name == "ByteOrder")
    560       data.byte_order_column = static_cast<int>(i);
    561   }
    562 
    563   // Determine the byte order from the table's meta-row.
    564   const MorkReader::ColumnDataList& meta_row = reader.meta_row();
    565   if (!meta_row.empty() && data.byte_order_column != -1) {
    566     std::string byte_order = meta_row[data.byte_order_column];
    567     if (!byte_order.empty()) {
    568       // Note whether the file uses a non-native byte ordering.
    569       // If it does, we'll have to swap bytes for PRUnichar values.
    570       // "BE" and "LE" are the only recognized values, anything
    571       // else is garbage and the file will be treated as native-endian
    572       // (no swapping).
    573       std::string byte_order_value(byte_order);
    574       reader.NormalizeValue(&byte_order_value);
    575       data.swap_bytes = (byte_order_value == "BE");
    576     }
    577   }
    578 
    579   std::vector<history::URLRow> rows;
    580   for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
    581     AddToHistory(i->second, data, &rows);
    582   if (!rows.empty())
    583     bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED);
    584 }
    585