1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* ***** BEGIN LICENSE BLOCK ***** 3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 4 * 5 * The contents of this file are subject to the Mozilla Public License Version 6 * 1.1 (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * http://www.mozilla.org/MPL/ 9 * 10 * Software distributed under the License is distributed on an "AS IS" basis, 11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 12 * for the specific language governing rights and limitations under the 13 * License. 14 * 15 * The Original Code is the Mork Reader. 16 * 17 * The Initial Developer of the Original Code is 18 * Google Inc. 19 * Portions created by the Initial Developer are Copyright (C) 2006 20 * the Initial Developer. All Rights Reserved. 21 * 22 * Contributor(s): 23 * Brian Ryner <bryner (at) brianryner.com> (original author) 24 * 25 * Alternatively, the contents of this file may be used under the terms of 26 * either the GNU General Public License Version 2 or later (the "GPL"), or 27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 28 * in which case the provisions of the GPL or the LGPL are applicable instead 29 * of those above. If you wish to allow use of your version of this file only 30 * under the terms of either the GPL or the LGPL, and not to allow others to 31 * use your version of this file under the terms of the MPL, indicate your 32 * decision by deleting the provisions above and replace them with the notice 33 * and other provisions required by the GPL or the LGPL. If you do not delete 34 * the provisions above, a recipient may use your version of this file under 35 * the terms of any one of the MPL, the GPL or the LGPL. 36 * 37 * ***** END LICENSE BLOCK ***** */ 38 39 // Source: 40 // http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp 41 // This file has been converted to google style. 42 43 #include "chrome/browser/importer/mork_reader.h" 44 45 #include <algorithm> 46 47 #include "base/file_path.h" 48 #include "base/i18n/icu_string_conversions.h" 49 #include "base/logging.h" 50 #include "base/message_loop.h" 51 #include "base/string_number_conversions.h" 52 #include "base/string_util.h" 53 #include "chrome/browser/history/history_types.h" 54 #include "chrome/browser/importer/firefox_importer_utils.h" 55 #include "chrome/browser/importer/importer_bridge.h" 56 57 namespace { 58 59 // Convert a hex character (0-9, A-F) to its corresponding byte value. 60 // Returns -1 if the character is invalid. 61 inline int HexCharToInt(char c) { 62 if ('0' <= c && c <= '9') 63 return c - '0'; 64 if ('A' <= c && c <= 'F') 65 return c - 'A' + 10; 66 return -1; 67 } 68 69 // Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII 70 // characters. Additionally, '$' and '\' are backslash-escaped. 71 // The result of the unescape is in returned. 72 std::string MorkUnescape(const std::string& input) { 73 // We optimize for speed over space here -- size the result buffer to 74 // the size of the source, which is an upper bound on the size of the 75 // unescaped string. 76 std::string result; 77 size_t input_length = input.size(); 78 result.reserve(input_length); 79 80 for (size_t i = 0; i < input_length; i++) { 81 char c = input[i]; 82 if (c == '\\') { 83 // Escaped literal, slip the backslash, append the next character. 84 i++; 85 if (i < input_length) 86 result.push_back(input[i]); 87 } else if (c == '$') { 88 // Dollar sign denotes a hex character. 89 if (i < input_length - 2) { 90 // Would be nice to use ToInteger() here, but it currently 91 // requires a null-terminated string. 92 int first = HexCharToInt(input[++i]); 93 int second = HexCharToInt(input[++i]); 94 if (first >= 0 && second >= 0) 95 result.push_back((first << 4) | second); 96 } 97 } else { 98 // Regular character, just append. 99 result.push_back(input[i]); 100 } 101 } 102 return result; 103 } 104 105 } // namespace 106 107 MorkReader::MorkReader() { 108 } 109 110 MorkReader::~MorkReader() { 111 // Need to delete all the pointers to vectors we have in the table. 112 for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i) 113 delete i->second; 114 } 115 116 bool MorkReader::Read(const FilePath& path) { 117 stream_.open(path.value().c_str()); 118 if (!stream_.is_open()) 119 return false; 120 121 std::string line; 122 if (!ReadLine(&line) || 123 line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0) 124 return false; // Unexpected file format. 125 126 IndexMap column_map; 127 while (ReadLine(&line)) { 128 // Trim off leading spaces 129 size_t idx = 0; 130 size_t len = line.size(); 131 while (idx < len && line[idx] == ' ') 132 ++idx; 133 if (idx >= len) 134 continue; 135 136 // Look at the line to figure out what section type this is 137 if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) { 138 // Column map. We begin by creating a hash of column id to column name. 139 StringMap column_name_map; 140 ParseMap(line, idx, &column_name_map); 141 142 // Now that we have the list of columns, we put them into a flat array. 143 // Rows will have value arrays of the same size, with indexes that 144 // correspond to the columns array. As we insert each column into the 145 // array, we also make an entry in columnMap so that we can look up the 146 // index given the column id. 147 columns_.reserve(column_name_map.size()); 148 149 for (StringMap::const_iterator i = column_name_map.begin(); 150 i != column_name_map.end(); ++i) { 151 column_map[i->first] = static_cast<int>(columns_.size()); 152 MorkColumn col(i->first, i->second); 153 columns_.push_back(col); 154 } 155 } else if (StartsWithASCII(&line[idx], "<(", true)) { 156 // Value map. 157 ParseMap(line, idx, &value_map_); 158 } else if (line[idx] == '{' || line[idx] == '[') { 159 // Table / table row. 160 ParseTable(line, idx, &column_map); 161 } else { 162 // Don't know, hopefully don't care. 163 } 164 } 165 return true; 166 } 167 168 // Parses a key/value map of the form 169 // <(k1=v1)(k2=v2)...> 170 bool MorkReader::ParseMap(const std::string& first_line, 171 size_t start_index, 172 StringMap* map) { 173 // If the first line is the a=c line (column map), just skip over it. 174 std::string line(first_line); 175 if (StartsWithASCII(line, "< <(a=c)>", true)) 176 ReadLine(&line); 177 178 std::string key; 179 do { 180 size_t idx = start_index; 181 size_t len = line.size(); 182 size_t token_start; 183 184 while (idx < len) { 185 switch (line[idx++]) { 186 case '(': 187 // Beginning of a key/value pair. 188 if (!key.empty()) { 189 DLOG(WARNING) << "unterminated key/value pair?"; 190 key.clear(); 191 } 192 193 token_start = idx; 194 while (idx < len && line[idx] != '=') 195 ++idx; 196 key.assign(&line[token_start], idx - token_start); 197 break; 198 199 case '=': { 200 // Beginning of the value. 201 if (key.empty()) { 202 DLOG(WARNING) << "stray value"; 203 break; 204 } 205 206 token_start = idx; 207 while (idx < len && line[idx] != ')') { 208 if (line[idx] == '\\') 209 ++idx; // Skip escaped ')' characters. 210 ++idx; 211 } 212 size_t token_end = std::min(idx, len); 213 ++idx; 214 215 std::string value = MorkUnescape( 216 std::string(&line[token_start], token_end - token_start)); 217 (*map)[key] = value; 218 key.clear(); 219 break; 220 } 221 case '>': 222 // End of the map. 223 DLOG_IF(WARNING, key.empty()) << 224 "map terminates inside of key/value pair"; 225 return true; 226 } 227 } 228 229 // We should start reading the next line at the beginning. 230 start_index = 0; 231 } while (ReadLine(&line)); 232 233 // We ran out of lines and the map never terminated. This probably indicates 234 // a parsing error. 235 DLOG(WARNING) << "didn't find end of key/value map"; 236 return false; 237 } 238 239 // Parses a table row of the form [123(^45^67)..] 240 // (row id 123 has the value with id 67 for the column with id 45). 241 // A '^' prefix for a column or value references an entry in the column or 242 // value map. '=' is used as the separator when the value is a literal. 243 void MorkReader::ParseTable(const std::string& first_line, 244 size_t start_index, 245 const IndexMap* column_map) { 246 std::string line(first_line); 247 248 // Column index of the cell we're parsing, minus one if invalid. 249 int column_index = -1; 250 251 // Points to the current row we're parsing inside of the |table_|, will be 252 // NULL if we're not inside a row. 253 ColumnDataList* current_row = NULL; 254 255 bool in_meta_row = false; 256 257 do { 258 size_t idx = start_index; 259 size_t len = line.size(); 260 261 while (idx < len) { 262 switch (line[idx++]) { 263 case '{': 264 // This marks the beginning of a table section. There's a lot of 265 // junk before the first row that looks like cell values but isn't. 266 // Skip to the first '['. 267 while (idx < len && line[idx] != '[') { 268 if (line[idx] == '{') { 269 in_meta_row = true; // The meta row is enclosed in { } 270 } else if (line[idx] == '}') { 271 in_meta_row = false; 272 } 273 ++idx; 274 } 275 break; 276 277 case '[': { 278 // Start of a new row. Consume the row id, up to the first '('. 279 // Row edits also have a table namespace, separated from the row id 280 // by a colon. We don't make use of the namespace, but we need to 281 // make sure not to consider it part of the row id. 282 if (current_row) { 283 DLOG(WARNING) << "unterminated row?"; 284 current_row = NULL; 285 } 286 287 // Check for a '-' at the start of the id. This signifies that 288 // if the row already exists, we should delete all columns from it 289 // before adding the new values. 290 bool cut_columns; 291 if (idx < len && line[idx] == '-') { 292 cut_columns = true; 293 ++idx; 294 } else { 295 cut_columns = false; 296 } 297 298 // Locate the range of the ID. 299 size_t token_start = idx; // Index of the first char of the token. 300 while (idx < len && 301 line[idx] != '(' && 302 line[idx] != ']' && 303 line[idx] != ':') { 304 ++idx; 305 } 306 size_t token_end = idx; // Index of the char following the token. 307 while (idx < len && line[idx] != '(' && line[idx] != ']') { 308 ++idx; 309 } 310 311 if (in_meta_row) { 312 // Need to create the meta row. 313 meta_row_.resize(columns_.size()); 314 current_row = &meta_row_; 315 } else { 316 // Find or create the regular row for this. 317 IDString row_id(&line[token_start], token_end - token_start); 318 RowMap::iterator found_row = table_.find(row_id); 319 if (found_row == table_.end()) { 320 // We don't already have this row, create a new one for it. 321 current_row = new ColumnDataList(columns_.size()); 322 table_[row_id] = current_row; 323 } else { 324 // The row already exists and we're adding/replacing things. 325 current_row = found_row->second; 326 } 327 } 328 if (cut_columns) { 329 for (size_t i = 0; i < current_row->size(); ++i) 330 (*current_row)[i].clear(); 331 } 332 break; 333 } 334 335 case ']': 336 // We're done with the row. 337 current_row = NULL; 338 in_meta_row = false; 339 break; 340 341 case '(': { 342 if (!current_row) { 343 DLOG(WARNING) << "cell value outside of row"; 344 break; 345 } 346 347 bool column_is_atom; 348 if (line[idx] == '^') { 349 column_is_atom = true; 350 ++idx; // This is not part of the column id, advance past it. 351 } else { 352 column_is_atom = false; 353 } 354 size_t token_start = idx; 355 while (idx < len && line[idx] != '^' && line[idx] != '=') { 356 if (line[idx] == '\\') 357 ++idx; // Skip escaped characters. 358 ++idx; 359 } 360 361 size_t token_end = std::min(idx, len); 362 363 IDString column; 364 if (column_is_atom) 365 column.assign(&line[token_start], token_end - token_start); 366 else 367 column = MorkUnescape(line.substr(token_start, 368 token_end - token_start)); 369 370 IndexMap::const_iterator found_column = column_map->find(column); 371 if (found_column == column_map->end()) { 372 DLOG(WARNING) << "Column not in column map, discarding it"; 373 column_index = -1; 374 } else { 375 column_index = found_column->second; 376 } 377 break; 378 } 379 380 case '=': 381 case '^': { 382 if (column_index == -1) { 383 DLOG(WARNING) << "stray ^ or = marker"; 384 break; 385 } 386 387 bool value_is_atom = (line[idx - 1] == '^'); 388 size_t token_start = idx - 1; // Include the '=' or '^' marker. 389 while (idx < len && line[idx] != ')') { 390 if (line[idx] == '\\') 391 ++idx; // Skip escaped characters. 392 ++idx; 393 } 394 size_t token_end = std::min(idx, len); 395 ++idx; 396 397 if (value_is_atom) { 398 (*current_row)[column_index].assign(&line[token_start], 399 token_end - token_start); 400 } else { 401 (*current_row)[column_index] = 402 MorkUnescape(line.substr(token_start, token_end - token_start)); 403 } 404 column_index = -1; 405 } 406 break; 407 } 408 } 409 410 // Start parsing the next line at the beginning. 411 start_index = 0; 412 } while (current_row && ReadLine(&line)); 413 } 414 415 bool MorkReader::ReadLine(std::string* line) { 416 line->resize(256); 417 std::getline(stream_, *line); 418 if (stream_.eof() || stream_.bad()) 419 return false; 420 421 while (!line->empty() && (*line)[line->size() - 1] == '\\') { 422 // There is a continuation for this line. Read it and append. 423 std::string new_line; 424 std::getline(stream_, new_line); 425 if (stream_.eof()) 426 return false; 427 line->erase(line->size() - 1); 428 line->append(new_line); 429 } 430 431 return true; 432 } 433 434 void MorkReader::NormalizeValue(std::string* value) const { 435 if (value->empty()) 436 return; 437 MorkReader::StringMap::const_iterator i; 438 switch (value->at(0)) { 439 case '^': 440 // Hex ID, lookup the name for it in the |value_map_|. 441 i = value_map_.find(value->substr(1)); 442 if (i == value_map_.end()) 443 value->clear(); 444 else 445 *value = i->second; 446 break; 447 case '=': 448 // Just use the literal after the equals sign. 449 value->erase(value->begin()); 450 break; 451 default: 452 // Anything else is invalid. 453 value->clear(); 454 break; 455 } 456 } 457 458 // Source: 459 // http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp 460 461 // Columns for entry (non-meta) history rows 462 enum { 463 kURLColumn, 464 kNameColumn, 465 kVisitCountColumn, 466 kHiddenColumn, 467 kTypedColumn, 468 kLastVisitColumn, 469 kColumnCount // Keep me last. 470 }; 471 472 static const char * const gColumnNames[] = { 473 "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate" 474 }; 475 476 struct TableReadClosure { 477 explicit TableReadClosure(const MorkReader& r) 478 : reader(r), 479 swap_bytes(false), 480 byte_order_column(-1) { 481 for (int i = 0; i < kColumnCount; ++i) 482 column_indexes[i] = -1; 483 } 484 485 // Backpointers to the reader and history we're operating on. 486 const MorkReader& reader; 487 488 // Whether we need to swap bytes (file format is other-endian). 489 bool swap_bytes; 490 491 // Indexes of the columns that we care about. 492 int column_indexes[kColumnCount]; 493 int byte_order_column; 494 }; 495 496 void AddToHistory(MorkReader::ColumnDataList* column_values, 497 const TableReadClosure& data, 498 std::vector<history::URLRow>* rows) { 499 std::string values[kColumnCount]; 500 501 for (size_t i = 0; i < kColumnCount; ++i) { 502 if (data.column_indexes[i] != -1) { 503 values[i] = column_values->at(data.column_indexes[i]); 504 data.reader.NormalizeValue(&values[i]); 505 // Do not import hidden records. 506 if (i == kHiddenColumn && values[i] == "1") 507 return; 508 } 509 } 510 511 GURL url(values[kURLColumn]); 512 513 if (CanImportURL(url)) { 514 history::URLRow row(url); 515 516 string16 title; 517 if (data.swap_bytes) { 518 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16BE, 519 base::OnStringConversionError::SKIP, &title); 520 } else { 521 base::CodepageToUTF16(values[kNameColumn], base::kCodepageUTF16LE, 522 base::OnStringConversionError::SKIP, &title); 523 } 524 row.set_title(title); 525 526 int count = atoi(values[kVisitCountColumn].c_str()); 527 if (count == 0) 528 count = 1; 529 row.set_visit_count(count); 530 531 int64 date; 532 base::StringToInt64(values[kLastVisitColumn], &date); 533 if (date != 0) 534 row.set_last_visit(base::Time::FromTimeT(date / 1000000)); 535 536 bool is_typed = (values[kTypedColumn] == "1"); 537 if (is_typed) 538 row.set_typed_count(1); 539 540 rows->push_back(row); 541 } 542 } 543 544 // It sets up the file stream and loops over the lines in the file to 545 // parse them, then adds the resulting row set to history. 546 void ImportHistoryFromFirefox2(const FilePath& file, ImporterBridge* bridge) { 547 MorkReader reader; 548 reader.Read(file); 549 550 // Gather up the column ids so we don't need to find them on each row 551 TableReadClosure data(reader); 552 const MorkReader::MorkColumnList& columns = reader.columns(); 553 for (size_t i = 0; i < columns.size(); ++i) { 554 for (int j = 0; j < kColumnCount; ++j) 555 if (columns[i].name == gColumnNames[j]) { 556 data.column_indexes[j] = static_cast<int>(i); 557 break; 558 } 559 if (columns[i].name == "ByteOrder") 560 data.byte_order_column = static_cast<int>(i); 561 } 562 563 // Determine the byte order from the table's meta-row. 564 const MorkReader::ColumnDataList& meta_row = reader.meta_row(); 565 if (!meta_row.empty() && data.byte_order_column != -1) { 566 std::string byte_order = meta_row[data.byte_order_column]; 567 if (!byte_order.empty()) { 568 // Note whether the file uses a non-native byte ordering. 569 // If it does, we'll have to swap bytes for PRUnichar values. 570 // "BE" and "LE" are the only recognized values, anything 571 // else is garbage and the file will be treated as native-endian 572 // (no swapping). 573 std::string byte_order_value(byte_order); 574 reader.NormalizeValue(&byte_order_value); 575 data.swap_bytes = (byte_order_value == "BE"); 576 } 577 } 578 579 std::vector<history::URLRow> rows; 580 for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i) 581 AddToHistory(i->second, data, &rows); 582 if (!rows.empty()) 583 bridge->SetHistoryItems(rows, history::SOURCE_FIREFOX_IMPORTED); 584 } 585