1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/utility/importer/bookmark_html_reader.h" 6 7 #include "base/callback.h" 8 #include "base/file_util.h" 9 #include "base/i18n/icu_string_conversions.h" 10 #include "base/strings/string_number_conversions.h" 11 #include "base/strings/string_split.h" 12 #include "base/strings/string_util.h" 13 #include "base/time/time.h" 14 #include "chrome/common/importer/imported_bookmark_entry.h" 15 #include "chrome/common/importer/imported_favicon_usage.h" 16 #include "chrome/utility/importer/favicon_reencode.h" 17 #include "content/public/common/url_constants.h" 18 #include "net/base/data_url.h" 19 #include "net/base/escape.h" 20 #include "url/gurl.h" 21 22 namespace { 23 24 // Fetches the given |attribute| value from the |attribute_list|. Returns true 25 // if successful, and |value| will contain the value. 26 bool GetAttribute(const std::string& attribute_list, 27 const std::string& attribute, 28 std::string* value) { 29 const char kQuote[] = "\""; 30 31 size_t begin = attribute_list.find(attribute + "=" + kQuote); 32 if (begin == std::string::npos) 33 return false; // Can't find the attribute. 34 35 begin += attribute.size() + 2; 36 size_t end = begin + 1; 37 38 while (end < attribute_list.size()) { 39 if (attribute_list[end] == '"' && 40 attribute_list[end - 1] != '\\') { 41 break; 42 } 43 end++; 44 } 45 46 if (end == attribute_list.size()) 47 return false; // The value is not quoted. 48 49 *value = attribute_list.substr(begin, end - begin); 50 return true; 51 } 52 53 // Given the URL of a page and a favicon data URL, adds an appropriate record 54 // to the given favicon usage vector. 55 void DataURLToFaviconUsage( 56 const GURL& link_url, 57 const GURL& favicon_data, 58 std::vector<ImportedFaviconUsage>* favicons) { 59 if (!link_url.is_valid() || !favicon_data.is_valid() || 60 !favicon_data.SchemeIs(chrome::kDataScheme)) 61 return; 62 63 // Parse the data URL. 64 std::string mime_type, char_set, data; 65 if (!net::DataURL::Parse(favicon_data, &mime_type, &char_set, &data) || 66 data.empty()) 67 return; 68 69 ImportedFaviconUsage usage; 70 if (!importer::ReencodeFavicon( 71 reinterpret_cast<const unsigned char*>(&data[0]), 72 data.size(), &usage.png_data)) 73 return; // Unable to decode. 74 75 // We need to make up a URL for the favicon. We use a version of the page's 76 // URL so that we can be sure it will not collide. 77 usage.favicon_url = GURL(std::string("made-up-favicon:") + link_url.spec()); 78 79 // We only have one URL per favicon for Firefox 2 bookmarks. 80 usage.urls.insert(link_url); 81 82 favicons->push_back(usage); 83 } 84 85 } // namespace 86 87 namespace bookmark_html_reader { 88 89 void ImportBookmarksFile( 90 const base::Callback<bool(void)>& cancellation_callback, 91 const base::Callback<bool(const GURL&)>& valid_url_callback, 92 const base::FilePath& file_path, 93 std::vector<ImportedBookmarkEntry>* bookmarks, 94 std::vector<ImportedFaviconUsage>* favicons) { 95 std::string content; 96 base::ReadFileToString(file_path, &content); 97 std::vector<std::string> lines; 98 base::SplitString(content, '\n', &lines); 99 100 base::string16 last_folder; 101 bool last_folder_on_toolbar = false; 102 bool last_folder_is_empty = true; 103 bool has_subfolder = false; 104 base::Time last_folder_add_date; 105 std::vector<base::string16> path; 106 size_t toolbar_folder_index = 0; 107 std::string charset; 108 for (size_t i = 0; 109 i < lines.size() && 110 (cancellation_callback.is_null() || !cancellation_callback.Run()); 111 ++i) { 112 std::string line; 113 base::TrimString(lines[i], " ", &line); 114 115 // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries 116 // separator in Firefox that Chrome does not support. Note that there can be 117 // multiple "<HR>" tags at the beginning of a single line. 118 // See http://crbug.com/257474. 119 static const char kHrTag[] = "<HR>"; 120 while (StartsWithASCII(line, kHrTag, false)) { 121 line.erase(0, arraysize(kHrTag) - 1); 122 base::TrimString(line, " ", &line); 123 } 124 125 // Get the encoding of the bookmark file. 126 if (internal::ParseCharsetFromLine(line, &charset)) 127 continue; 128 129 // Get the folder name. 130 if (internal::ParseFolderNameFromLine(line, 131 charset, 132 &last_folder, 133 &last_folder_on_toolbar, 134 &last_folder_add_date)) { 135 continue; 136 } 137 138 // Get the bookmark entry. 139 base::string16 title; 140 base::string16 shortcut; 141 GURL url, favicon; 142 base::Time add_date; 143 base::string16 post_data; 144 bool is_bookmark; 145 // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based 146 // keywords yet. 147 is_bookmark = 148 internal::ParseBookmarkFromLine(line, charset, &title, 149 &url, &favicon, &shortcut, 150 &add_date, &post_data) || 151 internal::ParseMinimumBookmarkFromLine(line, charset, &title, &url); 152 153 if (is_bookmark) 154 last_folder_is_empty = false; 155 156 if (is_bookmark && 157 post_data.empty() && 158 (valid_url_callback.is_null() || valid_url_callback.Run(url))) { 159 if (toolbar_folder_index > path.size() && !path.empty()) { 160 NOTREACHED(); // error in parsing. 161 break; 162 } 163 164 ImportedBookmarkEntry entry; 165 entry.creation_time = add_date; 166 entry.url = url; 167 entry.title = title; 168 169 if (toolbar_folder_index) { 170 // The toolbar folder should be at the top level. 171 entry.in_toolbar = true; 172 entry.path.assign(path.begin() + toolbar_folder_index - 1, path.end()); 173 } else { 174 // Add this bookmark to the list of |bookmarks|. 175 if (!has_subfolder && !last_folder.empty()) { 176 path.push_back(last_folder); 177 last_folder.clear(); 178 } 179 entry.path.assign(path.begin(), path.end()); 180 } 181 bookmarks->push_back(entry); 182 183 // Save the favicon. DataURLToFaviconUsage will handle the case where 184 // there is no favicon. 185 if (favicons) 186 DataURLToFaviconUsage(url, favicon, favicons); 187 188 continue; 189 } 190 191 // Bookmarks in sub-folder are encapsulated with <DL> tag. 192 if (StartsWithASCII(line, "<DL>", false)) { 193 has_subfolder = true; 194 if (!last_folder.empty()) { 195 path.push_back(last_folder); 196 last_folder.clear(); 197 } 198 if (last_folder_on_toolbar && !toolbar_folder_index) 199 toolbar_folder_index = path.size(); 200 201 // Mark next folder empty as initial state. 202 last_folder_is_empty = true; 203 } else if (StartsWithASCII(line, "</DL>", false)) { 204 if (path.empty()) 205 break; // Mismatch <DL>. 206 207 base::string16 folder_title = path.back(); 208 path.pop_back(); 209 210 if (last_folder_is_empty) { 211 // Empty folder should be added explicitly. 212 ImportedBookmarkEntry entry; 213 entry.is_folder = true; 214 entry.creation_time = last_folder_add_date; 215 entry.title = folder_title; 216 if (toolbar_folder_index) { 217 // The toolbar folder should be at the top level. 218 // Make sure we don't add the toolbar folder itself if it is empty. 219 if (toolbar_folder_index <= path.size()) { 220 entry.in_toolbar = true; 221 entry.path.assign(path.begin() + toolbar_folder_index - 1, 222 path.end()); 223 bookmarks->push_back(entry); 224 } 225 } else { 226 // Add this folder to the list of |bookmarks|. 227 entry.path.assign(path.begin(), path.end()); 228 bookmarks->push_back(entry); 229 } 230 231 // Parent folder include current one, so it's not empty. 232 last_folder_is_empty = false; 233 } 234 235 if (toolbar_folder_index > path.size()) 236 toolbar_folder_index = 0; 237 } 238 } 239 } 240 241 namespace internal { 242 243 bool ParseCharsetFromLine(const std::string& line, std::string* charset) { 244 const char kCharset[] = "charset="; 245 if (StartsWithASCII(line, "<META", false) && 246 (line.find("CONTENT=\"") != std::string::npos || 247 line.find("content=\"") != std::string::npos)) { 248 size_t begin = line.find(kCharset); 249 if (begin == std::string::npos) 250 return false; 251 begin += std::string(kCharset).size(); 252 size_t end = line.find_first_of('\"', begin); 253 *charset = line.substr(begin, end - begin); 254 return true; 255 } 256 return false; 257 } 258 259 bool ParseFolderNameFromLine(const std::string& line, 260 const std::string& charset, 261 base::string16* folder_name, 262 bool* is_toolbar_folder, 263 base::Time* add_date) { 264 const char kFolderOpen[] = "<DT><H3"; 265 const char kFolderClose[] = "</H3>"; 266 const char kToolbarFolderAttribute[] = "PERSONAL_TOOLBAR_FOLDER"; 267 const char kAddDateAttribute[] = "ADD_DATE"; 268 269 if (!StartsWithASCII(line, kFolderOpen, true)) 270 return false; 271 272 size_t end = line.find(kFolderClose); 273 size_t tag_end = line.rfind('>', end) + 1; 274 // If no end tag or start tag is broken, we skip to find the folder name. 275 if (end == std::string::npos || tag_end < arraysize(kFolderOpen)) 276 return false; 277 278 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(), 279 base::OnStringConversionError::SKIP, folder_name); 280 *folder_name = net::UnescapeForHTML(*folder_name); 281 282 std::string attribute_list = line.substr(arraysize(kFolderOpen), 283 tag_end - arraysize(kFolderOpen) - 1); 284 std::string value; 285 286 // Add date 287 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) { 288 int64 time; 289 base::StringToInt64(value, &time); 290 // Upper bound it at 32 bits. 291 if (0 < time && time < (1LL << 32)) 292 *add_date = base::Time::FromTimeT(time); 293 } 294 295 if (GetAttribute(attribute_list, kToolbarFolderAttribute, &value) && 296 LowerCaseEqualsASCII(value, "true")) 297 *is_toolbar_folder = true; 298 else 299 *is_toolbar_folder = false; 300 301 return true; 302 } 303 304 bool ParseBookmarkFromLine(const std::string& line, 305 const std::string& charset, 306 base::string16* title, 307 GURL* url, 308 GURL* favicon, 309 base::string16* shortcut, 310 base::Time* add_date, 311 base::string16* post_data) { 312 const char kItemOpen[] = "<DT><A"; 313 const char kItemClose[] = "</A>"; 314 const char kFeedURLAttribute[] = "FEEDURL"; 315 const char kHrefAttribute[] = "HREF"; 316 const char kIconAttribute[] = "ICON"; 317 const char kShortcutURLAttribute[] = "SHORTCUTURL"; 318 const char kAddDateAttribute[] = "ADD_DATE"; 319 const char kPostDataAttribute[] = "POST_DATA"; 320 321 title->clear(); 322 *url = GURL(); 323 *favicon = GURL(); 324 shortcut->clear(); 325 post_data->clear(); 326 *add_date = base::Time(); 327 328 if (!StartsWithASCII(line, kItemOpen, true)) 329 return false; 330 331 size_t end = line.find(kItemClose); 332 size_t tag_end = line.rfind('>', end) + 1; 333 if (end == std::string::npos || tag_end < arraysize(kItemOpen)) 334 return false; // No end tag or start tag is broken. 335 336 std::string attribute_list = line.substr(arraysize(kItemOpen), 337 tag_end - arraysize(kItemOpen) - 1); 338 339 // We don't import Live Bookmark folders, which is Firefox's RSS reading 340 // feature, since the user never necessarily bookmarked them and we don't 341 // have this feature to update their contents. 342 std::string value; 343 if (GetAttribute(attribute_list, kFeedURLAttribute, &value)) 344 return false; 345 346 // Title 347 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(), 348 base::OnStringConversionError::SKIP, title); 349 *title = net::UnescapeForHTML(*title); 350 351 // URL 352 if (GetAttribute(attribute_list, kHrefAttribute, &value)) { 353 base::string16 url16; 354 base::CodepageToUTF16(value, charset.c_str(), 355 base::OnStringConversionError::SKIP, &url16); 356 url16 = net::UnescapeForHTML(url16); 357 358 *url = GURL(url16); 359 } 360 361 // Favicon 362 if (GetAttribute(attribute_list, kIconAttribute, &value)) 363 *favicon = GURL(value); 364 365 // Keyword 366 if (GetAttribute(attribute_list, kShortcutURLAttribute, &value)) { 367 base::CodepageToUTF16(value, charset.c_str(), 368 base::OnStringConversionError::SKIP, shortcut); 369 *shortcut = net::UnescapeForHTML(*shortcut); 370 } 371 372 // Add date 373 if (GetAttribute(attribute_list, kAddDateAttribute, &value)) { 374 int64 time; 375 base::StringToInt64(value, &time); 376 // Upper bound it at 32 bits. 377 if (0 < time && time < (1LL << 32)) 378 *add_date = base::Time::FromTimeT(time); 379 } 380 381 // Post data. 382 if (GetAttribute(attribute_list, kPostDataAttribute, &value)) { 383 base::CodepageToUTF16(value, charset.c_str(), 384 base::OnStringConversionError::SKIP, post_data); 385 *post_data = net::UnescapeForHTML(*post_data); 386 } 387 388 return true; 389 } 390 391 bool ParseMinimumBookmarkFromLine(const std::string& line, 392 const std::string& charset, 393 base::string16* title, 394 GURL* url) { 395 const char kItemOpen[] = "<DT><A"; 396 const char kItemClose[] = "</"; 397 const char kHrefAttributeUpper[] = "HREF"; 398 const char kHrefAttributeLower[] = "href"; 399 400 title->clear(); 401 *url = GURL(); 402 403 // Case-insensitive check of open tag. 404 if (!StartsWithASCII(line, kItemOpen, false)) 405 return false; 406 407 // Find any close tag. 408 size_t end = line.find(kItemClose); 409 size_t tag_end = line.rfind('>', end) + 1; 410 if (end == std::string::npos || tag_end < arraysize(kItemOpen)) 411 return false; // No end tag or start tag is broken. 412 413 std::string attribute_list = line.substr(arraysize(kItemOpen), 414 tag_end - arraysize(kItemOpen) - 1); 415 416 // Title 417 base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(), 418 base::OnStringConversionError::SKIP, title); 419 *title = net::UnescapeForHTML(*title); 420 421 // URL 422 std::string value; 423 if (GetAttribute(attribute_list, kHrefAttributeUpper, &value) || 424 GetAttribute(attribute_list, kHrefAttributeLower, &value)) { 425 if (charset.length() != 0) { 426 base::string16 url16; 427 base::CodepageToUTF16(value, charset.c_str(), 428 base::OnStringConversionError::SKIP, &url16); 429 url16 = net::UnescapeForHTML(url16); 430 431 *url = GURL(url16); 432 } else { 433 *url = GURL(value); 434 } 435 } 436 437 return true; 438 } 439 440 } // namespace internal 441 442 } // namespace bookmark_html_reader 443