Home | History | Annotate | Download | only in importer
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/utility/importer/bookmark_html_reader.h"
      6 
      7 #include "base/callback.h"
      8 #include "base/file_util.h"
      9 #include "base/i18n/icu_string_conversions.h"
     10 #include "base/strings/string_number_conversions.h"
     11 #include "base/strings/string_split.h"
     12 #include "base/strings/string_util.h"
     13 #include "base/time/time.h"
     14 #include "chrome/common/importer/imported_bookmark_entry.h"
     15 #include "chrome/common/importer/imported_favicon_usage.h"
     16 #include "chrome/utility/importer/favicon_reencode.h"
     17 #include "content/public/common/url_constants.h"
     18 #include "net/base/data_url.h"
     19 #include "net/base/escape.h"
     20 #include "url/gurl.h"
     21 
     22 namespace {
     23 
     24 // Fetches the given |attribute| value from the |attribute_list|. Returns true
     25 // if successful, and |value| will contain the value.
     26 bool GetAttribute(const std::string& attribute_list,
     27                   const std::string& attribute,
     28                   std::string* value) {
     29   const char kQuote[] = "\"";
     30 
     31   size_t begin = attribute_list.find(attribute + "=" + kQuote);
     32   if (begin == std::string::npos)
     33     return false;  // Can't find the attribute.
     34 
     35   begin += attribute.size() + 2;
     36   size_t end = begin + 1;
     37 
     38   while (end < attribute_list.size()) {
     39     if (attribute_list[end] == '"' &&
     40         attribute_list[end - 1] != '\\') {
     41       break;
     42     }
     43     end++;
     44   }
     45 
     46   if (end == attribute_list.size())
     47     return false;  // The value is not quoted.
     48 
     49   *value = attribute_list.substr(begin, end - begin);
     50   return true;
     51 }
     52 
     53 // Given the URL of a page and a favicon data URL, adds an appropriate record
     54 // to the given favicon usage vector.
     55 void DataURLToFaviconUsage(
     56     const GURL& link_url,
     57     const GURL& favicon_data,
     58     std::vector<ImportedFaviconUsage>* favicons) {
     59   if (!link_url.is_valid() || !favicon_data.is_valid() ||
     60       !favicon_data.SchemeIs(chrome::kDataScheme))
     61     return;
     62 
     63   // Parse the data URL.
     64   std::string mime_type, char_set, data;
     65   if (!net::DataURL::Parse(favicon_data, &mime_type, &char_set, &data) ||
     66       data.empty())
     67     return;
     68 
     69   ImportedFaviconUsage usage;
     70   if (!importer::ReencodeFavicon(
     71           reinterpret_cast<const unsigned char*>(&data[0]),
     72           data.size(), &usage.png_data))
     73     return;  // Unable to decode.
     74 
     75   // We need to make up a URL for the favicon. We use a version of the page's
     76   // URL so that we can be sure it will not collide.
     77   usage.favicon_url = GURL(std::string("made-up-favicon:") + link_url.spec());
     78 
     79   // We only have one URL per favicon for Firefox 2 bookmarks.
     80   usage.urls.insert(link_url);
     81 
     82   favicons->push_back(usage);
     83 }
     84 
     85 }  // namespace
     86 
     87 namespace bookmark_html_reader {
     88 
     89 void ImportBookmarksFile(
     90       const base::Callback<bool(void)>& cancellation_callback,
     91       const base::Callback<bool(const GURL&)>& valid_url_callback,
     92       const base::FilePath& file_path,
     93       std::vector<ImportedBookmarkEntry>* bookmarks,
     94       std::vector<ImportedFaviconUsage>* favicons) {
     95   std::string content;
     96   base::ReadFileToString(file_path, &content);
     97   std::vector<std::string> lines;
     98   base::SplitString(content, '\n', &lines);
     99 
    100   base::string16 last_folder;
    101   bool last_folder_on_toolbar = false;
    102   bool last_folder_is_empty = true;
    103   bool has_subfolder = false;
    104   base::Time last_folder_add_date;
    105   std::vector<base::string16> path;
    106   size_t toolbar_folder_index = 0;
    107   std::string charset;
    108   for (size_t i = 0;
    109        i < lines.size() &&
    110            (cancellation_callback.is_null() || !cancellation_callback.Run());
    111        ++i) {
    112     std::string line;
    113     base::TrimString(lines[i], " ", &line);
    114 
    115     // Remove "<HR>" if |line| starts with it. "<HR>" is the bookmark entries
    116     // separator in Firefox that Chrome does not support. Note that there can be
    117     // multiple "<HR>" tags at the beginning of a single line.
    118     // See http://crbug.com/257474.
    119     static const char kHrTag[] = "<HR>";
    120     while (StartsWithASCII(line, kHrTag, false)) {
    121       line.erase(0, arraysize(kHrTag) - 1);
    122       base::TrimString(line, " ", &line);
    123     }
    124 
    125     // Get the encoding of the bookmark file.
    126     if (internal::ParseCharsetFromLine(line, &charset))
    127       continue;
    128 
    129     // Get the folder name.
    130     if (internal::ParseFolderNameFromLine(line,
    131                                           charset,
    132                                           &last_folder,
    133                                           &last_folder_on_toolbar,
    134                                           &last_folder_add_date)) {
    135       continue;
    136     }
    137 
    138     // Get the bookmark entry.
    139     base::string16 title;
    140     base::string16 shortcut;
    141     GURL url, favicon;
    142     base::Time add_date;
    143     base::string16 post_data;
    144     bool is_bookmark;
    145     // TODO(jcampan): http://b/issue?id=1196285 we do not support POST based
    146     //                keywords yet.
    147     is_bookmark =
    148         internal::ParseBookmarkFromLine(line, charset, &title,
    149                                         &url, &favicon, &shortcut,
    150                                         &add_date, &post_data) ||
    151         internal::ParseMinimumBookmarkFromLine(line, charset, &title, &url);
    152 
    153     if (is_bookmark)
    154       last_folder_is_empty = false;
    155 
    156     if (is_bookmark &&
    157         post_data.empty() &&
    158         (valid_url_callback.is_null() || valid_url_callback.Run(url))) {
    159       if (toolbar_folder_index > path.size() && !path.empty()) {
    160         NOTREACHED();  // error in parsing.
    161         break;
    162       }
    163 
    164       ImportedBookmarkEntry entry;
    165       entry.creation_time = add_date;
    166       entry.url = url;
    167       entry.title = title;
    168 
    169       if (toolbar_folder_index) {
    170         // The toolbar folder should be at the top level.
    171         entry.in_toolbar = true;
    172         entry.path.assign(path.begin() + toolbar_folder_index - 1, path.end());
    173       } else {
    174         // Add this bookmark to the list of |bookmarks|.
    175         if (!has_subfolder && !last_folder.empty()) {
    176           path.push_back(last_folder);
    177           last_folder.clear();
    178         }
    179         entry.path.assign(path.begin(), path.end());
    180       }
    181       bookmarks->push_back(entry);
    182 
    183       // Save the favicon. DataURLToFaviconUsage will handle the case where
    184       // there is no favicon.
    185       if (favicons)
    186         DataURLToFaviconUsage(url, favicon, favicons);
    187 
    188       continue;
    189     }
    190 
    191     // Bookmarks in sub-folder are encapsulated with <DL> tag.
    192     if (StartsWithASCII(line, "<DL>", false)) {
    193       has_subfolder = true;
    194       if (!last_folder.empty()) {
    195         path.push_back(last_folder);
    196         last_folder.clear();
    197       }
    198       if (last_folder_on_toolbar && !toolbar_folder_index)
    199         toolbar_folder_index = path.size();
    200 
    201       // Mark next folder empty as initial state.
    202       last_folder_is_empty = true;
    203     } else if (StartsWithASCII(line, "</DL>", false)) {
    204       if (path.empty())
    205         break;  // Mismatch <DL>.
    206 
    207       base::string16 folder_title = path.back();
    208       path.pop_back();
    209 
    210       if (last_folder_is_empty) {
    211         // Empty folder should be added explicitly.
    212         ImportedBookmarkEntry entry;
    213         entry.is_folder = true;
    214         entry.creation_time = last_folder_add_date;
    215         entry.title = folder_title;
    216         if (toolbar_folder_index) {
    217           // The toolbar folder should be at the top level.
    218           // Make sure we don't add the toolbar folder itself if it is empty.
    219           if (toolbar_folder_index <= path.size()) {
    220             entry.in_toolbar = true;
    221             entry.path.assign(path.begin() + toolbar_folder_index - 1,
    222                               path.end());
    223             bookmarks->push_back(entry);
    224           }
    225         } else {
    226           // Add this folder to the list of |bookmarks|.
    227           entry.path.assign(path.begin(), path.end());
    228           bookmarks->push_back(entry);
    229         }
    230 
    231         // Parent folder include current one, so it's not empty.
    232         last_folder_is_empty = false;
    233       }
    234 
    235       if (toolbar_folder_index > path.size())
    236         toolbar_folder_index = 0;
    237     }
    238   }
    239 }
    240 
    241 namespace internal {
    242 
    243 bool ParseCharsetFromLine(const std::string& line, std::string* charset) {
    244   const char kCharset[] = "charset=";
    245   if (StartsWithASCII(line, "<META", false) &&
    246       (line.find("CONTENT=\"") != std::string::npos ||
    247           line.find("content=\"") != std::string::npos)) {
    248     size_t begin = line.find(kCharset);
    249     if (begin == std::string::npos)
    250       return false;
    251     begin += std::string(kCharset).size();
    252     size_t end = line.find_first_of('\"', begin);
    253     *charset = line.substr(begin, end - begin);
    254     return true;
    255   }
    256   return false;
    257 }
    258 
    259 bool ParseFolderNameFromLine(const std::string& line,
    260                              const std::string& charset,
    261                              base::string16* folder_name,
    262                              bool* is_toolbar_folder,
    263                              base::Time* add_date) {
    264   const char kFolderOpen[] = "<DT><H3";
    265   const char kFolderClose[] = "</H3>";
    266   const char kToolbarFolderAttribute[] = "PERSONAL_TOOLBAR_FOLDER";
    267   const char kAddDateAttribute[] = "ADD_DATE";
    268 
    269   if (!StartsWithASCII(line, kFolderOpen, true))
    270     return false;
    271 
    272   size_t end = line.find(kFolderClose);
    273   size_t tag_end = line.rfind('>', end) + 1;
    274   // If no end tag or start tag is broken, we skip to find the folder name.
    275   if (end == std::string::npos || tag_end < arraysize(kFolderOpen))
    276     return false;
    277 
    278   base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
    279                         base::OnStringConversionError::SKIP, folder_name);
    280   *folder_name = net::UnescapeForHTML(*folder_name);
    281 
    282   std::string attribute_list = line.substr(arraysize(kFolderOpen),
    283       tag_end - arraysize(kFolderOpen) - 1);
    284   std::string value;
    285 
    286   // Add date
    287   if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
    288     int64 time;
    289     base::StringToInt64(value, &time);
    290     // Upper bound it at 32 bits.
    291     if (0 < time && time < (1LL << 32))
    292       *add_date = base::Time::FromTimeT(time);
    293   }
    294 
    295   if (GetAttribute(attribute_list, kToolbarFolderAttribute, &value) &&
    296       LowerCaseEqualsASCII(value, "true"))
    297     *is_toolbar_folder = true;
    298   else
    299     *is_toolbar_folder = false;
    300 
    301   return true;
    302 }
    303 
    304 bool ParseBookmarkFromLine(const std::string& line,
    305                            const std::string& charset,
    306                            base::string16* title,
    307                            GURL* url,
    308                            GURL* favicon,
    309                            base::string16* shortcut,
    310                            base::Time* add_date,
    311                            base::string16* post_data) {
    312   const char kItemOpen[] = "<DT><A";
    313   const char kItemClose[] = "</A>";
    314   const char kFeedURLAttribute[] = "FEEDURL";
    315   const char kHrefAttribute[] = "HREF";
    316   const char kIconAttribute[] = "ICON";
    317   const char kShortcutURLAttribute[] = "SHORTCUTURL";
    318   const char kAddDateAttribute[] = "ADD_DATE";
    319   const char kPostDataAttribute[] = "POST_DATA";
    320 
    321   title->clear();
    322   *url = GURL();
    323   *favicon = GURL();
    324   shortcut->clear();
    325   post_data->clear();
    326   *add_date = base::Time();
    327 
    328   if (!StartsWithASCII(line, kItemOpen, true))
    329     return false;
    330 
    331   size_t end = line.find(kItemClose);
    332   size_t tag_end = line.rfind('>', end) + 1;
    333   if (end == std::string::npos || tag_end < arraysize(kItemOpen))
    334     return false;  // No end tag or start tag is broken.
    335 
    336   std::string attribute_list = line.substr(arraysize(kItemOpen),
    337       tag_end - arraysize(kItemOpen) - 1);
    338 
    339   // We don't import Live Bookmark folders, which is Firefox's RSS reading
    340   // feature, since the user never necessarily bookmarked them and we don't
    341   // have this feature to update their contents.
    342   std::string value;
    343   if (GetAttribute(attribute_list, kFeedURLAttribute, &value))
    344     return false;
    345 
    346   // Title
    347   base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
    348                         base::OnStringConversionError::SKIP, title);
    349   *title = net::UnescapeForHTML(*title);
    350 
    351   // URL
    352   if (GetAttribute(attribute_list, kHrefAttribute, &value)) {
    353     base::string16 url16;
    354     base::CodepageToUTF16(value, charset.c_str(),
    355                           base::OnStringConversionError::SKIP, &url16);
    356     url16 = net::UnescapeForHTML(url16);
    357 
    358     *url = GURL(url16);
    359   }
    360 
    361   // Favicon
    362   if (GetAttribute(attribute_list, kIconAttribute, &value))
    363     *favicon = GURL(value);
    364 
    365   // Keyword
    366   if (GetAttribute(attribute_list, kShortcutURLAttribute, &value)) {
    367     base::CodepageToUTF16(value, charset.c_str(),
    368                           base::OnStringConversionError::SKIP, shortcut);
    369     *shortcut = net::UnescapeForHTML(*shortcut);
    370   }
    371 
    372   // Add date
    373   if (GetAttribute(attribute_list, kAddDateAttribute, &value)) {
    374     int64 time;
    375     base::StringToInt64(value, &time);
    376     // Upper bound it at 32 bits.
    377     if (0 < time && time < (1LL << 32))
    378       *add_date = base::Time::FromTimeT(time);
    379   }
    380 
    381   // Post data.
    382   if (GetAttribute(attribute_list, kPostDataAttribute, &value)) {
    383     base::CodepageToUTF16(value, charset.c_str(),
    384                           base::OnStringConversionError::SKIP, post_data);
    385     *post_data = net::UnescapeForHTML(*post_data);
    386   }
    387 
    388   return true;
    389 }
    390 
    391 bool ParseMinimumBookmarkFromLine(const std::string& line,
    392                                   const std::string& charset,
    393                                   base::string16* title,
    394                                   GURL* url) {
    395   const char kItemOpen[] = "<DT><A";
    396   const char kItemClose[] = "</";
    397   const char kHrefAttributeUpper[] = "HREF";
    398   const char kHrefAttributeLower[] = "href";
    399 
    400   title->clear();
    401   *url = GURL();
    402 
    403   // Case-insensitive check of open tag.
    404   if (!StartsWithASCII(line, kItemOpen, false))
    405     return false;
    406 
    407   // Find any close tag.
    408   size_t end = line.find(kItemClose);
    409   size_t tag_end = line.rfind('>', end) + 1;
    410   if (end == std::string::npos || tag_end < arraysize(kItemOpen))
    411     return false;  // No end tag or start tag is broken.
    412 
    413   std::string attribute_list = line.substr(arraysize(kItemOpen),
    414       tag_end - arraysize(kItemOpen) - 1);
    415 
    416   // Title
    417   base::CodepageToUTF16(line.substr(tag_end, end - tag_end), charset.c_str(),
    418                         base::OnStringConversionError::SKIP, title);
    419   *title = net::UnescapeForHTML(*title);
    420 
    421   // URL
    422   std::string value;
    423   if (GetAttribute(attribute_list, kHrefAttributeUpper, &value) ||
    424       GetAttribute(attribute_list, kHrefAttributeLower, &value)) {
    425     if (charset.length() != 0) {
    426       base::string16 url16;
    427       base::CodepageToUTF16(value, charset.c_str(),
    428                             base::OnStringConversionError::SKIP, &url16);
    429       url16 = net::UnescapeForHTML(url16);
    430 
    431       *url = GURL(url16);
    432     } else {
    433       *url = GURL(value);
    434     }
    435   }
    436 
    437   return true;
    438 }
    439 
    440 }  // namespace internal
    441 
    442 }  // namespace bookmark_html_reader
    443