Home | History | Annotate | Download | only in history
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/history/snippet.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/string_split.h"
     10 #include "base/string_util.h"
     11 #include "base/utf_string_conversions.h"
     12 #include "testing/gtest/include/gtest/gtest.h"
     13 
     14 namespace {
     15 
     16 // A sample document to compute snippets of.
     17 // The \x bits after the first "Google" are UTF-8 of U+2122 TRADE MARK SIGN,
     18 // and are useful for verifying we don't screw up in UTF-8/UTF-16 conversion.
     19 const char* kSampleDocument = "Google\xe2\x84\xa2 Terms of Service "
     20 "Welcome to Google! "
     21 "1. Your relationship with Google "
     22 "1.1 Your use of Google's products, software, services and web sites "
     23 "(referred to collectively as the \"Services\" in this document and excluding "
     24 "any services provided to you by Google under a separate written agreement) "
     25 "is subject to the terms of a legal agreement between you and Google. "
     26 "\"Google\" means Google Inc., whose principal place of business is at 1600 "
     27 "Amphitheatre Parkway, Mountain View, CA 94043, United States. This document "
     28 "explains how the agreement is made up, and sets out some of the terms of "
     29 "that agreement.";
     30 };
     31 
     32 // Thai sample taken from http://www.google.co.th/intl/th/privacy.html
     33 // TODO(jungshik) : Add more samples (e.g. Hindi) after porting
     34 // ICU 4.0's character iterator changes to our copy of ICU 3.8 to get
     35 // grapheme clusters in Indic scripts handled more reasonably.
     36 const char* kThaiSample = "Google \xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87"
     37 "\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
     38 "\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
     39 "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
     40 "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1"
     41 "\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93"
     42 "\xE0\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A"
     43 "\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
     44 "\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A"
     45 "\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82"
     46 "\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7"
     47 "\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x82\xE0\xB9\x89"
     48 "\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1"
     49 "\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7"
     50 "\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1"
     51 "\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
     52 "\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7"
     53 "\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
     54 "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
     55 "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5"
     56 "\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
     57 "\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x88"
     58 "\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB9\x80"
     59 "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A"
     60 "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
     61 "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4"
     62 "\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88"
     63 "\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB"
     64 "\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
     65 "\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA"
     66 "\xE0\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88"
     67 "\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x9C\xE0\xB8\xB9"
     68 "\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94"
     69 "\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3"
     70 "\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3"
     71 "\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94"
     72 "\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3"
     73 "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87"
     74 "\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
     75 "\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89"
     76 "\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89"
     77 "\xE0\xB9\x80\xE0\xB8\xAB\xE0\xB8\xA1\xE0\xB8\xB2\xE0\xB8\xB0\xE0\xB8\xAA"
     78 "\xE0\xB8\xB3\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x84"
     79 "\xE0\xB8\xB8\xE0\xB8\x93";
     80 
     81 // Comparator for sorting by the first element in a pair.
     82 bool ComparePair1st(const Snippet::MatchPosition& a,
     83                     const Snippet::MatchPosition& b) {
     84   return a.first < b.first;
     85 }
     86 
     87 // For testing, we'll compute the match positions manually instead of using
     88 // sqlite's FTS matching.  BuildSnippet returns the snippet for matching
     89 // |query| against |document|.  Matches are surrounded by "**".
     90 string16 BuildSnippet(const std::string& document,
     91                       const std::string& query) {
     92   // This function assumes that |document| does not contain
     93   // any character for which lowercasing changes its length. Further,
     94   // it's assumed that lowercasing only the ASCII-portion works for
     95   // |document|. We need to add more test cases and change this function
     96   // to be more generic depending on how we deal with 'folding for match'
     97   // in history.
     98   const std::string document_folded = StringToLowerASCII(std::string(document));
     99 
    100   std::vector<std::string> query_words;
    101   base::SplitString(query, ' ', &query_words);
    102 
    103   // Manually construct match_positions of the document.
    104   Snippet::MatchPositions match_positions;
    105   match_positions.clear();
    106   for (std::vector<std::string>::iterator qw = query_words.begin();
    107        qw != query_words.end(); ++qw) {
    108     // Insert all instances of this word into match_pairs.
    109     size_t ofs = 0;
    110     while ((ofs = document_folded.find(*qw, ofs)) != std::string::npos) {
    111       match_positions.push_back(std::make_pair(ofs, ofs + qw->size()));
    112       ofs += qw->size();
    113     }
    114   }
    115   // Sort match_positions in order of increasing offset.
    116   std::sort(match_positions.begin(), match_positions.end(), ComparePair1st);
    117 
    118   // Compute the snippet.
    119   Snippet snippet;
    120   snippet.ComputeSnippet(match_positions, document);
    121 
    122   // Now "highlight" all matches in the snippet with **.
    123   string16 star_snippet;
    124   Snippet::MatchPositions::const_iterator match;
    125   size_t pos = 0;
    126   for (match = snippet.matches().begin();
    127        match != snippet.matches().end(); ++match) {
    128     star_snippet += snippet.text().substr(pos, match->first - pos);
    129     star_snippet += UTF8ToUTF16("**");
    130     star_snippet += snippet.text().substr(match->first,
    131                                           match->second - match->first);
    132     star_snippet += UTF8ToUTF16("**");
    133     pos = match->second;
    134   }
    135   star_snippet += snippet.text().substr(pos);
    136 
    137   return star_snippet;
    138 }
    139 
    140 TEST(Snippets, SimpleQuery) {
    141   ASSERT_EQ(" ... eferred to collectively as the \"Services\" in this "
    142             "**document** and excluding any services provided to you by "
    143             "Goo ...  ... way, Mountain View, CA 94043, United States. This "
    144             "**document** explains how the agreement is made up, and sets "
    145             "o ... ",
    146             UTF16ToUTF8(BuildSnippet(kSampleDocument, "document")));
    147 }
    148 
    149 // Test that two words that are near each other don't produce two elided bits.
    150 TEST(Snippets, NearbyWords) {
    151   ASSERT_EQ(" ... lace of business is at 1600 Amphitheatre Parkway, "
    152             "**Mountain** **View**, CA 94043, United States. This "
    153             "document explains  ... ",
    154             UTF16ToUTF8(BuildSnippet(kSampleDocument, "mountain view")));
    155 }
    156 
    157 // The above tests already test that we get byte offsets correct, but here's
    158 // one that gets the "TM" in its snippet.
    159 TEST(Snippets, UTF8) {
    160   ASSERT_EQ(" ... ogle\xe2\x84\xa2 Terms of Service Welcome to Google! "
    161             "1. Your **relationship** with Google 1.1 Your use of Google's "
    162             "products, so ... ",
    163             UTF16ToUTF8(BuildSnippet(kSampleDocument, "relationship")));
    164 }
    165 
    166 TEST(Snippets, ThaiUTF8) {
    167   // There are 3 instances of '\u0E43\u0E2B\u0E49'
    168   // (\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89) in kThaiSample.
    169   // The 1st is more than |kSniipetContext| graphemes away from the
    170   // 2nd while the 2nd and 3rd are within that window. However, with
    171   // the 2nd match added, the snippet goes over the size limit so that
    172   // the snippet ends right before the 3rd match.
    173   ASSERT_EQ(" ... "
    174             "\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
    175             "\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8"
    176             "\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8"
    177             "\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8"
    178             "\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1\xE0\xB8\xB7\xE0"
    179             "\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0"
    180             "\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0"
    181             "\xB8\x9A\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0"
    182             "\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0"
    183             "\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0"
    184             "\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82\xE0\xB8\xAD\xE0"
    185             "\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8"
    186             "\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x82\xE0"
    187             "\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0"
    188             "\xB8\x94\xE0\xB8\xB1\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0"
    189             "\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7\xE0\xB9\x82\xE0\xB8\x94\xE0"
    190             "\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1\xE0\xB8\x84\xE0"
    191             "\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
    192             "\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3"
    193             "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD"
    194             "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88"
    195             "\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
    196             "\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88"
    197             "\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
    198             "\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
    199             "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8"
    200             "\xE0\xB8\x93\xE0\xB9\x80\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2"
    201             "\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A ...  ... \xE0\xB8\x82"
    202             "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
    203             "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3"
    204             "\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD"
    205             "\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD"
    206             "\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0"
    207             "\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0"
    208             "\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA\xE0"
    209             "\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
    210             "\xE0\xB9\x88\xE0\xB8\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9"
    211             "\x89**\xE0\xB8\x9C\xE0\xB8\xB9\xE0\xB9\x89\xE0\xB9\x83\xE0"
    212             "\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94\xE0\xB9\x89\xE0"
    213             "\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3\xE0"
    214             "\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0"
    215             "\xB8\xA3\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0"
    216             "\xB9\x88\xE0\xB8\x94\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0"
    217             "\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
    218             "\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87\xE0\xB8\x9B"
    219             "\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
    220             "\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7"
    221             "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2",
    222             UTF16ToUTF8(BuildSnippet(kThaiSample,
    223                                      "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89")));
    224 }
    225 
    226 TEST(Snippets, ExtractMatchPositions) {
    227   struct TestData {
    228     const std::string offsets_string;
    229     const size_t expected_match_count;
    230     const size_t expected_matches[10];
    231   } data[] = {
    232     { "0 0 1 2 0 0 4 1 0 0 1 5",            1,     { 1, 6 } },
    233     { "0 0 1 4 0 0 2 1",                    1,     { 1, 5 } },
    234     { "0 0 4 1 0 0 2 1",                    2,     { 2, 3, 4, 5 } },
    235     { "0 0 0 1",                            1,     { 0, 1 } },
    236     { "0 0 0 1 0 0 0 2",                    1,     { 0, 2 } },
    237     { "0 0 1 1 0 0 1 2",                    1,     { 1, 3 } },
    238     { "0 0 1 2 0 0 4 3 0 0 3 1",            1,     { 1, 7 } },
    239     { "0 0 1 4 0 0 2 5",                    1,     { 1, 7 } },
    240     { "0 0 1 2 0 0 1 1",                    1,     { 1, 3 } },
    241     { "0 0 1 1 0 0 5 2 0 0 10 1 0 0 3 10",  2,     { 1, 2, 3, 13 } },
    242   };
    243   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(data); ++i) {
    244     Snippet::MatchPositions matches;
    245     Snippet::ExtractMatchPositions(data[i].offsets_string, "0", &matches);
    246     EXPECT_EQ(data[i].expected_match_count, matches.size());
    247     for (size_t j = 0; j < data[i].expected_match_count; ++j) {
    248       EXPECT_EQ(data[i].expected_matches[2 * j], matches[j].first);
    249       EXPECT_EQ(data[i].expected_matches[2 * j + 1], matches[j].second);
    250     }
    251   }
    252 }
    253