Home | History | Annotate | Download | only in dump_cache
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "net/tools/dump_cache/url_to_filename_encoder.h"
      6 
      7 #include <string>
      8 #include <vector>
      9 
     10 #include "base/strings/string_util.h"
     11 #include "base/strings/stringprintf.h"
     12 #include "base/strings/string_piece.h"
     13 #include "testing/gtest/include/gtest/gtest.h"
     14 
     15 using base::StringPiece;
     16 using std::string;
     17 
     18 namespace net {
     19 
     20 #ifdef WIN32
     21 char kDirSeparator = '\\';
     22 char kOtherDirSeparator = '/';
     23 #else
     24 char kDirSeparator = '/';
     25 char kOtherDirSeparator = '\\';
     26 #endif
     27 
     28 class UrlToFilenameEncoderTest : public ::testing::Test {
     29  protected:
     30   UrlToFilenameEncoderTest() : escape_(1, UrlToFilenameEncoder::kEscapeChar),
     31                                dir_sep_(1, kDirSeparator) {
     32   }
     33 
     34   void CheckSegmentLength(const StringPiece& escaped_word) {
     35     std::vector<StringPiece> components;
     36     Tokenize(escaped_word, StringPiece("/"), &components);
     37     for (size_t i = 0; i < components.size(); ++i) {
     38       EXPECT_GE(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
     39                 components[i].size());
     40     }
     41   }
     42 
     43   void CheckValidChars(const StringPiece& escaped_word, char invalid_slash) {
     44     // These characters are invalid in Windows.  We add in ', as that's pretty
     45     // inconvenient in a Unix filename.
     46     //
     47     // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
     48     const string kInvalidChars = "<>:\"|?*'";
     49     for (size_t i = 0; i < escaped_word.size(); ++i) {
     50       char c = escaped_word[i];
     51       EXPECT_EQ(string::npos, kInvalidChars.find(c));
     52       EXPECT_NE(invalid_slash, c);
     53       EXPECT_NE('\0', c);  // only invalid character in Posix
     54       EXPECT_GT(0x7E, c);  // only English printable characters
     55     }
     56   }
     57 
     58   void Validate(const string& in_word, const string& gold_word) {
     59     string escaped_word, url;
     60     UrlToFilenameEncoder::EncodeSegment(
     61         std::string(), in_word, '/', &escaped_word);
     62     EXPECT_EQ(gold_word, escaped_word);
     63     CheckSegmentLength(escaped_word);
     64     CheckValidChars(escaped_word, '\\');
     65     UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
     66     EXPECT_EQ(in_word, url);
     67   }
     68 
     69   void ValidateAllSegmentsSmall(const string& in_word) {
     70     string escaped_word, url;
     71     UrlToFilenameEncoder::EncodeSegment(
     72         std::string(), in_word, '/', &escaped_word);
     73     CheckSegmentLength(escaped_word);
     74     CheckValidChars(escaped_word, '\\');
     75     UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
     76     EXPECT_EQ(in_word, url);
     77   }
     78 
     79   void ValidateNoChange(const string& word) {
     80     // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
     81     Validate(word, word + escape_);
     82   }
     83 
     84   void ValidateEscaped(unsigned char ch) {
     85     // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
     86     char escaped[100];
     87     const char escape = UrlToFilenameEncoder::kEscapeChar;
     88     base::snprintf(escaped, sizeof(escaped), "%c%02X%c", escape, ch, escape);
     89     Validate(string(1, ch), escaped);
     90   }
     91 
     92   void ValidateUrl(const string& url, const string& base_path,
     93                    bool legacy_escape, const string& gold_filename) {
     94     string encoded_filename = UrlToFilenameEncoder::Encode(
     95         url, base_path, legacy_escape);
     96     EXPECT_EQ(gold_filename, encoded_filename);
     97     if (!legacy_escape) {
     98       CheckSegmentLength(encoded_filename);
     99       CheckValidChars(encoded_filename, kOtherDirSeparator);
    100       string decoded_url;
    101       UrlToFilenameEncoder::Decode(encoded_filename, kDirSeparator,
    102                                    &decoded_url);
    103       if (url != decoded_url) {
    104         EXPECT_EQ(url, "http://" + decoded_url);
    105       }
    106     }
    107   }
    108 
    109   void ValidateUrlOldNew(const string& url, const string& gold_old_filename,
    110                          const string& gold_new_filename) {
    111     ValidateUrl(url, std::string(), true, gold_old_filename);
    112     ValidateUrl(url, std::string(), false, gold_new_filename);
    113   }
    114 
    115   void ValidateEncodeSame(const string& url1, const string& url2) {
    116     string filename1 = UrlToFilenameEncoder::Encode(url1, std::string(), false);
    117     string filename2 = UrlToFilenameEncoder::Encode(url2, std::string(), false);
    118     EXPECT_EQ(filename1, filename2);
    119   }
    120 
    121   string escape_;
    122   string dir_sep_;
    123 };
    124 
    125 TEST_F(UrlToFilenameEncoderTest, DoesNotEscape) {
    126   ValidateNoChange(std::string());
    127   ValidateNoChange("abcdefg");
    128   ValidateNoChange("abcdefghijklmnopqrstuvwxyz");
    129   ValidateNoChange("ZYXWVUT");
    130   ValidateNoChange("ZYXWVUTSRQPONMLKJIHGFEDCBA");
    131   ValidateNoChange("01234567689");
    132   ValidateNoChange("_.=+-");
    133   ValidateNoChange("abcdefghijklmnopqrstuvwxyzZYXWVUTSRQPONMLKJIHGFEDCBA"
    134                    "01234567689_.=+-");
    135   ValidateNoChange("index.html");
    136   ValidateNoChange("/");
    137   ValidateNoChange("/.");
    138   ValidateNoChange(".");
    139   ValidateNoChange("..");
    140 }
    141 
    142 TEST_F(UrlToFilenameEncoderTest, Escapes) {
    143   const string bad_chars =
    144       "<>:\"\\|?*"      // Illegal on Windows
    145       "~`!$^&(){}[]';"  // Bad for Unix shells
    146       "^@"              // Build tool doesn't like
    147       "#%"              // Tool doesn't like
    148       ",";              // The escape char has to be escaped
    149 
    150   for (size_t i = 0; i < bad_chars.size(); ++i) {
    151     ValidateEscaped(bad_chars[i]);
    152   }
    153 
    154   // Check non-printable characters.
    155   ValidateEscaped('\0');
    156   for (size_t i = 127; i < 256; ++i) {
    157     ValidateEscaped(static_cast<char>(i));
    158   }
    159 }
    160 
    161 TEST_F(UrlToFilenameEncoderTest, DoesEscapeCorrectly) {
    162   Validate("mysite.com&x", "mysite.com" + escape_ + "26x" + escape_);
    163   Validate("/./", "/" + escape_ + "./" + escape_);
    164   Validate("/../", "/" + escape_ + "../" + escape_);
    165   Validate("//", "/" + escape_ + "2F" + escape_);
    166   Validate("/./leaf", "/" + escape_ + "./leaf" + escape_);
    167   Validate("/../leaf", "/" + escape_ + "../leaf" + escape_);
    168   Validate("//leaf", "/" + escape_ + "2Fleaf" + escape_);
    169   Validate("mysite/u?param1=x&param2=y",
    170            "mysite/u" + escape_ + "3Fparam1=x" + escape_ + "26param2=y" +
    171            escape_);
    172   Validate("search?q=dogs&go=&form=QBLH&qs=n",  // from Latency Labs bing test.
    173            "search" + escape_ + "3Fq=dogs" + escape_ + "26go=" + escape_ +
    174            "26form=QBLH" + escape_ + "26qs=n" + escape_);
    175   Validate("~joebob/my_neeto-website+with_stuff.asp?id=138&content=true",
    176            "" + escape_ + "7Ejoebob/my_neeto-website+with_stuff.asp" + escape_ +
    177            "3Fid=138" + escape_ + "26content=true" + escape_);
    178 }
    179 
    180 TEST_F(UrlToFilenameEncoderTest, EncodeUrlCorrectly) {
    181   ValidateUrlOldNew("http://www.google.com/index.html",
    182                     "www.google.com" + dir_sep_ + "indexx2Ehtml",
    183                     "www.google.com" + dir_sep_ + "index.html" + escape_);
    184   ValidateUrlOldNew("http://www.google.com/x/search?hl=en&q=dogs&oq=",
    185                     "www.google.com" + dir_sep_ + "x" + dir_sep_ +
    186                     "searchx3Fhlx3Denx26qx3Ddogsx26oqx3D",
    187 
    188                     "www.google.com" + dir_sep_ + "x" + dir_sep_ + "search" +
    189                     escape_ + "3Fhl=en" + escape_ + "26q=dogs" + escape_ +
    190                     "26oq=" + escape_);
    191   ValidateUrlOldNew("http://www.foo.com/a//",
    192                     "www.foo.com" + dir_sep_ + "ax255Cx255Cindexx2Ehtml",
    193                     "www.foo.com" + dir_sep_ + "a" + dir_sep_ + escape_ + "2F" +
    194                     escape_);
    195 
    196   // From bug: Double slash preserved.
    197   ValidateUrl("http://www.foo.com/u?site=http://www.google.com/index.html",
    198               std::string(),
    199               false,
    200               "www.foo.com" + dir_sep_ + "u" + escape_ + "3Fsite=http" +
    201               escape_ + "3A" + dir_sep_ + escape_ + "2Fwww.google.com" +
    202               dir_sep_ + "index.html" + escape_);
    203   ValidateUrlOldNew(
    204       "http://blogutils.net/olct/online.php?"
    205       "site=http://thelwordfanfics.blogspot.&interval=600",
    206 
    207       "blogutils.net" + dir_sep_ + "olct" + dir_sep_ + "onlinex2Ephpx3F"
    208       "sitex3Dhttpx3Ax255Cx255Cthelwordfanficsx2Eblogspotx2Ex26intervalx3D600",
    209 
    210       "blogutils.net" + dir_sep_ + "olct" + dir_sep_ + "online.php" + escape_ +
    211       "3Fsite=http" + escape_ + "3A" + dir_sep_ + escape_ +
    212       "2Fthelwordfanfics.blogspot." + escape_ + "26interval=600" + escape_);
    213 }
    214 
    215 // From bug: Escapes treated the same as normal char.
    216 TEST_F(UrlToFilenameEncoderTest, UnescapeUrlsBeforeEncode) {
    217   for (int i = 0; i < 128; ++i) {
    218     string unescaped(1, static_cast<char>(i));
    219     string escaped = base::StringPrintf("%%%02X", i);
    220     ValidateEncodeSame(unescaped, escaped);
    221   }
    222 
    223   ValidateEncodeSame(
    224       "http://www.blogger.com/navbar.g?bName=God!&Mode=FOO&searchRoot"
    225       "=http%3A%2F%2Fsurvivorscanthrive.blogspot.com%2Fsearch",
    226 
    227       "http://www.blogger.com/navbar.g?bName=God%21&Mode=FOO&searchRoot"
    228       "=http%3A%2F%2Fsurvivorscanthrive.blogspot.com%2Fsearch");
    229 }
    230 
    231 // From bug: Filename encoding is not prefix-free.
    232 TEST_F(UrlToFilenameEncoderTest, EscapeSecondSlash) {
    233   Validate("/", "/" + escape_);
    234   Validate("//", "/" + escape_ + "2F" + escape_);
    235   Validate("///", "/" + escape_ + "2F" + "/" + escape_);
    236 }
    237 
    238 TEST_F(UrlToFilenameEncoderTest, LongTail) {
    239   static char long_word[] =
    240       "~joebob/briggs/12345678901234567890123456789012345678901234567890"
    241       "1234567890123456789012345678901234567890123456789012345678901234567890"
    242       "1234567890123456789012345678901234567890123456789012345678901234567890"
    243       "1234567890123456789012345678901234567890123456789012345678901234567890"
    244       "1234567890123456789012345678901234567890123456789012345678901234567890"
    245       "1234567890123456789012345678901234567890123456789012345678901234567890";
    246 
    247   // the long lines in the string below are 64 characters, so we can see
    248   // the slashes every 128.
    249   string gold_long_word =
    250       escape_ + "7Ejoebob/briggs/"
    251       "1234567890123456789012345678901234567890123456789012345678901234"
    252       "56789012345678901234567890123456789012345678901234567890123456" +
    253       escape_ + "-/"
    254       "7890123456789012345678901234567890123456789012345678901234567890"
    255       "12345678901234567890123456789012345678901234567890123456789012" +
    256       escape_ + "-/"
    257       "3456789012345678901234567890123456789012345678901234567890123456"
    258       "78901234567890123456789012345678901234567890123456789012345678" +
    259       escape_ + "-/"
    260       "9012345678901234567890" + escape_;
    261   EXPECT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
    262             sizeof(long_word));
    263   Validate(long_word, gold_long_word);
    264 }
    265 
    266 TEST_F(UrlToFilenameEncoderTest, LongTailQuestion) {
    267   // Here the '?' in the last path segment expands to @3F, making
    268   // it hit 128 chars before the input segment gets that big.
    269   static char long_word[] =
    270       "~joebob/briggs/1234567?1234567?1234567?1234567?1234567?"
    271       "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
    272       "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
    273       "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
    274       "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
    275       "1234567?1234567?1234567?1234567?1234567?1234567?1234567?";
    276 
    277   // Notice that at the end of the third segment, we avoid splitting
    278   // the (escape_ + "3F") that was generated from the "?", so that segment is
    279   // only 127 characters.
    280   string pattern = "1234567" + escape_ + "3F";  // 10 characters
    281   string gold_long_word =
    282       escape_ + "7Ejoebob/briggs/" +
    283       pattern + pattern + pattern + pattern + pattern + pattern + "1234"
    284       "567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
    285        "123456" + escape_ + "-/"
    286       "7" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
    287       pattern + pattern + pattern + pattern + pattern + pattern + pattern +
    288       "12" +
    289       escape_ + "-/"
    290       "34567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern
    291       + "1234567" + escape_ + "3F" + pattern + pattern + pattern + pattern
    292       + pattern + "1234567" +
    293       escape_ + "-/" +
    294       escape_ + "3F" + pattern + pattern + escape_;
    295   EXPECT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
    296             sizeof(long_word));
    297   Validate(long_word, gold_long_word);
    298 }
    299 
    300 TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenNoEscape) {
    301   // hit corner cases, +/- 4 characters from kMaxLen
    302   for (int i = -4; i <= 4; ++i) {
    303     string input;
    304     input.append(i + UrlToFilenameEncoder::kMaximumSubdirectoryLength, 'x');
    305     ValidateAllSegmentsSmall(input);
    306   }
    307 }
    308 
    309 TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenWithEscape) {
    310   // hit corner cases, +/- 4 characters from kMaxLen.  This time we
    311   // leave off the last 'x' and put in a '.', which ensures that we
    312   // are truncating with '/' *after* the expansion.
    313   for (int i = -4; i <= 4; ++i) {
    314     string input;
    315     input.append(i + UrlToFilenameEncoder::kMaximumSubdirectoryLength - 1, 'x');
    316     input.append(1, '.');  // this will expand to 3 characters.
    317     ValidateAllSegmentsSmall(input);
    318   }
    319 }
    320 
    321 TEST_F(UrlToFilenameEncoderTest, LeafBranchAlias) {
    322   Validate("/a/b/c", "/a/b/c" + escape_);        // c is leaf file "c,"
    323   Validate("/a/b/c/d", "/a/b/c/d" + escape_);    // c is directory "c"
    324   Validate("/a/b/c/d/", "/a/b/c/d/" + escape_);
    325 }
    326 
    327 
    328 TEST_F(UrlToFilenameEncoderTest, BackslashSeparator) {
    329   string long_word;
    330   string escaped_word;
    331   long_word.append(UrlToFilenameEncoder::kMaximumSubdirectoryLength + 1, 'x');
    332   UrlToFilenameEncoder::EncodeSegment(
    333       std::string(), long_word, '\\', &escaped_word);
    334 
    335   // check that one backslash, plus the escape ",-", and the ending , got added.
    336   EXPECT_EQ(long_word.size() + 4, escaped_word.size());
    337   ASSERT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
    338             escaped_word.size());
    339   // Check that the backslash got inserted at the correct spot.
    340   EXPECT_EQ('\\', escaped_word[
    341       UrlToFilenameEncoder::kMaximumSubdirectoryLength]);
    342 }
    343 
    344 }  // namespace net
    345 
    346