Home | History | Annotate | Download | only in dump_cache
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // URL filename encoder goals:
      6 //
      7 // 1. Allow URLs with arbitrary path-segment length, generating filenames
      8 //    with a maximum of 128 characters.
      9 // 2. Provide a somewhat human readable filenames, for easy debugging flow.
     10 // 3. Provide reverse-mapping from filenames back to URLs.
     11 // 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
     12 //    Those can all be different URLs.
     13 // 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
     14 //    with Facebook Connect.
     15 //
     16 // We need an escape-character for representing characters that are legal
     17 // in URL paths, but not in filenames, such as '?'.
     18 //
     19 // We can pick any legal character as an escape, as long as we escape it too.
     20 // But as we have a goal of having filenames that humans can correlate with
     21 // URLs, we should pick one that doesn't show up frequently in URLs. Candidates
     22 // are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
     23 // shell escapes or that various build tools use.
     24 //
     25 // .#&%-=_+ occur frequently in URLs.
     26 // <>:"/\|?* are illegal in Windows
     27 //   See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
     28 // ~`!$^&(){}[]'; are special to Unix shells
     29 // In addition, build tools do not like ^@#%
     30 //
     31 // Josh took a quick look at the frequency of some special characters in
     32 // Sadeesh's slurped directory from Fall 09 and found the following occurances:
     33 //
     34 //   ^   3               build tool doesn't like ^ in testdata filenames
     35 //   @   10              build tool doesn't like @ in testdata filenames
     36 //   .   1676            too frequent in URLs
     37 //   ,   76              THE WINNER
     38 //   #   0               build tool doesn't like it
     39 //   &   487             Prefer to avoid shell escapes
     40 //   %   374             g4 doesn't like it
     41 //   =   579             very frequent in URLs -- leave unmodified
     42 //   -   464             very frequent in URLs -- leave unmodified
     43 //   _   798             very frequent in URLs -- leave unmodified
     44 //
     45 //
     46 // The escaping algorithm is:
     47 //  1) Escape all unfriendly symbols as ,XX where XX is the hex code.
     48 //  2) Add a ',' at the end (We do not allow ',' at end of any directory name,
     49 //     so this assures that e.g. /a and /a/b can coexist in the filesystem).
     50 //  3) Go through the path segment by segment (where a segment is one directory
     51 //     or leaf in the path) and
     52 //     3a) If the segment is empty, escape the second slash. i.e. if it was
     53 //         www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
     54 //     3a) If it is "." or ".." prepend with ',' (so that we have a non-
     55 //         empty and non-reserved filename).
     56 //     3b) If it is over 128 characters, break it up into smaller segments by
     57 //         inserting ,-/ (Windows limits paths to 128 chars, other OSes also
     58 //         have limits that would restrict us)
     59 //
     60 // For example:
     61 //     URL               File
     62 //     /                 /,
     63 //     /index.html       /index.html,
     64 //     /.                /.,
     65 //     /a/b              /a/b,
     66 //     /a/b/             /a/b/,
     67 //     /a/b/c            /a/b/c,   Note: no prefix problem
     68 //     /u?foo=bar        /u,3Ffoo=bar,
     69 //     //                /,2F,
     70 //     /./               /,./,
     71 //     /../              /,../,
     72 //     /,                /,2C,
     73 //     /,./              /,2C./,
     74 //     /very...longname/ /very...long,-/name   If very...long is about 126 long.
     75 
     76 // NOTE: we avoid using some classes here (like FilePath and GURL) because we
     77 //       share this code with other projects externally.
     78 
     79 #ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
     80 #define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
     81 #pragma once
     82 
     83 #include <string>
     84 
     85 #include "base/string_util.h"
     86 #include "net/tools/dump_cache/url_utilities.h"
     87 
     88 namespace net {
     89 
     90 // Helper class for converting a URL into a filename.
     91 class UrlToFilenameEncoder {
     92  public:
     93   // Given a |url| and a |base_path|, returns a filename which represents this
     94   // |url|. |url| may include URL escaping such as %21 for !
     95   // |legacy_escape| indicates that this function should use the old-style
     96   // of encoding.
     97   // TODO(mbelshe): delete the legacy_escape code.
     98   static std::string Encode(const std::string& url, std::string base_path,
     99                             bool legacy_escape) {
    100     std::string filename;
    101     if (!legacy_escape) {
    102       std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
    103       EncodeSegment(base_path, url_no_scheme, '/', &filename);
    104 #ifdef WIN32
    105       ReplaceAll(&filename, "/", "\\");
    106 #endif
    107     } else {
    108       std::string clean_url(url);
    109       if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
    110         clean_url.append("index.html");
    111 
    112       std::string host = UrlUtilities::GetUrlHost(clean_url);
    113       filename.append(base_path);
    114       filename.append(host);
    115 #ifdef WIN32
    116       filename.append("\\");
    117 #else
    118       filename.append("/");
    119 #endif
    120 
    121       std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
    122       // Strip the leading '/'.
    123       if (url_filename[0] == '/')
    124         url_filename = url_filename.substr(1);
    125 
    126       // Replace '/' with '\'.
    127       ConvertToSlashes(&url_filename);
    128 
    129       // Strip double back-slashes ("\\\\").
    130       StripDoubleSlashes(&url_filename);
    131 
    132       // Save path as filesystem-safe characters.
    133       url_filename = LegacyEscape(url_filename);
    134       filename.append(url_filename);
    135 
    136 #ifndef WIN32
    137       // Last step - convert to native slashes.
    138       const std::string slash("/");
    139       const std::string backslash("\\");
    140       ReplaceAll(&filename, backslash, slash);
    141 #endif
    142     }
    143 
    144     return filename;
    145   }
    146 
    147   // Rewrite HTML in a form that the SPDY in-memory server
    148   // can read.
    149   // |filename_prefix| is prepended without escaping.
    150   // |escaped_ending| is the URL to be encoded into a filename. It may have URL
    151   // escaped characters (like %21 for !).
    152   // |dir_separator| is "/" on Unix, "\" on Windows.
    153   // |encoded_filename| is the resultant filename.
    154   static void EncodeSegment(
    155       const std::string& filename_prefix,
    156       const std::string& escaped_ending,
    157       char dir_separator,
    158       std::string* encoded_filename);
    159 
    160   // Decodes a filename that was encoded with EncodeSegment,
    161   // yielding back the original URL.
    162   static bool Decode(const std::string& encoded_filename,
    163                      char dir_separator,
    164                      std::string* decoded_url);
    165 
    166   static const char kEscapeChar;
    167   static const char kTruncationChar;
    168   static const size_t kMaximumSubdirectoryLength;
    169 
    170   friend class UrlToFilenameEncoderTest;
    171 
    172  private:
    173   // Appends a segment of the path, special-casing "." and "..", and
    174   // ensuring that the segment does not exceed the path length.  If it does,
    175   // it chops the end off the segment, writes the segment with a separator of
    176   // ",-/", and then rewrites segment to contain just the truncated piece so
    177   // it can be used in the next iteration.
    178   // |segment| is a read/write parameter containing segment to write
    179   // Note: this should not be called with empty segment.
    180   static void AppendSegment(std::string* segment, std::string* dest);
    181 
    182   // Allow reading of old slurped files.
    183   static std::string LegacyEscape(const std::string& path);
    184 
    185   // Replace all instances of |from| within |str| as |to|.
    186   static void ReplaceAll(std::string* str, const std::string& from,
    187                          const std::string& to) {
    188     std::string::size_type pos(0);
    189     while ((pos = str->find(from, pos)) != std::string::npos) {
    190       str->replace(pos, from.size(), to);
    191       pos += from.size();
    192     }
    193   }
    194 
    195   // Replace all instances of "/" with "\" in |path|.
    196   static void ConvertToSlashes(std::string* path) {
    197     const std::string slash("/");
    198     const std::string backslash("\\");
    199     ReplaceAll(path, slash, backslash);
    200   }
    201 
    202   // Replace all instances of "\\" with "%5C%5C" in |path|.
    203   static void StripDoubleSlashes(std::string* path) {
    204     const std::string doubleslash("\\\\");
    205     const std::string escaped_doubleslash("%5C%5C");
    206     ReplaceAll(path, doubleslash, escaped_doubleslash);
    207   }
    208 };
    209 
    210 }  // namespace net
    211 
    212 #endif  // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
    213