1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // URL filename encoder goals: 6 // 7 // 1. Allow URLs with arbitrary path-segment length, generating filenames 8 // with a maximum of 128 characters. 9 // 2. Provide a somewhat human readable filenames, for easy debugging flow. 10 // 3. Provide reverse-mapping from filenames back to URLs. 11 // 4. Be able to distinguish http://x from http://x/ from http://x/index.html. 12 // Those can all be different URLs. 13 // 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen 14 // with Facebook Connect. 15 // 16 // We need an escape-character for representing characters that are legal 17 // in URL paths, but not in filenames, such as '?'. 18 // 19 // We can pick any legal character as an escape, as long as we escape it too. 20 // But as we have a goal of having filenames that humans can correlate with 21 // URLs, we should pick one that doesn't show up frequently in URLs. Candidates 22 // are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are 23 // shell escapes or that various build tools use. 24 // 25 // .#&%-=_+ occur frequently in URLs. 26 // <>:"/\|?* are illegal in Windows 27 // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx 28 // ~`!$^&(){}[]'; are special to Unix shells 29 // In addition, build tools do not like ^@#% 30 // 31 // Josh took a quick look at the frequency of some special characters in 32 // Sadeesh's slurped directory from Fall 09 and found the following occurances: 33 // 34 // ^ 3 build tool doesn't like ^ in testdata filenames 35 // @ 10 build tool doesn't like @ in testdata filenames 36 // . 1676 too frequent in URLs 37 // , 76 THE WINNER 38 // # 0 build tool doesn't like it 39 // & 487 Prefer to avoid shell escapes 40 // % 374 g4 doesn't like it 41 // = 579 very frequent in URLs -- leave unmodified 42 // - 464 very frequent in URLs -- leave unmodified 43 // _ 798 very frequent in URLs -- leave unmodified 44 // 45 // 46 // The escaping algorithm is: 47 // 1) Escape all unfriendly symbols as ,XX where XX is the hex code. 48 // 2) Add a ',' at the end (We do not allow ',' at end of any directory name, 49 // so this assures that e.g. /a and /a/b can coexist in the filesystem). 50 // 3) Go through the path segment by segment (where a segment is one directory 51 // or leaf in the path) and 52 // 3a) If the segment is empty, escape the second slash. i.e. if it was 53 // www.foo.com//a then we escape the second / like www.foo.com/,2Fa, 54 // 3a) If it is "." or ".." prepend with ',' (so that we have a non- 55 // empty and non-reserved filename). 56 // 3b) If it is over 128 characters, break it up into smaller segments by 57 // inserting ,-/ (Windows limits paths to 128 chars, other OSes also 58 // have limits that would restrict us) 59 // 60 // For example: 61 // URL File 62 // / /, 63 // /index.html /index.html, 64 // /. /., 65 // /a/b /a/b, 66 // /a/b/ /a/b/, 67 // /a/b/c /a/b/c, Note: no prefix problem 68 // /u?foo=bar /u,3Ffoo=bar, 69 // // /,2F, 70 // /./ /,./, 71 // /../ /,../, 72 // /, /,2C, 73 // /,./ /,2C./, 74 // /very...longname/ /very...long,-/name If very...long is about 126 long. 75 76 // NOTE: we avoid using some classes here (like FilePath and GURL) because we 77 // share this code with other projects externally. 78 79 #ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_ 80 #define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_ 81 #pragma once 82 83 #include <string> 84 85 #include "base/string_util.h" 86 #include "net/tools/dump_cache/url_utilities.h" 87 88 namespace net { 89 90 // Helper class for converting a URL into a filename. 91 class UrlToFilenameEncoder { 92 public: 93 // Given a |url| and a |base_path|, returns a filename which represents this 94 // |url|. |url| may include URL escaping such as %21 for ! 95 // |legacy_escape| indicates that this function should use the old-style 96 // of encoding. 97 // TODO(mbelshe): delete the legacy_escape code. 98 static std::string Encode(const std::string& url, std::string base_path, 99 bool legacy_escape) { 100 std::string filename; 101 if (!legacy_escape) { 102 std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url); 103 EncodeSegment(base_path, url_no_scheme, '/', &filename); 104 #ifdef WIN32 105 ReplaceAll(&filename, "/", "\\"); 106 #endif 107 } else { 108 std::string clean_url(url); 109 if (clean_url.length() && clean_url[clean_url.length()-1] == '/') 110 clean_url.append("index.html"); 111 112 std::string host = UrlUtilities::GetUrlHost(clean_url); 113 filename.append(base_path); 114 filename.append(host); 115 #ifdef WIN32 116 filename.append("\\"); 117 #else 118 filename.append("/"); 119 #endif 120 121 std::string url_filename = UrlUtilities::GetUrlPath(clean_url); 122 // Strip the leading '/'. 123 if (url_filename[0] == '/') 124 url_filename = url_filename.substr(1); 125 126 // Replace '/' with '\'. 127 ConvertToSlashes(&url_filename); 128 129 // Strip double back-slashes ("\\\\"). 130 StripDoubleSlashes(&url_filename); 131 132 // Save path as filesystem-safe characters. 133 url_filename = LegacyEscape(url_filename); 134 filename.append(url_filename); 135 136 #ifndef WIN32 137 // Last step - convert to native slashes. 138 const std::string slash("/"); 139 const std::string backslash("\\"); 140 ReplaceAll(&filename, backslash, slash); 141 #endif 142 } 143 144 return filename; 145 } 146 147 // Rewrite HTML in a form that the SPDY in-memory server 148 // can read. 149 // |filename_prefix| is prepended without escaping. 150 // |escaped_ending| is the URL to be encoded into a filename. It may have URL 151 // escaped characters (like %21 for !). 152 // |dir_separator| is "/" on Unix, "\" on Windows. 153 // |encoded_filename| is the resultant filename. 154 static void EncodeSegment( 155 const std::string& filename_prefix, 156 const std::string& escaped_ending, 157 char dir_separator, 158 std::string* encoded_filename); 159 160 // Decodes a filename that was encoded with EncodeSegment, 161 // yielding back the original URL. 162 static bool Decode(const std::string& encoded_filename, 163 char dir_separator, 164 std::string* decoded_url); 165 166 static const char kEscapeChar; 167 static const char kTruncationChar; 168 static const size_t kMaximumSubdirectoryLength; 169 170 friend class UrlToFilenameEncoderTest; 171 172 private: 173 // Appends a segment of the path, special-casing "." and "..", and 174 // ensuring that the segment does not exceed the path length. If it does, 175 // it chops the end off the segment, writes the segment with a separator of 176 // ",-/", and then rewrites segment to contain just the truncated piece so 177 // it can be used in the next iteration. 178 // |segment| is a read/write parameter containing segment to write 179 // Note: this should not be called with empty segment. 180 static void AppendSegment(std::string* segment, std::string* dest); 181 182 // Allow reading of old slurped files. 183 static std::string LegacyEscape(const std::string& path); 184 185 // Replace all instances of |from| within |str| as |to|. 186 static void ReplaceAll(std::string* str, const std::string& from, 187 const std::string& to) { 188 std::string::size_type pos(0); 189 while ((pos = str->find(from, pos)) != std::string::npos) { 190 str->replace(pos, from.size(), to); 191 pos += from.size(); 192 } 193 } 194 195 // Replace all instances of "/" with "\" in |path|. 196 static void ConvertToSlashes(std::string* path) { 197 const std::string slash("/"); 198 const std::string backslash("\\"); 199 ReplaceAll(path, slash, backslash); 200 } 201 202 // Replace all instances of "\\" with "%5C%5C" in |path|. 203 static void StripDoubleSlashes(std::string* path) { 204 const std::string doubleslash("\\\\"); 205 const std::string escaped_doubleslash("%5C%5C"); 206 ReplaceAll(path, doubleslash, escaped_doubleslash); 207 } 208 }; 209 210 } // namespace net 211 212 #endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_ 213