1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <stdlib.h> 6 7 #include "base/logging.h" 8 #include "base/string_util.h" 9 #include "net/base/net_util.h" 10 #include "net/tools/dump_cache/url_to_filename_encoder.h" 11 12 using std::string; 13 14 namespace { 15 16 // Returns 1 if buf is prefixed by "num_digits" of hex digits 17 // Teturns 0 otherwise. 18 // The function checks for '\0' for string termination. 19 int HexDigitsPrefix(const char* buf, int num_digits) { 20 for (int i = 0; i < num_digits; i++) { 21 if (!IsHexDigit(buf[i])) 22 return 0; // This also detects end of string as '\0' is not xdigit. 23 } 24 return 1; 25 } 26 27 #ifdef WIN32 28 #define strtoull _strtoui64 29 #endif 30 31 // A simple parser for long long values. Returns the parsed value if a 32 // valid integer is found; else returns deflt 33 // UInt64 and Int64 cannot handle decimal numbers with leading 0s. 34 uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) { 35 char *error = NULL; 36 const uint64 value = strtoull(str, &error, 16); 37 return (error == str) ? deflt : value; 38 } 39 40 } 41 42 namespace net { 43 44 // The escape character choice is made here -- all code and tests in this 45 // directory are based off of this constant. However, our testdata 46 // has tons of dependencies on this, so it cannot be changed without 47 // re-running those tests and fixing them. 48 const char UrlToFilenameEncoder::kEscapeChar = ','; 49 const char UrlToFilenameEncoder::kTruncationChar = '-'; 50 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128; 51 52 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) { 53 CHECK(!segment->empty()); 54 if ((*segment == ".") || (*segment == "..")) { 55 dest->append(1, kEscapeChar); 56 dest->append(*segment); 57 segment->clear(); 58 } else { 59 size_t segment_size = segment->size(); 60 if (segment_size > kMaximumSubdirectoryLength) { 61 // We need to inject ",-" at the end of the segment to signify that 62 // we are inserting an artificial '/'. This means we have to chop 63 // off at least two characters to make room. 64 segment_size = kMaximumSubdirectoryLength - 2; 65 66 // But we don't want to break up an escape sequence that happens to lie at 67 // the end. Escape sequences are at most 2 characters. 68 if ((*segment)[segment_size - 1] == kEscapeChar) { 69 segment_size -= 1; 70 } else if ((*segment)[segment_size - 2] == kEscapeChar) { 71 segment_size -= 2; 72 } 73 dest->append(segment->data(), segment_size); 74 dest->append(1, kEscapeChar); 75 dest->append(1, kTruncationChar); 76 segment->erase(0, segment_size); 77 78 // At this point, if we had segment_size=3, and segment="abcd", 79 // then after this erase, we will have written "abc,-" and set segment="d" 80 } else { 81 dest->append(*segment); 82 segment->clear(); 83 } 84 } 85 } 86 87 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix, 88 const string& escaped_ending, 89 char dir_separator, 90 string* encoded_filename) { 91 string filename_ending = UrlUtilities::Unescape(escaped_ending); 92 93 char encoded[3]; 94 int encoded_len; 95 string segment; 96 97 // TODO(jmarantz): This code would be a bit simpler if we disallowed 98 // Instaweb allowing filename_prefix to not end in "/". We could 99 // then change the is routine to just take one input string. 100 size_t start_of_segment = filename_prefix.find_last_of(dir_separator); 101 if (start_of_segment == string::npos) { 102 segment = filename_prefix; 103 } else { 104 segment = filename_prefix.substr(start_of_segment + 1); 105 *encoded_filename = filename_prefix.substr(0, start_of_segment + 1); 106 } 107 108 size_t index = 0; 109 // Special case the first / to avoid adding a leading kEscapeChar. 110 if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) { 111 encoded_filename->append(segment); 112 segment.clear(); 113 encoded_filename->append(1, dir_separator); 114 ++index; 115 } 116 117 for (; index < filename_ending.length(); ++index) { 118 unsigned char ch = static_cast<unsigned char>(filename_ending[index]); 119 120 // Note: instead of outputing an empty segment, we let the second slash 121 // be escaped below. 122 if ((ch == dir_separator) && !segment.empty()) { 123 AppendSegment(&segment, encoded_filename); 124 encoded_filename->append(1, dir_separator); 125 segment.clear(); 126 } else { 127 // After removing unsafe chars the only safe ones are _.=+- and alphanums. 128 if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') || 129 (ch == '-') || (('0' <= ch) && (ch <= '9')) || 130 (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) { 131 encoded[0] = ch; 132 encoded_len = 1; 133 } else { 134 encoded[0] = kEscapeChar; 135 encoded[1] = ch / 16; 136 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; 137 encoded[2] = ch % 16; 138 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; 139 encoded_len = 3; 140 } 141 segment.append(encoded, encoded_len); 142 143 // If segment is too big, we must chop it into chunks. 144 if (segment.size() > kMaximumSubdirectoryLength) { 145 AppendSegment(&segment, encoded_filename); 146 encoded_filename->append(1, dir_separator); 147 } 148 } 149 } 150 151 // Append "," to the leaf filename so the leaf can also be a branch., e.g. 152 // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and 153 // /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed 154 // us over the 128 char limit, then we will need to append "/" and the 155 // remaining chars. 156 segment += kEscapeChar; 157 AppendSegment(&segment, encoded_filename); 158 if (!segment.empty()) { 159 // The last overflow segment is special, because we appended in 160 // kEscapeChar above. We won't need to check it again for size 161 // or further escaping. 162 encoded_filename->append(1, dir_separator); 163 encoded_filename->append(segment); 164 } 165 } 166 167 // Note: this decoder is not the exact inverse of the EncodeSegment above, 168 // because it does not take into account a prefix. 169 bool UrlToFilenameEncoder::Decode(const string& encoded_filename, 170 char dir_separator, 171 string* decoded_url) { 172 enum State { 173 kStart, 174 kEscape, 175 kFirstDigit, 176 kTruncate, 177 kEscapeDot 178 }; 179 State state = kStart; 180 int char_code = 0; 181 char hex_buffer[3]; 182 hex_buffer[2] = '\0'; 183 for (size_t i = 0; i < encoded_filename.size(); ++i) { 184 char ch = encoded_filename[i]; 185 switch (state) { 186 case kStart: 187 if (ch == kEscapeChar) { 188 state = kEscape; 189 } else if (ch == dir_separator) { 190 decoded_url->append(1, '/'); // URLs only use '/' not '\\' 191 } else { 192 decoded_url->append(1, ch); 193 } 194 break; 195 case kEscape: 196 if (HexDigitsPrefix(&ch, 1) == 1) { 197 hex_buffer[0] = ch; 198 state = kFirstDigit; 199 } else if (ch == kTruncationChar) { 200 state = kTruncate; 201 } else if (ch == '.') { 202 decoded_url->append(1, '.'); 203 state = kEscapeDot; // Look for at most one more dot. 204 } else if (ch == dir_separator) { 205 // Consider url "//x". This was once encoded to "/,/x,". 206 // This code is what skips the first Escape. 207 decoded_url->append(1, '/'); // URLs only use '/' not '\\' 208 state = kStart; 209 } else { 210 return false; 211 } 212 break; 213 case kFirstDigit: 214 if (HexDigitsPrefix(&ch, 1) == 1) { 215 hex_buffer[1] = ch; 216 uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0); 217 decoded_url->append(1, static_cast<char>(hex_value)); 218 char_code = 0; 219 state = kStart; 220 } else { 221 return false; 222 } 223 break; 224 case kTruncate: 225 if (ch == dir_separator) { 226 // Skip this separator, it was only put in to break up long 227 // path segments, but is not part of the URL. 228 state = kStart; 229 } else { 230 return false; 231 } 232 break; 233 case kEscapeDot: 234 decoded_url->append(1, ch); 235 state = kStart; 236 break; 237 } 238 } 239 240 // All legal encoded filenames end in kEscapeChar. 241 return (state == kEscape); 242 } 243 244 // Escape the given input |path| and chop any individual components 245 // of the path which are greater than kMaximumSubdirectoryLength characters 246 // into two chunks. 247 // 248 // This legacy version has several issues with aliasing of different URLs, 249 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode 250 // the filenames back into URLs. 251 // 252 // But there is a large body of slurped data which depends on this format, 253 // so leave it as the default for spdy_in_mem_edsm_server. 254 string UrlToFilenameEncoder::LegacyEscape(const string& path) { 255 string output; 256 257 // Note: We also chop paths into medium sized 'chunks'. 258 // This is due to the incompetence of the windows 259 // filesystem, which still hasn't figured out how 260 // to deal with long filenames. 261 int last_slash = 0; 262 for (size_t index = 0; index < path.length(); index++) { 263 char ch = path[index]; 264 if (ch == 0x5C) 265 last_slash = index; 266 if ((ch == 0x2D) || // hyphen 267 (ch == 0x5C) || (ch == 0x5F) || // backslash, underscore 268 ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9] 269 ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z] 270 ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z] 271 output.append(&path[index], 1); 272 } else { 273 char encoded[3]; 274 encoded[0] = 'x'; 275 encoded[1] = ch / 16; 276 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; 277 encoded[2] = ch % 16; 278 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; 279 output.append(encoded, 3); 280 } 281 if (index - last_slash > kMaximumSubdirectoryLength) { 282 #ifdef WIN32 283 char slash = '\\'; 284 #else 285 char slash = '/'; 286 #endif 287 output.append(&slash, 1); 288 last_slash = index; 289 } 290 } 291 return output; 292 } 293 294 } // namespace net 295