Home | History | Annotate | Download | only in dump_cache
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <stdlib.h>
      6 
      7 #include "base/logging.h"
      8 #include "base/string_util.h"
      9 #include "net/base/net_util.h"
     10 #include "net/tools/dump_cache/url_to_filename_encoder.h"
     11 
     12 using std::string;
     13 
     14 namespace {
     15 
     16 // Returns 1 if buf is prefixed by "num_digits" of hex digits
     17 // Teturns 0 otherwise.
     18 // The function checks for '\0' for string termination.
     19 int HexDigitsPrefix(const char* buf, int num_digits) {
     20   for (int i = 0; i < num_digits; i++) {
     21     if (!IsHexDigit(buf[i]))
     22       return 0;  // This also detects end of string as '\0' is not xdigit.
     23   }
     24   return 1;
     25 }
     26 
     27 #ifdef WIN32
     28 #define strtoull _strtoui64
     29 #endif
     30 
     31 // A simple parser for long long values. Returns the parsed value if a
     32 // valid integer is found; else returns deflt
     33 // UInt64 and Int64 cannot handle decimal numbers with leading 0s.
     34 uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
     35   char *error = NULL;
     36   const uint64 value = strtoull(str, &error, 16);
     37   return (error == str) ? deflt : value;
     38 }
     39 
     40 }
     41 
     42 namespace net {
     43 
     44 // The escape character choice is made here -- all code and tests in this
     45 // directory are based off of this constant.  However, our testdata
     46 // has tons of dependencies on this, so it cannot be changed without
     47 // re-running those tests and fixing them.
     48 const char UrlToFilenameEncoder::kEscapeChar = ',';
     49 const char UrlToFilenameEncoder::kTruncationChar = '-';
     50 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
     51 
     52 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
     53   CHECK(!segment->empty());
     54   if ((*segment == ".") || (*segment == "..")) {
     55     dest->append(1, kEscapeChar);
     56     dest->append(*segment);
     57     segment->clear();
     58   } else {
     59     size_t segment_size = segment->size();
     60     if (segment_size > kMaximumSubdirectoryLength) {
     61       // We need to inject ",-" at the end of the segment to signify that
     62       // we are inserting an artificial '/'.  This means we have to chop
     63       // off at least two characters to make room.
     64       segment_size = kMaximumSubdirectoryLength - 2;
     65 
     66       // But we don't want to break up an escape sequence that happens to lie at
     67       // the end.  Escape sequences are at most 2 characters.
     68       if ((*segment)[segment_size - 1] == kEscapeChar) {
     69         segment_size -= 1;
     70       } else if ((*segment)[segment_size - 2] == kEscapeChar) {
     71         segment_size -= 2;
     72       }
     73       dest->append(segment->data(), segment_size);
     74       dest->append(1, kEscapeChar);
     75       dest->append(1, kTruncationChar);
     76       segment->erase(0, segment_size);
     77 
     78       // At this point, if we had segment_size=3, and segment="abcd",
     79       // then after this erase, we will have written "abc,-" and set segment="d"
     80     } else {
     81       dest->append(*segment);
     82       segment->clear();
     83     }
     84   }
     85 }
     86 
     87 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
     88                                          const string& escaped_ending,
     89                                          char dir_separator,
     90                                          string* encoded_filename) {
     91   string filename_ending = UrlUtilities::Unescape(escaped_ending);
     92 
     93   char encoded[3];
     94   int encoded_len;
     95   string segment;
     96 
     97   // TODO(jmarantz): This code would be a bit simpler if we disallowed
     98   // Instaweb allowing filename_prefix to not end in "/".  We could
     99   // then change the is routine to just take one input string.
    100   size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
    101   if (start_of_segment == string::npos) {
    102     segment = filename_prefix;
    103   } else {
    104     segment = filename_prefix.substr(start_of_segment + 1);
    105     *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
    106   }
    107 
    108   size_t index = 0;
    109   // Special case the first / to avoid adding a leading kEscapeChar.
    110   if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
    111     encoded_filename->append(segment);
    112     segment.clear();
    113     encoded_filename->append(1, dir_separator);
    114     ++index;
    115   }
    116 
    117   for (; index < filename_ending.length(); ++index) {
    118     unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
    119 
    120     // Note: instead of outputing an empty segment, we let the second slash
    121     // be escaped below.
    122     if ((ch == dir_separator) && !segment.empty()) {
    123       AppendSegment(&segment, encoded_filename);
    124       encoded_filename->append(1, dir_separator);
    125       segment.clear();
    126     } else {
    127       // After removing unsafe chars the only safe ones are _.=+- and alphanums.
    128       if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
    129           (ch == '-') || (('0' <= ch) && (ch <= '9')) ||
    130           (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
    131         encoded[0] = ch;
    132         encoded_len = 1;
    133       } else {
    134         encoded[0] = kEscapeChar;
    135         encoded[1] = ch / 16;
    136         encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
    137         encoded[2] = ch % 16;
    138         encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
    139         encoded_len = 3;
    140       }
    141       segment.append(encoded, encoded_len);
    142 
    143       // If segment is too big, we must chop it into chunks.
    144       if (segment.size() > kMaximumSubdirectoryLength) {
    145         AppendSegment(&segment, encoded_filename);
    146         encoded_filename->append(1, dir_separator);
    147       }
    148     }
    149   }
    150 
    151   // Append "," to the leaf filename so the leaf can also be a branch., e.g.
    152   // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
    153   // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed
    154   // us over the 128 char limit, then we will need to append "/" and the
    155   // remaining chars.
    156   segment += kEscapeChar;
    157   AppendSegment(&segment, encoded_filename);
    158   if (!segment.empty()) {
    159     // The last overflow segment is special, because we appended in
    160     // kEscapeChar above.  We won't need to check it again for size
    161     // or further escaping.
    162     encoded_filename->append(1, dir_separator);
    163     encoded_filename->append(segment);
    164   }
    165 }
    166 
    167 // Note: this decoder is not the exact inverse of the EncodeSegment above,
    168 // because it does not take into account a prefix.
    169 bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
    170                                   char dir_separator,
    171                                   string* decoded_url) {
    172   enum State {
    173     kStart,
    174     kEscape,
    175     kFirstDigit,
    176     kTruncate,
    177     kEscapeDot
    178   };
    179   State state = kStart;
    180   int char_code = 0;
    181   char hex_buffer[3];
    182   hex_buffer[2] = '\0';
    183   for (size_t i = 0; i < encoded_filename.size(); ++i) {
    184     char ch = encoded_filename[i];
    185     switch (state) {
    186       case kStart:
    187         if (ch == kEscapeChar) {
    188           state = kEscape;
    189         } else if (ch == dir_separator) {
    190           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
    191         } else {
    192           decoded_url->append(1, ch);
    193         }
    194         break;
    195       case kEscape:
    196         if (HexDigitsPrefix(&ch, 1) == 1) {
    197           hex_buffer[0] = ch;
    198           state = kFirstDigit;
    199         } else if (ch == kTruncationChar) {
    200           state = kTruncate;
    201         } else if (ch == '.') {
    202           decoded_url->append(1, '.');
    203           state = kEscapeDot;  // Look for at most one more dot.
    204         } else if (ch == dir_separator) {
    205           // Consider url "//x".  This was once encoded to "/,/x,".
    206           // This code is what skips the first Escape.
    207           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
    208           state = kStart;
    209         } else {
    210           return false;
    211         }
    212         break;
    213       case kFirstDigit:
    214         if (HexDigitsPrefix(&ch, 1) == 1) {
    215           hex_buffer[1] = ch;
    216           uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
    217           decoded_url->append(1, static_cast<char>(hex_value));
    218           char_code = 0;
    219           state = kStart;
    220         } else {
    221           return false;
    222         }
    223         break;
    224       case kTruncate:
    225         if (ch == dir_separator) {
    226           // Skip this separator, it was only put in to break up long
    227           // path segments, but is not part of the URL.
    228           state = kStart;
    229         } else {
    230           return false;
    231         }
    232         break;
    233       case kEscapeDot:
    234         decoded_url->append(1, ch);
    235         state = kStart;
    236         break;
    237     }
    238   }
    239 
    240   // All legal encoded filenames end in kEscapeChar.
    241   return (state == kEscape);
    242 }
    243 
    244 // Escape the given input |path| and chop any individual components
    245 // of the path which are greater than kMaximumSubdirectoryLength characters
    246 // into two chunks.
    247 //
    248 // This legacy version has several issues with aliasing of different URLs,
    249 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode
    250 // the filenames back into URLs.
    251 //
    252 // But there is a large body of slurped data which depends on this format,
    253 // so leave it as the default for spdy_in_mem_edsm_server.
    254 string UrlToFilenameEncoder::LegacyEscape(const string& path) {
    255   string output;
    256 
    257   // Note:  We also chop paths into medium sized 'chunks'.
    258   //        This is due to the incompetence of the windows
    259   //        filesystem, which still hasn't figured out how
    260   //        to deal with long filenames.
    261   int last_slash = 0;
    262   for (size_t index = 0; index < path.length(); index++) {
    263     char ch = path[index];
    264     if (ch == 0x5C)
    265       last_slash = index;
    266     if ((ch == 0x2D) ||                    // hyphen
    267         (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore
    268         ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
    269         ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
    270         ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
    271       output.append(&path[index], 1);
    272     } else {
    273       char encoded[3];
    274       encoded[0] = 'x';
    275       encoded[1] = ch / 16;
    276       encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
    277       encoded[2] = ch % 16;
    278       encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
    279       output.append(encoded, 3);
    280     }
    281     if (index - last_slash > kMaximumSubdirectoryLength) {
    282 #ifdef WIN32
    283       char slash = '\\';
    284 #else
    285       char slash = '/';
    286 #endif
    287       output.append(&slash, 1);
    288       last_slash = index;
    289     }
    290   }
    291   return output;
    292 }
    293 
    294 }  // namespace net
    295