Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "net/base/escape.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/logging.h"
     10 #include "base/scoped_ptr.h"
     11 #include "base/string_piece.h"
     12 #include "base/string_util.h"
     13 #include "base/utf_string_conversions.h"
     14 #include "base/utf_offset_string_conversions.h"
     15 
     16 namespace {
     17 
     18 static const char* const kHexString = "0123456789ABCDEF";
     19 inline char IntToHex(int i) {
     20   DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
     21   return kHexString[i];
     22 }
     23 
     24 // A fast bit-vector map for ascii characters.
     25 //
     26 // Internally stores 256 bits in an array of 8 ints.
     27 // Does quick bit-flicking to lookup needed characters.
     28 class Charmap {
     29  public:
     30   Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
     31           uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
     32     map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
     33     map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
     34   }
     35 
     36   bool Contains(unsigned char c) const {
     37     return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
     38   }
     39 
     40  private:
     41   uint32 map_[8];
     42 };
     43 
     44 // Given text to escape and a Charmap defining which values to escape,
     45 // return an escaped string.  If use_plus is true, spaces are converted
     46 // to +, otherwise, if spaces are in the charmap, they are converted to
     47 // %20.
     48 std::string Escape(const std::string& text, const Charmap& charmap,
     49                    bool use_plus) {
     50   std::string escaped;
     51   escaped.reserve(text.length() * 3);
     52   for (unsigned int i = 0; i < text.length(); ++i) {
     53     unsigned char c = static_cast<unsigned char>(text[i]);
     54     if (use_plus && ' ' == c) {
     55       escaped.push_back('+');
     56     } else if (charmap.Contains(c)) {
     57       escaped.push_back('%');
     58       escaped.push_back(IntToHex(c >> 4));
     59       escaped.push_back(IntToHex(c & 0xf));
     60     } else {
     61       escaped.push_back(c);
     62     }
     63   }
     64   return escaped;
     65 }
     66 
     67 // Contains nonzero when the corresponding character is unescapable for normal
     68 // URLs. These characters are the ones that may change the parsing of a URL, so
     69 // we don't want to unescape them sometimes. In many case we won't want to
     70 // unescape spaces, but that is controlled by parameters to Unescape*.
     71 //
     72 // The basic rule is that we can't unescape anything that would changing parsing
     73 // like # or ?. We also can't unescape &, =, or + since that could be part of a
     74 // query and that could change the server's parsing of the query. Nor can we
     75 // unescape \ since googleurl will convert it to a /.
     76 //
     77 // Lastly, we can't unescape anything that doesn't have a canonical
     78 // representation in a URL. This means that unescaping will change the URL, and
     79 // you could get different behavior if you copy and paste the URL, or press
     80 // enter in the URL bar. The list of characters that fall into this category
     81 // are the ones labeled PASS (allow either escaped or unescaped) in the big
     82 // lookup table at the top of googleurl/src/url_canon_path.cc
     83 const char kUrlUnescape[128] = {
     84 //   NULL, control chars...
     85      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     86      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     87 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
     88      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
     89 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
     90      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
     91 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
     92      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     93 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
     94      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
     95 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
     96      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     97 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
     98      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
     99 };
    100 
    101 template<typename STR>
    102 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
    103                                UnescapeRule::Type rules,
    104                                std::vector<size_t>* offsets_for_adjustment) {
    105   if (offsets_for_adjustment) {
    106     std::for_each(offsets_for_adjustment->begin(),
    107                   offsets_for_adjustment->end(),
    108                   LimitOffset<std::wstring>(escaped_text.length()));
    109   }
    110   // Do not unescape anything, return the |escaped_text| text.
    111   if (rules == UnescapeRule::NONE)
    112     return escaped_text;
    113 
    114   // The output of the unescaping is always smaller than the input, so we can
    115   // reserve the input size to make sure we have enough buffer and don't have
    116   // to allocate in the loop below.
    117   STR result;
    118   result.reserve(escaped_text.length());
    119 
    120   AdjustEncodingOffset::Adjustments adjustments;  // Locations of adjusted text.
    121   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
    122     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
    123       // Non ASCII character, append as is.
    124       result.push_back(escaped_text[i]);
    125       continue;
    126     }
    127 
    128     char current_char = static_cast<char>(escaped_text[i]);
    129     if (current_char == '%' && i + 2 < max) {
    130       const typename STR::value_type most_sig_digit(
    131           static_cast<typename STR::value_type>(escaped_text[i + 1]));
    132       const typename STR::value_type least_sig_digit(
    133           static_cast<typename STR::value_type>(escaped_text[i + 2]));
    134       if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
    135         unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
    136             HexDigitToInt(least_sig_digit);
    137         if (value >= 0x80 ||  // Unescape all high-bit characters.
    138             // For 7-bit characters, the lookup table tells us all valid chars.
    139             (kUrlUnescape[value] ||
    140              // ...and we allow some additional unescaping when flags are set.
    141              (value == ' ' && (rules & UnescapeRule::SPACES)) ||
    142              // Allow any of the prohibited but non-control characters when
    143              // we're doing "special" chars.
    144              (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
    145              // Additionally allow control characters if requested.
    146              (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
    147           // Use the unescaped version of the character.
    148           adjustments.push_back(i);
    149           result.push_back(value);
    150           i += 2;
    151         } else {
    152           // Keep escaped. Append a percent and we'll get the following two
    153           // digits on the next loops through.
    154           result.push_back('%');
    155         }
    156       } else {
    157         // Invalid escape sequence, just pass the percent through and continue
    158         // right after it.
    159         result.push_back('%');
    160       }
    161     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
    162                escaped_text[i] == '+') {
    163       result.push_back(' ');
    164     } else {
    165       // Normal case for unescaped characters.
    166       result.push_back(escaped_text[i]);
    167     }
    168   }
    169 
    170   // Make offset adjustment.
    171   if (offsets_for_adjustment && !adjustments.empty()) {
    172     std::for_each(offsets_for_adjustment->begin(),
    173                    offsets_for_adjustment->end(),
    174                    AdjustEncodingOffset(adjustments));
    175   }
    176 
    177   return result;
    178 }
    179 
    180 template<typename STR>
    181 STR UnescapeURLImpl(const STR& escaped_text,
    182                     UnescapeRule::Type rules,
    183                     size_t* offset_for_adjustment) {
    184   std::vector<size_t> offsets;
    185   if (offset_for_adjustment)
    186     offsets.push_back(*offset_for_adjustment);
    187   STR result = UnescapeURLWithOffsetsImpl(escaped_text, rules, &offsets);
    188   if (offset_for_adjustment)
    189     *offset_for_adjustment = offsets[0];
    190   return result;
    191 }
    192 
    193 }  // namespace
    194 
    195 // Everything except alphanumerics and !'()*-._~
    196 // See RFC 2396 for the list of reserved characters.
    197 static const Charmap kQueryCharmap(
    198   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
    199   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    200 
    201 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
    202   return Escape(text, kQueryCharmap, use_plus);
    203 }
    204 
    205 // Convert the string to a sequence of bytes and then % escape anything
    206 // except alphanumerics and !'()*-._~
    207 string16 EscapeQueryParamValueUTF8(const string16& text,
    208                                    bool use_plus) {
    209   return UTF8ToUTF16(Escape(UTF16ToUTF8(text), kQueryCharmap, use_plus));
    210 }
    211 
    212 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
    213 static const Charmap kPathCharmap(
    214   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
    215   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    216 
    217 std::string EscapePath(const std::string& path) {
    218   return Escape(path, kPathCharmap, false);
    219 }
    220 
    221 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
    222 static const Charmap kUrlEscape(
    223   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
    224   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
    225 );
    226 
    227 std::string EscapeUrlEncodedData(const std::string& path) {
    228   return Escape(path, kUrlEscape, true);
    229 }
    230 
    231 // non-7bit
    232 static const Charmap kNonASCIICharmap(
    233   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
    234   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    235 
    236 std::string EscapeNonASCII(const std::string& input) {
    237   return Escape(input, kNonASCIICharmap, false);
    238 }
    239 
    240 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
    241 // !'()*-._~%
    242 static const Charmap kExternalHandlerCharmap(
    243   0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
    244   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    245 
    246 std::string EscapeExternalHandlerValue(const std::string& text) {
    247   return Escape(text, kExternalHandlerCharmap, false);
    248 }
    249 
    250 string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
    251     const std::string& text,
    252     UnescapeRule::Type rules,
    253     std::vector<size_t>* offsets_for_adjustment) {
    254   std::wstring result;
    255   std::vector<size_t> original_offsets;
    256   if (offsets_for_adjustment)
    257     original_offsets = *offsets_for_adjustment;
    258   std::string unescaped_url(
    259       UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment));
    260   if (UTF8ToWideAndAdjustOffsets(unescaped_url.data(), unescaped_url.length(),
    261                                 &result, offsets_for_adjustment))
    262     return WideToUTF16Hack(result);      // Character set looks like it's valid.
    263 
    264   // Not valid.  Return the escaped version.  Undo our changes to
    265   // |offset_for_adjustment| since we haven't changed the string after all.
    266   if (offsets_for_adjustment)
    267     *offsets_for_adjustment = original_offsets;
    268   return WideToUTF16Hack(UTF8ToWideAndAdjustOffsets(
    269       text, offsets_for_adjustment));
    270 }
    271 
    272 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
    273                                            UnescapeRule::Type rules,
    274                                            size_t* offset_for_adjustment) {
    275   std::vector<size_t> offsets;
    276   if (offset_for_adjustment)
    277     offsets.push_back(*offset_for_adjustment);
    278   string16 result =
    279       UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets);
    280   if (offset_for_adjustment)
    281     *offset_for_adjustment = offsets[0];
    282   return result;
    283 }
    284 
    285 std::string UnescapeURLComponent(const std::string& escaped_text,
    286                                  UnescapeRule::Type rules) {
    287   return UnescapeURLWithOffsetsImpl<std::string>(escaped_text, rules, NULL);
    288 }
    289 
    290 string16 UnescapeURLComponent(const string16& escaped_text,
    291                               UnescapeRule::Type rules) {
    292   return UnescapeURLWithOffsetsImpl<string16>(escaped_text, rules, NULL);
    293 }
    294 
    295 
    296 template <class str>
    297 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
    298   static const struct {
    299     char key;
    300     const char* replacement;
    301   } kCharsToEscape[] = {
    302     { '<', "&lt;" },
    303     { '>', "&gt;" },
    304     { '&', "&amp;" },
    305     { '"', "&quot;" },
    306     { '\'', "&#39;" },
    307   };
    308   size_t k;
    309   for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
    310     if (c == kCharsToEscape[k].key) {
    311       const char* p = kCharsToEscape[k].replacement;
    312       while (*p)
    313         output->push_back(*p++);
    314       break;
    315     }
    316   }
    317   if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
    318     output->push_back(c);
    319 }
    320 
    321 void AppendEscapedCharForHTML(char c, std::string* output) {
    322   AppendEscapedCharForHTMLImpl(c, output);
    323 }
    324 
    325 void AppendEscapedCharForHTML(wchar_t c, string16* output) {
    326   AppendEscapedCharForHTMLImpl(c, output);
    327 }
    328 
    329 template <class str>
    330 str EscapeForHTMLImpl(const str& input) {
    331   str result;
    332   result.reserve(input.size());  // optimize for no escaping
    333 
    334   for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
    335     AppendEscapedCharForHTMLImpl(*it, &result);
    336 
    337   return result;
    338 }
    339 
    340 std::string EscapeForHTML(const std::string& input) {
    341   return EscapeForHTMLImpl(input);
    342 }
    343 
    344 string16 EscapeForHTML(const string16& input) {
    345   return EscapeForHTMLImpl(input);
    346 }
    347 
    348 string16 UnescapeForHTML(const string16& input) {
    349   static const struct {
    350     const wchar_t* ampersand_code;
    351     const char replacement;
    352   } kEscapeToChars[] = {
    353     { L"&lt;", '<' },
    354     { L"&gt;", '>' },
    355     { L"&amp;", '&' },
    356     { L"&quot;", '"' },
    357     { L"&#39;", '\''},
    358   };
    359 
    360   if (input.find(WideToUTF16(L"&")) == std::string::npos)
    361     return input;
    362 
    363   string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
    364   string16 text(input);
    365   for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
    366     if (*iter == '&') {
    367       // Potential ampersand encode char.
    368       size_t index = iter - text.begin();
    369       for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
    370         if (ampersand_chars[i].empty())
    371           ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
    372         if (text.find(ampersand_chars[i], index) == index) {
    373           text.replace(iter, iter + ampersand_chars[i].length(),
    374                        1, kEscapeToChars[i].replacement);
    375           break;
    376         }
    377       }
    378     }
    379   }
    380   return text;
    381 }
    382 
    383 AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments)
    384   : adjustments(adjustments) {}
    385 
    386 void AdjustEncodingOffset::operator()(size_t& offset) {
    387   // For each encoded character occurring before an offset subtract 2.
    388   if (offset == string16::npos)
    389     return;
    390   size_t adjusted_offset = offset;
    391   for (Adjustments::const_iterator i = adjustments.begin();
    392        i != adjustments.end(); ++i) {
    393     size_t location = *i;
    394     if (offset <= location) {
    395       offset = adjusted_offset;
    396       return;
    397     }
    398     if (offset <= (location + 2)) {
    399       offset = string16::npos;
    400       return;
    401     }
    402     adjusted_offset -= 2;
    403   }
    404   offset = adjusted_offset;
    405 }
    406