Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <algorithm>
      6 
      7 #include "net/base/escape.h"
      8 
      9 #include "base/i18n/icu_string_conversions.h"
     10 #include "base/logging.h"
     11 #include "base/string_piece.h"
     12 #include "base/utf_string_conversions.h"
     13 #include "base/utf_offset_string_conversions.h"
     14 
     15 namespace {
     16 
     17 template <class char_type>
     18 inline bool IsHex(char_type ch) {
     19   return (ch >= '0' && ch <= '9') ||
     20          (ch >= 'A' && ch <= 'F') ||
     21          (ch >= 'a' && ch <= 'f');
     22 }
     23 
     24 template <class char_type>
     25 inline char_type HexToInt(char_type ch) {
     26   if (ch >= '0' && ch <= '9')
     27     return ch - '0';
     28   if (ch >= 'A' && ch <= 'F')
     29     return ch - 'A' + 10;
     30   if (ch >= 'a' && ch <= 'f')
     31     return ch - 'a' + 10;
     32   NOTREACHED();
     33   return 0;
     34 }
     35 
     36 static const char* const kHexString = "0123456789ABCDEF";
     37 inline char IntToHex(int i) {
     38   DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
     39   return kHexString[i];
     40 }
     41 
     42 // A fast bit-vector map for ascii characters.
     43 //
     44 // Internally stores 256 bits in an array of 8 ints.
     45 // Does quick bit-flicking to lookup needed characters.
     46 class Charmap {
     47  public:
     48   Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
     49           uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
     50     map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
     51     map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
     52   }
     53 
     54   bool Contains(unsigned char c) const {
     55     return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
     56   }
     57 
     58  private:
     59   uint32 map_[8];
     60 };
     61 
     62 // Given text to escape and a Charmap defining which values to escape,
     63 // return an escaped string.  If use_plus is true, spaces are converted
     64 // to +, otherwise, if spaces are in the charmap, they are converted to
     65 // %20.
     66 const std::string Escape(const std::string& text, const Charmap& charmap,
     67                          bool use_plus) {
     68   std::string escaped;
     69   escaped.reserve(text.length() * 3);
     70   for (unsigned int i = 0; i < text.length(); ++i) {
     71     unsigned char c = static_cast<unsigned char>(text[i]);
     72     if (use_plus && ' ' == c) {
     73       escaped.push_back('+');
     74     } else if (charmap.Contains(c)) {
     75       escaped.push_back('%');
     76       escaped.push_back(IntToHex(c >> 4));
     77       escaped.push_back(IntToHex(c & 0xf));
     78     } else {
     79       escaped.push_back(c);
     80     }
     81   }
     82   return escaped;
     83 }
     84 
     85 // Contains nonzero when the corresponding character is unescapable for normal
     86 // URLs. These characters are the ones that may change the parsing of a URL, so
     87 // we don't want to unescape them sometimes. In many case we won't want to
     88 // unescape spaces, but that is controlled by parameters to Unescape*.
     89 //
     90 // The basic rule is that we can't unescape anything that would changing parsing
     91 // like # or ?. We also can't unescape &, =, or + since that could be part of a
     92 // query and that could change the server's parsing of the query.
     93 const char kUrlUnescape[128] = {
     94 //   NULL, control chars...
     95      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     96      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     97 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
     98      0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
     99 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
    100      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
    101 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
    102      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    103 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
    104      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    105 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
    106      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    107 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
    108      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
    109 };
    110 
    111 template<typename STR>
    112 STR UnescapeURLImpl(const STR& escaped_text,
    113                     UnescapeRule::Type rules,
    114                     size_t* offset_for_adjustment) {
    115   size_t offset_temp = string16::npos;
    116   if (!offset_for_adjustment)
    117     offset_for_adjustment = &offset_temp;
    118   else if (*offset_for_adjustment >= escaped_text.length())
    119     *offset_for_adjustment = string16::npos;
    120 
    121   // Do not unescape anything, return the |escaped_text| text.
    122   if (rules == UnescapeRule::NONE)
    123     return escaped_text;
    124 
    125   // The output of the unescaping is always smaller than the input, so we can
    126   // reserve the input size to make sure we have enough buffer and don't have
    127   // to allocate in the loop below.
    128   STR result;
    129   result.reserve(escaped_text.length());
    130 
    131   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
    132     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
    133       // Non ASCII character, append as is.
    134       result.push_back(escaped_text[i]);
    135       continue;
    136     }
    137 
    138     char current_char = static_cast<char>(escaped_text[i]);
    139     if (current_char == '%' && i + 2 < max) {
    140       const typename STR::value_type most_sig_digit(
    141           static_cast<typename STR::value_type>(escaped_text[i + 1]));
    142       const typename STR::value_type least_sig_digit(
    143           static_cast<typename STR::value_type>(escaped_text[i + 2]));
    144       if (IsHex(most_sig_digit) && IsHex(least_sig_digit)) {
    145         unsigned char value = HexToInt(most_sig_digit) * 16 +
    146             HexToInt(least_sig_digit);
    147         if (value >= 0x80 ||  // Unescape all high-bit characters.
    148             // For 7-bit characters, the lookup table tells us all valid chars.
    149             (kUrlUnescape[value] ||
    150              // ...and we allow some additional unescaping when flags are set.
    151              (value == ' ' && (rules & UnescapeRule::SPACES)) ||
    152              // Allow any of the prohibited but non-control characters when
    153              // we're doing "special" chars.
    154              (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
    155              // Additionally allow control characters if requested.
    156              (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
    157           // Use the unescaped version of the character.
    158           size_t length_before_append = result.length();
    159           result.push_back(value);
    160           i += 2;
    161 
    162           // Adjust offset to match length change.
    163           if (*offset_for_adjustment != std::string::npos) {
    164             if (*offset_for_adjustment > (length_before_append + 2))
    165               *offset_for_adjustment -= 2;
    166             else if (*offset_for_adjustment > length_before_append)
    167               *offset_for_adjustment = std::string::npos;
    168           }
    169         } else {
    170           // Keep escaped. Append a percent and we'll get the following two
    171           // digits on the next loops through.
    172           result.push_back('%');
    173         }
    174       } else {
    175         // Invalid escape sequence, just pass the percent through and continue
    176         // right after it.
    177         result.push_back('%');
    178       }
    179     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
    180                escaped_text[i] == '+') {
    181       result.push_back(' ');
    182     } else {
    183       // Normal case for unescaped characters.
    184       result.push_back(escaped_text[i]);
    185     }
    186   }
    187 
    188   return result;
    189 }
    190 
    191 }  // namespace
    192 
    193 // Everything except alphanumerics and !'()*-._~
    194 // See RFC 2396 for the list of reserved characters.
    195 static const Charmap kQueryCharmap(
    196   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
    197   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    198 
    199 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
    200   return Escape(text, kQueryCharmap, use_plus);
    201 }
    202 
    203 // Convert the string to a sequence of bytes and then % escape anything
    204 // except alphanumerics and !'()*-._~
    205 std::wstring EscapeQueryParamValueUTF8(const std::wstring& text,
    206                                        bool use_plus) {
    207   return UTF8ToWide(Escape(WideToUTF8(text), kQueryCharmap, use_plus));
    208 }
    209 
    210 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
    211 static const Charmap kPathCharmap(
    212   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
    213   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    214 
    215 std::string EscapePath(const std::string& path) {
    216   return Escape(path, kPathCharmap, false);
    217 }
    218 
    219 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
    220 static const Charmap kUrlEscape(
    221   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
    222   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
    223 );
    224 
    225 std::string EscapeUrlEncodedData(const std::string& path) {
    226   return Escape(path, kUrlEscape, true);
    227 }
    228 
    229 // non-7bit
    230 static const Charmap kNonASCIICharmap(
    231   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
    232   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    233 
    234 std::string EscapeNonASCII(const std::string& input) {
    235   return Escape(input, kNonASCIICharmap, false);
    236 }
    237 
    238 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
    239 // !'()*-._~%
    240 static const Charmap kExternalHandlerCharmap(
    241   0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
    242   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
    243 
    244 std::string EscapeExternalHandlerValue(const std::string& text) {
    245   return Escape(text, kExternalHandlerCharmap, false);
    246 }
    247 
    248 bool EscapeQueryParamValue(const string16& text, const char* codepage,
    249                            bool use_plus, string16* escaped) {
    250   // TODO(brettw) bug 1201094: this function should be removed, this "SKIP"
    251   // behavior is wrong when the character can't be encoded properly.
    252   std::string encoded;
    253   if (!base::UTF16ToCodepage(text, codepage,
    254                              base::OnStringConversionError::SKIP, &encoded))
    255     return false;
    256 
    257   escaped->assign(UTF8ToUTF16(Escape(encoded, kQueryCharmap, use_plus)));
    258   return true;
    259 }
    260 
    261 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
    262                                            UnescapeRule::Type rules,
    263                                            size_t* offset_for_adjustment) {
    264   std::wstring result;
    265   size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0;
    266   std::string unescaped_url(
    267       UnescapeURLImpl(text, rules, offset_for_adjustment));
    268   if (UTF8ToWideAndAdjustOffset(unescaped_url.data(), unescaped_url.length(),
    269                                 &result, offset_for_adjustment))
    270     return WideToUTF16Hack(result);      // Character set looks like it's valid.
    271 
    272   // Not valid.  Return the escaped version.  Undo our changes to
    273   // |offset_for_adjustment| since we haven't changed the string after all.
    274   if (offset_for_adjustment)
    275     *offset_for_adjustment = original_offset;
    276   return WideToUTF16Hack(UTF8ToWideAndAdjustOffset(text,
    277                                                    offset_for_adjustment));
    278 }
    279 
    280 std::string UnescapeURLComponent(const std::string& escaped_text,
    281                                  UnescapeRule::Type rules) {
    282   return UnescapeURLImpl(escaped_text, rules, NULL);
    283 }
    284 
    285 string16 UnescapeURLComponent(const string16& escaped_text,
    286                               UnescapeRule::Type rules) {
    287   return UnescapeURLImpl(escaped_text, rules, NULL);
    288 }
    289 
    290 
    291 template <class str>
    292 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
    293   static const struct {
    294     char key;
    295     const char* replacement;
    296   } kCharsToEscape[] = {
    297     { '<', "&lt;" },
    298     { '>', "&gt;" },
    299     { '&', "&amp;" },
    300     { '"', "&quot;" },
    301     { '\'', "&#39;" },
    302   };
    303   size_t k;
    304   for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
    305     if (c == kCharsToEscape[k].key) {
    306       const char* p = kCharsToEscape[k].replacement;
    307       while (*p)
    308         output->push_back(*p++);
    309       break;
    310     }
    311   }
    312   if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
    313     output->push_back(c);
    314 }
    315 
    316 void AppendEscapedCharForHTML(char c, std::string* output) {
    317   AppendEscapedCharForHTMLImpl(c, output);
    318 }
    319 
    320 void AppendEscapedCharForHTML(wchar_t c, string16* output) {
    321   AppendEscapedCharForHTMLImpl(c, output);
    322 }
    323 
    324 template <class str>
    325 str EscapeForHTMLImpl(const str& input) {
    326   str result;
    327   result.reserve(input.size());  // optimize for no escaping
    328 
    329   for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
    330     AppendEscapedCharForHTMLImpl(*it, &result);
    331 
    332   return result;
    333 }
    334 
    335 std::string EscapeForHTML(const std::string& input) {
    336   return EscapeForHTMLImpl(input);
    337 }
    338 
    339 string16 EscapeForHTML(const string16& input) {
    340   return EscapeForHTMLImpl(input);
    341 }
    342 
    343 string16 UnescapeForHTML(const string16& input) {
    344   static const struct {
    345     const wchar_t* ampersand_code;
    346     const char replacement;
    347   } kEscapeToChars[] = {
    348     { L"&lt;", '<' },
    349     { L"&gt;", '>' },
    350     { L"&amp;", '&' },
    351     { L"&quot;", '"' },
    352     { L"&#39;", '\''},
    353   };
    354 
    355   if (input.find(WideToUTF16(L"&")) == std::string::npos)
    356     return input;
    357 
    358   string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
    359   string16 text(input);
    360   for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
    361     if (*iter == '&') {
    362       // Potential ampersand encode char.
    363       size_t index = iter - text.begin();
    364       for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
    365         if (ampersand_chars[i].empty())
    366           ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
    367         if (text.find(ampersand_chars[i], index) == index) {
    368           text.replace(iter, iter + ampersand_chars[i].length(),
    369                        1, kEscapeToChars[i].replacement);
    370           break;
    371         }
    372       }
    373     }
    374   }
    375   return text;
    376 }
    377