Home | History | Annotate | Download | only in base
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "net/base/escape.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/logging.h"
     10 #include "base/memory/scoped_ptr.h"
     11 #include "base/strings/string_piece.h"
     12 #include "base/strings/string_util.h"
     13 #include "base/strings/utf_offset_string_conversions.h"
     14 #include "base/strings/utf_string_conversions.h"
     15 
     16 namespace net {
     17 
     18 namespace {
     19 
     20 const char kHexString[] = "0123456789ABCDEF";
     21 inline char IntToHex(int i) {
     22   DCHECK_GE(i, 0) << i << " not a hex value";
     23   DCHECK_LE(i, 15) << i << " not a hex value";
     24   return kHexString[i];
     25 }
     26 
     27 // A fast bit-vector map for ascii characters.
     28 //
     29 // Internally stores 256 bits in an array of 8 ints.
     30 // Does quick bit-flicking to lookup needed characters.
     31 struct Charmap {
     32   bool Contains(unsigned char c) const {
     33     return ((map[c >> 5] & (1 << (c & 31))) != 0);
     34   }
     35 
     36   uint32 map[8];
     37 };
     38 
     39 // Given text to escape and a Charmap defining which values to escape,
     40 // return an escaped string.  If use_plus is true, spaces are converted
     41 // to +, otherwise, if spaces are in the charmap, they are converted to
     42 // %20.
     43 std::string Escape(const std::string& text, const Charmap& charmap,
     44                    bool use_plus) {
     45   std::string escaped;
     46   escaped.reserve(text.length() * 3);
     47   for (unsigned int i = 0; i < text.length(); ++i) {
     48     unsigned char c = static_cast<unsigned char>(text[i]);
     49     if (use_plus && ' ' == c) {
     50       escaped.push_back('+');
     51     } else if (charmap.Contains(c)) {
     52       escaped.push_back('%');
     53       escaped.push_back(IntToHex(c >> 4));
     54       escaped.push_back(IntToHex(c & 0xf));
     55     } else {
     56       escaped.push_back(c);
     57     }
     58   }
     59   return escaped;
     60 }
     61 
     62 // Contains nonzero when the corresponding character is unescapable for normal
     63 // URLs. These characters are the ones that may change the parsing of a URL, so
     64 // we don't want to unescape them sometimes. In many case we won't want to
     65 // unescape spaces, but that is controlled by parameters to Unescape*.
     66 //
     67 // The basic rule is that we can't unescape anything that would changing parsing
     68 // like # or ?. We also can't unescape &, =, or + since that could be part of a
     69 // query and that could change the server's parsing of the query. Nor can we
     70 // unescape \ since src/url/ will convert it to a /.
     71 //
     72 // Lastly, we can't unescape anything that doesn't have a canonical
     73 // representation in a URL. This means that unescaping will change the URL, and
     74 // you could get different behavior if you copy and paste the URL, or press
     75 // enter in the URL bar. The list of characters that fall into this category
     76 // are the ones labeled PASS (allow either escaped or unescaped) in the big
     77 // lookup table at the top of url/url_canon_path.cc.  Also, characters
     78 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
     79 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
     80 // not unescaped, to avoid turning a valid url according to spec into an
     81 // invalid one.
     82 const char kUrlUnescape[128] = {
     83 //   NULL, control chars...
     84      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     85      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     86 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
     87      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
     88 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
     89      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
     90 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
     91      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     92 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
     93      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
     94 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
     95      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     96 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
     97      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
     98 };
     99 
    100 template<typename STR>
    101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
    102                                UnescapeRule::Type rules,
    103                                std::vector<size_t>* offsets_for_adjustment) {
    104   if (offsets_for_adjustment) {
    105     std::for_each(offsets_for_adjustment->begin(),
    106                   offsets_for_adjustment->end(),
    107                   base::LimitOffset<STR>(escaped_text.length()));
    108   }
    109   // Do not unescape anything, return the |escaped_text| text.
    110   if (rules == UnescapeRule::NONE)
    111     return escaped_text;
    112 
    113   // The output of the unescaping is always smaller than the input, so we can
    114   // reserve the input size to make sure we have enough buffer and don't have
    115   // to allocate in the loop below.
    116   STR result;
    117   result.reserve(escaped_text.length());
    118 
    119   // Locations of adjusted text.
    120   net::internal::AdjustEncodingOffset::Adjustments adjustments;
    121   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
    122     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
    123       // Non ASCII character, append as is.
    124       result.push_back(escaped_text[i]);
    125       continue;
    126     }
    127 
    128     char current_char = static_cast<char>(escaped_text[i]);
    129     if (current_char == '%' && i + 2 < max) {
    130       const typename STR::value_type most_sig_digit(
    131           static_cast<typename STR::value_type>(escaped_text[i + 1]));
    132       const typename STR::value_type least_sig_digit(
    133           static_cast<typename STR::value_type>(escaped_text[i + 2]));
    134       if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
    135         unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
    136             HexDigitToInt(least_sig_digit);
    137         if (value >= 0x80 ||  // Unescape all high-bit characters.
    138             // For 7-bit characters, the lookup table tells us all valid chars.
    139             (kUrlUnescape[value] ||
    140              // ...and we allow some additional unescaping when flags are set.
    141              (value == ' ' && (rules & UnescapeRule::SPACES)) ||
    142              // Allow any of the prohibited but non-control characters when
    143              // we're doing "special" chars.
    144              (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
    145              // Additionally allow control characters if requested.
    146              (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
    147           // Use the unescaped version of the character.
    148           adjustments.push_back(i);
    149           result.push_back(value);
    150           i += 2;
    151         } else {
    152           // Keep escaped. Append a percent and we'll get the following two
    153           // digits on the next loops through.
    154           result.push_back('%');
    155         }
    156       } else {
    157         // Invalid escape sequence, just pass the percent through and continue
    158         // right after it.
    159         result.push_back('%');
    160       }
    161     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
    162                escaped_text[i] == '+') {
    163       result.push_back(' ');
    164     } else {
    165       // Normal case for unescaped characters.
    166       result.push_back(escaped_text[i]);
    167     }
    168   }
    169 
    170   // Make offset adjustment.
    171   if (offsets_for_adjustment && !adjustments.empty()) {
    172     std::for_each(offsets_for_adjustment->begin(),
    173                    offsets_for_adjustment->end(),
    174                    net::internal::AdjustEncodingOffset(adjustments));
    175   }
    176 
    177   return result;
    178 }
    179 
    180 template <class str>
    181 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
    182   static const struct {
    183     char key;
    184     const char* replacement;
    185   } kCharsToEscape[] = {
    186     { '<', "&lt;" },
    187     { '>', "&gt;" },
    188     { '&', "&amp;" },
    189     { '"', "&quot;" },
    190     { '\'', "&#39;" },
    191   };
    192   size_t k;
    193   for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
    194     if (c == kCharsToEscape[k].key) {
    195       const char* p = kCharsToEscape[k].replacement;
    196       while (*p)
    197         output->push_back(*p++);
    198       break;
    199     }
    200   }
    201   if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
    202     output->push_back(c);
    203 }
    204 
    205 template <class str>
    206 str EscapeForHTMLImpl(const str& input) {
    207   str result;
    208   result.reserve(input.size());  // Optimize for no escaping.
    209 
    210   for (typename str::const_iterator i = input.begin(); i != input.end(); ++i)
    211     AppendEscapedCharForHTMLImpl(*i, &result);
    212 
    213   return result;
    214 }
    215 
    216 // Everything except alphanumerics and !'()*-._~
    217 // See RFC 2396 for the list of reserved characters.
    218 static const Charmap kQueryCharmap = {{
    219   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
    220   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
    221 }};
    222 
    223 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
    224 static const Charmap kPathCharmap = {{
    225   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
    226   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
    227 }};
    228 
    229 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
    230 static const Charmap kUrlEscape = {{
    231   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
    232   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
    233 }};
    234 
    235 // non-7bit
    236 static const Charmap kNonASCIICharmap = {{
    237   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
    238   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
    239 }};
    240 
    241 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
    242 // !'()*-._~%
    243 static const Charmap kExternalHandlerCharmap = {{
    244   0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
    245   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
    246 }};
    247 
    248 }  // namespace
    249 
    250 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
    251   return Escape(text, kQueryCharmap, use_plus);
    252 }
    253 
    254 std::string EscapePath(const std::string& path) {
    255   return Escape(path, kPathCharmap, false);
    256 }
    257 
    258 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) {
    259   return Escape(path, kUrlEscape, use_plus);
    260 }
    261 
    262 std::string EscapeNonASCII(const std::string& input) {
    263   return Escape(input, kNonASCIICharmap, false);
    264 }
    265 
    266 std::string EscapeExternalHandlerValue(const std::string& text) {
    267   return Escape(text, kExternalHandlerCharmap, false);
    268 }
    269 
    270 void AppendEscapedCharForHTML(char c, std::string* output) {
    271   AppendEscapedCharForHTMLImpl(c, output);
    272 }
    273 
    274 std::string EscapeForHTML(const std::string& input) {
    275   return EscapeForHTMLImpl(input);
    276 }
    277 
    278 base::string16 EscapeForHTML(const base::string16& input) {
    279   return EscapeForHTMLImpl(input);
    280 }
    281 
    282 std::string UnescapeURLComponent(const std::string& escaped_text,
    283                                  UnescapeRule::Type rules) {
    284   return UnescapeURLWithOffsetsImpl(escaped_text, rules, NULL);
    285 }
    286 
    287 base::string16 UnescapeURLComponent(const base::string16& escaped_text,
    288                                     UnescapeRule::Type rules) {
    289   return UnescapeURLWithOffsetsImpl(escaped_text, rules, NULL);
    290 }
    291 
    292 base::string16 UnescapeAndDecodeUTF8URLComponent(
    293     const std::string& text,
    294     UnescapeRule::Type rules,
    295     size_t* offset_for_adjustment) {
    296   std::vector<size_t> offsets;
    297   if (offset_for_adjustment)
    298     offsets.push_back(*offset_for_adjustment);
    299   base::string16 result =
    300       UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets);
    301   if (offset_for_adjustment)
    302     *offset_for_adjustment = offsets[0];
    303   return result;
    304 }
    305 
    306 base::string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
    307     const std::string& text,
    308     UnescapeRule::Type rules,
    309     std::vector<size_t>* offsets_for_adjustment) {
    310   base::string16 result;
    311   std::vector<size_t> original_offsets;
    312   if (offsets_for_adjustment)
    313     original_offsets = *offsets_for_adjustment;
    314   std::string unescaped_url(
    315       UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment));
    316   if (base::UTF8ToUTF16AndAdjustOffsets(unescaped_url.data(),
    317                                         unescaped_url.length(),
    318                                         &result, offsets_for_adjustment))
    319     return result;  // Character set looks like it's valid.
    320 
    321   // Not valid.  Return the escaped version.  Undo our changes to
    322   // |offset_for_adjustment| since we haven't changed the string after all.
    323   if (offsets_for_adjustment)
    324     *offsets_for_adjustment = original_offsets;
    325   return base::UTF8ToUTF16AndAdjustOffsets(text, offsets_for_adjustment);
    326 }
    327 
    328 base::string16 UnescapeForHTML(const base::string16& input) {
    329   static const struct {
    330     const char* ampersand_code;
    331     const char replacement;
    332   } kEscapeToChars[] = {
    333     { "&lt;", '<' },
    334     { "&gt;", '>' },
    335     { "&amp;", '&' },
    336     { "&quot;", '"' },
    337     { "&#39;", '\''},
    338   };
    339 
    340   if (input.find(ASCIIToUTF16("&")) == std::string::npos)
    341     return input;
    342 
    343   base::string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
    344   base::string16 text(input);
    345   for (base::string16::iterator iter = text.begin();
    346        iter != text.end(); ++iter) {
    347     if (*iter == '&') {
    348       // Potential ampersand encode char.
    349       size_t index = iter - text.begin();
    350       for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
    351         if (ampersand_chars[i].empty())
    352           ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
    353         if (text.find(ampersand_chars[i], index) == index) {
    354           text.replace(iter, iter + ampersand_chars[i].length(),
    355                        1, kEscapeToChars[i].replacement);
    356           break;
    357         }
    358       }
    359     }
    360   }
    361   return text;
    362 }
    363 
    364 namespace internal {
    365 
    366 AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments)
    367   : adjustments(adjustments) {}
    368 
    369 void AdjustEncodingOffset::operator()(size_t& offset) {
    370   // For each encoded character occurring before an offset subtract 2.
    371   if (offset == base::string16::npos)
    372     return;
    373   size_t adjusted_offset = offset;
    374   for (Adjustments::const_iterator i = adjustments.begin();
    375        i != adjustments.end(); ++i) {
    376     size_t location = *i;
    377     if (offset <= location) {
    378       offset = adjusted_offset;
    379       return;
    380     }
    381     if (offset <= (location + 2)) {
    382       offset = base::string16::npos;
    383       return;
    384     }
    385     adjusted_offset -= 2;
    386   }
    387   offset = adjusted_offset;
    388 }
    389 
    390 }  // namespace internal
    391 
    392 }  // namespace net
    393