1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/base/escape.h" 6 7 #include <algorithm> 8 9 #include "base/logging.h" 10 #include "base/scoped_ptr.h" 11 #include "base/string_piece.h" 12 #include "base/string_util.h" 13 #include "base/utf_string_conversions.h" 14 #include "base/utf_offset_string_conversions.h" 15 16 namespace { 17 18 static const char* const kHexString = "0123456789ABCDEF"; 19 inline char IntToHex(int i) { 20 DCHECK(i >= 0 && i <= 15) << i << " not a hex value"; 21 return kHexString[i]; 22 } 23 24 // A fast bit-vector map for ascii characters. 25 // 26 // Internally stores 256 bits in an array of 8 ints. 27 // Does quick bit-flicking to lookup needed characters. 28 class Charmap { 29 public: 30 Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3, 31 uint32 b4, uint32 b5, uint32 b6, uint32 b7) { 32 map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3; 33 map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7; 34 } 35 36 bool Contains(unsigned char c) const { 37 return (map_[c >> 5] & (1 << (c & 31))) ? true : false; 38 } 39 40 private: 41 uint32 map_[8]; 42 }; 43 44 // Given text to escape and a Charmap defining which values to escape, 45 // return an escaped string. If use_plus is true, spaces are converted 46 // to +, otherwise, if spaces are in the charmap, they are converted to 47 // %20. 48 std::string Escape(const std::string& text, const Charmap& charmap, 49 bool use_plus) { 50 std::string escaped; 51 escaped.reserve(text.length() * 3); 52 for (unsigned int i = 0; i < text.length(); ++i) { 53 unsigned char c = static_cast<unsigned char>(text[i]); 54 if (use_plus && ' ' == c) { 55 escaped.push_back('+'); 56 } else if (charmap.Contains(c)) { 57 escaped.push_back('%'); 58 escaped.push_back(IntToHex(c >> 4)); 59 escaped.push_back(IntToHex(c & 0xf)); 60 } else { 61 escaped.push_back(c); 62 } 63 } 64 return escaped; 65 } 66 67 // Contains nonzero when the corresponding character is unescapable for normal 68 // URLs. These characters are the ones that may change the parsing of a URL, so 69 // we don't want to unescape them sometimes. In many case we won't want to 70 // unescape spaces, but that is controlled by parameters to Unescape*. 71 // 72 // The basic rule is that we can't unescape anything that would changing parsing 73 // like # or ?. We also can't unescape &, =, or + since that could be part of a 74 // query and that could change the server's parsing of the query. Nor can we 75 // unescape \ since googleurl will convert it to a /. 76 // 77 // Lastly, we can't unescape anything that doesn't have a canonical 78 // representation in a URL. This means that unescaping will change the URL, and 79 // you could get different behavior if you copy and paste the URL, or press 80 // enter in the URL bar. The list of characters that fall into this category 81 // are the ones labeled PASS (allow either escaped or unescaped) in the big 82 // lookup table at the top of googleurl/src/url_canon_path.cc 83 const char kUrlUnescape[128] = { 84 // NULL, control chars... 85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87 // ' ' ! " # $ % & ' ( ) * + , - . / 88 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 89 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 91 // @ A B C D E F G H I J K L M N O 92 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 93 // P Q R S T U V W X Y Z [ \ ] ^ _ 94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 95 // ` a b c d e f g h i j k l m n o 96 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 97 // p q r s t u v w x y z { | } ~ <NBSP> 98 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 99 }; 100 101 template<typename STR> 102 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, 103 UnescapeRule::Type rules, 104 std::vector<size_t>* offsets_for_adjustment) { 105 if (offsets_for_adjustment) { 106 std::for_each(offsets_for_adjustment->begin(), 107 offsets_for_adjustment->end(), 108 LimitOffset<std::wstring>(escaped_text.length())); 109 } 110 // Do not unescape anything, return the |escaped_text| text. 111 if (rules == UnescapeRule::NONE) 112 return escaped_text; 113 114 // The output of the unescaping is always smaller than the input, so we can 115 // reserve the input size to make sure we have enough buffer and don't have 116 // to allocate in the loop below. 117 STR result; 118 result.reserve(escaped_text.length()); 119 120 AdjustEncodingOffset::Adjustments adjustments; // Locations of adjusted text. 121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { 122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { 123 // Non ASCII character, append as is. 124 result.push_back(escaped_text[i]); 125 continue; 126 } 127 128 char current_char = static_cast<char>(escaped_text[i]); 129 if (current_char == '%' && i + 2 < max) { 130 const typename STR::value_type most_sig_digit( 131 static_cast<typename STR::value_type>(escaped_text[i + 1])); 132 const typename STR::value_type least_sig_digit( 133 static_cast<typename STR::value_type>(escaped_text[i + 2])); 134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { 135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 + 136 HexDigitToInt(least_sig_digit); 137 if (value >= 0x80 || // Unescape all high-bit characters. 138 // For 7-bit characters, the lookup table tells us all valid chars. 139 (kUrlUnescape[value] || 140 // ...and we allow some additional unescaping when flags are set. 141 (value == ' ' && (rules & UnescapeRule::SPACES)) || 142 // Allow any of the prohibited but non-control characters when 143 // we're doing "special" chars. 144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || 145 // Additionally allow control characters if requested. 146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { 147 // Use the unescaped version of the character. 148 adjustments.push_back(i); 149 result.push_back(value); 150 i += 2; 151 } else { 152 // Keep escaped. Append a percent and we'll get the following two 153 // digits on the next loops through. 154 result.push_back('%'); 155 } 156 } else { 157 // Invalid escape sequence, just pass the percent through and continue 158 // right after it. 159 result.push_back('%'); 160 } 161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && 162 escaped_text[i] == '+') { 163 result.push_back(' '); 164 } else { 165 // Normal case for unescaped characters. 166 result.push_back(escaped_text[i]); 167 } 168 } 169 170 // Make offset adjustment. 171 if (offsets_for_adjustment && !adjustments.empty()) { 172 std::for_each(offsets_for_adjustment->begin(), 173 offsets_for_adjustment->end(), 174 AdjustEncodingOffset(adjustments)); 175 } 176 177 return result; 178 } 179 180 template<typename STR> 181 STR UnescapeURLImpl(const STR& escaped_text, 182 UnescapeRule::Type rules, 183 size_t* offset_for_adjustment) { 184 std::vector<size_t> offsets; 185 if (offset_for_adjustment) 186 offsets.push_back(*offset_for_adjustment); 187 STR result = UnescapeURLWithOffsetsImpl(escaped_text, rules, &offsets); 188 if (offset_for_adjustment) 189 *offset_for_adjustment = offsets[0]; 190 return result; 191 } 192 193 } // namespace 194 195 // Everything except alphanumerics and !'()*-._~ 196 // See RFC 2396 for the list of reserved characters. 197 static const Charmap kQueryCharmap( 198 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L, 199 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 200 201 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) { 202 return Escape(text, kQueryCharmap, use_plus); 203 } 204 205 // Convert the string to a sequence of bytes and then % escape anything 206 // except alphanumerics and !'()*-._~ 207 string16 EscapeQueryParamValueUTF8(const string16& text, 208 bool use_plus) { 209 return UTF8ToUTF16(Escape(UTF16ToUTF8(text), kQueryCharmap, use_plus)); 210 } 211 212 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} 213 static const Charmap kPathCharmap( 214 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L, 215 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 216 217 std::string EscapePath(const std::string& path) { 218 return Escape(path, kPathCharmap, false); 219 } 220 221 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} 222 static const Charmap kUrlEscape( 223 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L, 224 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL 225 ); 226 227 std::string EscapeUrlEncodedData(const std::string& path) { 228 return Escape(path, kUrlEscape, true); 229 } 230 231 // non-7bit 232 static const Charmap kNonASCIICharmap( 233 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L, 234 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 235 236 std::string EscapeNonASCII(const std::string& input) { 237 return Escape(input, kNonASCIICharmap, false); 238 } 239 240 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and 241 // !'()*-._~% 242 static const Charmap kExternalHandlerCharmap( 243 0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L, 244 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 245 246 std::string EscapeExternalHandlerValue(const std::string& text) { 247 return Escape(text, kExternalHandlerCharmap, false); 248 } 249 250 string16 UnescapeAndDecodeUTF8URLComponentWithOffsets( 251 const std::string& text, 252 UnescapeRule::Type rules, 253 std::vector<size_t>* offsets_for_adjustment) { 254 std::wstring result; 255 std::vector<size_t> original_offsets; 256 if (offsets_for_adjustment) 257 original_offsets = *offsets_for_adjustment; 258 std::string unescaped_url( 259 UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment)); 260 if (UTF8ToWideAndAdjustOffsets(unescaped_url.data(), unescaped_url.length(), 261 &result, offsets_for_adjustment)) 262 return WideToUTF16Hack(result); // Character set looks like it's valid. 263 264 // Not valid. Return the escaped version. Undo our changes to 265 // |offset_for_adjustment| since we haven't changed the string after all. 266 if (offsets_for_adjustment) 267 *offsets_for_adjustment = original_offsets; 268 return WideToUTF16Hack(UTF8ToWideAndAdjustOffsets( 269 text, offsets_for_adjustment)); 270 } 271 272 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, 273 UnescapeRule::Type rules, 274 size_t* offset_for_adjustment) { 275 std::vector<size_t> offsets; 276 if (offset_for_adjustment) 277 offsets.push_back(*offset_for_adjustment); 278 string16 result = 279 UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets); 280 if (offset_for_adjustment) 281 *offset_for_adjustment = offsets[0]; 282 return result; 283 } 284 285 std::string UnescapeURLComponent(const std::string& escaped_text, 286 UnescapeRule::Type rules) { 287 return UnescapeURLWithOffsetsImpl<std::string>(escaped_text, rules, NULL); 288 } 289 290 string16 UnescapeURLComponent(const string16& escaped_text, 291 UnescapeRule::Type rules) { 292 return UnescapeURLWithOffsetsImpl<string16>(escaped_text, rules, NULL); 293 } 294 295 296 template <class str> 297 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { 298 static const struct { 299 char key; 300 const char* replacement; 301 } kCharsToEscape[] = { 302 { '<', "<" }, 303 { '>', ">" }, 304 { '&', "&" }, 305 { '"', """ }, 306 { '\'', "'" }, 307 }; 308 size_t k; 309 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) { 310 if (c == kCharsToEscape[k].key) { 311 const char* p = kCharsToEscape[k].replacement; 312 while (*p) 313 output->push_back(*p++); 314 break; 315 } 316 } 317 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape)) 318 output->push_back(c); 319 } 320 321 void AppendEscapedCharForHTML(char c, std::string* output) { 322 AppendEscapedCharForHTMLImpl(c, output); 323 } 324 325 void AppendEscapedCharForHTML(wchar_t c, string16* output) { 326 AppendEscapedCharForHTMLImpl(c, output); 327 } 328 329 template <class str> 330 str EscapeForHTMLImpl(const str& input) { 331 str result; 332 result.reserve(input.size()); // optimize for no escaping 333 334 for (typename str::const_iterator it = input.begin(); it != input.end(); ++it) 335 AppendEscapedCharForHTMLImpl(*it, &result); 336 337 return result; 338 } 339 340 std::string EscapeForHTML(const std::string& input) { 341 return EscapeForHTMLImpl(input); 342 } 343 344 string16 EscapeForHTML(const string16& input) { 345 return EscapeForHTMLImpl(input); 346 } 347 348 string16 UnescapeForHTML(const string16& input) { 349 static const struct { 350 const wchar_t* ampersand_code; 351 const char replacement; 352 } kEscapeToChars[] = { 353 { L"<", '<' }, 354 { L">", '>' }, 355 { L"&", '&' }, 356 { L""", '"' }, 357 { L"'", '\''}, 358 }; 359 360 if (input.find(WideToUTF16(L"&")) == std::string::npos) 361 return input; 362 363 string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)]; 364 string16 text(input); 365 for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) { 366 if (*iter == '&') { 367 // Potential ampersand encode char. 368 size_t index = iter - text.begin(); 369 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) { 370 if (ampersand_chars[i].empty()) 371 ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code); 372 if (text.find(ampersand_chars[i], index) == index) { 373 text.replace(iter, iter + ampersand_chars[i].length(), 374 1, kEscapeToChars[i].replacement); 375 break; 376 } 377 } 378 } 379 } 380 return text; 381 } 382 383 AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments) 384 : adjustments(adjustments) {} 385 386 void AdjustEncodingOffset::operator()(size_t& offset) { 387 // For each encoded character occurring before an offset subtract 2. 388 if (offset == string16::npos) 389 return; 390 size_t adjusted_offset = offset; 391 for (Adjustments::const_iterator i = adjustments.begin(); 392 i != adjustments.end(); ++i) { 393 size_t location = *i; 394 if (offset <= location) { 395 offset = adjusted_offset; 396 return; 397 } 398 if (offset <= (location + 2)) { 399 offset = string16::npos; 400 return; 401 } 402 adjusted_offset -= 2; 403 } 404 offset = adjusted_offset; 405 } 406