1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <algorithm> 6 7 #include "net/base/escape.h" 8 9 #include "base/i18n/icu_string_conversions.h" 10 #include "base/logging.h" 11 #include "base/string_piece.h" 12 #include "base/utf_string_conversions.h" 13 #include "base/utf_offset_string_conversions.h" 14 15 namespace { 16 17 template <class char_type> 18 inline bool IsHex(char_type ch) { 19 return (ch >= '0' && ch <= '9') || 20 (ch >= 'A' && ch <= 'F') || 21 (ch >= 'a' && ch <= 'f'); 22 } 23 24 template <class char_type> 25 inline char_type HexToInt(char_type ch) { 26 if (ch >= '0' && ch <= '9') 27 return ch - '0'; 28 if (ch >= 'A' && ch <= 'F') 29 return ch - 'A' + 10; 30 if (ch >= 'a' && ch <= 'f') 31 return ch - 'a' + 10; 32 NOTREACHED(); 33 return 0; 34 } 35 36 static const char* const kHexString = "0123456789ABCDEF"; 37 inline char IntToHex(int i) { 38 DCHECK(i >= 0 && i <= 15) << i << " not a hex value"; 39 return kHexString[i]; 40 } 41 42 // A fast bit-vector map for ascii characters. 43 // 44 // Internally stores 256 bits in an array of 8 ints. 45 // Does quick bit-flicking to lookup needed characters. 46 class Charmap { 47 public: 48 Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3, 49 uint32 b4, uint32 b5, uint32 b6, uint32 b7) { 50 map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3; 51 map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7; 52 } 53 54 bool Contains(unsigned char c) const { 55 return (map_[c >> 5] & (1 << (c & 31))) ? true : false; 56 } 57 58 private: 59 uint32 map_[8]; 60 }; 61 62 // Given text to escape and a Charmap defining which values to escape, 63 // return an escaped string. If use_plus is true, spaces are converted 64 // to +, otherwise, if spaces are in the charmap, they are converted to 65 // %20. 66 const std::string Escape(const std::string& text, const Charmap& charmap, 67 bool use_plus) { 68 std::string escaped; 69 escaped.reserve(text.length() * 3); 70 for (unsigned int i = 0; i < text.length(); ++i) { 71 unsigned char c = static_cast<unsigned char>(text[i]); 72 if (use_plus && ' ' == c) { 73 escaped.push_back('+'); 74 } else if (charmap.Contains(c)) { 75 escaped.push_back('%'); 76 escaped.push_back(IntToHex(c >> 4)); 77 escaped.push_back(IntToHex(c & 0xf)); 78 } else { 79 escaped.push_back(c); 80 } 81 } 82 return escaped; 83 } 84 85 // Contains nonzero when the corresponding character is unescapable for normal 86 // URLs. These characters are the ones that may change the parsing of a URL, so 87 // we don't want to unescape them sometimes. In many case we won't want to 88 // unescape spaces, but that is controlled by parameters to Unescape*. 89 // 90 // The basic rule is that we can't unescape anything that would changing parsing 91 // like # or ?. We also can't unescape &, =, or + since that could be part of a 92 // query and that could change the server's parsing of the query. 93 const char kUrlUnescape[128] = { 94 // NULL, control chars... 95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97 // ' ' ! " # $ % & ' ( ) * + , - . / 98 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 99 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 101 // @ A B C D E F G H I J K L M N O 102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 103 // P Q R S T U V W X Y Z [ \ ] ^ _ 104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 105 // ` a b c d e f g h i j k l m n o 106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 107 // p q r s t u v w x y z { | } ~ <NBSP> 108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 109 }; 110 111 template<typename STR> 112 STR UnescapeURLImpl(const STR& escaped_text, 113 UnescapeRule::Type rules, 114 size_t* offset_for_adjustment) { 115 size_t offset_temp = string16::npos; 116 if (!offset_for_adjustment) 117 offset_for_adjustment = &offset_temp; 118 else if (*offset_for_adjustment >= escaped_text.length()) 119 *offset_for_adjustment = string16::npos; 120 121 // Do not unescape anything, return the |escaped_text| text. 122 if (rules == UnescapeRule::NONE) 123 return escaped_text; 124 125 // The output of the unescaping is always smaller than the input, so we can 126 // reserve the input size to make sure we have enough buffer and don't have 127 // to allocate in the loop below. 128 STR result; 129 result.reserve(escaped_text.length()); 130 131 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { 132 if (static_cast<unsigned char>(escaped_text[i]) >= 128) { 133 // Non ASCII character, append as is. 134 result.push_back(escaped_text[i]); 135 continue; 136 } 137 138 char current_char = static_cast<char>(escaped_text[i]); 139 if (current_char == '%' && i + 2 < max) { 140 const typename STR::value_type most_sig_digit( 141 static_cast<typename STR::value_type>(escaped_text[i + 1])); 142 const typename STR::value_type least_sig_digit( 143 static_cast<typename STR::value_type>(escaped_text[i + 2])); 144 if (IsHex(most_sig_digit) && IsHex(least_sig_digit)) { 145 unsigned char value = HexToInt(most_sig_digit) * 16 + 146 HexToInt(least_sig_digit); 147 if (value >= 0x80 || // Unescape all high-bit characters. 148 // For 7-bit characters, the lookup table tells us all valid chars. 149 (kUrlUnescape[value] || 150 // ...and we allow some additional unescaping when flags are set. 151 (value == ' ' && (rules & UnescapeRule::SPACES)) || 152 // Allow any of the prohibited but non-control characters when 153 // we're doing "special" chars. 154 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || 155 // Additionally allow control characters if requested. 156 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { 157 // Use the unescaped version of the character. 158 size_t length_before_append = result.length(); 159 result.push_back(value); 160 i += 2; 161 162 // Adjust offset to match length change. 163 if (*offset_for_adjustment != std::string::npos) { 164 if (*offset_for_adjustment > (length_before_append + 2)) 165 *offset_for_adjustment -= 2; 166 else if (*offset_for_adjustment > length_before_append) 167 *offset_for_adjustment = std::string::npos; 168 } 169 } else { 170 // Keep escaped. Append a percent and we'll get the following two 171 // digits on the next loops through. 172 result.push_back('%'); 173 } 174 } else { 175 // Invalid escape sequence, just pass the percent through and continue 176 // right after it. 177 result.push_back('%'); 178 } 179 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && 180 escaped_text[i] == '+') { 181 result.push_back(' '); 182 } else { 183 // Normal case for unescaped characters. 184 result.push_back(escaped_text[i]); 185 } 186 } 187 188 return result; 189 } 190 191 } // namespace 192 193 // Everything except alphanumerics and !'()*-._~ 194 // See RFC 2396 for the list of reserved characters. 195 static const Charmap kQueryCharmap( 196 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L, 197 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 198 199 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) { 200 return Escape(text, kQueryCharmap, use_plus); 201 } 202 203 // Convert the string to a sequence of bytes and then % escape anything 204 // except alphanumerics and !'()*-._~ 205 std::wstring EscapeQueryParamValueUTF8(const std::wstring& text, 206 bool use_plus) { 207 return UTF8ToWide(Escape(WideToUTF8(text), kQueryCharmap, use_plus)); 208 } 209 210 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} 211 static const Charmap kPathCharmap( 212 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L, 213 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 214 215 std::string EscapePath(const std::string& path) { 216 return Escape(path, kPathCharmap, false); 217 } 218 219 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} 220 static const Charmap kUrlEscape( 221 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L, 222 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL 223 ); 224 225 std::string EscapeUrlEncodedData(const std::string& path) { 226 return Escape(path, kUrlEscape, true); 227 } 228 229 // non-7bit 230 static const Charmap kNonASCIICharmap( 231 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L, 232 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 233 234 std::string EscapeNonASCII(const std::string& input) { 235 return Escape(input, kNonASCIICharmap, false); 236 } 237 238 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and 239 // !'()*-._~% 240 static const Charmap kExternalHandlerCharmap( 241 0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L, 242 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL); 243 244 std::string EscapeExternalHandlerValue(const std::string& text) { 245 return Escape(text, kExternalHandlerCharmap, false); 246 } 247 248 bool EscapeQueryParamValue(const string16& text, const char* codepage, 249 bool use_plus, string16* escaped) { 250 // TODO(brettw) bug 1201094: this function should be removed, this "SKIP" 251 // behavior is wrong when the character can't be encoded properly. 252 std::string encoded; 253 if (!base::UTF16ToCodepage(text, codepage, 254 base::OnStringConversionError::SKIP, &encoded)) 255 return false; 256 257 escaped->assign(UTF8ToUTF16(Escape(encoded, kQueryCharmap, use_plus))); 258 return true; 259 } 260 261 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, 262 UnescapeRule::Type rules, 263 size_t* offset_for_adjustment) { 264 std::wstring result; 265 size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0; 266 std::string unescaped_url( 267 UnescapeURLImpl(text, rules, offset_for_adjustment)); 268 if (UTF8ToWideAndAdjustOffset(unescaped_url.data(), unescaped_url.length(), 269 &result, offset_for_adjustment)) 270 return WideToUTF16Hack(result); // Character set looks like it's valid. 271 272 // Not valid. Return the escaped version. Undo our changes to 273 // |offset_for_adjustment| since we haven't changed the string after all. 274 if (offset_for_adjustment) 275 *offset_for_adjustment = original_offset; 276 return WideToUTF16Hack(UTF8ToWideAndAdjustOffset(text, 277 offset_for_adjustment)); 278 } 279 280 std::string UnescapeURLComponent(const std::string& escaped_text, 281 UnescapeRule::Type rules) { 282 return UnescapeURLImpl(escaped_text, rules, NULL); 283 } 284 285 string16 UnescapeURLComponent(const string16& escaped_text, 286 UnescapeRule::Type rules) { 287 return UnescapeURLImpl(escaped_text, rules, NULL); 288 } 289 290 291 template <class str> 292 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { 293 static const struct { 294 char key; 295 const char* replacement; 296 } kCharsToEscape[] = { 297 { '<', "<" }, 298 { '>', ">" }, 299 { '&', "&" }, 300 { '"', """ }, 301 { '\'', "'" }, 302 }; 303 size_t k; 304 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) { 305 if (c == kCharsToEscape[k].key) { 306 const char* p = kCharsToEscape[k].replacement; 307 while (*p) 308 output->push_back(*p++); 309 break; 310 } 311 } 312 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape)) 313 output->push_back(c); 314 } 315 316 void AppendEscapedCharForHTML(char c, std::string* output) { 317 AppendEscapedCharForHTMLImpl(c, output); 318 } 319 320 void AppendEscapedCharForHTML(wchar_t c, string16* output) { 321 AppendEscapedCharForHTMLImpl(c, output); 322 } 323 324 template <class str> 325 str EscapeForHTMLImpl(const str& input) { 326 str result; 327 result.reserve(input.size()); // optimize for no escaping 328 329 for (typename str::const_iterator it = input.begin(); it != input.end(); ++it) 330 AppendEscapedCharForHTMLImpl(*it, &result); 331 332 return result; 333 } 334 335 std::string EscapeForHTML(const std::string& input) { 336 return EscapeForHTMLImpl(input); 337 } 338 339 string16 EscapeForHTML(const string16& input) { 340 return EscapeForHTMLImpl(input); 341 } 342 343 string16 UnescapeForHTML(const string16& input) { 344 static const struct { 345 const wchar_t* ampersand_code; 346 const char replacement; 347 } kEscapeToChars[] = { 348 { L"<", '<' }, 349 { L">", '>' }, 350 { L"&", '&' }, 351 { L""", '"' }, 352 { L"'", '\''}, 353 }; 354 355 if (input.find(WideToUTF16(L"&")) == std::string::npos) 356 return input; 357 358 string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)]; 359 string16 text(input); 360 for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) { 361 if (*iter == '&') { 362 // Potential ampersand encode char. 363 size_t index = iter - text.begin(); 364 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) { 365 if (ampersand_chars[i].empty()) 366 ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code); 367 if (text.find(ampersand_chars[i], index) == index) { 368 text.replace(iter, iter + ampersand_chars[i].length(), 369 1, kEscapeToChars[i].replacement); 370 break; 371 } 372 } 373 } 374 } 375 return text; 376 } 377