1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <errno.h> 6 #include <stdlib.h> 7 8 #include <cstdio> 9 #include <string> 10 11 #include "url/url_canon_internal.h" 12 13 namespace url_canon { 14 15 namespace { 16 17 template<typename CHAR, typename UCHAR> 18 void DoAppendStringOfType(const CHAR* source, int length, 19 SharedCharTypes type, 20 CanonOutput* output) { 21 for (int i = 0; i < length; i++) { 22 if (static_cast<UCHAR>(source[i]) >= 0x80) { 23 // ReadChar will fill the code point with kUnicodeReplacementCharacter 24 // when the input is invalid, which is what we want. 25 unsigned code_point; 26 ReadUTFChar(source, &i, length, &code_point); 27 AppendUTF8EscapedValue(code_point, output); 28 } else { 29 // Just append the 7-bit character, possibly escaping it. 30 unsigned char uch = static_cast<unsigned char>(source[i]); 31 if (!IsCharOfType(uch, type)) 32 AppendEscapedChar(uch, output); 33 else 34 output->push_back(uch); 35 } 36 } 37 } 38 39 // This function assumes the input values are all contained in 8-bit, 40 // although it allows any type. Returns true if input is valid, false if not. 41 template<typename CHAR, typename UCHAR> 42 void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end, 43 CanonOutput* output) { 44 for (int i = begin; i < end; i++) { 45 UCHAR uch = static_cast<UCHAR>(spec[i]); 46 if (uch >= 0x80) { 47 // Handle UTF-8/16 encodings. This call will correctly handle the error 48 // case by appending the invalid character. 49 AppendUTF8EscapedChar(spec, &i, end, output); 50 } else if (uch <= ' ' || uch == 0x7f) { 51 // This function is for error handling, so we escape all control 52 // characters and spaces, but not anything else since we lack 53 // context to do something more specific. 54 AppendEscapedChar(static_cast<unsigned char>(uch), output); 55 } else { 56 output->push_back(static_cast<char>(uch)); 57 } 58 } 59 } 60 61 // Overrides one component, see the url_canon::Replacements structure for 62 // what the various combionations of source pointer and component mean. 63 void DoOverrideComponent(const char* override_source, 64 const url_parse::Component& override_component, 65 const char** dest, 66 url_parse::Component* dest_component) { 67 if (override_source) { 68 *dest = override_source; 69 *dest_component = override_component; 70 } 71 } 72 73 // Similar to DoOverrideComponent except that it takes a UTF-16 input and does 74 // not actually set the output character pointer. 75 // 76 // The input is converted to UTF-8 at the end of the given buffer as a temporary 77 // holding place. The component indentifying the portion of the buffer used in 78 // the |utf8_buffer| will be specified in |*dest_component|. 79 // 80 // This will not actually set any |dest| pointer like DoOverrideComponent 81 // does because all of the pointers will point into the |utf8_buffer|, which 82 // may get resized while we're overriding a subsequent component. Instead, the 83 // caller should use the beginning of the |utf8_buffer| as the string pointer 84 // for all components once all overrides have been prepared. 85 bool PrepareUTF16OverrideComponent( 86 const base::char16* override_source, 87 const url_parse::Component& override_component, 88 CanonOutput* utf8_buffer, 89 url_parse::Component* dest_component) { 90 bool success = true; 91 if (override_source) { 92 if (!override_component.is_valid()) { 93 // Non-"valid" component (means delete), so we need to preserve that. 94 *dest_component = url_parse::Component(); 95 } else { 96 // Convert to UTF-8. 97 dest_component->begin = utf8_buffer->length(); 98 success = ConvertUTF16ToUTF8(&override_source[override_component.begin], 99 override_component.len, utf8_buffer); 100 dest_component->len = utf8_buffer->length() - dest_component->begin; 101 } 102 } 103 return success; 104 } 105 106 } // namespace 107 108 // See the header file for this array's declaration. 109 const unsigned char kSharedCharTypeTable[0x100] = { 110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f 111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f 112 0, // 0x20 ' ' (escape spaces in queries) 113 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x21 ! 114 0, // 0x22 " 115 0, // 0x23 # (invalid in query since it marks the ref) 116 CHAR_QUERY | CHAR_USERINFO, // 0x24 $ 117 CHAR_QUERY | CHAR_USERINFO, // 0x25 % 118 CHAR_QUERY | CHAR_USERINFO, // 0x26 & 119 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x27 ' 120 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x28 ( 121 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x29 ) 122 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2a * 123 CHAR_QUERY | CHAR_USERINFO, // 0x2b + 124 CHAR_QUERY | CHAR_USERINFO, // 0x2c , 125 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2d - 126 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x2e . 127 CHAR_QUERY, // 0x2f / 128 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x30 0 129 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x31 1 130 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x32 2 131 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x33 3 132 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x34 4 133 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x35 5 134 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x36 6 135 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x37 7 136 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x38 8 137 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x39 9 138 CHAR_QUERY, // 0x3a : 139 CHAR_QUERY, // 0x3b ; 140 0, // 0x3c < (Try to prevent certain types of XSS.) 141 CHAR_QUERY, // 0x3d = 142 0, // 0x3e > (Try to prevent certain types of XSS.) 143 CHAR_QUERY, // 0x3f ? 144 CHAR_QUERY, // 0x40 @ 145 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x41 A 146 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x42 B 147 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x43 C 148 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x44 D 149 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x45 E 150 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x46 F 151 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x47 G 152 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x48 H 153 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x49 I 154 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4a J 155 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4b K 156 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4c L 157 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4d M 158 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4e N 159 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4f O 160 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x50 P 161 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x51 Q 162 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x52 R 163 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x53 S 164 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x54 T 165 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x55 U 166 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x56 V 167 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x57 W 168 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58 X 169 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x59 Y 170 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5a Z 171 CHAR_QUERY, // 0x5b [ 172 CHAR_QUERY, // 0x5c '\' 173 CHAR_QUERY, // 0x5d ] 174 CHAR_QUERY, // 0x5e ^ 175 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5f _ 176 CHAR_QUERY, // 0x60 ` 177 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x61 a 178 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x62 b 179 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x63 c 180 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x64 d 181 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x65 e 182 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x66 f 183 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x67 g 184 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x68 h 185 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x69 i 186 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6a j 187 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6b k 188 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6c l 189 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6d m 190 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6e n 191 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6f o 192 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x70 p 193 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x71 q 194 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x72 r 195 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x73 s 196 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x74 t 197 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x75 u 198 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x76 v 199 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x77 w 200 CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x78 x 201 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x79 y 202 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7a z 203 CHAR_QUERY, // 0x7b { 204 CHAR_QUERY, // 0x7c | 205 CHAR_QUERY, // 0x7d } 206 CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7e ~ 207 0, // 0x7f 208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f 209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f 210 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf 211 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf 212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf 213 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf 214 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef 215 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff 216 }; 217 218 const char kHexCharLookup[0x10] = { 219 '0', '1', '2', '3', '4', '5', '6', '7', 220 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 221 }; 222 223 const char kCharToHexLookup[8] = { 224 0, // 0x00 - 0x1f 225 '0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39 226 'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46 227 'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66 228 0, // 0x80 - 0x9F 229 0, // 0xA0 - 0xBF 230 0, // 0xC0 - 0xDF 231 0, // 0xE0 - 0xFF 232 }; 233 234 const base::char16 kUnicodeReplacementCharacter = 0xfffd; 235 236 void AppendStringOfType(const char* source, int length, 237 SharedCharTypes type, 238 CanonOutput* output) { 239 DoAppendStringOfType<char, unsigned char>(source, length, type, output); 240 } 241 242 void AppendStringOfType(const base::char16* source, int length, 243 SharedCharTypes type, 244 CanonOutput* output) { 245 DoAppendStringOfType<base::char16, base::char16>( 246 source, length, type, output); 247 } 248 249 void AppendInvalidNarrowString(const char* spec, int begin, int end, 250 CanonOutput* output) { 251 DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output); 252 } 253 254 void AppendInvalidNarrowString(const base::char16* spec, int begin, int end, 255 CanonOutput* output) { 256 DoAppendInvalidNarrowString<base::char16, base::char16>( 257 spec, begin, end, output); 258 } 259 260 bool ConvertUTF16ToUTF8(const base::char16* input, int input_len, 261 CanonOutput* output) { 262 bool success = true; 263 for (int i = 0; i < input_len; i++) { 264 unsigned code_point; 265 success &= ReadUTFChar(input, &i, input_len, &code_point); 266 AppendUTF8Value(code_point, output); 267 } 268 return success; 269 } 270 271 bool ConvertUTF8ToUTF16(const char* input, int input_len, 272 CanonOutputT<base::char16>* output) { 273 bool success = true; 274 for (int i = 0; i < input_len; i++) { 275 unsigned code_point; 276 success &= ReadUTFChar(input, &i, input_len, &code_point); 277 AppendUTF16Value(code_point, output); 278 } 279 return success; 280 } 281 282 void SetupOverrideComponents(const char* base, 283 const Replacements<char>& repl, 284 URLComponentSource<char>* source, 285 url_parse::Parsed* parsed) { 286 // Get the source and parsed structures of the things we are replacing. 287 const URLComponentSource<char>& repl_source = repl.sources(); 288 const url_parse::Parsed& repl_parsed = repl.components(); 289 290 DoOverrideComponent(repl_source.scheme, repl_parsed.scheme, 291 &source->scheme, &parsed->scheme); 292 DoOverrideComponent(repl_source.username, repl_parsed.username, 293 &source->username, &parsed->username); 294 DoOverrideComponent(repl_source.password, repl_parsed.password, 295 &source->password, &parsed->password); 296 297 // Our host should be empty if not present, so override the default setup. 298 DoOverrideComponent(repl_source.host, repl_parsed.host, 299 &source->host, &parsed->host); 300 if (parsed->host.len == -1) 301 parsed->host.len = 0; 302 303 DoOverrideComponent(repl_source.port, repl_parsed.port, 304 &source->port, &parsed->port); 305 DoOverrideComponent(repl_source.path, repl_parsed.path, 306 &source->path, &parsed->path); 307 DoOverrideComponent(repl_source.query, repl_parsed.query, 308 &source->query, &parsed->query); 309 DoOverrideComponent(repl_source.ref, repl_parsed.ref, 310 &source->ref, &parsed->ref); 311 } 312 313 bool SetupUTF16OverrideComponents(const char* base, 314 const Replacements<base::char16>& repl, 315 CanonOutput* utf8_buffer, 316 URLComponentSource<char>* source, 317 url_parse::Parsed* parsed) { 318 bool success = true; 319 320 // Get the source and parsed structures of the things we are replacing. 321 const URLComponentSource<base::char16>& repl_source = repl.sources(); 322 const url_parse::Parsed& repl_parsed = repl.components(); 323 324 success &= PrepareUTF16OverrideComponent( 325 repl_source.scheme, repl_parsed.scheme, 326 utf8_buffer, &parsed->scheme); 327 success &= PrepareUTF16OverrideComponent( 328 repl_source.username, repl_parsed.username, 329 utf8_buffer, &parsed->username); 330 success &= PrepareUTF16OverrideComponent( 331 repl_source.password, repl_parsed.password, 332 utf8_buffer, &parsed->password); 333 success &= PrepareUTF16OverrideComponent( 334 repl_source.host, repl_parsed.host, 335 utf8_buffer, &parsed->host); 336 success &= PrepareUTF16OverrideComponent( 337 repl_source.port, repl_parsed.port, 338 utf8_buffer, &parsed->port); 339 success &= PrepareUTF16OverrideComponent( 340 repl_source.path, repl_parsed.path, 341 utf8_buffer, &parsed->path); 342 success &= PrepareUTF16OverrideComponent( 343 repl_source.query, repl_parsed.query, 344 utf8_buffer, &parsed->query); 345 success &= PrepareUTF16OverrideComponent( 346 repl_source.ref, repl_parsed.ref, 347 utf8_buffer, &parsed->ref); 348 349 // PrepareUTF16OverrideComponent will not have set the data pointer since the 350 // buffer could be resized, invalidating the pointers. We set the data 351 // pointers for affected components now that the buffer is finalized. 352 if (repl_source.scheme) source->scheme = utf8_buffer->data(); 353 if (repl_source.username) source->username = utf8_buffer->data(); 354 if (repl_source.password) source->password = utf8_buffer->data(); 355 if (repl_source.host) source->host = utf8_buffer->data(); 356 if (repl_source.port) source->port = utf8_buffer->data(); 357 if (repl_source.path) source->path = utf8_buffer->data(); 358 if (repl_source.query) source->query = utf8_buffer->data(); 359 if (repl_source.ref) source->ref = utf8_buffer->data(); 360 361 return success; 362 } 363 364 #ifndef WIN32 365 366 int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) { 367 const char* format_str; 368 if (radix == 10) 369 format_str = "%d"; 370 else if (radix == 16) 371 format_str = "%x"; 372 else 373 return EINVAL; 374 375 int written = snprintf(buffer, size_in_chars, format_str, value); 376 if (static_cast<size_t>(written) >= size_in_chars) { 377 // Output was truncated, or written was negative. 378 return EINVAL; 379 } 380 return 0; 381 } 382 383 int _itow_s(int value, base::char16* buffer, size_t size_in_chars, int radix) { 384 if (radix != 10) 385 return EINVAL; 386 387 // No more than 12 characters will be required for a 32-bit integer. 388 // Add an extra byte for the terminating null. 389 char temp[13]; 390 int written = snprintf(temp, sizeof(temp), "%d", value); 391 if (static_cast<size_t>(written) >= size_in_chars) { 392 // Output was truncated, or written was negative. 393 return EINVAL; 394 } 395 396 for (int i = 0; i < written; ++i) { 397 buffer[i] = static_cast<base::char16>(temp[i]); 398 } 399 buffer[written] = '\0'; 400 return 0; 401 } 402 403 #endif // !WIN32 404 405 } // namespace url_canon 406