1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/logging.h" 6 #include "url/url_canon.h" 7 #include "url/url_canon_internal.h" 8 9 namespace url { 10 11 namespace { 12 13 // For reference, here's what IE supports: 14 // Key: 0 (disallowed: failure if present in the input) 15 // + (allowed either escaped or unescaped, and unmodified) 16 // U (allowed escaped or unescaped but always unescaped if present in 17 // escaped form) 18 // E (allowed escaped or unescaped but always escaped if present in 19 // unescaped form) 20 // % (only allowed escaped in the input, will be unmodified). 21 // I left blank alpha numeric characters. 22 // 23 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 24 // ----------------------------------------------- 25 // 0 0 E E E E E E E E E E E E E E E 26 // 1 E E E E E E E E E E E E E E E E 27 // 2 E + E E + E + + + + + + + U U 0 28 // 3 % % E + E 0 <-- Those are : ; < = > ? 29 // 4 % 30 // 5 U 0 U U U <-- Those are [ \ ] ^ _ 31 // 6 E <-- That's ` 32 // 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE) 33 // 34 // NOTE: I didn't actually test all the control characters. Some may be 35 // disallowed in the input, but they are all accepted escaped except for 0. 36 // I also didn't test if characters affecting HTML parsing are allowed 37 // unescaped, eg. (") or (#), which would indicate the beginning of the path. 38 // Surprisingly, space is accepted in the input and always escaped. 39 40 // This table lists the canonical version of all characters we allow in the 41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar 42 // value to indicate that this character should be escaped. We are a little more 43 // restrictive than IE, but less restrictive than Firefox. 44 // 45 // Note that we disallow the % character. We will allow it when part of an 46 // escape sequence, of course, but this disallows "%25". Even though IE allows 47 // it, allowing it would put us in a funny state. If there was an invalid 48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail. 49 // Allowing percents means we'll succeed a second time, so validity would change 50 // based on how many times you run the canonicalizer. We prefer to always report 51 // the same vailidity, so reject this. 52 const unsigned char kEsc = 0xff; 53 const unsigned char kHostCharLookup[0x80] = { 54 // 00-1f: all are invalid 55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57 // ' ' ! " # $ % & ' ( ) * + , - . / 58 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0, 59 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 60 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 , 61 // @ A B C D E F G H I J K L M N O 62 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 63 // P Q R S T U V W X Y Z [ \ ] ^ _ 64 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_', 65 // ` a b c d e f g h i j k l m n o 66 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 67 // p q r s t u v w x y z { | } ~ 68 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; 69 70 const int kTempHostBufferLen = 1024; 71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer; 72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW; 73 74 // Scans a host name and fills in the output flags according to what we find. 75 // |has_non_ascii| will be true if there are any non-7-bit characters, and 76 // |has_escaped| will be true if there is a percent sign. 77 template<typename CHAR, typename UCHAR> 78 void ScanHostname(const CHAR* spec, 79 const Component& host, 80 bool* has_non_ascii, 81 bool* has_escaped) { 82 int end = host.end(); 83 *has_non_ascii = false; 84 *has_escaped = false; 85 for (int i = host.begin; i < end; i++) { 86 if (static_cast<UCHAR>(spec[i]) >= 0x80) 87 *has_non_ascii = true; 88 else if (spec[i] == '%') 89 *has_escaped = true; 90 } 91 } 92 93 // Canonicalizes a host name that is entirely 8-bit characters (even though 94 // the type holding them may be 16 bits. Escaped characters will be unescaped. 95 // Non-7-bit characters (for example, UTF-8) will be passed unchanged. 96 // 97 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in 98 // the output. 99 // 100 // This function is used in two situations: 101 // 102 // * When the caller knows there is no non-ASCII or percent escaped 103 // characters. This is what DoHost does. The result will be a completely 104 // canonicalized host since we know nothing weird can happen (escaped 105 // characters could be unescaped to non-7-bit, so they have to be treated 106 // with suspicion at this point). It does not use the |has_non_ascii| flag. 107 // 108 // * When the caller has an 8-bit string that may need unescaping. 109 // DoComplexHost calls us this situation to do unescaping and validation. 110 // After this, it may do other IDN operations depending on the value of the 111 // |*has_non_ascii| flag. 112 // 113 // The return value indicates if the output is a potentially valid host name. 114 template<typename INCHAR, typename OUTCHAR> 115 bool DoSimpleHost(const INCHAR* host, 116 int host_len, 117 CanonOutputT<OUTCHAR>* output, 118 bool* has_non_ascii) { 119 *has_non_ascii = false; 120 121 bool success = true; 122 for (int i = 0; i < host_len; ++i) { 123 unsigned int source = host[i]; 124 if (source == '%') { 125 // Unescape first, if possible. 126 // Source will be used only if decode operation was successful. 127 if (!DecodeEscaped(host, &i, host_len, 128 reinterpret_cast<unsigned char*>(&source))) { 129 // Invalid escaped character. There is nothing that can make this 130 // host valid. We append an escaped percent so the URL looks reasonable 131 // and mark as failed. 132 AppendEscapedChar('%', output); 133 success = false; 134 continue; 135 } 136 } 137 138 if (source < 0x80) { 139 // We have ASCII input, we can use our lookup table. 140 unsigned char replacement = kHostCharLookup[source]; 141 if (!replacement) { 142 // Invalid character, add it as percent-escaped and mark as failed. 143 AppendEscapedChar(source, output); 144 success = false; 145 } else if (replacement == kEsc) { 146 // This character is valid but should be escaped. 147 AppendEscapedChar(source, output); 148 } else { 149 // Common case, the given character is valid in a hostname, the lookup 150 // table tells us the canonical representation of that character (lower 151 // cased). 152 output->push_back(replacement); 153 } 154 } else { 155 // It's a non-ascii char. Just push it to the output. 156 // In case where we have char16 input, and char output it's safe to 157 // cast char16->char only if input string was converted to ASCII. 158 output->push_back(static_cast<OUTCHAR>(source)); 159 *has_non_ascii = true; 160 } 161 } 162 163 return success; 164 } 165 166 // Canonicalizes a host that requires IDN conversion. Returns true on success 167 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) { 168 // We need to escape URL before doing IDN conversion, since punicode strings 169 // cannot be escaped after they are created. 170 RawCanonOutputW<kTempHostBufferLen> url_escaped_host; 171 bool has_non_ascii; 172 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); 173 174 StackBufferW wide_output; 175 if (!IDNToASCII(url_escaped_host.data(), 176 url_escaped_host.length(), 177 &wide_output)) { 178 // Some error, give up. This will write some reasonable looking 179 // representation of the string to the output. 180 AppendInvalidNarrowString(src, 0, src_len, output); 181 return false; 182 } 183 184 // Now we check the ASCII output like a normal host. It will also handle 185 // unescaping. Although we unescaped everything before this function call, if 186 // somebody does %00 as fullwidth, ICU will convert this to ASCII. 187 bool success = DoSimpleHost(wide_output.data(), 188 wide_output.length(), 189 output, &has_non_ascii); 190 DCHECK(!has_non_ascii); 191 return success; 192 } 193 194 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to 195 // UTF-16. The has_escaped flag should be set if the input string requires 196 // unescaping. 197 bool DoComplexHost(const char* host, int host_len, 198 bool has_non_ascii, bool has_escaped, CanonOutput* output) { 199 // Save the current position in the output. We may write stuff and rewind it 200 // below, so we need to know where to rewind to. 201 int begin_length = output->length(); 202 203 // Points to the UTF-8 data we want to convert. This will either be the 204 // input or the unescaped version written to |*output| if necessary. 205 const char* utf8_source; 206 int utf8_source_len; 207 if (has_escaped) { 208 // Unescape before converting to UTF-16 for IDN. We write this into the 209 // output because it most likely does not require IDNization, and we can 210 // save another huge stack buffer. It will be replaced below if it requires 211 // IDN. This will also update our non-ASCII flag so we know whether the 212 // unescaped input requires IDN. 213 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { 214 // Error with some escape sequence. We'll call the current output 215 // complete. DoSimpleHost will have written some "reasonable" output. 216 return false; 217 } 218 219 // Unescaping may have left us with ASCII input, in which case the 220 // unescaped version we wrote to output is complete. 221 if (!has_non_ascii) { 222 return true; 223 } 224 225 // Save the pointer into the data was just converted (it may be appended to 226 // other data in the output buffer). 227 utf8_source = &output->data()[begin_length]; 228 utf8_source_len = output->length() - begin_length; 229 } else { 230 // We don't need to unescape, use input for IDNization later. (We know the 231 // input has non-ASCII, or the simple version would have been called 232 // instead of us.) 233 utf8_source = host; 234 utf8_source_len = host_len; 235 } 236 237 // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion. 238 // Above, we may have used the output to write the unescaped values to, so 239 // we have to rewind it to where we started after we convert it to UTF-16. 240 StackBufferW utf16; 241 if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) { 242 // In this error case, the input may or may not be the output. 243 StackBuffer utf8; 244 for (int i = 0; i < utf8_source_len; i++) 245 utf8.push_back(utf8_source[i]); 246 output->set_length(begin_length); 247 AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output); 248 return false; 249 } 250 output->set_length(begin_length); 251 252 // This will call DoSimpleHost which will do normal ASCII canonicalization 253 // and also check for IP addresses in the outpt. 254 return DoIDNHost(utf16.data(), utf16.length(), output); 255 } 256 257 // UTF-16 convert host to its ASCII version. The set up is already ready for 258 // the backend, so we just pass through. The has_escaped flag should be set if 259 // the input string requires unescaping. 260 bool DoComplexHost(const base::char16* host, int host_len, 261 bool has_non_ascii, bool has_escaped, CanonOutput* output) { 262 if (has_escaped) { 263 // Yikes, we have escaped characters with wide input. The escaped 264 // characters should be interpreted as UTF-8. To solve this problem, 265 // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN. 266 // 267 // We don't bother to optimize the conversion in the ASCII case (which 268 // *could* just be a copy) and use the UTF-8 path, because it should be 269 // very rare that host names have escaped characters, and it is relatively 270 // fast to do the conversion anyway. 271 StackBuffer utf8; 272 if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) { 273 AppendInvalidNarrowString(host, 0, host_len, output); 274 return false; 275 } 276 277 // Once we convert to UTF-8, we can use the 8-bit version of the complex 278 // host handling code above. 279 return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, 280 has_escaped, output); 281 } 282 283 // No unescaping necessary, we can safely pass the input to ICU. This 284 // function will only get called if we either have escaped or non-ascii 285 // input, so it's safe to just use ICU now. Even if the input is ASCII, 286 // this function will do the right thing (just slower than we could). 287 return DoIDNHost(host, host_len, output); 288 } 289 290 template<typename CHAR, typename UCHAR> 291 void DoHost(const CHAR* spec, 292 const Component& host, 293 CanonOutput* output, 294 CanonHostInfo* host_info) { 295 if (host.len <= 0) { 296 // Empty hosts don't need anything. 297 host_info->family = CanonHostInfo::NEUTRAL; 298 host_info->out_host = Component(); 299 return; 300 } 301 302 bool has_non_ascii, has_escaped; 303 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); 304 305 // Keep track of output's initial length, so we can rewind later. 306 const int output_begin = output->length(); 307 308 bool success; 309 if (!has_non_ascii && !has_escaped) { 310 success = DoSimpleHost(&spec[host.begin], host.len, 311 output, &has_non_ascii); 312 DCHECK(!has_non_ascii); 313 } else { 314 success = DoComplexHost(&spec[host.begin], host.len, 315 has_non_ascii, has_escaped, output); 316 } 317 318 if (!success) { 319 // Canonicalization failed. Set BROKEN to notify the caller. 320 host_info->family = CanonHostInfo::BROKEN; 321 } else { 322 // After all the other canonicalization, check if we ended up with an IP 323 // address. IP addresses are small, so writing into this temporary buffer 324 // should not cause an allocation. 325 RawCanonOutput<64> canon_ip; 326 CanonicalizeIPAddress(output->data(), 327 MakeRange(output_begin, output->length()), 328 &canon_ip, host_info); 329 330 // If we got an IPv4/IPv6 address, copy the canonical form back to the 331 // real buffer. Otherwise, it's a hostname or broken IP, in which case 332 // we just leave it in place. 333 if (host_info->IsIPAddress()) { 334 output->set_length(output_begin); 335 output->Append(canon_ip.data(), canon_ip.length()); 336 } 337 } 338 339 host_info->out_host = MakeRange(output_begin, output->length()); 340 } 341 342 } // namespace 343 344 bool CanonicalizeHost(const char* spec, 345 const Component& host, 346 CanonOutput* output, 347 Component* out_host) { 348 CanonHostInfo host_info; 349 DoHost<char, unsigned char>(spec, host, output, &host_info); 350 *out_host = host_info.out_host; 351 return (host_info.family != CanonHostInfo::BROKEN); 352 } 353 354 bool CanonicalizeHost(const base::char16* spec, 355 const Component& host, 356 CanonOutput* output, 357 Component* out_host) { 358 CanonHostInfo host_info; 359 DoHost<base::char16, base::char16>(spec, host, output, &host_info); 360 *out_host = host_info.out_host; 361 return (host_info.family != CanonHostInfo::BROKEN); 362 } 363 364 void CanonicalizeHostVerbose(const char* spec, 365 const Component& host, 366 CanonOutput* output, 367 CanonHostInfo* host_info) { 368 DoHost<char, unsigned char>(spec, host, output, host_info); 369 } 370 371 void CanonicalizeHostVerbose(const base::char16* spec, 372 const Component& host, 373 CanonOutput* output, 374 CanonHostInfo* host_info) { 375 DoHost<base::char16, base::char16>(spec, host, output, host_info); 376 } 377 378 } // namespace url 379