1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Canonicalizers for random bits that aren't big enough for their own files. 6 7 #include <string.h> 8 9 #include "url/url_canon.h" 10 #include "url/url_canon_internal.h" 11 12 namespace url { 13 14 namespace { 15 16 // Returns true if the given character should be removed from the middle of a 17 // URL. 18 inline bool IsRemovableURLWhitespace(int ch) { 19 return ch == '\r' || ch == '\n' || ch == '\t'; 20 } 21 22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h). 23 // It sucks that we have to do this, since this takes about 13% of the total URL 24 // canonicalization time. 25 template<typename CHAR> 26 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len, 27 CanonOutputT<CHAR>* buffer, 28 int* output_len) { 29 // Fast verification that there's nothing that needs removal. This is the 99% 30 // case, so we want it to be fast and don't care about impacting the speed 31 // when we do find whitespace. 32 int found_whitespace = false; 33 for (int i = 0; i < input_len; i++) { 34 if (!IsRemovableURLWhitespace(input[i])) 35 continue; 36 found_whitespace = true; 37 break; 38 } 39 40 if (!found_whitespace) { 41 // Didn't find any whitespace, we don't need to do anything. We can just 42 // return the input as the output. 43 *output_len = input_len; 44 return input; 45 } 46 47 // Remove the whitespace into the new buffer and return it. 48 for (int i = 0; i < input_len; i++) { 49 if (!IsRemovableURLWhitespace(input[i])) 50 buffer->push_back(input[i]); 51 } 52 *output_len = buffer->length(); 53 return buffer->data(); 54 } 55 56 // Contains the canonical version of each possible input letter in the scheme 57 // (basically, lower-cased). The corresponding entry will be 0 if the letter 58 // is not allowed in a scheme. 59 const char kSchemeCanonical[0x80] = { 60 // 00-1f: all are invalid 61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63 // ' ' ! " # $ % & ' ( ) * + , - . / 64 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0, 65 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 66 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 , 67 // @ A B C D E F G H I J K L M N O 68 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 69 // P Q R S T U V W X Y Z [ \ ] ^ _ 70 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0, 71 // ` a b c d e f g h i j k l m n o 72 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 73 // p q r s t u v w x y z { | } ~ 74 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 }; 75 76 // This could be a table lookup as well by setting the high bit for each 77 // valid character, but it's only called once per URL, and it makes the lookup 78 // table easier to read not having extra stuff in it. 79 inline bool IsSchemeFirstChar(unsigned char c) { 80 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 81 } 82 83 template<typename CHAR, typename UCHAR> 84 bool DoScheme(const CHAR* spec, 85 const Component& scheme, 86 CanonOutput* output, 87 Component* out_scheme) { 88 if (scheme.len <= 0) { 89 // Scheme is unspecified or empty, convert to empty by appending a colon. 90 *out_scheme = Component(output->length(), 0); 91 output->push_back(':'); 92 return true; 93 } 94 95 // The output scheme starts from the current position. 96 out_scheme->begin = output->length(); 97 98 // Danger: it's important that this code does not strip any characters: it 99 // only emits the canonical version (be it valid or escaped) of each of 100 // the input characters. Stripping would put it out of sync with 101 // FindAndCompareScheme, which could cause some security checks on 102 // schemes to be incorrect. 103 bool success = true; 104 int end = scheme.end(); 105 for (int i = scheme.begin; i < end; i++) { 106 UCHAR ch = static_cast<UCHAR>(spec[i]); 107 char replacement = 0; 108 if (ch < 0x80) { 109 if (i == scheme.begin) { 110 // Need to do a special check for the first letter of the scheme. 111 if (IsSchemeFirstChar(static_cast<unsigned char>(ch))) 112 replacement = kSchemeCanonical[ch]; 113 } else { 114 replacement = kSchemeCanonical[ch]; 115 } 116 } 117 118 if (replacement) { 119 output->push_back(replacement); 120 } else if (ch == '%') { 121 // Canonicalizing the scheme multiple times should lead to the same 122 // result. Since invalid characters will be escaped, we need to preserve 123 // the percent to avoid multiple escaping. The scheme will be invalid. 124 success = false; 125 output->push_back('%'); 126 } else { 127 // Invalid character, store it but mark this scheme as invalid. 128 success = false; 129 130 // This will escape the output and also handle encoding issues. 131 // Ignore the return value since we already failed. 132 AppendUTF8EscapedChar(spec, &i, end, output); 133 } 134 } 135 136 // The output scheme ends with the the current position, before appending 137 // the colon. 138 out_scheme->len = output->length() - out_scheme->begin; 139 output->push_back(':'); 140 return success; 141 } 142 143 // The username and password components reference ranges in the corresponding 144 // *_spec strings. Typically, these specs will be the same (we're 145 // canonicalizing a single source string), but may be different when 146 // replacing components. 147 template<typename CHAR, typename UCHAR> 148 bool DoUserInfo(const CHAR* username_spec, 149 const Component& username, 150 const CHAR* password_spec, 151 const Component& password, 152 CanonOutput* output, 153 Component* out_username, 154 Component* out_password) { 155 if (username.len <= 0 && password.len <= 0) { 156 // Common case: no user info. We strip empty username/passwords. 157 *out_username = Component(); 158 *out_password = Component(); 159 return true; 160 } 161 162 // Write the username. 163 out_username->begin = output->length(); 164 if (username.len > 0) { 165 // This will escape characters not valid for the username. 166 AppendStringOfType(&username_spec[username.begin], username.len, 167 CHAR_USERINFO, output); 168 } 169 out_username->len = output->length() - out_username->begin; 170 171 // When there is a password, we need the separator. Note that we strip 172 // empty but specified passwords. 173 if (password.len > 0) { 174 output->push_back(':'); 175 out_password->begin = output->length(); 176 AppendStringOfType(&password_spec[password.begin], password.len, 177 CHAR_USERINFO, output); 178 out_password->len = output->length() - out_password->begin; 179 } else { 180 *out_password = Component(); 181 } 182 183 output->push_back('@'); 184 return true; 185 } 186 187 // Helper functions for converting port integers to strings. 188 inline void WritePortInt(char* output, int output_len, int port) { 189 _itoa_s(port, output, output_len, 10); 190 } 191 192 // This function will prepend the colon if there will be a port. 193 template<typename CHAR, typename UCHAR> 194 bool DoPort(const CHAR* spec, 195 const Component& port, 196 int default_port_for_scheme, 197 CanonOutput* output, 198 Component* out_port) { 199 int port_num = ParsePort(spec, port); 200 if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) { 201 *out_port = Component(); 202 return true; // Leave port empty. 203 } 204 205 if (port_num == PORT_INVALID) { 206 // Invalid port: We'll copy the text from the input so the user can see 207 // what the error was, and mark the URL as invalid by returning false. 208 output->push_back(':'); 209 out_port->begin = output->length(); 210 AppendInvalidNarrowString(spec, port.begin, port.end(), output); 211 out_port->len = output->length() - out_port->begin; 212 return false; 213 } 214 215 // Convert port number back to an integer. Max port value is 5 digits, and 216 // the Parsed::ExtractPort will have made sure the integer is in range. 217 const int buf_size = 6; 218 char buf[buf_size]; 219 WritePortInt(buf, buf_size, port_num); 220 221 // Append the port number to the output, preceeded by a colon. 222 output->push_back(':'); 223 out_port->begin = output->length(); 224 for (int i = 0; i < buf_size && buf[i]; i++) 225 output->push_back(buf[i]); 226 227 out_port->len = output->length() - out_port->begin; 228 return true; 229 } 230 231 template<typename CHAR, typename UCHAR> 232 void DoCanonicalizeRef(const CHAR* spec, 233 const Component& ref, 234 CanonOutput* output, 235 Component* out_ref) { 236 if (ref.len < 0) { 237 // Common case of no ref. 238 *out_ref = Component(); 239 return; 240 } 241 242 // Append the ref separator. Note that we need to do this even when the ref 243 // is empty but present. 244 output->push_back('#'); 245 out_ref->begin = output->length(); 246 247 // Now iterate through all the characters, converting to UTF-8 and validating. 248 int end = ref.end(); 249 for (int i = ref.begin; i < end; i++) { 250 if (spec[i] == 0) { 251 // IE just strips NULLs, so we do too. 252 continue; 253 } else if (static_cast<UCHAR>(spec[i]) < 0x20) { 254 // Unline IE seems to, we escape control characters. This will probably 255 // make the reference fragment unusable on a web page, but people 256 // shouldn't be using control characters in their anchor names. 257 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output); 258 } else if (static_cast<UCHAR>(spec[i]) < 0x80) { 259 // Normal ASCII characters are just appended. 260 output->push_back(static_cast<char>(spec[i])); 261 } else { 262 // Non-ASCII characters are appended unescaped, but only when they are 263 // valid. Invalid Unicode characters are replaced with the "invalid 264 // character" as IE seems to (ReadUTFChar puts the unicode replacement 265 // character in the output on failure for us). 266 unsigned code_point; 267 ReadUTFChar(spec, &i, end, &code_point); 268 AppendUTF8Value(code_point, output); 269 } 270 } 271 272 out_ref->len = output->length() - out_ref->begin; 273 } 274 275 } // namespace 276 277 const char* RemoveURLWhitespace(const char* input, int input_len, 278 CanonOutputT<char>* buffer, 279 int* output_len) { 280 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); 281 } 282 283 const base::char16* RemoveURLWhitespace(const base::char16* input, 284 int input_len, 285 CanonOutputT<base::char16>* buffer, 286 int* output_len) { 287 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); 288 } 289 290 char CanonicalSchemeChar(base::char16 ch) { 291 if (ch >= 0x80) 292 return 0; // Non-ASCII is not supported by schemes. 293 return kSchemeCanonical[ch]; 294 } 295 296 bool CanonicalizeScheme(const char* spec, 297 const Component& scheme, 298 CanonOutput* output, 299 Component* out_scheme) { 300 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme); 301 } 302 303 bool CanonicalizeScheme(const base::char16* spec, 304 const Component& scheme, 305 CanonOutput* output, 306 Component* out_scheme) { 307 return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme); 308 } 309 310 bool CanonicalizeUserInfo(const char* username_source, 311 const Component& username, 312 const char* password_source, 313 const Component& password, 314 CanonOutput* output, 315 Component* out_username, 316 Component* out_password) { 317 return DoUserInfo<char, unsigned char>( 318 username_source, username, password_source, password, 319 output, out_username, out_password); 320 } 321 322 bool CanonicalizeUserInfo(const base::char16* username_source, 323 const Component& username, 324 const base::char16* password_source, 325 const Component& password, 326 CanonOutput* output, 327 Component* out_username, 328 Component* out_password) { 329 return DoUserInfo<base::char16, base::char16>( 330 username_source, username, password_source, password, 331 output, out_username, out_password); 332 } 333 334 bool CanonicalizePort(const char* spec, 335 const Component& port, 336 int default_port_for_scheme, 337 CanonOutput* output, 338 Component* out_port) { 339 return DoPort<char, unsigned char>(spec, port, 340 default_port_for_scheme, 341 output, out_port); 342 } 343 344 bool CanonicalizePort(const base::char16* spec, 345 const Component& port, 346 int default_port_for_scheme, 347 CanonOutput* output, 348 Component* out_port) { 349 return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme, 350 output, out_port); 351 } 352 353 void CanonicalizeRef(const char* spec, 354 const Component& ref, 355 CanonOutput* output, 356 Component* out_ref) { 357 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref); 358 } 359 360 void CanonicalizeRef(const base::char16* spec, 361 const Component& ref, 362 CanonOutput* output, 363 Component* out_ref) { 364 DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref); 365 } 366 367 } // namespace url 368