1 // Copyright 2007, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 // Canonicalizers for random bits that aren't big enough for their own files. 31 32 #include <string.h> 33 34 #include "googleurl/src/url_canon.h" 35 #include "googleurl/src/url_canon_internal.h" 36 37 namespace url_canon { 38 39 namespace { 40 41 // Returns true if the given character should be removed from the middle of a 42 // URL. 43 inline bool IsRemovableURLWhitespace(int ch) { 44 return ch == '\r' || ch == '\n' || ch == '\t'; 45 } 46 47 // Backend for RemoveURLWhitespace (see declaration in url_canon.h). 48 // It sucks that we have to do this, since this takes about 13% of the total URL 49 // canonicalization time. 50 template<typename CHAR> 51 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len, 52 CanonOutputT<CHAR>* buffer, 53 int* output_len) { 54 // Fast verification that there's nothing that needs removal. This is the 99% 55 // case, so we want it to be fast and don't care about impacting the speed 56 // when we do find whitespace. 57 int found_whitespace = false; 58 for (int i = 0; i < input_len; i++) { 59 if (!IsRemovableURLWhitespace(input[i])) 60 continue; 61 found_whitespace = true; 62 break; 63 } 64 65 if (!found_whitespace) { 66 // Didn't find any whitespace, we don't need to do anything. We can just 67 // return the input as the output. 68 *output_len = input_len; 69 return input; 70 } 71 72 // Remove the whitespace into the new buffer and return it. 73 for (int i = 0; i < input_len; i++) { 74 if (!IsRemovableURLWhitespace(input[i])) 75 buffer->push_back(input[i]); 76 } 77 *output_len = buffer->length(); 78 return buffer->data(); 79 } 80 81 // Contains the canonical version of each possible input letter in the scheme 82 // (basically, lower-cased). The corresponding entry will be 0 if the letter 83 // is not allowed in a scheme. 84 const char kSchemeCanonical[0x80] = { 85 // 00-1f: all are invalid 86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88 // ' ' ! " # $ % & ' ( ) * + , - . / 89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0, 90 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 91 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 , 92 // @ A B C D E F G H I J K L M N O 93 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 94 // P Q R S T U V W X Y Z [ \ ] ^ _ 95 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0, 96 // ` a b c d e f g h i j k l m n o 97 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 98 // p q r s t u v w x y z { | } ~ 99 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 }; 100 101 // This could be a table lookup as well by setting the high bit for each 102 // valid character, but it's only called once per URL, and it makes the lookup 103 // table easier to read not having extra stuff in it. 104 inline bool IsSchemeFirstChar(unsigned char c) { 105 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 106 } 107 108 template<typename CHAR, typename UCHAR> 109 bool DoScheme(const CHAR* spec, 110 const url_parse::Component& scheme, 111 CanonOutput* output, 112 url_parse::Component* out_scheme) { 113 if (scheme.len <= 0) { 114 // Scheme is unspecified or empty, convert to empty by appending a colon. 115 *out_scheme = url_parse::Component(output->length(), 0); 116 output->push_back(':'); 117 return true; 118 } 119 120 // The output scheme starts from the current position. 121 out_scheme->begin = output->length(); 122 123 bool success = true; 124 int end = scheme.end(); 125 for (int i = scheme.begin; i < end; i++) { 126 UCHAR ch = static_cast<UCHAR>(spec[i]); 127 char replacement = 0; 128 if (ch < 0x80) { 129 if (i == scheme.begin) { 130 // Need to do a special check for the first letter of the scheme. 131 if (IsSchemeFirstChar(static_cast<unsigned char>(ch))) 132 replacement = kSchemeCanonical[ch]; 133 } else { 134 replacement = kSchemeCanonical[ch]; 135 } 136 } 137 138 if (replacement) { 139 output->push_back(replacement); 140 } else if (ch == '%') { 141 // Canonicalizing the scheme multiple times should lead to the same 142 // result. Since invalid characters will be escaped, we need to preserve 143 // the percent to avoid multiple escaping. The scheme will be invalid. 144 success = false; 145 output->push_back('%'); 146 } else { 147 // Invalid character, store it but mark this scheme as invalid. 148 success = false; 149 150 // This will escape the output and also handle encoding issues. 151 // Ignore the return value since we already failed. 152 AppendUTF8EscapedChar(spec, &i, end, output); 153 } 154 } 155 156 // The output scheme ends with the the current position, before appending 157 // the colon. 158 out_scheme->len = output->length() - out_scheme->begin; 159 output->push_back(':'); 160 return success; 161 } 162 163 // The username and password components reference ranges in the corresponding 164 // *_spec strings. Typically, these specs will be the same (we're 165 // canonicalizing a single source string), but may be different when 166 // replacing components. 167 template<typename CHAR, typename UCHAR> 168 bool DoUserInfo(const CHAR* username_spec, 169 const url_parse::Component& username, 170 const CHAR* password_spec, 171 const url_parse::Component& password, 172 CanonOutput* output, 173 url_parse::Component* out_username, 174 url_parse::Component* out_password) { 175 if (username.len <= 0 && password.len <= 0) { 176 // Common case: no user info. We strip empty username/passwords. 177 *out_username = url_parse::Component(); 178 *out_password = url_parse::Component(); 179 return true; 180 } 181 182 // Write the username. 183 out_username->begin = output->length(); 184 if (username.len > 0) { 185 // This will escape characters not valid for the username. 186 AppendStringOfType(&username_spec[username.begin], username.len, 187 CHAR_USERINFO, output); 188 } 189 out_username->len = output->length() - out_username->begin; 190 191 // When there is a password, we need the separator. Note that we strip 192 // empty but specified passwords. 193 if (password.len > 0) { 194 output->push_back(':'); 195 out_password->begin = output->length(); 196 AppendStringOfType(&password_spec[password.begin], password.len, 197 CHAR_USERINFO, output); 198 out_password->len = output->length() - out_password->begin; 199 } else { 200 *out_password = url_parse::Component(); 201 } 202 203 output->push_back('@'); 204 return true; 205 } 206 207 // Helper functions for converting port integers to strings. 208 inline void WritePortInt(char* output, int output_len, int port) { 209 _itoa_s(port, output, output_len, 10); 210 } 211 inline void WritePortInt(char16* output, int output_len, int port) { 212 _itow_s(port, output, output_len, 10); 213 } 214 215 // This function will prepend the colon if there will be a port. 216 template<typename CHAR, typename UCHAR> 217 bool DoPort(const CHAR* spec, 218 const url_parse::Component& port, 219 int default_port_for_scheme, 220 CanonOutput* output, 221 url_parse::Component* out_port) { 222 int port_num = url_parse::ParsePort(spec, port); 223 if (port_num == url_parse::PORT_UNSPECIFIED || 224 port_num == default_port_for_scheme) { 225 *out_port = url_parse::Component(); 226 return true; // Leave port empty. 227 } 228 229 if (port_num == url_parse::PORT_INVALID) { 230 // Invalid port: We'll copy the text from the input so the user can see 231 // what the error was, and mark the URL as invalid by returning false. 232 output->push_back(':'); 233 out_port->begin = output->length(); 234 AppendInvalidNarrowString(spec, port.begin, port.end(), output); 235 out_port->len = output->length() - out_port->begin; 236 return false; 237 } 238 239 // Convert port number back to an integer. Max port value is 5 digits, and 240 // the Parsed::ExtractPort will have made sure the integer is in range. 241 const int buf_size = 6; 242 char buf[buf_size]; 243 WritePortInt(buf, buf_size, port_num); 244 245 // Append the port number to the output, preceeded by a colon. 246 output->push_back(':'); 247 out_port->begin = output->length(); 248 for (int i = 0; i < buf_size && buf[i]; i++) 249 output->push_back(buf[i]); 250 251 out_port->len = output->length() - out_port->begin; 252 return true; 253 } 254 255 template<typename CHAR, typename UCHAR> 256 void DoCanonicalizeRef(const CHAR* spec, 257 const url_parse::Component& ref, 258 CanonOutput* output, 259 url_parse::Component* out_ref) { 260 if (ref.len < 0) { 261 // Common case of no ref. 262 *out_ref = url_parse::Component(); 263 return; 264 } 265 266 // Append the ref separator. Note that we need to do this even when the ref 267 // is empty but present. 268 output->push_back('#'); 269 out_ref->begin = output->length(); 270 271 // Now iterate through all the characters, converting to UTF-8 and validating. 272 int end = ref.end(); 273 for (int i = ref.begin; i < end; i++) { 274 if (spec[i] == 0) { 275 // IE just strips NULLs, so we do too. 276 continue; 277 } else if (static_cast<UCHAR>(spec[i]) < 0x20) { 278 // Unline IE seems to, we escape control characters. This will probably 279 // make the reference fragment unusable on a web page, but people 280 // shouldn't be using control characters in their anchor names. 281 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output); 282 } else if (static_cast<UCHAR>(spec[i]) < 0x80) { 283 // Normal ASCII characters are just appended. 284 output->push_back(static_cast<char>(spec[i])); 285 } else { 286 // Non-ASCII characters are appended unescaped, but only when they are 287 // valid. Invalid Unicode characters are replaced with the "invalid 288 // character" as IE seems to. 289 unsigned code_point; 290 if (!ReadUTFChar(spec, &i, end, &code_point)) 291 AppendUTF8Value(kUnicodeReplacementCharacter, output); 292 else 293 AppendUTF8Value(code_point, output); 294 } 295 } 296 297 out_ref->len = output->length() - out_ref->begin; 298 } 299 300 } // namespace 301 302 const char* RemoveURLWhitespace(const char* input, int input_len, 303 CanonOutputT<char>* buffer, 304 int* output_len) { 305 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); 306 } 307 308 const char16* RemoveURLWhitespace(const char16* input, int input_len, 309 CanonOutputT<char16>* buffer, 310 int* output_len) { 311 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); 312 } 313 314 char CanonicalSchemeChar(char16 ch) { 315 if (ch >= 0x80) 316 return 0; // Non-ASCII is not supported by schemes. 317 return kSchemeCanonical[ch]; 318 } 319 320 bool CanonicalizeScheme(const char* spec, 321 const url_parse::Component& scheme, 322 CanonOutput* output, 323 url_parse::Component* out_scheme) { 324 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme); 325 } 326 327 bool CanonicalizeScheme(const char16* spec, 328 const url_parse::Component& scheme, 329 CanonOutput* output, 330 url_parse::Component* out_scheme) { 331 return DoScheme<char16, char16>(spec, scheme, output, out_scheme); 332 } 333 334 bool CanonicalizeUserInfo(const char* username_source, 335 const url_parse::Component& username, 336 const char* password_source, 337 const url_parse::Component& password, 338 CanonOutput* output, 339 url_parse::Component* out_username, 340 url_parse::Component* out_password) { 341 return DoUserInfo<char, unsigned char>( 342 username_source, username, password_source, password, 343 output, out_username, out_password); 344 } 345 346 bool CanonicalizeUserInfo(const char16* username_source, 347 const url_parse::Component& username, 348 const char16* password_source, 349 const url_parse::Component& password, 350 CanonOutput* output, 351 url_parse::Component* out_username, 352 url_parse::Component* out_password) { 353 return DoUserInfo<char16, char16>( 354 username_source, username, password_source, password, 355 output, out_username, out_password); 356 } 357 358 bool CanonicalizePort(const char* spec, 359 const url_parse::Component& port, 360 int default_port_for_scheme, 361 CanonOutput* output, 362 url_parse::Component* out_port) { 363 return DoPort<char, unsigned char>(spec, port, 364 default_port_for_scheme, 365 output, out_port); 366 } 367 368 bool CanonicalizePort(const char16* spec, 369 const url_parse::Component& port, 370 int default_port_for_scheme, 371 CanonOutput* output, 372 url_parse::Component* out_port) { 373 return DoPort<char16, char16>(spec, port, default_port_for_scheme, 374 output, out_port); 375 } 376 377 void CanonicalizeRef(const char* spec, 378 const url_parse::Component& ref, 379 CanonOutput* output, 380 url_parse::Component* out_ref) { 381 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref); 382 } 383 384 void CanonicalizeRef(const char16* spec, 385 const url_parse::Component& ref, 386 CanonOutput* output, 387 url_parse::Component* out_ref) { 388 DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref); 389 } 390 391 } // namespace url_canon 392