Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Canonicalizers for random bits that aren't big enough for their own files.
      6 
      7 #include <string.h>
      8 
      9 #include "url/url_canon.h"
     10 #include "url/url_canon_internal.h"
     11 
     12 namespace url_canon {
     13 
     14 namespace {
     15 
     16 // Returns true if the given character should be removed from the middle of a
     17 // URL.
     18 inline bool IsRemovableURLWhitespace(int ch) {
     19   return ch == '\r' || ch == '\n' || ch == '\t';
     20 }
     21 
     22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
     23 // It sucks that we have to do this, since this takes about 13% of the total URL
     24 // canonicalization time.
     25 template<typename CHAR>
     26 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
     27                                   CanonOutputT<CHAR>* buffer,
     28                                   int* output_len) {
     29   // Fast verification that there's nothing that needs removal. This is the 99%
     30   // case, so we want it to be fast and don't care about impacting the speed
     31   // when we do find whitespace.
     32   int found_whitespace = false;
     33   for (int i = 0; i < input_len; i++) {
     34     if (!IsRemovableURLWhitespace(input[i]))
     35       continue;
     36     found_whitespace = true;
     37     break;
     38   }
     39 
     40   if (!found_whitespace) {
     41     // Didn't find any whitespace, we don't need to do anything. We can just
     42     // return the input as the output.
     43     *output_len = input_len;
     44     return input;
     45   }
     46 
     47   // Remove the whitespace into the new buffer and return it.
     48   for (int i = 0; i < input_len; i++) {
     49     if (!IsRemovableURLWhitespace(input[i]))
     50       buffer->push_back(input[i]);
     51   }
     52   *output_len = buffer->length();
     53   return buffer->data();
     54 }
     55 
     56 // Contains the canonical version of each possible input letter in the scheme
     57 // (basically, lower-cased). The corresponding entry will be 0 if the letter
     58 // is not allowed in a scheme.
     59 const char kSchemeCanonical[0x80] = {
     60 // 00-1f: all are invalid
     61      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     62      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     63 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
     64      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
     65 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
     66     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
     67 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
     68      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     69 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     70     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
     71 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
     72      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     73 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     74     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
     75 
     76 // This could be a table lookup as well by setting the high bit for each
     77 // valid character, but it's only called once per URL, and it makes the lookup
     78 // table easier to read not having extra stuff in it.
     79 inline bool IsSchemeFirstChar(unsigned char c) {
     80   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
     81 }
     82 
     83 template<typename CHAR, typename UCHAR>
     84 bool DoScheme(const CHAR* spec,
     85               const url_parse::Component& scheme,
     86               CanonOutput* output,
     87               url_parse::Component* out_scheme) {
     88   if (scheme.len <= 0) {
     89     // Scheme is unspecified or empty, convert to empty by appending a colon.
     90     *out_scheme = url_parse::Component(output->length(), 0);
     91     output->push_back(':');
     92     return true;
     93   }
     94 
     95   // The output scheme starts from the current position.
     96   out_scheme->begin = output->length();
     97 
     98   // Danger: it's important that this code does not strip any characters: it
     99   // only emits the canonical version (be it valid or escaped) of each of
    100   // the input characters. Stripping would put it out of sync with
    101   // url_util::FindAndCompareScheme, which could cause some security checks on
    102   // schemes to be incorrect.
    103   bool success = true;
    104   int end = scheme.end();
    105   for (int i = scheme.begin; i < end; i++) {
    106     UCHAR ch = static_cast<UCHAR>(spec[i]);
    107     char replacement = 0;
    108     if (ch < 0x80) {
    109       if (i == scheme.begin) {
    110         // Need to do a special check for the first letter of the scheme.
    111         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
    112           replacement = kSchemeCanonical[ch];
    113       } else {
    114         replacement = kSchemeCanonical[ch];
    115       }
    116     }
    117 
    118     if (replacement) {
    119       output->push_back(replacement);
    120     } else if (ch == '%') {
    121       // Canonicalizing the scheme multiple times should lead to the same
    122       // result. Since invalid characters will be escaped, we need to preserve
    123       // the percent to avoid multiple escaping. The scheme will be invalid.
    124       success = false;
    125       output->push_back('%');
    126     } else {
    127       // Invalid character, store it but mark this scheme as invalid.
    128       success = false;
    129 
    130       // This will escape the output and also handle encoding issues.
    131       // Ignore the return value since we already failed.
    132       AppendUTF8EscapedChar(spec, &i, end, output);
    133     }
    134   }
    135 
    136   // The output scheme ends with the the current position, before appending
    137   // the colon.
    138   out_scheme->len = output->length() - out_scheme->begin;
    139   output->push_back(':');
    140   return success;
    141 }
    142 
    143 // The username and password components reference ranges in the corresponding
    144 // *_spec strings. Typically, these specs will be the same (we're
    145 // canonicalizing a single source string), but may be different when
    146 // replacing components.
    147 template<typename CHAR, typename UCHAR>
    148 bool DoUserInfo(const CHAR* username_spec,
    149                 const url_parse::Component& username,
    150                 const CHAR* password_spec,
    151                 const url_parse::Component& password,
    152                 CanonOutput* output,
    153                 url_parse::Component* out_username,
    154                 url_parse::Component* out_password) {
    155   if (username.len <= 0 && password.len <= 0) {
    156     // Common case: no user info. We strip empty username/passwords.
    157     *out_username = url_parse::Component();
    158     *out_password = url_parse::Component();
    159     return true;
    160   }
    161 
    162   // Write the username.
    163   out_username->begin = output->length();
    164   if (username.len > 0) {
    165     // This will escape characters not valid for the username.
    166     AppendStringOfType(&username_spec[username.begin], username.len,
    167                        CHAR_USERINFO, output);
    168   }
    169   out_username->len = output->length() - out_username->begin;
    170 
    171   // When there is a password, we need the separator. Note that we strip
    172   // empty but specified passwords.
    173   if (password.len > 0) {
    174     output->push_back(':');
    175     out_password->begin = output->length();
    176     AppendStringOfType(&password_spec[password.begin], password.len,
    177                        CHAR_USERINFO, output);
    178     out_password->len = output->length() - out_password->begin;
    179   } else {
    180     *out_password = url_parse::Component();
    181   }
    182 
    183   output->push_back('@');
    184   return true;
    185 }
    186 
    187 // Helper functions for converting port integers to strings.
    188 inline void WritePortInt(char* output, int output_len, int port) {
    189   _itoa_s(port, output, output_len, 10);
    190 }
    191 
    192 // This function will prepend the colon if there will be a port.
    193 template<typename CHAR, typename UCHAR>
    194 bool DoPort(const CHAR* spec,
    195             const url_parse::Component& port,
    196             int default_port_for_scheme,
    197             CanonOutput* output,
    198             url_parse::Component* out_port) {
    199   int port_num = url_parse::ParsePort(spec, port);
    200   if (port_num == url_parse::PORT_UNSPECIFIED ||
    201       port_num == default_port_for_scheme) {
    202     *out_port = url_parse::Component();
    203     return true;  // Leave port empty.
    204   }
    205 
    206   if (port_num == url_parse::PORT_INVALID) {
    207     // Invalid port: We'll copy the text from the input so the user can see
    208     // what the error was, and mark the URL as invalid by returning false.
    209     output->push_back(':');
    210     out_port->begin = output->length();
    211     AppendInvalidNarrowString(spec, port.begin, port.end(), output);
    212     out_port->len = output->length() - out_port->begin;
    213     return false;
    214   }
    215 
    216   // Convert port number back to an integer. Max port value is 5 digits, and
    217   // the Parsed::ExtractPort will have made sure the integer is in range.
    218   const int buf_size = 6;
    219   char buf[buf_size];
    220   WritePortInt(buf, buf_size, port_num);
    221 
    222   // Append the port number to the output, preceeded by a colon.
    223   output->push_back(':');
    224   out_port->begin = output->length();
    225   for (int i = 0; i < buf_size && buf[i]; i++)
    226     output->push_back(buf[i]);
    227 
    228   out_port->len = output->length() - out_port->begin;
    229   return true;
    230 }
    231 
    232 template<typename CHAR, typename UCHAR>
    233 void DoCanonicalizeRef(const CHAR* spec,
    234                        const url_parse::Component& ref,
    235                        CanonOutput* output,
    236                        url_parse::Component* out_ref) {
    237   if (ref.len < 0) {
    238     // Common case of no ref.
    239     *out_ref = url_parse::Component();
    240     return;
    241   }
    242 
    243   // Append the ref separator. Note that we need to do this even when the ref
    244   // is empty but present.
    245   output->push_back('#');
    246   out_ref->begin = output->length();
    247 
    248   // Now iterate through all the characters, converting to UTF-8 and validating.
    249   int end = ref.end();
    250   for (int i = ref.begin; i < end; i++) {
    251     if (spec[i] == 0) {
    252       // IE just strips NULLs, so we do too.
    253       continue;
    254     } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
    255       // Unline IE seems to, we escape control characters. This will probably
    256       // make the reference fragment unusable on a web page, but people
    257       // shouldn't be using control characters in their anchor names.
    258       AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
    259     } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
    260       // Normal ASCII characters are just appended.
    261       output->push_back(static_cast<char>(spec[i]));
    262     } else {
    263       // Non-ASCII characters are appended unescaped, but only when they are
    264       // valid. Invalid Unicode characters are replaced with the "invalid
    265       // character" as IE seems to (ReadUTFChar puts the unicode replacement
    266       // character in the output on failure for us).
    267       unsigned code_point;
    268       ReadUTFChar(spec, &i, end, &code_point);
    269       AppendUTF8Value(code_point, output);
    270     }
    271   }
    272 
    273   out_ref->len = output->length() - out_ref->begin;
    274 }
    275 
    276 }  // namespace
    277 
    278 const char* RemoveURLWhitespace(const char* input, int input_len,
    279                                 CanonOutputT<char>* buffer,
    280                                 int* output_len) {
    281   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
    282 }
    283 
    284 const base::char16* RemoveURLWhitespace(const base::char16* input,
    285                                         int input_len,
    286                                         CanonOutputT<base::char16>* buffer,
    287                                         int* output_len) {
    288   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
    289 }
    290 
    291 char CanonicalSchemeChar(base::char16 ch) {
    292   if (ch >= 0x80)
    293     return 0;  // Non-ASCII is not supported by schemes.
    294   return kSchemeCanonical[ch];
    295 }
    296 
    297 bool CanonicalizeScheme(const char* spec,
    298                         const url_parse::Component& scheme,
    299                         CanonOutput* output,
    300                         url_parse::Component* out_scheme) {
    301   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
    302 }
    303 
    304 bool CanonicalizeScheme(const base::char16* spec,
    305                         const url_parse::Component& scheme,
    306                         CanonOutput* output,
    307                         url_parse::Component* out_scheme) {
    308   return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme);
    309 }
    310 
    311 bool CanonicalizeUserInfo(const char* username_source,
    312                           const url_parse::Component& username,
    313                           const char* password_source,
    314                           const url_parse::Component& password,
    315                           CanonOutput* output,
    316                           url_parse::Component* out_username,
    317                           url_parse::Component* out_password) {
    318   return DoUserInfo<char, unsigned char>(
    319       username_source, username, password_source, password,
    320       output, out_username, out_password);
    321 }
    322 
    323 bool CanonicalizeUserInfo(const base::char16* username_source,
    324                           const url_parse::Component& username,
    325                           const base::char16* password_source,
    326                           const url_parse::Component& password,
    327                           CanonOutput* output,
    328                           url_parse::Component* out_username,
    329                           url_parse::Component* out_password) {
    330   return DoUserInfo<base::char16, base::char16>(
    331       username_source, username, password_source, password,
    332       output, out_username, out_password);
    333 }
    334 
    335 bool CanonicalizePort(const char* spec,
    336                       const url_parse::Component& port,
    337                       int default_port_for_scheme,
    338                       CanonOutput* output,
    339                       url_parse::Component* out_port) {
    340   return DoPort<char, unsigned char>(spec, port,
    341                                      default_port_for_scheme,
    342                                      output, out_port);
    343 }
    344 
    345 bool CanonicalizePort(const base::char16* spec,
    346                       const url_parse::Component& port,
    347                       int default_port_for_scheme,
    348                       CanonOutput* output,
    349                       url_parse::Component* out_port) {
    350   return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme,
    351                                             output, out_port);
    352 }
    353 
    354 void CanonicalizeRef(const char* spec,
    355                      const url_parse::Component& ref,
    356                      CanonOutput* output,
    357                      url_parse::Component* out_ref) {
    358   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
    359 }
    360 
    361 void CanonicalizeRef(const base::char16* spec,
    362                      const url_parse::Component& ref,
    363                      CanonOutput* output,
    364                      url_parse::Component* out_ref) {
    365   DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref);
    366 }
    367 
    368 }  // namespace url_canon
    369