Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 // Canonicalizers for random bits that aren't big enough for their own files.
     31 
     32 #include <string.h>
     33 
     34 #include "googleurl/src/url_canon.h"
     35 #include "googleurl/src/url_canon_internal.h"
     36 
     37 namespace url_canon {
     38 
     39 namespace {
     40 
     41 // Returns true if the given character should be removed from the middle of a
     42 // URL.
     43 inline bool IsRemovableURLWhitespace(int ch) {
     44   return ch == '\r' || ch == '\n' || ch == '\t';
     45 }
     46 
     47 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
     48 // It sucks that we have to do this, since this takes about 13% of the total URL
     49 // canonicalization time.
     50 template<typename CHAR>
     51 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
     52                                   CanonOutputT<CHAR>* buffer,
     53                                   int* output_len) {
     54   // Fast verification that there's nothing that needs removal. This is the 99%
     55   // case, so we want it to be fast and don't care about impacting the speed
     56   // when we do find whitespace.
     57   int found_whitespace = false;
     58   for (int i = 0; i < input_len; i++) {
     59     if (!IsRemovableURLWhitespace(input[i]))
     60       continue;
     61     found_whitespace = true;
     62     break;
     63   }
     64 
     65   if (!found_whitespace) {
     66     // Didn't find any whitespace, we don't need to do anything. We can just
     67     // return the input as the output.
     68     *output_len = input_len;
     69     return input;
     70   }
     71 
     72   // Remove the whitespace into the new buffer and return it.
     73   for (int i = 0; i < input_len; i++) {
     74     if (!IsRemovableURLWhitespace(input[i]))
     75       buffer->push_back(input[i]);
     76   }
     77   *output_len = buffer->length();
     78   return buffer->data();
     79 }
     80 
     81 // Contains the canonical version of each possible input letter in the scheme
     82 // (basically, lower-cased). The corresponding entry will be 0 if the letter
     83 // is not allowed in a scheme.
     84 const char kSchemeCanonical[0x80] = {
     85 // 00-1f: all are invalid
     86      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     87      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     88 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
     89      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
     90 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
     91     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
     92 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
     93      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     94 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     95     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
     96 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
     97      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     98 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     99     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
    100 
    101 // This could be a table lookup as well by setting the high bit for each
    102 // valid character, but it's only called once per URL, and it makes the lookup
    103 // table easier to read not having extra stuff in it.
    104 inline bool IsSchemeFirstChar(unsigned char c) {
    105   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    106 }
    107 
    108 template<typename CHAR, typename UCHAR>
    109 bool DoScheme(const CHAR* spec,
    110               const url_parse::Component& scheme,
    111               CanonOutput* output,
    112               url_parse::Component* out_scheme) {
    113   if (scheme.len <= 0) {
    114     // Scheme is unspecified or empty, convert to empty by appending a colon.
    115     *out_scheme = url_parse::Component(output->length(), 0);
    116     output->push_back(':');
    117     return true;
    118   }
    119 
    120   // The output scheme starts from the current position.
    121   out_scheme->begin = output->length();
    122 
    123   // Danger: it's important that this code does not strip any characters: it
    124   // only emits the canonical version (be it valid or escaped) of each of
    125   // the input characters. Stripping would put it out of sync with
    126   // url_util::FindAndCompareScheme, which could cause some security checks on
    127   // schemes to be incorrect.
    128   bool success = true;
    129   int end = scheme.end();
    130   for (int i = scheme.begin; i < end; i++) {
    131     UCHAR ch = static_cast<UCHAR>(spec[i]);
    132     char replacement = 0;
    133     if (ch < 0x80) {
    134       if (i == scheme.begin) {
    135         // Need to do a special check for the first letter of the scheme.
    136         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
    137           replacement = kSchemeCanonical[ch];
    138       } else {
    139         replacement = kSchemeCanonical[ch];
    140       }
    141     }
    142 
    143     if (replacement) {
    144       output->push_back(replacement);
    145     } else if (ch == '%') {
    146       // Canonicalizing the scheme multiple times should lead to the same
    147       // result. Since invalid characters will be escaped, we need to preserve
    148       // the percent to avoid multiple escaping. The scheme will be invalid.
    149       success = false;
    150       output->push_back('%');
    151     } else {
    152       // Invalid character, store it but mark this scheme as invalid.
    153       success = false;
    154 
    155       // This will escape the output and also handle encoding issues.
    156       // Ignore the return value since we already failed.
    157       AppendUTF8EscapedChar(spec, &i, end, output);
    158     }
    159   }
    160 
    161   // The output scheme ends with the the current position, before appending
    162   // the colon.
    163   out_scheme->len = output->length() - out_scheme->begin;
    164   output->push_back(':');
    165   return success;
    166 }
    167 
    168 // The username and password components reference ranges in the corresponding
    169 // *_spec strings. Typically, these specs will be the same (we're
    170 // canonicalizing a single source string), but may be different when
    171 // replacing components.
    172 template<typename CHAR, typename UCHAR>
    173 bool DoUserInfo(const CHAR* username_spec,
    174                 const url_parse::Component& username,
    175                 const CHAR* password_spec,
    176                 const url_parse::Component& password,
    177                 CanonOutput* output,
    178                 url_parse::Component* out_username,
    179                 url_parse::Component* out_password) {
    180   if (username.len <= 0 && password.len <= 0) {
    181     // Common case: no user info. We strip empty username/passwords.
    182     *out_username = url_parse::Component();
    183     *out_password = url_parse::Component();
    184     return true;
    185   }
    186 
    187   // Write the username.
    188   out_username->begin = output->length();
    189   if (username.len > 0) {
    190     // This will escape characters not valid for the username.
    191     AppendStringOfType(&username_spec[username.begin], username.len,
    192                        CHAR_USERINFO, output);
    193   }
    194   out_username->len = output->length() - out_username->begin;
    195 
    196   // When there is a password, we need the separator. Note that we strip
    197   // empty but specified passwords.
    198   if (password.len > 0) {
    199     output->push_back(':');
    200     out_password->begin = output->length();
    201     AppendStringOfType(&password_spec[password.begin], password.len,
    202                        CHAR_USERINFO, output);
    203     out_password->len = output->length() - out_password->begin;
    204   } else {
    205     *out_password = url_parse::Component();
    206   }
    207 
    208   output->push_back('@');
    209   return true;
    210 }
    211 
    212 // Helper functions for converting port integers to strings.
    213 inline void WritePortInt(char* output, int output_len, int port) {
    214   _itoa_s(port, output, output_len, 10);
    215 }
    216 
    217 // This function will prepend the colon if there will be a port.
    218 template<typename CHAR, typename UCHAR>
    219 bool DoPort(const CHAR* spec,
    220             const url_parse::Component& port,
    221             int default_port_for_scheme,
    222             CanonOutput* output,
    223             url_parse::Component* out_port) {
    224   int port_num = url_parse::ParsePort(spec, port);
    225   if (port_num == url_parse::PORT_UNSPECIFIED ||
    226       port_num == default_port_for_scheme) {
    227     *out_port = url_parse::Component();
    228     return true;  // Leave port empty.
    229   }
    230 
    231   if (port_num == url_parse::PORT_INVALID) {
    232     // Invalid port: We'll copy the text from the input so the user can see
    233     // what the error was, and mark the URL as invalid by returning false.
    234     output->push_back(':');
    235     out_port->begin = output->length();
    236     AppendInvalidNarrowString(spec, port.begin, port.end(), output);
    237     out_port->len = output->length() - out_port->begin;
    238     return false;
    239   }
    240 
    241   // Convert port number back to an integer. Max port value is 5 digits, and
    242   // the Parsed::ExtractPort will have made sure the integer is in range.
    243   const int buf_size = 6;
    244   char buf[buf_size];
    245   WritePortInt(buf, buf_size, port_num);
    246 
    247   // Append the port number to the output, preceeded by a colon.
    248   output->push_back(':');
    249   out_port->begin = output->length();
    250   for (int i = 0; i < buf_size && buf[i]; i++)
    251     output->push_back(buf[i]);
    252 
    253   out_port->len = output->length() - out_port->begin;
    254   return true;
    255 }
    256 
    257 template<typename CHAR, typename UCHAR>
    258 void DoCanonicalizeRef(const CHAR* spec,
    259                        const url_parse::Component& ref,
    260                        CanonOutput* output,
    261                        url_parse::Component* out_ref) {
    262   if (ref.len < 0) {
    263     // Common case of no ref.
    264     *out_ref = url_parse::Component();
    265     return;
    266   }
    267 
    268   // Append the ref separator. Note that we need to do this even when the ref
    269   // is empty but present.
    270   output->push_back('#');
    271   out_ref->begin = output->length();
    272 
    273   // Now iterate through all the characters, converting to UTF-8 and validating.
    274   int end = ref.end();
    275   for (int i = ref.begin; i < end; i++) {
    276     if (spec[i] == 0) {
    277       // IE just strips NULLs, so we do too.
    278       continue;
    279     } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
    280       // Unline IE seems to, we escape control characters. This will probably
    281       // make the reference fragment unusable on a web page, but people
    282       // shouldn't be using control characters in their anchor names.
    283       AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
    284     } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
    285       // Normal ASCII characters are just appended.
    286       output->push_back(static_cast<char>(spec[i]));
    287     } else {
    288       // Non-ASCII characters are appended unescaped, but only when they are
    289       // valid. Invalid Unicode characters are replaced with the "invalid
    290       // character" as IE seems to (ReadUTFChar puts the unicode replacement
    291       // character in the output on failure for us).
    292       unsigned code_point;
    293       ReadUTFChar(spec, &i, end, &code_point);
    294       AppendUTF8Value(code_point, output);
    295     }
    296   }
    297 
    298   out_ref->len = output->length() - out_ref->begin;
    299 }
    300 
    301 }  // namespace
    302 
    303 const char* RemoveURLWhitespace(const char* input, int input_len,
    304                                 CanonOutputT<char>* buffer,
    305                                 int* output_len) {
    306   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
    307 }
    308 
    309 const char16* RemoveURLWhitespace(const char16* input, int input_len,
    310                                   CanonOutputT<char16>* buffer,
    311                                   int* output_len) {
    312   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
    313 }
    314 
    315 char CanonicalSchemeChar(char16 ch) {
    316   if (ch >= 0x80)
    317     return 0;  // Non-ASCII is not supported by schemes.
    318   return kSchemeCanonical[ch];
    319 }
    320 
    321 bool CanonicalizeScheme(const char* spec,
    322                         const url_parse::Component& scheme,
    323                         CanonOutput* output,
    324                         url_parse::Component* out_scheme) {
    325   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
    326 }
    327 
    328 bool CanonicalizeScheme(const char16* spec,
    329                         const url_parse::Component& scheme,
    330                         CanonOutput* output,
    331                         url_parse::Component* out_scheme) {
    332   return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
    333 }
    334 
    335 bool CanonicalizeUserInfo(const char* username_source,
    336                           const url_parse::Component& username,
    337                           const char* password_source,
    338                           const url_parse::Component& password,
    339                           CanonOutput* output,
    340                           url_parse::Component* out_username,
    341                           url_parse::Component* out_password) {
    342   return DoUserInfo<char, unsigned char>(
    343       username_source, username, password_source, password,
    344       output, out_username, out_password);
    345 }
    346 
    347 bool CanonicalizeUserInfo(const char16* username_source,
    348                           const url_parse::Component& username,
    349                           const char16* password_source,
    350                           const url_parse::Component& password,
    351                           CanonOutput* output,
    352                           url_parse::Component* out_username,
    353                           url_parse::Component* out_password) {
    354   return DoUserInfo<char16, char16>(
    355       username_source, username, password_source, password,
    356       output, out_username, out_password);
    357 }
    358 
    359 bool CanonicalizePort(const char* spec,
    360                       const url_parse::Component& port,
    361                       int default_port_for_scheme,
    362                       CanonOutput* output,
    363                       url_parse::Component* out_port) {
    364   return DoPort<char, unsigned char>(spec, port,
    365                                      default_port_for_scheme,
    366                                      output, out_port);
    367 }
    368 
    369 bool CanonicalizePort(const char16* spec,
    370                       const url_parse::Component& port,
    371                       int default_port_for_scheme,
    372                       CanonOutput* output,
    373                       url_parse::Component* out_port) {
    374   return DoPort<char16, char16>(spec, port, default_port_for_scheme,
    375                                       output, out_port);
    376 }
    377 
    378 void CanonicalizeRef(const char* spec,
    379                      const url_parse::Component& ref,
    380                      CanonOutput* output,
    381                      url_parse::Component* out_ref) {
    382   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
    383 }
    384 
    385 void CanonicalizeRef(const char16* spec,
    386                      const url_parse::Component& ref,
    387                      CanonOutput* output,
    388                      url_parse::Component* out_ref) {
    389   DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
    390 }
    391 
    392 }  // namespace url_canon
    393