Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 // Canonicalizers for random bits that aren't big enough for their own files.
     31 
     32 #include <string.h>
     33 
     34 #include "googleurl/src/url_canon.h"
     35 #include "googleurl/src/url_canon_internal.h"
     36 
     37 namespace url_canon {
     38 
     39 namespace {
     40 
     41 // Returns true if the given character should be removed from the middle of a
     42 // URL.
     43 inline bool IsRemovableURLWhitespace(int ch) {
     44   return ch == '\r' || ch == '\n' || ch == '\t';
     45 }
     46 
     47 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
     48 // It sucks that we have to do this, since this takes about 13% of the total URL
     49 // canonicalization time.
     50 template<typename CHAR>
     51 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
     52                                   CanonOutputT<CHAR>* buffer,
     53                                   int* output_len) {
     54   // Fast verification that there's nothing that needs removal. This is the 99%
     55   // case, so we want it to be fast and don't care about impacting the speed
     56   // when we do find whitespace.
     57   int found_whitespace = false;
     58   for (int i = 0; i < input_len; i++) {
     59     if (!IsRemovableURLWhitespace(input[i]))
     60       continue;
     61     found_whitespace = true;
     62     break;
     63   }
     64 
     65   if (!found_whitespace) {
     66     // Didn't find any whitespace, we don't need to do anything. We can just
     67     // return the input as the output.
     68     *output_len = input_len;
     69     return input;
     70   }
     71 
     72   // Remove the whitespace into the new buffer and return it.
     73   for (int i = 0; i < input_len; i++) {
     74     if (!IsRemovableURLWhitespace(input[i]))
     75       buffer->push_back(input[i]);
     76   }
     77   *output_len = buffer->length();
     78   return buffer->data();
     79 }
     80 
     81 // Contains the canonical version of each possible input letter in the scheme
     82 // (basically, lower-cased). The corresponding entry will be 0 if the letter
     83 // is not allowed in a scheme.
     84 const char kSchemeCanonical[0x80] = {
     85 // 00-1f: all are invalid
     86      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     87      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     88 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
     89      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
     90 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
     91     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
     92 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
     93      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     94 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     95     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
     96 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
     97      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     98 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     99     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
    100 
    101 // This could be a table lookup as well by setting the high bit for each
    102 // valid character, but it's only called once per URL, and it makes the lookup
    103 // table easier to read not having extra stuff in it.
    104 inline bool IsSchemeFirstChar(unsigned char c) {
    105   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    106 }
    107 
    108 template<typename CHAR, typename UCHAR>
    109 bool DoScheme(const CHAR* spec,
    110               const url_parse::Component& scheme,
    111               CanonOutput* output,
    112               url_parse::Component* out_scheme) {
    113   if (scheme.len <= 0) {
    114     // Scheme is unspecified or empty, convert to empty by appending a colon.
    115     *out_scheme = url_parse::Component(output->length(), 0);
    116     output->push_back(':');
    117     return true;
    118   }
    119 
    120   // The output scheme starts from the current position.
    121   out_scheme->begin = output->length();
    122 
    123   bool success = true;
    124   int end = scheme.end();
    125   for (int i = scheme.begin; i < end; i++) {
    126     UCHAR ch = static_cast<UCHAR>(spec[i]);
    127     char replacement = 0;
    128     if (ch < 0x80) {
    129       if (i == scheme.begin) {
    130         // Need to do a special check for the first letter of the scheme.
    131         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
    132           replacement = kSchemeCanonical[ch];
    133       } else {
    134         replacement = kSchemeCanonical[ch];
    135       }
    136     }
    137 
    138     if (replacement) {
    139       output->push_back(replacement);
    140     } else if (ch == '%') {
    141       // Canonicalizing the scheme multiple times should lead to the same
    142       // result. Since invalid characters will be escaped, we need to preserve
    143       // the percent to avoid multiple escaping. The scheme will be invalid.
    144       success = false;
    145       output->push_back('%');
    146     } else {
    147       // Invalid character, store it but mark this scheme as invalid.
    148       success = false;
    149 
    150       // This will escape the output and also handle encoding issues.
    151       // Ignore the return value since we already failed.
    152       AppendUTF8EscapedChar(spec, &i, end, output);
    153     }
    154   }
    155 
    156   // The output scheme ends with the the current position, before appending
    157   // the colon.
    158   out_scheme->len = output->length() - out_scheme->begin;
    159   output->push_back(':');
    160   return success;
    161 }
    162 
    163 // The username and password components reference ranges in the corresponding
    164 // *_spec strings. Typically, these specs will be the same (we're
    165 // canonicalizing a single source string), but may be different when
    166 // replacing components.
    167 template<typename CHAR, typename UCHAR>
    168 bool DoUserInfo(const CHAR* username_spec,
    169                 const url_parse::Component& username,
    170                 const CHAR* password_spec,
    171                 const url_parse::Component& password,
    172                 CanonOutput* output,
    173                 url_parse::Component* out_username,
    174                 url_parse::Component* out_password) {
    175   if (username.len <= 0 && password.len <= 0) {
    176     // Common case: no user info. We strip empty username/passwords.
    177     *out_username = url_parse::Component();
    178     *out_password = url_parse::Component();
    179     return true;
    180   }
    181 
    182   // Write the username.
    183   out_username->begin = output->length();
    184   if (username.len > 0) {
    185     // This will escape characters not valid for the username.
    186     AppendStringOfType(&username_spec[username.begin], username.len,
    187                        CHAR_USERINFO, output);
    188   }
    189   out_username->len = output->length() - out_username->begin;
    190 
    191   // When there is a password, we need the separator. Note that we strip
    192   // empty but specified passwords.
    193   if (password.len > 0) {
    194     output->push_back(':');
    195     out_password->begin = output->length();
    196     AppendStringOfType(&password_spec[password.begin], password.len,
    197                        CHAR_USERINFO, output);
    198     out_password->len = output->length() - out_password->begin;
    199   } else {
    200     *out_password = url_parse::Component();
    201   }
    202 
    203   output->push_back('@');
    204   return true;
    205 }
    206 
    207 // Helper functions for converting port integers to strings.
    208 inline void WritePortInt(char* output, int output_len, int port) {
    209   _itoa_s(port, output, output_len, 10);
    210 }
    211 inline void WritePortInt(char16* output, int output_len, int port) {
    212   _itow_s(port, output, output_len, 10);
    213 }
    214 
    215 // This function will prepend the colon if there will be a port.
    216 template<typename CHAR, typename UCHAR>
    217 bool DoPort(const CHAR* spec,
    218             const url_parse::Component& port,
    219             int default_port_for_scheme,
    220             CanonOutput* output,
    221             url_parse::Component* out_port) {
    222   int port_num = url_parse::ParsePort(spec, port);
    223   if (port_num == url_parse::PORT_UNSPECIFIED ||
    224       port_num == default_port_for_scheme) {
    225     *out_port = url_parse::Component();
    226     return true;  // Leave port empty.
    227   }
    228 
    229   if (port_num == url_parse::PORT_INVALID) {
    230     // Invalid port: We'll copy the text from the input so the user can see
    231     // what the error was, and mark the URL as invalid by returning false.
    232     output->push_back(':');
    233     out_port->begin = output->length();
    234     AppendInvalidNarrowString(spec, port.begin, port.end(), output);
    235     out_port->len = output->length() - out_port->begin;
    236     return false;
    237   }
    238 
    239   // Convert port number back to an integer. Max port value is 5 digits, and
    240   // the Parsed::ExtractPort will have made sure the integer is in range.
    241   const int buf_size = 6;
    242   char buf[buf_size];
    243   WritePortInt(buf, buf_size, port_num);
    244 
    245   // Append the port number to the output, preceeded by a colon.
    246   output->push_back(':');
    247   out_port->begin = output->length();
    248   for (int i = 0; i < buf_size && buf[i]; i++)
    249     output->push_back(buf[i]);
    250 
    251   out_port->len = output->length() - out_port->begin;
    252   return true;
    253 }
    254 
    255 template<typename CHAR, typename UCHAR>
    256 void DoCanonicalizeRef(const CHAR* spec,
    257                        const url_parse::Component& ref,
    258                        CanonOutput* output,
    259                        url_parse::Component* out_ref) {
    260   if (ref.len < 0) {
    261     // Common case of no ref.
    262     *out_ref = url_parse::Component();
    263     return;
    264   }
    265 
    266   // Append the ref separator. Note that we need to do this even when the ref
    267   // is empty but present.
    268   output->push_back('#');
    269   out_ref->begin = output->length();
    270 
    271   // Now iterate through all the characters, converting to UTF-8 and validating.
    272   int end = ref.end();
    273   for (int i = ref.begin; i < end; i++) {
    274     if (spec[i] == 0) {
    275       // IE just strips NULLs, so we do too.
    276       continue;
    277     } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
    278       // Unline IE seems to, we escape control characters. This will probably
    279       // make the reference fragment unusable on a web page, but people
    280       // shouldn't be using control characters in their anchor names.
    281       AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
    282     } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
    283       // Normal ASCII characters are just appended.
    284       output->push_back(static_cast<char>(spec[i]));
    285     } else {
    286       // Non-ASCII characters are appended unescaped, but only when they are
    287       // valid. Invalid Unicode characters are replaced with the "invalid
    288       // character" as IE seems to.
    289       unsigned code_point;
    290       if (!ReadUTFChar(spec, &i, end, &code_point))
    291         AppendUTF8Value(kUnicodeReplacementCharacter, output);
    292       else
    293         AppendUTF8Value(code_point, output);
    294     }
    295   }
    296 
    297   out_ref->len = output->length() - out_ref->begin;
    298 }
    299 
    300 }  // namespace
    301 
    302 const char* RemoveURLWhitespace(const char* input, int input_len,
    303                                 CanonOutputT<char>* buffer,
    304                                 int* output_len) {
    305   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
    306 }
    307 
    308 const char16* RemoveURLWhitespace(const char16* input, int input_len,
    309                                   CanonOutputT<char16>* buffer,
    310                                   int* output_len) {
    311   return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
    312 }
    313 
    314 char CanonicalSchemeChar(char16 ch) {
    315   if (ch >= 0x80)
    316     return 0;  // Non-ASCII is not supported by schemes.
    317   return kSchemeCanonical[ch];
    318 }
    319 
    320 bool CanonicalizeScheme(const char* spec,
    321                         const url_parse::Component& scheme,
    322                         CanonOutput* output,
    323                         url_parse::Component* out_scheme) {
    324   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
    325 }
    326 
    327 bool CanonicalizeScheme(const char16* spec,
    328                         const url_parse::Component& scheme,
    329                         CanonOutput* output,
    330                         url_parse::Component* out_scheme) {
    331   return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
    332 }
    333 
    334 bool CanonicalizeUserInfo(const char* username_source,
    335                           const url_parse::Component& username,
    336                           const char* password_source,
    337                           const url_parse::Component& password,
    338                           CanonOutput* output,
    339                           url_parse::Component* out_username,
    340                           url_parse::Component* out_password) {
    341   return DoUserInfo<char, unsigned char>(
    342       username_source, username, password_source, password,
    343       output, out_username, out_password);
    344 }
    345 
    346 bool CanonicalizeUserInfo(const char16* username_source,
    347                           const url_parse::Component& username,
    348                           const char16* password_source,
    349                           const url_parse::Component& password,
    350                           CanonOutput* output,
    351                           url_parse::Component* out_username,
    352                           url_parse::Component* out_password) {
    353   return DoUserInfo<char16, char16>(
    354       username_source, username, password_source, password,
    355       output, out_username, out_password);
    356 }
    357 
    358 bool CanonicalizePort(const char* spec,
    359                       const url_parse::Component& port,
    360                       int default_port_for_scheme,
    361                       CanonOutput* output,
    362                       url_parse::Component* out_port) {
    363   return DoPort<char, unsigned char>(spec, port,
    364                                      default_port_for_scheme,
    365                                      output, out_port);
    366 }
    367 
    368 bool CanonicalizePort(const char16* spec,
    369                       const url_parse::Component& port,
    370                       int default_port_for_scheme,
    371                       CanonOutput* output,
    372                       url_parse::Component* out_port) {
    373   return DoPort<char16, char16>(spec, port, default_port_for_scheme,
    374                                       output, out_port);
    375 }
    376 
    377 void CanonicalizeRef(const char* spec,
    378                      const url_parse::Component& ref,
    379                      CanonOutput* output,
    380                      url_parse::Component* out_ref) {
    381   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
    382 }
    383 
    384 void CanonicalizeRef(const char16* spec,
    385                      const url_parse::Component& ref,
    386                      CanonOutput* output,
    387                      url_parse::Component* out_ref) {
    388   DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
    389 }
    390 
    391 }  // namespace url_canon
    392