Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 #include "base/logging.h"
     31 #include "googleurl/src/url_canon.h"
     32 #include "googleurl/src/url_canon_internal.h"
     33 
     34 namespace url_canon {
     35 
     36 namespace {
     37 
     38 // For reference, here's what IE supports:
     39 // Key: 0 (disallowed: failure if present in the input)
     40 //      + (allowed either escaped or unescaped, and unmodified)
     41 //      U (allowed escaped or unescaped but always unescaped if present in
     42 //         escaped form)
     43 //      E (allowed escaped or unescaped but always escaped if present in
     44 //         unescaped form)
     45 //      % (only allowed escaped in the input, will be unmodified).
     46 //      I left blank alpha numeric characters.
     47 //
     48 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
     49 //    -----------------------------------------------
     50 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
     51 // 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
     52 // 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
     53 // 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
     54 // 4   %
     55 // 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
     56 // 6   E                                               <-- That's  `
     57 // 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
     58 //
     59 // NOTE: I didn't actually test all the control characters. Some may be
     60 // disallowed in the input, but they are all accepted escaped except for 0.
     61 // I also didn't test if characters affecting HTML parsing are allowed
     62 // unescaped, eg. (") or (#), which would indicate the beginning of the path.
     63 // Surprisingly, space is accepted in the input and always escaped.
     64 
     65 // This table lists the canonical version of all characters we allow in the
     66 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
     67 // value to indicate that this character should be escaped. We are a little more
     68 // restrictive than IE, but less restrictive than Firefox.
     69 //
     70 // Note that we disallow the % character. We will allow it when part of an
     71 // escape sequence, of course, but this disallows "%25". Even though IE allows
     72 // it, allowing it would put us in a funny state. If there was an invalid
     73 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
     74 // Allowing percents means we'll succeed a second time, so validity would change
     75 // based on how many times you run the canonicalizer. We prefer to always report
     76 // the same vailidity, so reject this.
     77 const unsigned char kEsc = 0xff;
     78 const unsigned char kHostCharLookup[0x80] = {
     79 // 00-1f: all are invalid
     80      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     81      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     82 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
     83    kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
     84 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
     85     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
     86 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
     87    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     88 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     89     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
     90 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
     91    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     92 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     93     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
     94 
     95 const int kTempHostBufferLen = 1024;
     96 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
     97 typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
     98 
     99 // Scans a host name and fills in the output flags according to what we find.
    100 // |has_non_ascii| will be true if there are any non-7-bit characters, and
    101 // |has_escaped| will be true if there is a percent sign.
    102 template<typename CHAR, typename UCHAR>
    103 void ScanHostname(const CHAR* spec, const url_parse::Component& host,
    104                   bool* has_non_ascii, bool* has_escaped) {
    105   int end = host.end();
    106   *has_non_ascii = false;
    107   *has_escaped = false;
    108   for (int i = host.begin; i < end; i++) {
    109     if (static_cast<UCHAR>(spec[i]) >= 0x80)
    110       *has_non_ascii = true;
    111     else if (spec[i] == '%')
    112       *has_escaped = true;
    113   }
    114 }
    115 
    116 // Canonicalizes a host name that is entirely 8-bit characters (even though
    117 // the type holding them may be 16 bits. Escaped characters will be unescaped.
    118 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
    119 //
    120 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
    121 // the output.
    122 //
    123 // This function is used in two situations:
    124 //
    125 //  * When the caller knows there is no non-ASCII or percent escaped
    126 //    characters. This is what DoHost does. The result will be a completely
    127 //    canonicalized host since we know nothing weird can happen (escaped
    128 //    characters could be unescaped to non-7-bit, so they have to be treated
    129 //    with suspicion at this point). It does not use the |has_non_ascii| flag.
    130 //
    131 //  * When the caller has an 8-bit string that may need unescaping.
    132 //    DoComplexHost calls us this situation to do unescaping and validation.
    133 //    After this, it may do other IDN operations depending on the value of the
    134 //    |*has_non_ascii| flag.
    135 //
    136 // The return value indicates if the output is a potentially valid host name.
    137 template<typename INCHAR, typename OUTCHAR>
    138 bool DoSimpleHost(const INCHAR* host,
    139                   int host_len,
    140                   CanonOutputT<OUTCHAR>* output,
    141                   bool* has_non_ascii) {
    142   *has_non_ascii = false;
    143 
    144   bool success = true;
    145   for (int i = 0; i < host_len; ++i) {
    146     unsigned int source = host[i];
    147     if (source == '%') {
    148       // Unescape first, if possible.
    149       // Source will be used only if decode operation was successful.
    150       if (!DecodeEscaped(host, &i, host_len,
    151                          reinterpret_cast<unsigned char*>(&source))) {
    152         // Invalid escaped character. There is nothing that can make this
    153         // host valid. We append an escaped percent so the URL looks reasonable
    154         // and mark as failed.
    155         AppendEscapedChar('%', output);
    156         success = false;
    157         continue;
    158       }
    159     }
    160 
    161     if (source < 0x80) {
    162       // We have ASCII input, we can use our lookup table.
    163       unsigned char replacement = kHostCharLookup[source];
    164       if (!replacement) {
    165         // Invalid character, add it as percent-escaped and mark as failed.
    166         AppendEscapedChar(source, output);
    167         success = false;
    168       } else if (replacement == kEsc) {
    169         // This character is valid but should be escaped.
    170         AppendEscapedChar(source, output);
    171       } else {
    172         // Common case, the given character is valid in a hostname, the lookup
    173         // table tells us the canonical representation of that character (lower
    174         // cased).
    175         output->push_back(replacement);
    176       }
    177     } else {
    178       // It's a non-ascii char. Just push it to the output.
    179       // In case where we have char16 input, and char output it's safe to
    180       // cast char16->char only if input string was converted to ASCII.
    181       output->push_back(static_cast<OUTCHAR>(source));
    182       *has_non_ascii = true;
    183     }
    184   }
    185 
    186   return success;
    187 }
    188 
    189 // Canonicalizes a host that requires IDN conversion. Returns true on success
    190 bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
    191   // We need to escape URL before doing IDN conversion, since punicode strings
    192   // cannot be escaped after they are created.
    193   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
    194   bool has_non_ascii;
    195   DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
    196 
    197   StackBufferW wide_output;
    198   if (!IDNToASCII(url_escaped_host.data(),
    199                   url_escaped_host.length(),
    200                   &wide_output)) {
    201     // Some error, give up. This will write some reasonable looking
    202     // representation of the string to the output.
    203     AppendInvalidNarrowString(src, 0, src_len, output);
    204     return false;
    205   }
    206 
    207   // Now we check the ASCII output like a normal host. It will also handle
    208   // unescaping. Although we unescaped everything before this function call, if
    209   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
    210   bool success = DoSimpleHost(wide_output.data(),
    211                               wide_output.length(),
    212                               output, &has_non_ascii);
    213   DCHECK(!has_non_ascii);
    214   return success;
    215 }
    216 
    217 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
    218 // UTF-16. The has_escaped flag should be set if the input string requires
    219 // unescaping.
    220 bool DoComplexHost(const char* host, int host_len,
    221                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
    222   // Save the current position in the output. We may write stuff and rewind it
    223   // below, so we need to know where to rewind to.
    224   int begin_length = output->length();
    225 
    226   // Points to the UTF-8 data we want to convert. This will either be the
    227   // input or the unescaped version written to |*output| if necessary.
    228   const char* utf8_source;
    229   int utf8_source_len;
    230   if (has_escaped) {
    231     // Unescape before converting to UTF-16 for IDN. We write this into the
    232     // output because it most likely does not require IDNization, and we can
    233     // save another huge stack buffer. It will be replaced below if it requires
    234     // IDN. This will also update our non-ASCII flag so we know whether the
    235     // unescaped input requires IDN.
    236     if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
    237       // Error with some escape sequence. We'll call the current output
    238       // complete. DoSimpleHost will have written some "reasonable" output.
    239       return false;
    240     }
    241 
    242     // Unescaping may have left us with ASCII input, in which case the
    243     // unescaped version we wrote to output is complete.
    244     if (!has_non_ascii) {
    245       return true;
    246     }
    247 
    248     // Save the pointer into the data was just converted (it may be appended to
    249     // other data in the output buffer).
    250     utf8_source = &output->data()[begin_length];
    251     utf8_source_len = output->length() - begin_length;
    252   } else {
    253     // We don't need to unescape, use input for IDNization later. (We know the
    254     // input has non-ASCII, or the simple version would have been called
    255     // instead of us.)
    256     utf8_source = host;
    257     utf8_source_len = host_len;
    258   }
    259 
    260   // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
    261   // Above, we may have used the output to write the unescaped values to, so
    262   // we have to rewind it to where we started after we convert it to UTF-16.
    263   StackBufferW utf16;
    264   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
    265     // In this error case, the input may or may not be the output.
    266     StackBuffer utf8;
    267     for (int i = 0; i < utf8_source_len; i++)
    268       utf8.push_back(utf8_source[i]);
    269     output->set_length(begin_length);
    270     AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
    271     return false;
    272   }
    273   output->set_length(begin_length);
    274 
    275   // This will call DoSimpleHost which will do normal ASCII canonicalization
    276   // and also check for IP addresses in the outpt.
    277   return DoIDNHost(utf16.data(), utf16.length(), output);
    278 }
    279 
    280 // UTF-16 convert host to its ASCII version. The set up is already ready for
    281 // the backend, so we just pass through. The has_escaped flag should be set if
    282 // the input string requires unescaping.
    283 bool DoComplexHost(const char16* host, int host_len,
    284                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
    285   if (has_escaped) {
    286     // Yikes, we have escaped characters with wide input. The escaped
    287     // characters should be interpreted as UTF-8. To solve this problem,
    288     // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
    289     //
    290     // We don't bother to optimize the conversion in the ASCII case (which
    291     // *could* just be a copy) and use the UTF-8 path, because it should be
    292     // very rare that host names have escaped characters, and it is relatively
    293     // fast to do the conversion anyway.
    294     StackBuffer utf8;
    295     if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
    296       AppendInvalidNarrowString(host, 0, host_len, output);
    297       return false;
    298     }
    299 
    300     // Once we convert to UTF-8, we can use the 8-bit version of the complex
    301     // host handling code above.
    302     return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
    303                          has_escaped, output);
    304   }
    305 
    306   // No unescaping necessary, we can safely pass the input to ICU. This
    307   // function will only get called if we either have escaped or non-ascii
    308   // input, so it's safe to just use ICU now. Even if the input is ASCII,
    309   // this function will do the right thing (just slower than we could).
    310   return DoIDNHost(host, host_len, output);
    311 }
    312 
    313 template<typename CHAR, typename UCHAR>
    314 void DoHost(const CHAR* spec,
    315             const url_parse::Component& host,
    316             CanonOutput* output,
    317             CanonHostInfo* host_info) {
    318   if (host.len <= 0) {
    319     // Empty hosts don't need anything.
    320     host_info->family = CanonHostInfo::NEUTRAL;
    321     host_info->out_host = url_parse::Component();
    322     return;
    323   }
    324 
    325   bool has_non_ascii, has_escaped;
    326   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
    327 
    328   // Keep track of output's initial length, so we can rewind later.
    329   const int output_begin = output->length();
    330 
    331   bool success;
    332   if (!has_non_ascii && !has_escaped) {
    333     success = DoSimpleHost(&spec[host.begin], host.len,
    334                            output, &has_non_ascii);
    335     DCHECK(!has_non_ascii);
    336   } else {
    337     success = DoComplexHost(&spec[host.begin], host.len,
    338                             has_non_ascii, has_escaped, output);
    339   }
    340 
    341   if (!success) {
    342     // Canonicalization failed.  Set BROKEN to notify the caller.
    343     host_info->family = CanonHostInfo::BROKEN;
    344   } else {
    345     // After all the other canonicalization, check if we ended up with an IP
    346     // address.  IP addresses are small, so writing into this temporary buffer
    347     // should not cause an allocation.
    348     RawCanonOutput<64> canon_ip;
    349     CanonicalizeIPAddress(output->data(),
    350                           url_parse::MakeRange(output_begin, output->length()),
    351                           &canon_ip, host_info);
    352 
    353     // If we got an IPv4/IPv6 address, copy the canonical form back to the
    354     // real buffer.  Otherwise, it's a hostname or broken IP, in which case
    355     // we just leave it in place.
    356     if (host_info->IsIPAddress()) {
    357       output->set_length(output_begin);
    358       output->Append(canon_ip.data(), canon_ip.length());
    359     }
    360   }
    361 
    362   host_info->out_host = url_parse::MakeRange(output_begin, output->length());
    363 }
    364 
    365 }  // namespace
    366 
    367 bool CanonicalizeHost(const char* spec,
    368                       const url_parse::Component& host,
    369                       CanonOutput* output,
    370                       url_parse::Component* out_host) {
    371   CanonHostInfo host_info;
    372   DoHost<char, unsigned char>(spec, host, output, &host_info);
    373   *out_host = host_info.out_host;
    374   return (host_info.family != CanonHostInfo::BROKEN);
    375 }
    376 
    377 bool CanonicalizeHost(const char16* spec,
    378                       const url_parse::Component& host,
    379                       CanonOutput* output,
    380                       url_parse::Component* out_host) {
    381   CanonHostInfo host_info;
    382   DoHost<char16, char16>(spec, host, output, &host_info);
    383   *out_host = host_info.out_host;
    384   return (host_info.family != CanonHostInfo::BROKEN);
    385 }
    386 
    387 void CanonicalizeHostVerbose(const char* spec,
    388                              const url_parse::Component& host,
    389                              CanonOutput* output,
    390                              CanonHostInfo *host_info) {
    391   DoHost<char, unsigned char>(spec, host, output, host_info);
    392 }
    393 
    394 void CanonicalizeHostVerbose(const char16* spec,
    395                              const url_parse::Component& host,
    396                              CanonOutput* output,
    397                              CanonHostInfo *host_info) {
    398   DoHost<char16, char16>(spec, host, output, host_info);
    399 }
    400 
    401 }  // namespace url_canon
    402