Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/logging.h"
      6 #include "url/url_canon.h"
      7 #include "url/url_canon_internal.h"
      8 
      9 namespace url_canon {
     10 
     11 namespace {
     12 
     13 // For reference, here's what IE supports:
     14 // Key: 0 (disallowed: failure if present in the input)
     15 //      + (allowed either escaped or unescaped, and unmodified)
     16 //      U (allowed escaped or unescaped but always unescaped if present in
     17 //         escaped form)
     18 //      E (allowed escaped or unescaped but always escaped if present in
     19 //         unescaped form)
     20 //      % (only allowed escaped in the input, will be unmodified).
     21 //      I left blank alpha numeric characters.
     22 //
     23 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
     24 //    -----------------------------------------------
     25 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
     26 // 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
     27 // 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
     28 // 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
     29 // 4   %
     30 // 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
     31 // 6   E                                               <-- That's  `
     32 // 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
     33 //
     34 // NOTE: I didn't actually test all the control characters. Some may be
     35 // disallowed in the input, but they are all accepted escaped except for 0.
     36 // I also didn't test if characters affecting HTML parsing are allowed
     37 // unescaped, eg. (") or (#), which would indicate the beginning of the path.
     38 // Surprisingly, space is accepted in the input and always escaped.
     39 
     40 // This table lists the canonical version of all characters we allow in the
     41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
     42 // value to indicate that this character should be escaped. We are a little more
     43 // restrictive than IE, but less restrictive than Firefox.
     44 //
     45 // Note that we disallow the % character. We will allow it when part of an
     46 // escape sequence, of course, but this disallows "%25". Even though IE allows
     47 // it, allowing it would put us in a funny state. If there was an invalid
     48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
     49 // Allowing percents means we'll succeed a second time, so validity would change
     50 // based on how many times you run the canonicalizer. We prefer to always report
     51 // the same vailidity, so reject this.
     52 const unsigned char kEsc = 0xff;
     53 const unsigned char kHostCharLookup[0x80] = {
     54 // 00-1f: all are invalid
     55      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     56      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     57 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
     58    kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
     59 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
     60     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
     61 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
     62    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     63 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     64     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
     65 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
     66    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     67 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     68     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
     69 
     70 const int kTempHostBufferLen = 1024;
     71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
     72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
     73 
     74 // Scans a host name and fills in the output flags according to what we find.
     75 // |has_non_ascii| will be true if there are any non-7-bit characters, and
     76 // |has_escaped| will be true if there is a percent sign.
     77 template<typename CHAR, typename UCHAR>
     78 void ScanHostname(const CHAR* spec, const url_parse::Component& host,
     79                   bool* has_non_ascii, bool* has_escaped) {
     80   int end = host.end();
     81   *has_non_ascii = false;
     82   *has_escaped = false;
     83   for (int i = host.begin; i < end; i++) {
     84     if (static_cast<UCHAR>(spec[i]) >= 0x80)
     85       *has_non_ascii = true;
     86     else if (spec[i] == '%')
     87       *has_escaped = true;
     88   }
     89 }
     90 
     91 // Canonicalizes a host name that is entirely 8-bit characters (even though
     92 // the type holding them may be 16 bits. Escaped characters will be unescaped.
     93 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
     94 //
     95 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
     96 // the output.
     97 //
     98 // This function is used in two situations:
     99 //
    100 //  * When the caller knows there is no non-ASCII or percent escaped
    101 //    characters. This is what DoHost does. The result will be a completely
    102 //    canonicalized host since we know nothing weird can happen (escaped
    103 //    characters could be unescaped to non-7-bit, so they have to be treated
    104 //    with suspicion at this point). It does not use the |has_non_ascii| flag.
    105 //
    106 //  * When the caller has an 8-bit string that may need unescaping.
    107 //    DoComplexHost calls us this situation to do unescaping and validation.
    108 //    After this, it may do other IDN operations depending on the value of the
    109 //    |*has_non_ascii| flag.
    110 //
    111 // The return value indicates if the output is a potentially valid host name.
    112 template<typename INCHAR, typename OUTCHAR>
    113 bool DoSimpleHost(const INCHAR* host,
    114                   int host_len,
    115                   CanonOutputT<OUTCHAR>* output,
    116                   bool* has_non_ascii) {
    117   *has_non_ascii = false;
    118 
    119   bool success = true;
    120   for (int i = 0; i < host_len; ++i) {
    121     unsigned int source = host[i];
    122     if (source == '%') {
    123       // Unescape first, if possible.
    124       // Source will be used only if decode operation was successful.
    125       if (!DecodeEscaped(host, &i, host_len,
    126                          reinterpret_cast<unsigned char*>(&source))) {
    127         // Invalid escaped character. There is nothing that can make this
    128         // host valid. We append an escaped percent so the URL looks reasonable
    129         // and mark as failed.
    130         AppendEscapedChar('%', output);
    131         success = false;
    132         continue;
    133       }
    134     }
    135 
    136     if (source < 0x80) {
    137       // We have ASCII input, we can use our lookup table.
    138       unsigned char replacement = kHostCharLookup[source];
    139       if (!replacement) {
    140         // Invalid character, add it as percent-escaped and mark as failed.
    141         AppendEscapedChar(source, output);
    142         success = false;
    143       } else if (replacement == kEsc) {
    144         // This character is valid but should be escaped.
    145         AppendEscapedChar(source, output);
    146       } else {
    147         // Common case, the given character is valid in a hostname, the lookup
    148         // table tells us the canonical representation of that character (lower
    149         // cased).
    150         output->push_back(replacement);
    151       }
    152     } else {
    153       // It's a non-ascii char. Just push it to the output.
    154       // In case where we have char16 input, and char output it's safe to
    155       // cast char16->char only if input string was converted to ASCII.
    156       output->push_back(static_cast<OUTCHAR>(source));
    157       *has_non_ascii = true;
    158     }
    159   }
    160 
    161   return success;
    162 }
    163 
    164 // Canonicalizes a host that requires IDN conversion. Returns true on success
    165 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
    166   // We need to escape URL before doing IDN conversion, since punicode strings
    167   // cannot be escaped after they are created.
    168   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
    169   bool has_non_ascii;
    170   DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
    171 
    172   StackBufferW wide_output;
    173   if (!IDNToASCII(url_escaped_host.data(),
    174                   url_escaped_host.length(),
    175                   &wide_output)) {
    176     // Some error, give up. This will write some reasonable looking
    177     // representation of the string to the output.
    178     AppendInvalidNarrowString(src, 0, src_len, output);
    179     return false;
    180   }
    181 
    182   // Now we check the ASCII output like a normal host. It will also handle
    183   // unescaping. Although we unescaped everything before this function call, if
    184   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
    185   bool success = DoSimpleHost(wide_output.data(),
    186                               wide_output.length(),
    187                               output, &has_non_ascii);
    188   DCHECK(!has_non_ascii);
    189   return success;
    190 }
    191 
    192 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
    193 // UTF-16. The has_escaped flag should be set if the input string requires
    194 // unescaping.
    195 bool DoComplexHost(const char* host, int host_len,
    196                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
    197   // Save the current position in the output. We may write stuff and rewind it
    198   // below, so we need to know where to rewind to.
    199   int begin_length = output->length();
    200 
    201   // Points to the UTF-8 data we want to convert. This will either be the
    202   // input or the unescaped version written to |*output| if necessary.
    203   const char* utf8_source;
    204   int utf8_source_len;
    205   if (has_escaped) {
    206     // Unescape before converting to UTF-16 for IDN. We write this into the
    207     // output because it most likely does not require IDNization, and we can
    208     // save another huge stack buffer. It will be replaced below if it requires
    209     // IDN. This will also update our non-ASCII flag so we know whether the
    210     // unescaped input requires IDN.
    211     if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
    212       // Error with some escape sequence. We'll call the current output
    213       // complete. DoSimpleHost will have written some "reasonable" output.
    214       return false;
    215     }
    216 
    217     // Unescaping may have left us with ASCII input, in which case the
    218     // unescaped version we wrote to output is complete.
    219     if (!has_non_ascii) {
    220       return true;
    221     }
    222 
    223     // Save the pointer into the data was just converted (it may be appended to
    224     // other data in the output buffer).
    225     utf8_source = &output->data()[begin_length];
    226     utf8_source_len = output->length() - begin_length;
    227   } else {
    228     // We don't need to unescape, use input for IDNization later. (We know the
    229     // input has non-ASCII, or the simple version would have been called
    230     // instead of us.)
    231     utf8_source = host;
    232     utf8_source_len = host_len;
    233   }
    234 
    235   // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
    236   // Above, we may have used the output to write the unescaped values to, so
    237   // we have to rewind it to where we started after we convert it to UTF-16.
    238   StackBufferW utf16;
    239   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
    240     // In this error case, the input may or may not be the output.
    241     StackBuffer utf8;
    242     for (int i = 0; i < utf8_source_len; i++)
    243       utf8.push_back(utf8_source[i]);
    244     output->set_length(begin_length);
    245     AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
    246     return false;
    247   }
    248   output->set_length(begin_length);
    249 
    250   // This will call DoSimpleHost which will do normal ASCII canonicalization
    251   // and also check for IP addresses in the outpt.
    252   return DoIDNHost(utf16.data(), utf16.length(), output);
    253 }
    254 
    255 // UTF-16 convert host to its ASCII version. The set up is already ready for
    256 // the backend, so we just pass through. The has_escaped flag should be set if
    257 // the input string requires unescaping.
    258 bool DoComplexHost(const base::char16* host, int host_len,
    259                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
    260   if (has_escaped) {
    261     // Yikes, we have escaped characters with wide input. The escaped
    262     // characters should be interpreted as UTF-8. To solve this problem,
    263     // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
    264     //
    265     // We don't bother to optimize the conversion in the ASCII case (which
    266     // *could* just be a copy) and use the UTF-8 path, because it should be
    267     // very rare that host names have escaped characters, and it is relatively
    268     // fast to do the conversion anyway.
    269     StackBuffer utf8;
    270     if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
    271       AppendInvalidNarrowString(host, 0, host_len, output);
    272       return false;
    273     }
    274 
    275     // Once we convert to UTF-8, we can use the 8-bit version of the complex
    276     // host handling code above.
    277     return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
    278                          has_escaped, output);
    279   }
    280 
    281   // No unescaping necessary, we can safely pass the input to ICU. This
    282   // function will only get called if we either have escaped or non-ascii
    283   // input, so it's safe to just use ICU now. Even if the input is ASCII,
    284   // this function will do the right thing (just slower than we could).
    285   return DoIDNHost(host, host_len, output);
    286 }
    287 
    288 template<typename CHAR, typename UCHAR>
    289 void DoHost(const CHAR* spec,
    290             const url_parse::Component& host,
    291             CanonOutput* output,
    292             CanonHostInfo* host_info) {
    293   if (host.len <= 0) {
    294     // Empty hosts don't need anything.
    295     host_info->family = CanonHostInfo::NEUTRAL;
    296     host_info->out_host = url_parse::Component();
    297     return;
    298   }
    299 
    300   bool has_non_ascii, has_escaped;
    301   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
    302 
    303   // Keep track of output's initial length, so we can rewind later.
    304   const int output_begin = output->length();
    305 
    306   bool success;
    307   if (!has_non_ascii && !has_escaped) {
    308     success = DoSimpleHost(&spec[host.begin], host.len,
    309                            output, &has_non_ascii);
    310     DCHECK(!has_non_ascii);
    311   } else {
    312     success = DoComplexHost(&spec[host.begin], host.len,
    313                             has_non_ascii, has_escaped, output);
    314   }
    315 
    316   if (!success) {
    317     // Canonicalization failed.  Set BROKEN to notify the caller.
    318     host_info->family = CanonHostInfo::BROKEN;
    319   } else {
    320     // After all the other canonicalization, check if we ended up with an IP
    321     // address.  IP addresses are small, so writing into this temporary buffer
    322     // should not cause an allocation.
    323     RawCanonOutput<64> canon_ip;
    324     CanonicalizeIPAddress(output->data(),
    325                           url_parse::MakeRange(output_begin, output->length()),
    326                           &canon_ip, host_info);
    327 
    328     // If we got an IPv4/IPv6 address, copy the canonical form back to the
    329     // real buffer.  Otherwise, it's a hostname or broken IP, in which case
    330     // we just leave it in place.
    331     if (host_info->IsIPAddress()) {
    332       output->set_length(output_begin);
    333       output->Append(canon_ip.data(), canon_ip.length());
    334     }
    335   }
    336 
    337   host_info->out_host = url_parse::MakeRange(output_begin, output->length());
    338 }
    339 
    340 }  // namespace
    341 
    342 bool CanonicalizeHost(const char* spec,
    343                       const url_parse::Component& host,
    344                       CanonOutput* output,
    345                       url_parse::Component* out_host) {
    346   CanonHostInfo host_info;
    347   DoHost<char, unsigned char>(spec, host, output, &host_info);
    348   *out_host = host_info.out_host;
    349   return (host_info.family != CanonHostInfo::BROKEN);
    350 }
    351 
    352 bool CanonicalizeHost(const base::char16* spec,
    353                       const url_parse::Component& host,
    354                       CanonOutput* output,
    355                       url_parse::Component* out_host) {
    356   CanonHostInfo host_info;
    357   DoHost<base::char16, base::char16>(spec, host, output, &host_info);
    358   *out_host = host_info.out_host;
    359   return (host_info.family != CanonHostInfo::BROKEN);
    360 }
    361 
    362 void CanonicalizeHostVerbose(const char* spec,
    363                              const url_parse::Component& host,
    364                              CanonOutput* output,
    365                              CanonHostInfo *host_info) {
    366   DoHost<char, unsigned char>(spec, host, output, host_info);
    367 }
    368 
    369 void CanonicalizeHostVerbose(const base::char16* spec,
    370                              const url_parse::Component& host,
    371                              CanonOutput* output,
    372                              CanonHostInfo *host_info) {
    373   DoHost<base::char16, base::char16>(spec, host, output, host_info);
    374 }
    375 
    376 }  // namespace url_canon
    377