Home | History | Annotate | Download | only in url
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/logging.h"
      6 #include "url/url_canon.h"
      7 #include "url/url_canon_internal.h"
      8 
      9 namespace url {
     10 
     11 namespace {
     12 
     13 // For reference, here's what IE supports:
     14 // Key: 0 (disallowed: failure if present in the input)
     15 //      + (allowed either escaped or unescaped, and unmodified)
     16 //      U (allowed escaped or unescaped but always unescaped if present in
     17 //         escaped form)
     18 //      E (allowed escaped or unescaped but always escaped if present in
     19 //         unescaped form)
     20 //      % (only allowed escaped in the input, will be unmodified).
     21 //      I left blank alpha numeric characters.
     22 //
     23 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
     24 //    -----------------------------------------------
     25 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
     26 // 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
     27 // 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
     28 // 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
     29 // 4   %
     30 // 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
     31 // 6   E                                               <-- That's  `
     32 // 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
     33 //
     34 // NOTE: I didn't actually test all the control characters. Some may be
     35 // disallowed in the input, but they are all accepted escaped except for 0.
     36 // I also didn't test if characters affecting HTML parsing are allowed
     37 // unescaped, eg. (") or (#), which would indicate the beginning of the path.
     38 // Surprisingly, space is accepted in the input and always escaped.
     39 
     40 // This table lists the canonical version of all characters we allow in the
     41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
     42 // value to indicate that this character should be escaped. We are a little more
     43 // restrictive than IE, but less restrictive than Firefox.
     44 //
     45 // Note that we disallow the % character. We will allow it when part of an
     46 // escape sequence, of course, but this disallows "%25". Even though IE allows
     47 // it, allowing it would put us in a funny state. If there was an invalid
     48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
     49 // Allowing percents means we'll succeed a second time, so validity would change
     50 // based on how many times you run the canonicalizer. We prefer to always report
     51 // the same vailidity, so reject this.
     52 const unsigned char kEsc = 0xff;
     53 const unsigned char kHostCharLookup[0x80] = {
     54 // 00-1f: all are invalid
     55      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     56      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
     57 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
     58    kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
     59 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
     60     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
     61 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
     62    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     63 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
     64     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
     65 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
     66    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
     67 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
     68     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
     69 
     70 const int kTempHostBufferLen = 1024;
     71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
     72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
     73 
     74 // Scans a host name and fills in the output flags according to what we find.
     75 // |has_non_ascii| will be true if there are any non-7-bit characters, and
     76 // |has_escaped| will be true if there is a percent sign.
     77 template<typename CHAR, typename UCHAR>
     78 void ScanHostname(const CHAR* spec,
     79                   const Component& host,
     80                   bool* has_non_ascii,
     81                   bool* has_escaped) {
     82   int end = host.end();
     83   *has_non_ascii = false;
     84   *has_escaped = false;
     85   for (int i = host.begin; i < end; i++) {
     86     if (static_cast<UCHAR>(spec[i]) >= 0x80)
     87       *has_non_ascii = true;
     88     else if (spec[i] == '%')
     89       *has_escaped = true;
     90   }
     91 }
     92 
     93 // Canonicalizes a host name that is entirely 8-bit characters (even though
     94 // the type holding them may be 16 bits. Escaped characters will be unescaped.
     95 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
     96 //
     97 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
     98 // the output.
     99 //
    100 // This function is used in two situations:
    101 //
    102 //  * When the caller knows there is no non-ASCII or percent escaped
    103 //    characters. This is what DoHost does. The result will be a completely
    104 //    canonicalized host since we know nothing weird can happen (escaped
    105 //    characters could be unescaped to non-7-bit, so they have to be treated
    106 //    with suspicion at this point). It does not use the |has_non_ascii| flag.
    107 //
    108 //  * When the caller has an 8-bit string that may need unescaping.
    109 //    DoComplexHost calls us this situation to do unescaping and validation.
    110 //    After this, it may do other IDN operations depending on the value of the
    111 //    |*has_non_ascii| flag.
    112 //
    113 // The return value indicates if the output is a potentially valid host name.
    114 template<typename INCHAR, typename OUTCHAR>
    115 bool DoSimpleHost(const INCHAR* host,
    116                   int host_len,
    117                   CanonOutputT<OUTCHAR>* output,
    118                   bool* has_non_ascii) {
    119   *has_non_ascii = false;
    120 
    121   bool success = true;
    122   for (int i = 0; i < host_len; ++i) {
    123     unsigned int source = host[i];
    124     if (source == '%') {
    125       // Unescape first, if possible.
    126       // Source will be used only if decode operation was successful.
    127       if (!DecodeEscaped(host, &i, host_len,
    128                          reinterpret_cast<unsigned char*>(&source))) {
    129         // Invalid escaped character. There is nothing that can make this
    130         // host valid. We append an escaped percent so the URL looks reasonable
    131         // and mark as failed.
    132         AppendEscapedChar('%', output);
    133         success = false;
    134         continue;
    135       }
    136     }
    137 
    138     if (source < 0x80) {
    139       // We have ASCII input, we can use our lookup table.
    140       unsigned char replacement = kHostCharLookup[source];
    141       if (!replacement) {
    142         // Invalid character, add it as percent-escaped and mark as failed.
    143         AppendEscapedChar(source, output);
    144         success = false;
    145       } else if (replacement == kEsc) {
    146         // This character is valid but should be escaped.
    147         AppendEscapedChar(source, output);
    148       } else {
    149         // Common case, the given character is valid in a hostname, the lookup
    150         // table tells us the canonical representation of that character (lower
    151         // cased).
    152         output->push_back(replacement);
    153       }
    154     } else {
    155       // It's a non-ascii char. Just push it to the output.
    156       // In case where we have char16 input, and char output it's safe to
    157       // cast char16->char only if input string was converted to ASCII.
    158       output->push_back(static_cast<OUTCHAR>(source));
    159       *has_non_ascii = true;
    160     }
    161   }
    162 
    163   return success;
    164 }
    165 
    166 // Canonicalizes a host that requires IDN conversion. Returns true on success
    167 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
    168   // We need to escape URL before doing IDN conversion, since punicode strings
    169   // cannot be escaped after they are created.
    170   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
    171   bool has_non_ascii;
    172   DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
    173 
    174   StackBufferW wide_output;
    175   if (!IDNToASCII(url_escaped_host.data(),
    176                   url_escaped_host.length(),
    177                   &wide_output)) {
    178     // Some error, give up. This will write some reasonable looking
    179     // representation of the string to the output.
    180     AppendInvalidNarrowString(src, 0, src_len, output);
    181     return false;
    182   }
    183 
    184   // Now we check the ASCII output like a normal host. It will also handle
    185   // unescaping. Although we unescaped everything before this function call, if
    186   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
    187   bool success = DoSimpleHost(wide_output.data(),
    188                               wide_output.length(),
    189                               output, &has_non_ascii);
    190   DCHECK(!has_non_ascii);
    191   return success;
    192 }
    193 
    194 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
    195 // UTF-16. The has_escaped flag should be set if the input string requires
    196 // unescaping.
    197 bool DoComplexHost(const char* host, int host_len,
    198                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
    199   // Save the current position in the output. We may write stuff and rewind it
    200   // below, so we need to know where to rewind to.
    201   int begin_length = output->length();
    202 
    203   // Points to the UTF-8 data we want to convert. This will either be the
    204   // input or the unescaped version written to |*output| if necessary.
    205   const char* utf8_source;
    206   int utf8_source_len;
    207   if (has_escaped) {
    208     // Unescape before converting to UTF-16 for IDN. We write this into the
    209     // output because it most likely does not require IDNization, and we can
    210     // save another huge stack buffer. It will be replaced below if it requires
    211     // IDN. This will also update our non-ASCII flag so we know whether the
    212     // unescaped input requires IDN.
    213     if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
    214       // Error with some escape sequence. We'll call the current output
    215       // complete. DoSimpleHost will have written some "reasonable" output.
    216       return false;
    217     }
    218 
    219     // Unescaping may have left us with ASCII input, in which case the
    220     // unescaped version we wrote to output is complete.
    221     if (!has_non_ascii) {
    222       return true;
    223     }
    224 
    225     // Save the pointer into the data was just converted (it may be appended to
    226     // other data in the output buffer).
    227     utf8_source = &output->data()[begin_length];
    228     utf8_source_len = output->length() - begin_length;
    229   } else {
    230     // We don't need to unescape, use input for IDNization later. (We know the
    231     // input has non-ASCII, or the simple version would have been called
    232     // instead of us.)
    233     utf8_source = host;
    234     utf8_source_len = host_len;
    235   }
    236 
    237   // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
    238   // Above, we may have used the output to write the unescaped values to, so
    239   // we have to rewind it to where we started after we convert it to UTF-16.
    240   StackBufferW utf16;
    241   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
    242     // In this error case, the input may or may not be the output.
    243     StackBuffer utf8;
    244     for (int i = 0; i < utf8_source_len; i++)
    245       utf8.push_back(utf8_source[i]);
    246     output->set_length(begin_length);
    247     AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
    248     return false;
    249   }
    250   output->set_length(begin_length);
    251 
    252   // This will call DoSimpleHost which will do normal ASCII canonicalization
    253   // and also check for IP addresses in the outpt.
    254   return DoIDNHost(utf16.data(), utf16.length(), output);
    255 }
    256 
    257 // UTF-16 convert host to its ASCII version. The set up is already ready for
    258 // the backend, so we just pass through. The has_escaped flag should be set if
    259 // the input string requires unescaping.
    260 bool DoComplexHost(const base::char16* host, int host_len,
    261                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
    262   if (has_escaped) {
    263     // Yikes, we have escaped characters with wide input. The escaped
    264     // characters should be interpreted as UTF-8. To solve this problem,
    265     // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
    266     //
    267     // We don't bother to optimize the conversion in the ASCII case (which
    268     // *could* just be a copy) and use the UTF-8 path, because it should be
    269     // very rare that host names have escaped characters, and it is relatively
    270     // fast to do the conversion anyway.
    271     StackBuffer utf8;
    272     if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
    273       AppendInvalidNarrowString(host, 0, host_len, output);
    274       return false;
    275     }
    276 
    277     // Once we convert to UTF-8, we can use the 8-bit version of the complex
    278     // host handling code above.
    279     return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
    280                          has_escaped, output);
    281   }
    282 
    283   // No unescaping necessary, we can safely pass the input to ICU. This
    284   // function will only get called if we either have escaped or non-ascii
    285   // input, so it's safe to just use ICU now. Even if the input is ASCII,
    286   // this function will do the right thing (just slower than we could).
    287   return DoIDNHost(host, host_len, output);
    288 }
    289 
    290 template<typename CHAR, typename UCHAR>
    291 void DoHost(const CHAR* spec,
    292             const Component& host,
    293             CanonOutput* output,
    294             CanonHostInfo* host_info) {
    295   if (host.len <= 0) {
    296     // Empty hosts don't need anything.
    297     host_info->family = CanonHostInfo::NEUTRAL;
    298     host_info->out_host = Component();
    299     return;
    300   }
    301 
    302   bool has_non_ascii, has_escaped;
    303   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
    304 
    305   // Keep track of output's initial length, so we can rewind later.
    306   const int output_begin = output->length();
    307 
    308   bool success;
    309   if (!has_non_ascii && !has_escaped) {
    310     success = DoSimpleHost(&spec[host.begin], host.len,
    311                            output, &has_non_ascii);
    312     DCHECK(!has_non_ascii);
    313   } else {
    314     success = DoComplexHost(&spec[host.begin], host.len,
    315                             has_non_ascii, has_escaped, output);
    316   }
    317 
    318   if (!success) {
    319     // Canonicalization failed.  Set BROKEN to notify the caller.
    320     host_info->family = CanonHostInfo::BROKEN;
    321   } else {
    322     // After all the other canonicalization, check if we ended up with an IP
    323     // address.  IP addresses are small, so writing into this temporary buffer
    324     // should not cause an allocation.
    325     RawCanonOutput<64> canon_ip;
    326     CanonicalizeIPAddress(output->data(),
    327                           MakeRange(output_begin, output->length()),
    328                           &canon_ip, host_info);
    329 
    330     // If we got an IPv4/IPv6 address, copy the canonical form back to the
    331     // real buffer.  Otherwise, it's a hostname or broken IP, in which case
    332     // we just leave it in place.
    333     if (host_info->IsIPAddress()) {
    334       output->set_length(output_begin);
    335       output->Append(canon_ip.data(), canon_ip.length());
    336     }
    337   }
    338 
    339   host_info->out_host = MakeRange(output_begin, output->length());
    340 }
    341 
    342 }  // namespace
    343 
    344 bool CanonicalizeHost(const char* spec,
    345                       const Component& host,
    346                       CanonOutput* output,
    347                       Component* out_host) {
    348   CanonHostInfo host_info;
    349   DoHost<char, unsigned char>(spec, host, output, &host_info);
    350   *out_host = host_info.out_host;
    351   return (host_info.family != CanonHostInfo::BROKEN);
    352 }
    353 
    354 bool CanonicalizeHost(const base::char16* spec,
    355                       const Component& host,
    356                       CanonOutput* output,
    357                       Component* out_host) {
    358   CanonHostInfo host_info;
    359   DoHost<base::char16, base::char16>(spec, host, output, &host_info);
    360   *out_host = host_info.out_host;
    361   return (host_info.family != CanonHostInfo::BROKEN);
    362 }
    363 
    364 void CanonicalizeHostVerbose(const char* spec,
    365                              const Component& host,
    366                              CanonOutput* output,
    367                              CanonHostInfo* host_info) {
    368   DoHost<char, unsigned char>(spec, host, output, host_info);
    369 }
    370 
    371 void CanonicalizeHostVerbose(const base::char16* spec,
    372                              const Component& host,
    373                              CanonOutput* output,
    374                              CanonHostInfo* host_info) {
    375   DoHost<base::char16, base::char16>(spec, host, output, host_info);
    376 }
    377 
    378 }  // namespace url
    379