Home | History | Annotate | Download | only in src
      1 // Copyright 2007, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 #include <cstdio>
     31 #include <errno.h>
     32 #include <stdlib.h>
     33 #include <string>
     34 
     35 #include "googleurl/src/url_canon_internal.h"
     36 
     37 namespace url_canon {
     38 
     39 namespace {
     40 
     41 template<typename CHAR, typename UCHAR>
     42 void DoAppendStringOfType(const CHAR* source, int length,
     43                           SharedCharTypes type,
     44                           CanonOutput* output) {
     45   for (int i = 0; i < length; i++) {
     46     if (static_cast<UCHAR>(source[i]) >= 0x80) {
     47       // ReadChar will fill the code point with kUnicodeReplacementCharacter
     48       // when the input is invalid, which is what we want.
     49       unsigned code_point;
     50       ReadUTFChar(source, &i, length, &code_point);
     51       AppendUTF8EscapedValue(code_point, output);
     52     } else {
     53       // Just append the 7-bit character, possibly escaping it.
     54       unsigned char uch = static_cast<unsigned char>(source[i]);
     55       if (!IsCharOfType(uch, type))
     56         AppendEscapedChar(uch, output);
     57       else
     58         output->push_back(uch);
     59     }
     60   }
     61 }
     62 
     63 // This function assumes the input values are all contained in 8-bit,
     64 // although it allows any type. Returns true if input is valid, false if not.
     65 template<typename CHAR, typename UCHAR>
     66 void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end,
     67                                  CanonOutput* output) {
     68   for (int i = begin; i < end; i++) {
     69     UCHAR uch = static_cast<UCHAR>(spec[i]);
     70     if (uch >= 0x80) {
     71       // Handle UTF-8/16 encodings. This call will correctly handle the error
     72       // case by appending the invalid character.
     73       AppendUTF8EscapedChar(spec, &i, end, output);
     74     } else if (uch <= ' ' || uch == 0x7f) {
     75       // This function is for error handling, so we escape all control
     76       // characters and spaces, but not anything else since we lack
     77       // context to do something more specific.
     78       AppendEscapedChar(static_cast<unsigned char>(uch), output);
     79     } else {
     80       output->push_back(static_cast<char>(uch));
     81     }
     82   }
     83 }
     84 
     85 // Overrides one component, see the url_canon::Replacements structure for
     86 // what the various combionations of source pointer and component mean.
     87 void DoOverrideComponent(const char* override_source,
     88                          const url_parse::Component& override_component,
     89                          const char** dest,
     90                          url_parse::Component* dest_component) {
     91   if (override_source) {
     92     *dest = override_source;
     93     *dest_component = override_component;
     94   }
     95 }
     96 
     97 // Similar to DoOverrideComponent except that it takes a UTF-16 input and does
     98 // not actually set the output character pointer.
     99 //
    100 // The input is converted to UTF-8 at the end of the given buffer as a temporary
    101 // holding place. The component indentifying the portion of the buffer used in
    102 // the |utf8_buffer| will be specified in |*dest_component|.
    103 //
    104 // This will not actually set any |dest| pointer like DoOverrideComponent
    105 // does because all of the pointers will point into the |utf8_buffer|, which
    106 // may get resized while we're overriding a subsequent component. Instead, the
    107 // caller should use the beginning of the |utf8_buffer| as the string pointer
    108 // for all components once all overrides have been prepared.
    109 bool PrepareUTF16OverrideComponent(
    110     const char16* override_source,
    111     const url_parse::Component& override_component,
    112     CanonOutput* utf8_buffer,
    113     url_parse::Component* dest_component) {
    114   bool success = true;
    115   if (override_source) {
    116     if (!override_component.is_valid()) {
    117       // Non-"valid" component (means delete), so we need to preserve that.
    118       *dest_component = url_parse::Component();
    119     } else {
    120       // Convert to UTF-8.
    121       dest_component->begin = utf8_buffer->length();
    122       success = ConvertUTF16ToUTF8(&override_source[override_component.begin],
    123                                    override_component.len, utf8_buffer);
    124       dest_component->len = utf8_buffer->length() - dest_component->begin;
    125     }
    126   }
    127   return success;
    128 }
    129 
    130 }  // namespace
    131 
    132 // See the header file for this array's declaration.
    133 const unsigned char kSharedCharTypeTable[0x100] = {
    134     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x00 - 0x0f
    135     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x10 - 0x1f
    136     0,                           // 0x20  ' ' (escape spaces in queries)
    137     CHAR_QUERY | CHAR_USERINFO,  // 0x21  !
    138     0,                           // 0x22  "
    139     0,                           // 0x23  #  (invalid in query since it marks the ref)
    140     CHAR_QUERY | CHAR_USERINFO,  // 0x24  $
    141     CHAR_QUERY | CHAR_USERINFO,  // 0x25  %
    142     CHAR_QUERY | CHAR_USERINFO,  // 0x26  &
    143     CHAR_QUERY | CHAR_USERINFO,  // 0x27  '
    144     CHAR_QUERY | CHAR_USERINFO,  // 0x28  (
    145     CHAR_QUERY | CHAR_USERINFO,  // 0x29  )
    146     CHAR_QUERY | CHAR_USERINFO,  // 0x2a  *
    147     CHAR_QUERY | CHAR_USERINFO,  // 0x2b  +
    148     CHAR_QUERY | CHAR_USERINFO,  // 0x2c  ,
    149     CHAR_QUERY | CHAR_USERINFO,  // 0x2d  -
    150     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4,  // 0x2e  .
    151     CHAR_QUERY,                              // 0x2f  /
    152     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x30  0
    153     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x31  1
    154     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x32  2
    155     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x33  3
    156     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x34  4
    157     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x35  5
    158     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x36  6
    159     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x37  7
    160     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x38  8
    161     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x39  9
    162     CHAR_QUERY,  // 0x3a  :
    163     CHAR_QUERY,  // 0x3b  ;
    164     0,           // 0x3c  <  (Try to prevent certain types of XSS.)
    165     CHAR_QUERY,  // 0x3d  =
    166     0,           // 0x3e  >  (Try to prevent certain types of XSS.)
    167     CHAR_QUERY,  // 0x3f  ?
    168     CHAR_QUERY,  // 0x40  @
    169     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x41  A
    170     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x42  B
    171     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x43  C
    172     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x44  D
    173     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x45  E
    174     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x46  F
    175     CHAR_QUERY | CHAR_USERINFO,  // 0x47  G
    176     CHAR_QUERY | CHAR_USERINFO,  // 0x48  H
    177     CHAR_QUERY | CHAR_USERINFO,  // 0x49  I
    178     CHAR_QUERY | CHAR_USERINFO,  // 0x4a  J
    179     CHAR_QUERY | CHAR_USERINFO,  // 0x4b  K
    180     CHAR_QUERY | CHAR_USERINFO,  // 0x4c  L
    181     CHAR_QUERY | CHAR_USERINFO,  // 0x4d  M
    182     CHAR_QUERY | CHAR_USERINFO,  // 0x4e  N
    183     CHAR_QUERY | CHAR_USERINFO,  // 0x4f  O
    184     CHAR_QUERY | CHAR_USERINFO,  // 0x50  P
    185     CHAR_QUERY | CHAR_USERINFO,  // 0x51  Q
    186     CHAR_QUERY | CHAR_USERINFO,  // 0x52  R
    187     CHAR_QUERY | CHAR_USERINFO,  // 0x53  S
    188     CHAR_QUERY | CHAR_USERINFO,  // 0x54  T
    189     CHAR_QUERY | CHAR_USERINFO,  // 0x55  U
    190     CHAR_QUERY | CHAR_USERINFO,  // 0x56  V
    191     CHAR_QUERY | CHAR_USERINFO,  // 0x57  W
    192     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x58  X
    193     CHAR_QUERY | CHAR_USERINFO,  // 0x59  Y
    194     CHAR_QUERY | CHAR_USERINFO,  // 0x5a  Z
    195     CHAR_QUERY,  // 0x5b  [
    196     CHAR_QUERY,  // 0x5c  '\'
    197     CHAR_QUERY,  // 0x5d  ]
    198     CHAR_QUERY,  // 0x5e  ^
    199     CHAR_QUERY | CHAR_USERINFO,  // 0x5f  _
    200     CHAR_QUERY,  // 0x60  `
    201     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x61  a
    202     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x62  b
    203     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x63  c
    204     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x64  d
    205     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x65  e
    206     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x66  f
    207     CHAR_QUERY | CHAR_USERINFO,  // 0x67  g
    208     CHAR_QUERY | CHAR_USERINFO,  // 0x68  h
    209     CHAR_QUERY | CHAR_USERINFO,  // 0x69  i
    210     CHAR_QUERY | CHAR_USERINFO,  // 0x6a  j
    211     CHAR_QUERY | CHAR_USERINFO,  // 0x6b  k
    212     CHAR_QUERY | CHAR_USERINFO,  // 0x6c  l
    213     CHAR_QUERY | CHAR_USERINFO,  // 0x6d  m
    214     CHAR_QUERY | CHAR_USERINFO,  // 0x6e  n
    215     CHAR_QUERY | CHAR_USERINFO,  // 0x6f  o
    216     CHAR_QUERY | CHAR_USERINFO,  // 0x70  p
    217     CHAR_QUERY | CHAR_USERINFO,  // 0x71  q
    218     CHAR_QUERY | CHAR_USERINFO,  // 0x72  r
    219     CHAR_QUERY | CHAR_USERINFO,  // 0x73  s
    220     CHAR_QUERY | CHAR_USERINFO,  // 0x74  t
    221     CHAR_QUERY | CHAR_USERINFO,  // 0x75  u
    222     CHAR_QUERY | CHAR_USERINFO,  // 0x76  v
    223     CHAR_QUERY | CHAR_USERINFO,  // 0x77  w
    224     CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4,  // 0x78  x
    225     CHAR_QUERY | CHAR_USERINFO,  // 0x79  y
    226     CHAR_QUERY | CHAR_USERINFO,  // 0x7a  z
    227     CHAR_QUERY,  // 0x7b  {
    228     CHAR_QUERY,  // 0x7c  |
    229     CHAR_QUERY,  // 0x7d  }
    230     CHAR_QUERY | CHAR_USERINFO,  // 0x7e  ~
    231     0,           // 0x7f
    232     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8f
    233     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9f
    234     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xa0 - 0xaf
    235     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xb0 - 0xbf
    236     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xc0 - 0xcf
    237     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xd0 - 0xdf
    238     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xe0 - 0xef
    239     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xf0 - 0xff
    240 };
    241 
    242 const char kHexCharLookup[0x10] = {
    243     '0', '1', '2', '3', '4', '5', '6', '7',
    244     '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
    245 };
    246 
    247 const char kCharToHexLookup[8] = {
    248     0,         // 0x00 - 0x1f
    249     '0',       // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39
    250     'A' - 10,  // 0x40 - 0x5f: letters A - F are 0x41 - 0x46
    251     'a' - 10,  // 0x60 - 0x7f: letters a - f are 0x61 - 0x66
    252     0,         // 0x80 - 0x9F
    253     0,         // 0xA0 - 0xBF
    254     0,         // 0xC0 - 0xDF
    255     0,         // 0xE0 - 0xFF
    256 };
    257 
    258 const char16 kUnicodeReplacementCharacter = 0xfffd;
    259 
    260 void AppendStringOfType(const char* source, int length,
    261                         SharedCharTypes type,
    262                         CanonOutput* output) {
    263   DoAppendStringOfType<char, unsigned char>(source, length, type, output);
    264 }
    265 
    266 void AppendStringOfType(const char16* source, int length,
    267                         SharedCharTypes type,
    268                         CanonOutput* output) {
    269   DoAppendStringOfType<char16, char16>(source, length, type, output);
    270 }
    271 
    272 void AppendInvalidNarrowString(const char* spec, int begin, int end,
    273                                CanonOutput* output) {
    274   DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
    275 }
    276 
    277 void AppendInvalidNarrowString(const char16* spec, int begin, int end,
    278                                CanonOutput* output) {
    279   DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output);
    280 }
    281 
    282 bool ConvertUTF16ToUTF8(const char16* input, int input_len,
    283                         CanonOutput* output) {
    284   bool success = true;
    285   for (int i = 0; i < input_len; i++) {
    286     unsigned code_point;
    287     success &= ReadUTFChar(input, &i, input_len, &code_point);
    288     AppendUTF8Value(code_point, output);
    289   }
    290   return success;
    291 }
    292 
    293 bool ConvertUTF8ToUTF16(const char* input, int input_len,
    294                         CanonOutputT<char16>* output) {
    295   bool success = true;
    296   for (int i = 0; i < input_len; i++) {
    297     unsigned code_point;
    298     success &= ReadUTFChar(input, &i, input_len, &code_point);
    299     AppendUTF16Value(code_point, output);
    300   }
    301   return success;
    302 }
    303 
    304 void SetupOverrideComponents(const char* base,
    305                              const Replacements<char>& repl,
    306                              URLComponentSource<char>* source,
    307                              url_parse::Parsed* parsed) {
    308   // Get the source and parsed structures of the things we are replacing.
    309   const URLComponentSource<char>& repl_source = repl.sources();
    310   const url_parse::Parsed& repl_parsed = repl.components();
    311 
    312   DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,
    313                       &source->scheme, &parsed->scheme);
    314   DoOverrideComponent(repl_source.username, repl_parsed.username,
    315                       &source->username, &parsed->username);
    316   DoOverrideComponent(repl_source.password, repl_parsed.password,
    317                       &source->password, &parsed->password);
    318 
    319   // Our host should be empty if not present, so override the default setup.
    320   DoOverrideComponent(repl_source.host, repl_parsed.host,
    321                       &source->host, &parsed->host);
    322   if (parsed->host.len == -1)
    323     parsed->host.len = 0;
    324 
    325   DoOverrideComponent(repl_source.port, repl_parsed.port,
    326                       &source->port, &parsed->port);
    327   DoOverrideComponent(repl_source.path, repl_parsed.path,
    328                       &source->path, &parsed->path);
    329   DoOverrideComponent(repl_source.query, repl_parsed.query,
    330                       &source->query, &parsed->query);
    331   DoOverrideComponent(repl_source.ref, repl_parsed.ref,
    332                       &source->ref, &parsed->ref);
    333 }
    334 
    335 bool SetupUTF16OverrideComponents(const char* base,
    336                                   const Replacements<char16>& repl,
    337                                   CanonOutput* utf8_buffer,
    338                                   URLComponentSource<char>* source,
    339                                   url_parse::Parsed* parsed) {
    340   bool success = true;
    341 
    342   // Get the source and parsed structures of the things we are replacing.
    343   const URLComponentSource<char16>& repl_source = repl.sources();
    344   const url_parse::Parsed& repl_parsed = repl.components();
    345 
    346   success &= PrepareUTF16OverrideComponent(
    347       repl_source.scheme, repl_parsed.scheme,
    348       utf8_buffer, &parsed->scheme);
    349   success &= PrepareUTF16OverrideComponent(
    350       repl_source.username, repl_parsed.username,
    351       utf8_buffer, &parsed->username);
    352   success &= PrepareUTF16OverrideComponent(
    353       repl_source.password, repl_parsed.password,
    354       utf8_buffer, &parsed->password);
    355   success &= PrepareUTF16OverrideComponent(
    356       repl_source.host, repl_parsed.host,
    357       utf8_buffer, &parsed->host);
    358   success &= PrepareUTF16OverrideComponent(
    359       repl_source.port, repl_parsed.port,
    360       utf8_buffer, &parsed->port);
    361   success &= PrepareUTF16OverrideComponent(
    362       repl_source.path, repl_parsed.path,
    363       utf8_buffer, &parsed->path);
    364   success &= PrepareUTF16OverrideComponent(
    365       repl_source.query, repl_parsed.query,
    366       utf8_buffer, &parsed->query);
    367   success &= PrepareUTF16OverrideComponent(
    368       repl_source.ref, repl_parsed.ref,
    369       utf8_buffer, &parsed->ref);
    370 
    371   // PrepareUTF16OverrideComponent will not have set the data pointer since the
    372   // buffer could be resized, invalidating the pointers. We set the data
    373   // pointers for affected components now that the buffer is finalized.
    374   if (repl_source.scheme)   source->scheme = utf8_buffer->data();
    375   if (repl_source.username) source->username = utf8_buffer->data();
    376   if (repl_source.password) source->password = utf8_buffer->data();
    377   if (repl_source.host)     source->host = utf8_buffer->data();
    378   if (repl_source.port)     source->port = utf8_buffer->data();
    379   if (repl_source.path)     source->path = utf8_buffer->data();
    380   if (repl_source.query)    source->query = utf8_buffer->data();
    381   if (repl_source.ref)      source->ref = utf8_buffer->data();
    382 
    383   return success;
    384 }
    385 
    386 #ifndef WIN32
    387 
    388 int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) {
    389   const char* format_str;
    390   if (radix == 10)
    391     format_str = "%d";
    392   else if (radix == 16)
    393     format_str = "%x";
    394   else
    395     return EINVAL;
    396 
    397   int written = snprintf(buffer, size_in_chars, format_str, value);
    398   if (static_cast<size_t>(written) >= size_in_chars) {
    399     // Output was truncated, or written was negative.
    400     return EINVAL;
    401   }
    402   return 0;
    403 }
    404 
    405 int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) {
    406   if (radix != 10)
    407     return EINVAL;
    408 
    409   // No more than 12 characters will be required for a 32-bit integer.
    410   // Add an extra byte for the terminating null.
    411   char temp[13];
    412   int written = snprintf(temp, sizeof(temp), "%d", value);
    413   if (static_cast<size_t>(written) >= size_in_chars) {
    414     // Output was truncated, or written was negative.
    415     return EINVAL;
    416   }
    417 
    418   for (int i = 0; i < written; ++i) {
    419     buffer[i] = static_cast<char16>(temp[i]);
    420   }
    421   buffer[written] = '\0';
    422   return 0;
    423 }
    424 
    425 #endif  // !WIN32
    426 
    427 }  // namespace url_canon
    428