Home | History | Annotate | Download | only in src
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/uri.h"
      6 
      7 #include "src/char-predicates-inl.h"
      8 #include "src/handles.h"
      9 #include "src/isolate-inl.h"
     10 #include "src/list.h"
     11 #include "src/string-search.h"
     12 
     13 namespace v8 {
     14 namespace internal {
     15 
     16 namespace {  // anonymous namespace for DecodeURI helper functions
     17 bool IsReservedPredicate(uc16 c) {
     18   switch (c) {
     19     case '#':
     20     case '$':
     21     case '&':
     22     case '+':
     23     case ',':
     24     case '/':
     25     case ':':
     26     case ';':
     27     case '=':
     28     case '?':
     29     case '@':
     30       return true;
     31     default:
     32       return false;
     33   }
     34 }
     35 
     36 bool IsReplacementCharacter(const uint8_t* octets, int length) {
     37   // The replacement character is at codepoint U+FFFD in the Unicode Specials
     38   // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
     39   if (length != 3 || octets[0] != 0xef || octets[1] != 0xbf ||
     40       octets[2] != 0xbd) {
     41     return false;
     42   }
     43   return true;
     44 }
     45 
     46 bool DecodeOctets(const uint8_t* octets, int length, List<uc16>* buffer) {
     47   size_t cursor = 0;
     48   uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
     49   if (value == unibrow::Utf8::kBadChar &&
     50       !IsReplacementCharacter(octets, length)) {
     51     return false;
     52   }
     53 
     54   if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
     55     buffer->Add(value);
     56   } else {
     57     buffer->Add(unibrow::Utf16::LeadSurrogate(value));
     58     buffer->Add(unibrow::Utf16::TrailSurrogate(value));
     59   }
     60   return true;
     61 }
     62 
     63 int TwoDigitHex(uc16 character1, uc16 character2) {
     64   if (character1 > 'f') return -1;
     65   int high = HexValue(character1);
     66   if (high == -1) return -1;
     67   if (character2 > 'f') return -1;
     68   int low = HexValue(character2);
     69   if (low == -1) return -1;
     70   return (high << 4) + low;
     71 }
     72 
     73 template <typename T>
     74 void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
     75                  bool is_uri, List<T>* buffer) {
     76   if (is_uri && IsReservedPredicate(decoded)) {
     77     buffer->Add('%');
     78     uc16 first = uri_content->Get(index + 1);
     79     uc16 second = uri_content->Get(index + 2);
     80     DCHECK_GT(std::numeric_limits<T>::max(), first);
     81     DCHECK_GT(std::numeric_limits<T>::max(), second);
     82 
     83     buffer->Add(first);
     84     buffer->Add(second);
     85   } else {
     86     buffer->Add(decoded);
     87   }
     88 }
     89 
     90 bool IntoTwoByte(int index, bool is_uri, int uri_length,
     91                  String::FlatContent* uri_content, List<uc16>* buffer) {
     92   for (int k = index; k < uri_length; k++) {
     93     uc16 code = uri_content->Get(k);
     94     if (code == '%') {
     95       int two_digits;
     96       if (k + 2 >= uri_length ||
     97           (two_digits = TwoDigitHex(uri_content->Get(k + 1),
     98                                     uri_content->Get(k + 2))) < 0) {
     99         return false;
    100       }
    101       k += 2;
    102       uc16 decoded = static_cast<uc16>(two_digits);
    103       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
    104         uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
    105         octets[0] = decoded;
    106 
    107         int number_of_continuation_bytes = 0;
    108         while ((decoded << ++number_of_continuation_bytes) & 0x80) {
    109           if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
    110             return false;
    111           }
    112           if (uri_content->Get(++k) != '%' ||
    113               (two_digits = TwoDigitHex(uri_content->Get(k + 1),
    114                                         uri_content->Get(k + 2))) < 0) {
    115             return false;
    116           }
    117           k += 2;
    118           uc16 continuation_byte = static_cast<uc16>(two_digits);
    119           octets[number_of_continuation_bytes] = continuation_byte;
    120         }
    121 
    122         if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
    123           return false;
    124         }
    125       } else {
    126         AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
    127       }
    128     } else {
    129       buffer->Add(code);
    130     }
    131   }
    132   return true;
    133 }
    134 
    135 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
    136                        List<uint8_t>* one_byte_buffer,
    137                        List<uc16>* two_byte_buffer) {
    138   DisallowHeapAllocation no_gc;
    139   String::FlatContent uri_content = uri->GetFlatContent();
    140 
    141   int uri_length = uri->length();
    142   for (int k = 0; k < uri_length; k++) {
    143     uc16 code = uri_content.Get(k);
    144     if (code == '%') {
    145       int two_digits;
    146       if (k + 2 >= uri_length ||
    147           (two_digits = TwoDigitHex(uri_content.Get(k + 1),
    148                                     uri_content.Get(k + 2))) < 0) {
    149         return false;
    150       }
    151 
    152       uc16 decoded = static_cast<uc16>(two_digits);
    153       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
    154         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
    155                            two_byte_buffer);
    156       }
    157 
    158       AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
    159       k += 2;
    160     } else {
    161       if (code > unibrow::Utf8::kMaxOneByteChar) {
    162         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
    163                            two_byte_buffer);
    164       }
    165       one_byte_buffer->Add(code);
    166     }
    167   }
    168   return true;
    169 }
    170 
    171 }  // anonymous namespace
    172 
    173 MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
    174                                 bool is_uri) {
    175   uri = String::Flatten(uri);
    176   List<uint8_t> one_byte_buffer;
    177   List<uc16> two_byte_buffer;
    178 
    179   if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
    180     THROW_NEW_ERROR(isolate, NewURIError(), String);
    181   }
    182 
    183   if (two_byte_buffer.is_empty()) {
    184     return isolate->factory()->NewStringFromOneByte(
    185         one_byte_buffer.ToConstVector());
    186   }
    187 
    188   Handle<SeqTwoByteString> result;
    189   ASSIGN_RETURN_ON_EXCEPTION(
    190       isolate, result, isolate->factory()->NewRawTwoByteString(
    191                            one_byte_buffer.length() + two_byte_buffer.length()),
    192       String);
    193 
    194   CopyChars(result->GetChars(), one_byte_buffer.ToConstVector().start(),
    195             one_byte_buffer.length());
    196   CopyChars(result->GetChars() + one_byte_buffer.length(),
    197             two_byte_buffer.ToConstVector().start(), two_byte_buffer.length());
    198 
    199   return result;
    200 }
    201 
    202 namespace {  // anonymous namespace for EncodeURI helper functions
    203 bool IsUnescapePredicateInUriComponent(uc16 c) {
    204   if (IsAlphaNumeric(c)) {
    205     return true;
    206   }
    207 
    208   switch (c) {
    209     case '!':
    210     case '\'':
    211     case '(':
    212     case ')':
    213     case '*':
    214     case '-':
    215     case '.':
    216     case '_':
    217     case '~':
    218       return true;
    219     default:
    220       return false;
    221   }
    222 }
    223 
    224 bool IsUriSeparator(uc16 c) {
    225   switch (c) {
    226     case '#':
    227     case ':':
    228     case ';':
    229     case '/':
    230     case '?':
    231     case '$':
    232     case '&':
    233     case '+':
    234     case ',':
    235     case '@':
    236     case '=':
    237       return true;
    238     default:
    239       return false;
    240   }
    241 }
    242 
    243 void AddEncodedOctetToBuffer(uint8_t octet, List<uint8_t>* buffer) {
    244   buffer->Add('%');
    245   buffer->Add(HexCharOfValue(octet >> 4));
    246   buffer->Add(HexCharOfValue(octet & 0x0F));
    247 }
    248 
    249 void EncodeSingle(uc16 c, List<uint8_t>* buffer) {
    250   char s[4] = {};
    251   int number_of_bytes;
    252   number_of_bytes =
    253       unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
    254   for (int k = 0; k < number_of_bytes; k++) {
    255     AddEncodedOctetToBuffer(s[k], buffer);
    256   }
    257 }
    258 
    259 void EncodePair(uc16 cc1, uc16 cc2, List<uint8_t>* buffer) {
    260   char s[4] = {};
    261   int number_of_bytes =
    262       unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
    263                             unibrow::Utf16::kNoPreviousCharacter, false);
    264   for (int k = 0; k < number_of_bytes; k++) {
    265     AddEncodedOctetToBuffer(s[k], buffer);
    266   }
    267 }
    268 
    269 }  // anonymous namespace
    270 
    271 MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
    272                                 bool is_uri) {
    273   uri = String::Flatten(uri);
    274   int uri_length = uri->length();
    275   List<uint8_t> buffer(uri_length);
    276 
    277   {
    278     DisallowHeapAllocation no_gc;
    279     String::FlatContent uri_content = uri->GetFlatContent();
    280 
    281     for (int k = 0; k < uri_length; k++) {
    282       uc16 cc1 = uri_content.Get(k);
    283       if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
    284         k++;
    285         if (k < uri_length) {
    286           uc16 cc2 = uri->Get(k);
    287           if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
    288             EncodePair(cc1, cc2, &buffer);
    289             continue;
    290           }
    291         }
    292       } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
    293         if (IsUnescapePredicateInUriComponent(cc1) ||
    294             (is_uri && IsUriSeparator(cc1))) {
    295           buffer.Add(cc1);
    296         } else {
    297           EncodeSingle(cc1, &buffer);
    298         }
    299         continue;
    300       }
    301 
    302       AllowHeapAllocation allocate_error_and_return;
    303       THROW_NEW_ERROR(isolate, NewURIError(), String);
    304     }
    305   }
    306 
    307   return isolate->factory()->NewStringFromOneByte(buffer.ToConstVector());
    308 }
    309 
    310 namespace {  // Anonymous namespace for Escape and Unescape
    311 
    312 template <typename Char>
    313 int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
    314   uint16_t character = vector[i];
    315   int32_t hi = 0;
    316   int32_t lo = 0;
    317   if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
    318       (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
    319       (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
    320     *step = 6;
    321     return (hi << 8) + lo;
    322   } else if (character == '%' && i <= length - 3 &&
    323              (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
    324     *step = 3;
    325     return lo;
    326   } else {
    327     *step = 1;
    328     return character;
    329   }
    330 }
    331 
    332 template <typename Char>
    333 MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
    334                                  int start_index) {
    335   bool one_byte = true;
    336   int length = string->length();
    337 
    338   int unescaped_length = 0;
    339   {
    340     DisallowHeapAllocation no_allocation;
    341     Vector<const Char> vector = string->GetCharVector<Char>();
    342     for (int i = start_index; i < length; unescaped_length++) {
    343       int step;
    344       if (UnescapeChar(vector, i, length, &step) >
    345           String::kMaxOneByteCharCode) {
    346         one_byte = false;
    347       }
    348       i += step;
    349     }
    350   }
    351 
    352   DCHECK(start_index < length);
    353   Handle<String> first_part =
    354       isolate->factory()->NewProperSubString(string, 0, start_index);
    355 
    356   int dest_position = 0;
    357   Handle<String> second_part;
    358   DCHECK(unescaped_length <= String::kMaxLength);
    359   if (one_byte) {
    360     Handle<SeqOneByteString> dest = isolate->factory()
    361                                         ->NewRawOneByteString(unescaped_length)
    362                                         .ToHandleChecked();
    363     DisallowHeapAllocation no_allocation;
    364     Vector<const Char> vector = string->GetCharVector<Char>();
    365     for (int i = start_index; i < length; dest_position++) {
    366       int step;
    367       dest->SeqOneByteStringSet(dest_position,
    368                                 UnescapeChar(vector, i, length, &step));
    369       i += step;
    370     }
    371     second_part = dest;
    372   } else {
    373     Handle<SeqTwoByteString> dest = isolate->factory()
    374                                         ->NewRawTwoByteString(unescaped_length)
    375                                         .ToHandleChecked();
    376     DisallowHeapAllocation no_allocation;
    377     Vector<const Char> vector = string->GetCharVector<Char>();
    378     for (int i = start_index; i < length; dest_position++) {
    379       int step;
    380       dest->SeqTwoByteStringSet(dest_position,
    381                                 UnescapeChar(vector, i, length, &step));
    382       i += step;
    383     }
    384     second_part = dest;
    385   }
    386   return isolate->factory()->NewConsString(first_part, second_part);
    387 }
    388 
    389 bool IsNotEscaped(uint16_t c) {
    390   if (IsAlphaNumeric(c)) {
    391     return true;
    392   }
    393   //  @*_+-./
    394   switch (c) {
    395     case '@':
    396     case '*':
    397     case '_':
    398     case '+':
    399     case '-':
    400     case '.':
    401     case '/':
    402       return true;
    403     default:
    404       return false;
    405   }
    406 }
    407 
    408 template <typename Char>
    409 static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
    410                                            Handle<String> source) {
    411   int index;
    412   {
    413     DisallowHeapAllocation no_allocation;
    414     StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%"));
    415     index = search.Search(source->GetCharVector<Char>(), 0);
    416     if (index < 0) return source;
    417   }
    418   return UnescapeSlow<Char>(isolate, source, index);
    419 }
    420 
    421 template <typename Char>
    422 static MaybeHandle<String> EscapePrivate(Isolate* isolate,
    423                                          Handle<String> string) {
    424   DCHECK(string->IsFlat());
    425   int escaped_length = 0;
    426   int length = string->length();
    427 
    428   {
    429     DisallowHeapAllocation no_allocation;
    430     Vector<const Char> vector = string->GetCharVector<Char>();
    431     for (int i = 0; i < length; i++) {
    432       uint16_t c = vector[i];
    433       if (c >= 256) {
    434         escaped_length += 6;
    435       } else if (IsNotEscaped(c)) {
    436         escaped_length++;
    437       } else {
    438         escaped_length += 3;
    439       }
    440 
    441       // We don't allow strings that are longer than a maximal length.
    442       DCHECK(String::kMaxLength < 0x7fffffff - 6);     // Cannot overflow.
    443       if (escaped_length > String::kMaxLength) break;  // Provoke exception.
    444     }
    445   }
    446 
    447   // No length change implies no change.  Return original string if no change.
    448   if (escaped_length == length) return string;
    449 
    450   Handle<SeqOneByteString> dest;
    451   ASSIGN_RETURN_ON_EXCEPTION(
    452       isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
    453       String);
    454   int dest_position = 0;
    455 
    456   {
    457     DisallowHeapAllocation no_allocation;
    458     Vector<const Char> vector = string->GetCharVector<Char>();
    459     for (int i = 0; i < length; i++) {
    460       uint16_t c = vector[i];
    461       if (c >= 256) {
    462         dest->SeqOneByteStringSet(dest_position, '%');
    463         dest->SeqOneByteStringSet(dest_position + 1, 'u');
    464         dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
    465         dest->SeqOneByteStringSet(dest_position + 3,
    466                                   HexCharOfValue((c >> 8) & 0xf));
    467         dest->SeqOneByteStringSet(dest_position + 4,
    468                                   HexCharOfValue((c >> 4) & 0xf));
    469         dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xf));
    470         dest_position += 6;
    471       } else if (IsNotEscaped(c)) {
    472         dest->SeqOneByteStringSet(dest_position, c);
    473         dest_position++;
    474       } else {
    475         dest->SeqOneByteStringSet(dest_position, '%');
    476         dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
    477         dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xf));
    478         dest_position += 3;
    479       }
    480     }
    481   }
    482 
    483   return dest;
    484 }
    485 
    486 }  // Anonymous namespace
    487 
    488 MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
    489   Handle<String> result;
    490   string = String::Flatten(string);
    491   return string->IsOneByteRepresentationUnderneath()
    492              ? EscapePrivate<uint8_t>(isolate, string)
    493              : EscapePrivate<uc16>(isolate, string);
    494 }
    495 
    496 MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
    497   Handle<String> result;
    498   string = String::Flatten(string);
    499   return string->IsOneByteRepresentationUnderneath()
    500              ? UnescapePrivate<uint8_t>(isolate, string)
    501              : UnescapePrivate<uc16>(isolate, string);
    502 }
    503 
    504 }  // namespace internal
    505 }  // namespace v8
    506