Home | History | Annotate | Download | only in cctest
      1 // Copyright 2008 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 
     29 #include <stdlib.h>
     30 
     31 #include "v8.h"
     32 
     33 #include "string-stream.h"
     34 #include "cctest.h"
     35 #include "zone-inl.h"
     36 #include "parser.h"
     37 #include "ast.h"
     38 #include "jsregexp.h"
     39 #include "regexp-macro-assembler.h"
     40 #include "regexp-macro-assembler-irregexp.h"
     41 #ifdef V8_NATIVE_REGEXP
     42 #ifdef V8_TARGET_ARCH_ARM
     43 #include "arm/macro-assembler-arm.h"
     44 #include "arm/regexp-macro-assembler-arm.h"
     45 #endif
     46 #ifdef V8_TARGET_ARCH_X64
     47 #include "x64/macro-assembler-x64.h"
     48 #include "x64/regexp-macro-assembler-x64.h"
     49 #endif
     50 #ifdef V8_TARGET_ARCH_IA32
     51 #include "ia32/macro-assembler-ia32.h"
     52 #include "ia32/regexp-macro-assembler-ia32.h"
     53 #endif
     54 #else
     55 #include "interpreter-irregexp.h"
     56 #endif
     57 
     58 using namespace v8::internal;
     59 
     60 
     61 static bool CheckParse(const char* input) {
     62   V8::Initialize(NULL);
     63   v8::HandleScope scope;
     64   ZoneScope zone_scope(DELETE_ON_EXIT);
     65   FlatStringReader reader(CStrVector(input));
     66   RegExpCompileData result;
     67   return v8::internal::ParseRegExp(&reader, false, &result);
     68 }
     69 
     70 
     71 static SmartPointer<const char> Parse(const char* input) {
     72   V8::Initialize(NULL);
     73   v8::HandleScope scope;
     74   ZoneScope zone_scope(DELETE_ON_EXIT);
     75   FlatStringReader reader(CStrVector(input));
     76   RegExpCompileData result;
     77   CHECK(v8::internal::ParseRegExp(&reader, false, &result));
     78   CHECK(result.tree != NULL);
     79   CHECK(result.error.is_null());
     80   SmartPointer<const char> output = result.tree->ToString();
     81   return output;
     82 }
     83 
     84 static bool CheckSimple(const char* input) {
     85   V8::Initialize(NULL);
     86   v8::HandleScope scope;
     87   unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
     88   ZoneScope zone_scope(DELETE_ON_EXIT);
     89   FlatStringReader reader(CStrVector(input));
     90   RegExpCompileData result;
     91   CHECK(v8::internal::ParseRegExp(&reader, false, &result));
     92   CHECK(result.tree != NULL);
     93   CHECK(result.error.is_null());
     94   return result.simple;
     95 }
     96 
     97 struct MinMaxPair {
     98   int min_match;
     99   int max_match;
    100 };
    101 
    102 static MinMaxPair CheckMinMaxMatch(const char* input) {
    103   V8::Initialize(NULL);
    104   v8::HandleScope scope;
    105   unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
    106   ZoneScope zone_scope(DELETE_ON_EXIT);
    107   FlatStringReader reader(CStrVector(input));
    108   RegExpCompileData result;
    109   CHECK(v8::internal::ParseRegExp(&reader, false, &result));
    110   CHECK(result.tree != NULL);
    111   CHECK(result.error.is_null());
    112   int min_match = result.tree->min_match();
    113   int max_match = result.tree->max_match();
    114   MinMaxPair pair = { min_match, max_match };
    115   return pair;
    116 }
    117 
    118 
    119 #define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
    120 #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
    121 #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
    122 #define CHECK_MIN_MAX(input, min, max)                                         \
    123   { MinMaxPair min_max = CheckMinMaxMatch(input);                              \
    124     CHECK_EQ(min, min_max.min_match);                                          \
    125     CHECK_EQ(max, min_max.max_match);                                          \
    126   }
    127 
    128 TEST(Parser) {
    129   V8::Initialize(NULL);
    130 
    131   CHECK_PARSE_ERROR("?");
    132 
    133   CHECK_PARSE_EQ("abc", "'abc'");
    134   CHECK_PARSE_EQ("", "%");
    135   CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')");
    136   CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
    137   CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)");
    138   CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
    139   CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
    140   CHECK_PARSE_EQ("a*", "(# 0 - g 'a')");
    141   CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')");
    142   CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))");
    143   CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))");
    144   CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))");
    145   CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))");
    146   CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
    147   CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
    148   CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
    149   CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
    150   CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
    151   CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
    152   CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
    153   CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
    154   CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
    155   CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
    156   CHECK_PARSE_EQ("(?:foo)", "'foo'");
    157   CHECK_PARSE_EQ("(?: foo )", "' foo '");
    158   CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
    159   CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
    160   CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
    161   CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
    162   CHECK_PARSE_EQ("()", "(^ %)");
    163   CHECK_PARSE_EQ("(?=)", "(-> + %)");
    164   CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]");   // Doesn't compile on windows
    165   CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]");   // \uffff isn't in codepage 1252
    166   CHECK_PARSE_EQ("[x]", "[x]");
    167   CHECK_PARSE_EQ("[xyz]", "[x y z]");
    168   CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
    169   CHECK_PARSE_EQ("[-123]", "[- 1 2 3]");
    170   CHECK_PARSE_EQ("[^123]", "^[1 2 3]");
    171   CHECK_PARSE_EQ("]", "']'");
    172   CHECK_PARSE_EQ("}", "'}'");
    173   CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]");
    174   CHECK_PARSE_EQ("[\\d]", "[0-9]");
    175   CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]");
    176   CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
    177   CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
    178   CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
    179   CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
    180                  "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
    181   CHECK_PARSE_EQ("\\c!", "'c!'");
    182   CHECK_PARSE_EQ("\\c_", "'c_'");
    183   CHECK_PARSE_EQ("\\c~", "'c~'");
    184   CHECK_PARSE_EQ("[a\\]c]", "[a ] c]");
    185   CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
    186   CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ #  ]");
    187   CHECK_PARSE_EQ("\\0", "'\\x00'");
    188   CHECK_PARSE_EQ("\\8", "'8'");
    189   CHECK_PARSE_EQ("\\9", "'9'");
    190   CHECK_PARSE_EQ("\\11", "'\\x09'");
    191   CHECK_PARSE_EQ("\\11a", "'\\x09a'");
    192   CHECK_PARSE_EQ("\\011", "'\\x09'");
    193   CHECK_PARSE_EQ("\\00011", "'\\x0011'");
    194   CHECK_PARSE_EQ("\\118", "'\\x098'");
    195   CHECK_PARSE_EQ("\\111", "'I'");
    196   CHECK_PARSE_EQ("\\1111", "'I1'");
    197   CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
    198   CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
    199   CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
    200   CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
    201   CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')"
    202                                " (# 0 - g (<- 1)))");
    203   CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')"
    204                                " (# 0 - g (<- 2)))");
    205   CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')"
    206                                " (# 0 - g (<- 3)))");
    207   CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')"
    208                                " (# 0 - g '\\x04'))");
    209   CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
    210               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
    211               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
    212   CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
    213               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
    214               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
    215   CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
    216   CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
    217   CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
    218   CHECK_PARSE_EQ("(?=a)?a", "'a'");
    219   CHECK_PARSE_EQ("(?=a){0,10}a", "'a'");
    220   CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')");
    221   CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
    222   CHECK_PARSE_EQ("(?!a)?a", "'a'");
    223   CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
    224   CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
    225   CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
    226   CHECK_PARSE_EQ("[\\0]", "[\\x00]");
    227   CHECK_PARSE_EQ("[\\11]", "[\\x09]");
    228   CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
    229   CHECK_PARSE_EQ("[\\011]", "[\\x09]");
    230   CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]");
    231   CHECK_PARSE_EQ("[\\118]", "[\\x09 8]");
    232   CHECK_PARSE_EQ("[\\111]", "[I]");
    233   CHECK_PARSE_EQ("[\\1111]", "[I 1]");
    234   CHECK_PARSE_EQ("\\x34", "'\x34'");
    235   CHECK_PARSE_EQ("\\x60", "'\x60'");
    236   CHECK_PARSE_EQ("\\x3z", "'x3z'");
    237   CHECK_PARSE_EQ("\\c", "'c'");
    238   CHECK_PARSE_EQ("\\u0034", "'\x34'");
    239   CHECK_PARSE_EQ("\\u003z", "'u003z'");
    240   CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
    241 
    242   CHECK_SIMPLE("a", true);
    243   CHECK_SIMPLE("a|b", false);
    244   CHECK_SIMPLE("a\\n", false);
    245   CHECK_SIMPLE("^a", false);
    246   CHECK_SIMPLE("a$", false);
    247   CHECK_SIMPLE("a\\b!", false);
    248   CHECK_SIMPLE("a\\Bb", false);
    249   CHECK_SIMPLE("a*", false);
    250   CHECK_SIMPLE("a*?", false);
    251   CHECK_SIMPLE("a?", false);
    252   CHECK_SIMPLE("a??", false);
    253   CHECK_SIMPLE("a{0,1}?", false);
    254   CHECK_SIMPLE("a{1,1}?", false);
    255   CHECK_SIMPLE("a{1,2}?", false);
    256   CHECK_SIMPLE("a+?", false);
    257   CHECK_SIMPLE("(a)", false);
    258   CHECK_SIMPLE("(a)\\1", false);
    259   CHECK_SIMPLE("(\\1a)", false);
    260   CHECK_SIMPLE("\\1(a)", false);
    261   CHECK_SIMPLE("a\\s", false);
    262   CHECK_SIMPLE("a\\S", false);
    263   CHECK_SIMPLE("a\\d", false);
    264   CHECK_SIMPLE("a\\D", false);
    265   CHECK_SIMPLE("a\\w", false);
    266   CHECK_SIMPLE("a\\W", false);
    267   CHECK_SIMPLE("a.", false);
    268   CHECK_SIMPLE("a\\q", false);
    269   CHECK_SIMPLE("a[a]", false);
    270   CHECK_SIMPLE("a[^a]", false);
    271   CHECK_SIMPLE("a[a-z]", false);
    272   CHECK_SIMPLE("a[\\q]", false);
    273   CHECK_SIMPLE("a(?:b)", false);
    274   CHECK_SIMPLE("a(?=b)", false);
    275   CHECK_SIMPLE("a(?!b)", false);
    276   CHECK_SIMPLE("\\x60", false);
    277   CHECK_SIMPLE("\\u0060", false);
    278   CHECK_SIMPLE("\\cA", false);
    279   CHECK_SIMPLE("\\q", false);
    280   CHECK_SIMPLE("\\1112", false);
    281   CHECK_SIMPLE("\\0", false);
    282   CHECK_SIMPLE("(a)\\1", false);
    283   CHECK_SIMPLE("(?=a)?a", false);
    284   CHECK_SIMPLE("(?!a)?a\\1", false);
    285   CHECK_SIMPLE("(?:(?=a))a\\1", false);
    286 
    287   CHECK_PARSE_EQ("a{}", "'a{}'");
    288   CHECK_PARSE_EQ("a{,}", "'a{,}'");
    289   CHECK_PARSE_EQ("a{", "'a{'");
    290   CHECK_PARSE_EQ("a{z}", "'a{z}'");
    291   CHECK_PARSE_EQ("a{1z}", "'a{1z}'");
    292   CHECK_PARSE_EQ("a{12z}", "'a{12z}'");
    293   CHECK_PARSE_EQ("a{12,", "'a{12,'");
    294   CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'");
    295   CHECK_PARSE_EQ("{}", "'{}'");
    296   CHECK_PARSE_EQ("{,}", "'{,}'");
    297   CHECK_PARSE_EQ("{", "'{'");
    298   CHECK_PARSE_EQ("{z}", "'{z}'");
    299   CHECK_PARSE_EQ("{1z}", "'{1z}'");
    300   CHECK_PARSE_EQ("{12z}", "'{12z}'");
    301   CHECK_PARSE_EQ("{12,", "'{12,'");
    302   CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
    303 
    304   CHECK_MIN_MAX("a", 1, 1);
    305   CHECK_MIN_MAX("abc", 3, 3);
    306   CHECK_MIN_MAX("a[bc]d", 3, 3);
    307   CHECK_MIN_MAX("a|bc", 1, 2);
    308   CHECK_MIN_MAX("ab|c", 1, 2);
    309   CHECK_MIN_MAX("a||bc", 0, 2);
    310   CHECK_MIN_MAX("|", 0, 0);
    311   CHECK_MIN_MAX("(?:ab)", 2, 2);
    312   CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
    313   CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
    314   CHECK_MIN_MAX("(ab)", 2, 2);
    315   CHECK_MIN_MAX("(ab|cde)", 2, 3);
    316   CHECK_MIN_MAX("(ab)\\1", 2, 4);
    317   CHECK_MIN_MAX("(ab|cde)\\1", 2, 6);
    318   CHECK_MIN_MAX("(?:ab)?", 0, 2);
    319   CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
    320   CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
    321   CHECK_MIN_MAX("a?", 0, 1);
    322   CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
    323   CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
    324   CHECK_MIN_MAX("a??", 0, 1);
    325   CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
    326   CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
    327   CHECK_MIN_MAX("(?:a?)?", 0, 1);
    328   CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
    329   CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
    330   CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
    331   CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
    332   CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
    333   CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
    334   CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
    335   CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
    336   CHECK_MIN_MAX("a{0}", 0, 0);
    337   CHECK_MIN_MAX("(?:a+){0}", 0, 0);
    338   CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
    339   CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
    340   CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
    341   CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
    342   CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
    343   CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
    344   CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
    345   CHECK_MIN_MAX("a\\bc", 2, 2);
    346   CHECK_MIN_MAX("a\\Bc", 2, 2);
    347   CHECK_MIN_MAX("a\\sc", 3, 3);
    348   CHECK_MIN_MAX("a\\Sc", 3, 3);
    349   CHECK_MIN_MAX("a(?=b)c", 2, 2);
    350   CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
    351   CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
    352 }
    353 
    354 TEST(ParserRegression) {
    355   CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])");
    356   CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
    357   CHECK_PARSE_EQ("{", "'{'");
    358   CHECK_PARSE_EQ("a|", "(| 'a' %)");
    359 }
    360 
    361 static void ExpectError(const char* input,
    362                         const char* expected) {
    363   V8::Initialize(NULL);
    364   v8::HandleScope scope;
    365   ZoneScope zone_scope(DELETE_ON_EXIT);
    366   FlatStringReader reader(CStrVector(input));
    367   RegExpCompileData result;
    368   CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result));
    369   CHECK(result.tree == NULL);
    370   CHECK(!result.error.is_null());
    371   SmartPointer<char> str = result.error->ToCString(ALLOW_NULLS);
    372   CHECK_EQ(expected, *str);
    373 }
    374 
    375 
    376 TEST(Errors) {
    377   V8::Initialize(NULL);
    378   const char* kEndBackslash = "\\ at end of pattern";
    379   ExpectError("\\", kEndBackslash);
    380   const char* kUnterminatedGroup = "Unterminated group";
    381   ExpectError("(foo", kUnterminatedGroup);
    382   const char* kInvalidGroup = "Invalid group";
    383   ExpectError("(?", kInvalidGroup);
    384   const char* kUnterminatedCharacterClass = "Unterminated character class";
    385   ExpectError("[", kUnterminatedCharacterClass);
    386   ExpectError("[a-", kUnterminatedCharacterClass);
    387   const char* kNothingToRepeat = "Nothing to repeat";
    388   ExpectError("*", kNothingToRepeat);
    389   ExpectError("?", kNothingToRepeat);
    390   ExpectError("+", kNothingToRepeat);
    391   ExpectError("{1}", kNothingToRepeat);
    392   ExpectError("{1,2}", kNothingToRepeat);
    393   ExpectError("{1,}", kNothingToRepeat);
    394 
    395   // Check that we don't allow more than kMaxCapture captures
    396   const int kMaxCaptures = 1 << 16;  // Must match RegExpParser::kMaxCaptures.
    397   const char* kTooManyCaptures = "Too many captures";
    398   HeapStringAllocator allocator;
    399   StringStream accumulator(&allocator);
    400   for (int i = 0; i <= kMaxCaptures; i++) {
    401     accumulator.Add("()");
    402   }
    403   SmartPointer<const char> many_captures(accumulator.ToCString());
    404   ExpectError(*many_captures, kTooManyCaptures);
    405 }
    406 
    407 
    408 static bool IsDigit(uc16 c) {
    409   return ('0' <= c && c <= '9');
    410 }
    411 
    412 
    413 static bool NotDigit(uc16 c) {
    414   return !IsDigit(c);
    415 }
    416 
    417 
    418 static bool IsWhiteSpace(uc16 c) {
    419   switch (c) {
    420     case 0x09:
    421     case 0x0A:
    422     case 0x0B:
    423     case 0x0C:
    424     case 0x0d:
    425     case 0x20:
    426     case 0xA0:
    427     case 0x2028:
    428     case 0x2029:
    429       return true;
    430     default:
    431       return unibrow::Space::Is(c);
    432   }
    433 }
    434 
    435 
    436 static bool NotWhiteSpace(uc16 c) {
    437   return !IsWhiteSpace(c);
    438 }
    439 
    440 
    441 static bool NotWord(uc16 c) {
    442   return !IsRegExpWord(c);
    443 }
    444 
    445 
    446 static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
    447   ZoneScope scope(DELETE_ON_EXIT);
    448   ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
    449   CharacterRange::AddClassEscape(c, ranges);
    450   for (unsigned i = 0; i < (1 << 16); i++) {
    451     bool in_class = false;
    452     for (int j = 0; !in_class && j < ranges->length(); j++) {
    453       CharacterRange& range = ranges->at(j);
    454       in_class = (range.from() <= i && i <= range.to());
    455     }
    456     CHECK_EQ(pred(i), in_class);
    457   }
    458 }
    459 
    460 
    461 TEST(CharacterClassEscapes) {
    462   TestCharacterClassEscapes('.', IsRegExpNewline);
    463   TestCharacterClassEscapes('d', IsDigit);
    464   TestCharacterClassEscapes('D', NotDigit);
    465   TestCharacterClassEscapes('s', IsWhiteSpace);
    466   TestCharacterClassEscapes('S', NotWhiteSpace);
    467   TestCharacterClassEscapes('w', IsRegExpWord);
    468   TestCharacterClassEscapes('W', NotWord);
    469 }
    470 
    471 
    472 static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
    473   V8::Initialize(NULL);
    474   FlatStringReader reader(CStrVector(input));
    475   RegExpCompileData compile_data;
    476   if (!v8::internal::ParseRegExp(&reader, multiline, &compile_data))
    477     return NULL;
    478   Handle<String> pattern = Factory::NewStringFromUtf8(CStrVector(input));
    479   RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
    480   return compile_data.node;
    481 }
    482 
    483 
    484 static void Execute(const char* input,
    485                     bool multiline,
    486                     bool is_ascii,
    487                     bool dot_output = false) {
    488   v8::HandleScope scope;
    489   ZoneScope zone_scope(DELETE_ON_EXIT);
    490   RegExpNode* node = Compile(input, multiline, is_ascii);
    491   USE(node);
    492 #ifdef DEBUG
    493   if (dot_output) {
    494     RegExpEngine::DotPrint(input, node, false);
    495     exit(0);
    496   }
    497 #endif  // DEBUG
    498 }
    499 
    500 
    501 class TestConfig {
    502  public:
    503   typedef int Key;
    504   typedef int Value;
    505   static const int kNoKey;
    506   static const int kNoValue;
    507   static inline int Compare(int a, int b) {
    508     if (a < b)
    509       return -1;
    510     else if (a > b)
    511       return 1;
    512     else
    513       return 0;
    514   }
    515 };
    516 
    517 
    518 const int TestConfig::kNoKey = 0;
    519 const int TestConfig::kNoValue = 0;
    520 
    521 
    522 static unsigned PseudoRandom(int i, int j) {
    523   return ~(~((i * 781) ^ (j * 329)));
    524 }
    525 
    526 
    527 TEST(SplayTreeSimple) {
    528   static const unsigned kLimit = 1000;
    529   ZoneScope zone_scope(DELETE_ON_EXIT);
    530   ZoneSplayTree<TestConfig> tree;
    531   bool seen[kLimit];
    532   for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
    533 #define CHECK_MAPS_EQUAL() do {                                      \
    534     for (unsigned k = 0; k < kLimit; k++)                            \
    535       CHECK_EQ(seen[k], tree.Find(k, &loc));                         \
    536   } while (false)
    537   for (int i = 0; i < 50; i++) {
    538     for (int j = 0; j < 50; j++) {
    539       unsigned next = PseudoRandom(i, j) % kLimit;
    540       if (seen[next]) {
    541         // We've already seen this one.  Check the value and remove
    542         // it.
    543         ZoneSplayTree<TestConfig>::Locator loc;
    544         CHECK(tree.Find(next, &loc));
    545         CHECK_EQ(next, loc.key());
    546         CHECK_EQ(3 * next, loc.value());
    547         tree.Remove(next);
    548         seen[next] = false;
    549         CHECK_MAPS_EQUAL();
    550       } else {
    551         // Check that it wasn't there already and then add it.
    552         ZoneSplayTree<TestConfig>::Locator loc;
    553         CHECK(!tree.Find(next, &loc));
    554         CHECK(tree.Insert(next, &loc));
    555         CHECK_EQ(next, loc.key());
    556         loc.set_value(3 * next);
    557         seen[next] = true;
    558         CHECK_MAPS_EQUAL();
    559       }
    560       int val = PseudoRandom(j, i) % kLimit;
    561       if (seen[val]) {
    562         ZoneSplayTree<TestConfig>::Locator loc;
    563         CHECK(tree.FindGreatestLessThan(val, &loc));
    564         CHECK_EQ(loc.key(), val);
    565         break;
    566       }
    567       val = PseudoRandom(i + j, i - j) % kLimit;
    568       if (seen[val]) {
    569         ZoneSplayTree<TestConfig>::Locator loc;
    570         CHECK(tree.FindLeastGreaterThan(val, &loc));
    571         CHECK_EQ(loc.key(), val);
    572         break;
    573       }
    574     }
    575   }
    576 }
    577 
    578 
    579 TEST(DispatchTableConstruction) {
    580   // Initialize test data.
    581   static const int kLimit = 1000;
    582   static const int kRangeCount = 8;
    583   static const int kRangeSize = 16;
    584   uc16 ranges[kRangeCount][2 * kRangeSize];
    585   for (int i = 0; i < kRangeCount; i++) {
    586     Vector<uc16> range(ranges[i], 2 * kRangeSize);
    587     for (int j = 0; j < 2 * kRangeSize; j++) {
    588       range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
    589     }
    590     range.Sort();
    591     for (int j = 1; j < 2 * kRangeSize; j++) {
    592       CHECK(range[j-1] <= range[j]);
    593     }
    594   }
    595   // Enter test data into dispatch table.
    596   ZoneScope zone_scope(DELETE_ON_EXIT);
    597   DispatchTable table;
    598   for (int i = 0; i < kRangeCount; i++) {
    599     uc16* range = ranges[i];
    600     for (int j = 0; j < 2 * kRangeSize; j += 2)
    601       table.AddRange(CharacterRange(range[j], range[j + 1]), i);
    602   }
    603   // Check that the table looks as we would expect
    604   for (int p = 0; p < kLimit; p++) {
    605     OutSet* outs = table.Get(p);
    606     for (int j = 0; j < kRangeCount; j++) {
    607       uc16* range = ranges[j];
    608       bool is_on = false;
    609       for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
    610         is_on = (range[k] <= p && p <= range[k + 1]);
    611       CHECK_EQ(is_on, outs->Get(j));
    612     }
    613   }
    614 }
    615 
    616 // Test of debug-only syntax.
    617 #ifdef DEBUG
    618 
    619 TEST(ParsePossessiveRepetition) {
    620   bool old_flag_value = FLAG_regexp_possessive_quantifier;
    621 
    622   // Enable possessive quantifier syntax.
    623   FLAG_regexp_possessive_quantifier = true;
    624 
    625   CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')");
    626   CHECK_PARSE_EQ("a++", "(# 1 - p 'a')");
    627   CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')");
    628   CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')");
    629   CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
    630 
    631   // Disable possessive quantifier syntax.
    632   FLAG_regexp_possessive_quantifier = false;
    633 
    634   CHECK_PARSE_ERROR("a*+");
    635   CHECK_PARSE_ERROR("a++");
    636   CHECK_PARSE_ERROR("a?+");
    637   CHECK_PARSE_ERROR("a{10,20}+");
    638   CHECK_PARSE_ERROR("a{10,20}+b");
    639 
    640   FLAG_regexp_possessive_quantifier = old_flag_value;
    641 }
    642 
    643 #endif
    644 
    645 // Tests of interpreter.
    646 
    647 
    648 #ifdef V8_NATIVE_REGEXP
    649 
    650 #if V8_TARGET_ARCH_IA32
    651 typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
    652 #elif V8_TARGET_ARCH_X64
    653 typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
    654 #elif V8_TARGET_ARCH_ARM
    655 typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
    656 #elif V8_TARGET_ARCH_MIPS
    657 typedef RegExpMacroAssembler ArchRegExpMacroAssembler;
    658 #endif
    659 
    660 class ContextInitializer {
    661  public:
    662   ContextInitializer()
    663       : env_(), scope_(), zone_(DELETE_ON_EXIT), stack_guard_() {
    664     env_ = v8::Context::New();
    665     env_->Enter();
    666   }
    667   ~ContextInitializer() {
    668     env_->Exit();
    669     env_.Dispose();
    670   }
    671  private:
    672   v8::Persistent<v8::Context> env_;
    673   v8::HandleScope scope_;
    674   v8::internal::ZoneScope zone_;
    675   v8::internal::StackGuard stack_guard_;
    676 };
    677 
    678 
    679 static ArchRegExpMacroAssembler::Result Execute(Code* code,
    680                                                 String* input,
    681                                                 int start_offset,
    682                                                 const byte* input_start,
    683                                                 const byte* input_end,
    684                                                 int* captures) {
    685   return NativeRegExpMacroAssembler::Execute(
    686       code,
    687       input,
    688       start_offset,
    689       input_start,
    690       input_end,
    691       captures);
    692 }
    693 
    694 
    695 TEST(MacroAssemblerNativeSuccess) {
    696   v8::V8::Initialize();
    697   ContextInitializer initializer;
    698 
    699   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
    700 
    701   m.Succeed();
    702 
    703   Handle<String> source = Factory::NewStringFromAscii(CStrVector(""));
    704   Handle<Object> code_object = m.GetCode(source);
    705   Handle<Code> code = Handle<Code>::cast(code_object);
    706 
    707   int captures[4] = {42, 37, 87, 117};
    708   Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo"));
    709   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
    710   const byte* start_adr =
    711       reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
    712 
    713   NativeRegExpMacroAssembler::Result result =
    714       Execute(*code,
    715               *input,
    716               0,
    717               start_adr,
    718               start_adr + seq_input->length(),
    719               captures);
    720 
    721   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
    722   CHECK_EQ(-1, captures[0]);
    723   CHECK_EQ(-1, captures[1]);
    724   CHECK_EQ(-1, captures[2]);
    725   CHECK_EQ(-1, captures[3]);
    726 }
    727 
    728 
    729 TEST(MacroAssemblerNativeSimple) {
    730   v8::V8::Initialize();
    731   ContextInitializer initializer;
    732 
    733   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
    734 
    735   uc16 foo_chars[3] = {'f', 'o', 'o'};
    736   Vector<const uc16> foo(foo_chars, 3);
    737 
    738   Label fail;
    739   m.CheckCharacters(foo, 0, &fail, true);
    740   m.WriteCurrentPositionToRegister(0, 0);
    741   m.AdvanceCurrentPosition(3);
    742   m.WriteCurrentPositionToRegister(1, 0);
    743   m.Succeed();
    744   m.Bind(&fail);
    745   m.Fail();
    746 
    747   Handle<String> source = Factory::NewStringFromAscii(CStrVector("^foo"));
    748   Handle<Object> code_object = m.GetCode(source);
    749   Handle<Code> code = Handle<Code>::cast(code_object);
    750 
    751   int captures[4] = {42, 37, 87, 117};
    752   Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo"));
    753   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
    754   Address start_adr = seq_input->GetCharsAddress();
    755 
    756   NativeRegExpMacroAssembler::Result result =
    757       Execute(*code,
    758               *input,
    759               0,
    760               start_adr,
    761               start_adr + input->length(),
    762               captures);
    763 
    764   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
    765   CHECK_EQ(0, captures[0]);
    766   CHECK_EQ(3, captures[1]);
    767   CHECK_EQ(-1, captures[2]);
    768   CHECK_EQ(-1, captures[3]);
    769 
    770   input = Factory::NewStringFromAscii(CStrVector("barbarbar"));
    771   seq_input = Handle<SeqAsciiString>::cast(input);
    772   start_adr = seq_input->GetCharsAddress();
    773 
    774   result = Execute(*code,
    775                    *input,
    776                    0,
    777                    start_adr,
    778                    start_adr + input->length(),
    779                    captures);
    780 
    781   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
    782 }
    783 
    784 
    785 TEST(MacroAssemblerNativeSimpleUC16) {
    786   v8::V8::Initialize();
    787   ContextInitializer initializer;
    788 
    789   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4);
    790 
    791   uc16 foo_chars[3] = {'f', 'o', 'o'};
    792   Vector<const uc16> foo(foo_chars, 3);
    793 
    794   Label fail;
    795   m.CheckCharacters(foo, 0, &fail, true);
    796   m.WriteCurrentPositionToRegister(0, 0);
    797   m.AdvanceCurrentPosition(3);
    798   m.WriteCurrentPositionToRegister(1, 0);
    799   m.Succeed();
    800   m.Bind(&fail);
    801   m.Fail();
    802 
    803   Handle<String> source = Factory::NewStringFromAscii(CStrVector("^foo"));
    804   Handle<Object> code_object = m.GetCode(source);
    805   Handle<Code> code = Handle<Code>::cast(code_object);
    806 
    807   int captures[4] = {42, 37, 87, 117};
    808   const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o', '\xa0'};
    809   Handle<String> input =
    810       Factory::NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
    811   Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
    812   Address start_adr = seq_input->GetCharsAddress();
    813 
    814   NativeRegExpMacroAssembler::Result result =
    815       Execute(*code,
    816               *input,
    817               0,
    818               start_adr,
    819               start_adr + input->length(),
    820               captures);
    821 
    822   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
    823   CHECK_EQ(0, captures[0]);
    824   CHECK_EQ(3, captures[1]);
    825   CHECK_EQ(-1, captures[2]);
    826   CHECK_EQ(-1, captures[3]);
    827 
    828   const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', '\xa0'};
    829   input = Factory::NewStringFromTwoByte(Vector<const uc16>(input_data2, 9));
    830   seq_input = Handle<SeqTwoByteString>::cast(input);
    831   start_adr = seq_input->GetCharsAddress();
    832 
    833   result = Execute(*code,
    834                    *input,
    835                    0,
    836                    start_adr,
    837                    start_adr + input->length() * 2,
    838                    captures);
    839 
    840   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
    841 }
    842 
    843 
    844 TEST(MacroAssemblerNativeBacktrack) {
    845   v8::V8::Initialize();
    846   ContextInitializer initializer;
    847 
    848   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
    849 
    850   Label fail;
    851   Label backtrack;
    852   m.LoadCurrentCharacter(10, &fail);
    853   m.Succeed();
    854   m.Bind(&fail);
    855   m.PushBacktrack(&backtrack);
    856   m.LoadCurrentCharacter(10, NULL);
    857   m.Succeed();
    858   m.Bind(&backtrack);
    859   m.Fail();
    860 
    861   Handle<String> source = Factory::NewStringFromAscii(CStrVector(".........."));
    862   Handle<Object> code_object = m.GetCode(source);
    863   Handle<Code> code = Handle<Code>::cast(code_object);
    864 
    865   Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo"));
    866   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
    867   Address start_adr = seq_input->GetCharsAddress();
    868 
    869   NativeRegExpMacroAssembler::Result result =
    870       Execute(*code,
    871               *input,
    872               0,
    873               start_adr,
    874               start_adr + input->length(),
    875               NULL);
    876 
    877   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
    878 }
    879 
    880 
    881 TEST(MacroAssemblerNativeBackReferenceASCII) {
    882   v8::V8::Initialize();
    883   ContextInitializer initializer;
    884 
    885   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
    886 
    887   m.WriteCurrentPositionToRegister(0, 0);
    888   m.AdvanceCurrentPosition(2);
    889   m.WriteCurrentPositionToRegister(1, 0);
    890   Label nomatch;
    891   m.CheckNotBackReference(0, &nomatch);
    892   m.Fail();
    893   m.Bind(&nomatch);
    894   m.AdvanceCurrentPosition(2);
    895   Label missing_match;
    896   m.CheckNotBackReference(0, &missing_match);
    897   m.WriteCurrentPositionToRegister(2, 0);
    898   m.Succeed();
    899   m.Bind(&missing_match);
    900   m.Fail();
    901 
    902   Handle<String> source = Factory::NewStringFromAscii(CStrVector("^(..)..\1"));
    903   Handle<Object> code_object = m.GetCode(source);
    904   Handle<Code> code = Handle<Code>::cast(code_object);
    905 
    906   Handle<String> input = Factory::NewStringFromAscii(CStrVector("fooofo"));
    907   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
    908   Address start_adr = seq_input->GetCharsAddress();
    909 
    910   int output[4];
    911   NativeRegExpMacroAssembler::Result result =
    912       Execute(*code,
    913               *input,
    914               0,
    915               start_adr,
    916               start_adr + input->length(),
    917               output);
    918 
    919   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
    920   CHECK_EQ(0, output[0]);
    921   CHECK_EQ(2, output[1]);
    922   CHECK_EQ(6, output[2]);
    923   CHECK_EQ(-1, output[3]);
    924 }
    925 
    926 
    927 TEST(MacroAssemblerNativeBackReferenceUC16) {
    928   v8::V8::Initialize();
    929   ContextInitializer initializer;
    930 
    931   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4);
    932 
    933   m.WriteCurrentPositionToRegister(0, 0);
    934   m.AdvanceCurrentPosition(2);
    935   m.WriteCurrentPositionToRegister(1, 0);
    936   Label nomatch;
    937   m.CheckNotBackReference(0, &nomatch);
    938   m.Fail();
    939   m.Bind(&nomatch);
    940   m.AdvanceCurrentPosition(2);
    941   Label missing_match;
    942   m.CheckNotBackReference(0, &missing_match);
    943   m.WriteCurrentPositionToRegister(2, 0);
    944   m.Succeed();
    945   m.Bind(&missing_match);
    946   m.Fail();
    947 
    948   Handle<String> source = Factory::NewStringFromAscii(CStrVector("^(..)..\1"));
    949   Handle<Object> code_object = m.GetCode(source);
    950   Handle<Code> code = Handle<Code>::cast(code_object);
    951 
    952   const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
    953   Handle<String> input =
    954       Factory::NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
    955   Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
    956   Address start_adr = seq_input->GetCharsAddress();
    957 
    958   int output[4];
    959   NativeRegExpMacroAssembler::Result result =
    960       Execute(*code,
    961                   *input,
    962                   0,
    963                   start_adr,
    964                   start_adr + input->length() * 2,
    965                   output);
    966 
    967   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
    968   CHECK_EQ(0, output[0]);
    969   CHECK_EQ(2, output[1]);
    970   CHECK_EQ(6, output[2]);
    971   CHECK_EQ(-1, output[3]);
    972 }
    973 
    974 
    975 
    976 TEST(MacroAssemblernativeAtStart) {
    977   v8::V8::Initialize();
    978   ContextInitializer initializer;
    979 
    980   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
    981 
    982   Label not_at_start, newline, fail;
    983   m.CheckNotAtStart(&not_at_start);
    984   // Check that prevchar = '\n' and current = 'f'.
    985   m.CheckCharacter('\n', &newline);
    986   m.Bind(&fail);
    987   m.Fail();
    988   m.Bind(&newline);
    989   m.LoadCurrentCharacter(0, &fail);
    990   m.CheckNotCharacter('f', &fail);
    991   m.Succeed();
    992 
    993   m.Bind(&not_at_start);
    994   // Check that prevchar = 'o' and current = 'b'.
    995   Label prevo;
    996   m.CheckCharacter('o', &prevo);
    997   m.Fail();
    998   m.Bind(&prevo);
    999   m.LoadCurrentCharacter(0, &fail);
   1000   m.CheckNotCharacter('b', &fail);
   1001   m.Succeed();
   1002 
   1003   Handle<String> source = Factory::NewStringFromAscii(CStrVector("(^f|ob)"));
   1004   Handle<Object> code_object = m.GetCode(source);
   1005   Handle<Code> code = Handle<Code>::cast(code_object);
   1006 
   1007   Handle<String> input = Factory::NewStringFromAscii(CStrVector("foobar"));
   1008   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
   1009   Address start_adr = seq_input->GetCharsAddress();
   1010 
   1011   NativeRegExpMacroAssembler::Result result =
   1012       Execute(*code,
   1013               *input,
   1014               0,
   1015               start_adr,
   1016               start_adr + input->length(),
   1017               NULL);
   1018 
   1019   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
   1020 
   1021   result = Execute(*code,
   1022                    *input,
   1023                    3,
   1024                    start_adr + 3,
   1025                    start_adr + input->length(),
   1026                    NULL);
   1027 
   1028   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
   1029 }
   1030 
   1031 
   1032 TEST(MacroAssemblerNativeBackRefNoCase) {
   1033   v8::V8::Initialize();
   1034   ContextInitializer initializer;
   1035 
   1036   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
   1037 
   1038   Label fail, succ;
   1039 
   1040   m.WriteCurrentPositionToRegister(0, 0);
   1041   m.WriteCurrentPositionToRegister(2, 0);
   1042   m.AdvanceCurrentPosition(3);
   1043   m.WriteCurrentPositionToRegister(3, 0);
   1044   m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "AbC".
   1045   m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "ABC".
   1046   Label expected_fail;
   1047   m.CheckNotBackReferenceIgnoreCase(2, &expected_fail);
   1048   m.Bind(&fail);
   1049   m.Fail();
   1050 
   1051   m.Bind(&expected_fail);
   1052   m.AdvanceCurrentPosition(3);  // Skip "xYz"
   1053   m.CheckNotBackReferenceIgnoreCase(2, &succ);
   1054   m.Fail();
   1055 
   1056   m.Bind(&succ);
   1057   m.WriteCurrentPositionToRegister(1, 0);
   1058   m.Succeed();
   1059 
   1060   Handle<String> source =
   1061       Factory::NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)"));
   1062   Handle<Object> code_object = m.GetCode(source);
   1063   Handle<Code> code = Handle<Code>::cast(code_object);
   1064 
   1065   Handle<String> input =
   1066       Factory::NewStringFromAscii(CStrVector("aBcAbCABCxYzab"));
   1067   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
   1068   Address start_adr = seq_input->GetCharsAddress();
   1069 
   1070   int output[4];
   1071   NativeRegExpMacroAssembler::Result result =
   1072       Execute(*code,
   1073               *input,
   1074               0,
   1075               start_adr,
   1076               start_adr + input->length(),
   1077               output);
   1078 
   1079   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
   1080   CHECK_EQ(0, output[0]);
   1081   CHECK_EQ(12, output[1]);
   1082   CHECK_EQ(0, output[2]);
   1083   CHECK_EQ(3, output[3]);
   1084 }
   1085 
   1086 
   1087 
   1088 TEST(MacroAssemblerNativeRegisters) {
   1089   v8::V8::Initialize();
   1090   ContextInitializer initializer;
   1091 
   1092   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6);
   1093 
   1094   uc16 foo_chars[3] = {'f', 'o', 'o'};
   1095   Vector<const uc16> foo(foo_chars, 3);
   1096 
   1097   enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
   1098   Label fail;
   1099   Label backtrack;
   1100   m.WriteCurrentPositionToRegister(out1, 0);  // Output: [0]
   1101   m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
   1102   m.PushBacktrack(&backtrack);
   1103   m.WriteStackPointerToRegister(sp);
   1104   // Fill stack and registers
   1105   m.AdvanceCurrentPosition(2);
   1106   m.WriteCurrentPositionToRegister(out1, 0);
   1107   m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
   1108   m.PushBacktrack(&fail);
   1109   // Drop backtrack stack frames.
   1110   m.ReadStackPointerFromRegister(sp);
   1111   // And take the first backtrack (to &backtrack)
   1112   m.Backtrack();
   1113 
   1114   m.PushCurrentPosition();
   1115   m.AdvanceCurrentPosition(2);
   1116   m.PopCurrentPosition();
   1117 
   1118   m.Bind(&backtrack);
   1119   m.PopRegister(out1);
   1120   m.ReadCurrentPositionFromRegister(out1);
   1121   m.AdvanceCurrentPosition(3);
   1122   m.WriteCurrentPositionToRegister(out2, 0);  // [0,3]
   1123 
   1124   Label loop;
   1125   m.SetRegister(loop_cnt, 0);  // loop counter
   1126   m.Bind(&loop);
   1127   m.AdvanceRegister(loop_cnt, 1);
   1128   m.AdvanceCurrentPosition(1);
   1129   m.IfRegisterLT(loop_cnt, 3, &loop);
   1130   m.WriteCurrentPositionToRegister(out3, 0);  // [0,3,6]
   1131 
   1132   Label loop2;
   1133   m.SetRegister(loop_cnt, 2);  // loop counter
   1134   m.Bind(&loop2);
   1135   m.AdvanceRegister(loop_cnt, -1);
   1136   m.AdvanceCurrentPosition(1);
   1137   m.IfRegisterGE(loop_cnt, 0, &loop2);
   1138   m.WriteCurrentPositionToRegister(out4, 0);  // [0,3,6,9]
   1139 
   1140   Label loop3;
   1141   Label exit_loop3;
   1142   m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
   1143   m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
   1144   m.ReadCurrentPositionFromRegister(out3);
   1145   m.Bind(&loop3);
   1146   m.AdvanceCurrentPosition(1);
   1147   m.CheckGreedyLoop(&exit_loop3);
   1148   m.GoTo(&loop3);
   1149   m.Bind(&exit_loop3);
   1150   m.PopCurrentPosition();
   1151   m.WriteCurrentPositionToRegister(out5, 0);  // [0,3,6,9,9,-1]
   1152 
   1153   m.Succeed();
   1154 
   1155   m.Bind(&fail);
   1156   m.Fail();
   1157 
   1158   Handle<String> source =
   1159       Factory::NewStringFromAscii(CStrVector("<loop test>"));
   1160   Handle<Object> code_object = m.GetCode(source);
   1161   Handle<Code> code = Handle<Code>::cast(code_object);
   1162 
   1163   // String long enough for test (content doesn't matter).
   1164   Handle<String> input =
   1165       Factory::NewStringFromAscii(CStrVector("foofoofoofoofoo"));
   1166   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
   1167   Address start_adr = seq_input->GetCharsAddress();
   1168 
   1169   int output[6];
   1170   NativeRegExpMacroAssembler::Result result =
   1171       Execute(*code,
   1172               *input,
   1173               0,
   1174               start_adr,
   1175               start_adr + input->length(),
   1176               output);
   1177 
   1178   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
   1179   CHECK_EQ(0, output[0]);
   1180   CHECK_EQ(3, output[1]);
   1181   CHECK_EQ(6, output[2]);
   1182   CHECK_EQ(9, output[3]);
   1183   CHECK_EQ(9, output[4]);
   1184   CHECK_EQ(-1, output[5]);
   1185 }
   1186 
   1187 
   1188 TEST(MacroAssemblerStackOverflow) {
   1189   v8::V8::Initialize();
   1190   ContextInitializer initializer;
   1191 
   1192   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
   1193 
   1194   Label loop;
   1195   m.Bind(&loop);
   1196   m.PushBacktrack(&loop);
   1197   m.GoTo(&loop);
   1198 
   1199   Handle<String> source =
   1200       Factory::NewStringFromAscii(CStrVector("<stack overflow test>"));
   1201   Handle<Object> code_object = m.GetCode(source);
   1202   Handle<Code> code = Handle<Code>::cast(code_object);
   1203 
   1204   // String long enough for test (content doesn't matter).
   1205   Handle<String> input =
   1206       Factory::NewStringFromAscii(CStrVector("dummy"));
   1207   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
   1208   Address start_adr = seq_input->GetCharsAddress();
   1209 
   1210   NativeRegExpMacroAssembler::Result result =
   1211       Execute(*code,
   1212               *input,
   1213               0,
   1214               start_adr,
   1215               start_adr + input->length(),
   1216               NULL);
   1217 
   1218   CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
   1219   CHECK(Top::has_pending_exception());
   1220   Top::clear_pending_exception();
   1221 }
   1222 
   1223 
   1224 TEST(MacroAssemblerNativeLotsOfRegisters) {
   1225   v8::V8::Initialize();
   1226   ContextInitializer initializer;
   1227 
   1228   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2);
   1229 
   1230   // At least 2048, to ensure the allocated space for registers
   1231   // span one full page.
   1232   const int large_number = 8000;
   1233   m.WriteCurrentPositionToRegister(large_number, 42);
   1234   m.WriteCurrentPositionToRegister(0, 0);
   1235   m.WriteCurrentPositionToRegister(1, 1);
   1236   Label done;
   1237   m.CheckNotBackReference(0, &done);  // Performs a system-stack push.
   1238   m.Bind(&done);
   1239   m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
   1240   m.PopRegister(1);
   1241   m.Succeed();
   1242 
   1243   Handle<String> source =
   1244       Factory::NewStringFromAscii(CStrVector("<huge register space test>"));
   1245   Handle<Object> code_object = m.GetCode(source);
   1246   Handle<Code> code = Handle<Code>::cast(code_object);
   1247 
   1248   // String long enough for test (content doesn't matter).
   1249   Handle<String> input =
   1250       Factory::NewStringFromAscii(CStrVector("sample text"));
   1251   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
   1252   Address start_adr = seq_input->GetCharsAddress();
   1253 
   1254   int captures[2];
   1255   NativeRegExpMacroAssembler::Result result =
   1256       Execute(*code,
   1257               *input,
   1258               0,
   1259               start_adr,
   1260               start_adr + input->length(),
   1261               captures);
   1262 
   1263   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
   1264   CHECK_EQ(0, captures[0]);
   1265   CHECK_EQ(42, captures[1]);
   1266 
   1267   Top::clear_pending_exception();
   1268 }
   1269 
   1270 #else  // ! V8_REGEX_NATIVE
   1271 
   1272 TEST(MacroAssembler) {
   1273   V8::Initialize(NULL);
   1274   byte codes[1024];
   1275   RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024));
   1276   // ^f(o)o.
   1277   Label fail, fail2, start;
   1278   uc16 foo_chars[3];
   1279   foo_chars[0] = 'f';
   1280   foo_chars[1] = 'o';
   1281   foo_chars[2] = 'o';
   1282   Vector<const uc16> foo(foo_chars, 3);
   1283   m.SetRegister(4, 42);
   1284   m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
   1285   m.AdvanceRegister(4, 42);
   1286   m.GoTo(&start);
   1287   m.Fail();
   1288   m.Bind(&start);
   1289   m.PushBacktrack(&fail2);
   1290   m.CheckCharacters(foo, 0, &fail, true);
   1291   m.WriteCurrentPositionToRegister(0, 0);
   1292   m.PushCurrentPosition();
   1293   m.AdvanceCurrentPosition(3);
   1294   m.WriteCurrentPositionToRegister(1, 0);
   1295   m.PopCurrentPosition();
   1296   m.AdvanceCurrentPosition(1);
   1297   m.WriteCurrentPositionToRegister(2, 0);
   1298   m.AdvanceCurrentPosition(1);
   1299   m.WriteCurrentPositionToRegister(3, 0);
   1300   m.Succeed();
   1301 
   1302   m.Bind(&fail);
   1303   m.Backtrack();
   1304   m.Succeed();
   1305 
   1306   m.Bind(&fail2);
   1307   m.PopRegister(0);
   1308   m.Fail();
   1309 
   1310   v8::HandleScope scope;
   1311 
   1312   Handle<String> source = Factory::NewStringFromAscii(CStrVector("^f(o)o"));
   1313   Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
   1314   int captures[5];
   1315 
   1316   const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
   1317   Handle<String> f1_16 =
   1318       Factory::NewStringFromTwoByte(Vector<const uc16>(str1, 6));
   1319 
   1320   CHECK(IrregexpInterpreter::Match(array, f1_16, captures, 0));
   1321   CHECK_EQ(0, captures[0]);
   1322   CHECK_EQ(3, captures[1]);
   1323   CHECK_EQ(1, captures[2]);
   1324   CHECK_EQ(2, captures[3]);
   1325   CHECK_EQ(84, captures[4]);
   1326 
   1327   const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
   1328   Handle<String> f2_16 =
   1329       Factory::NewStringFromTwoByte(Vector<const uc16>(str2, 6));
   1330 
   1331   CHECK(!IrregexpInterpreter::Match(array, f2_16, captures, 0));
   1332   CHECK_EQ(42, captures[0]);
   1333 }
   1334 
   1335 #endif  // ! V8_REGEXP_NATIVE
   1336 
   1337 
   1338 TEST(AddInverseToTable) {
   1339   static const int kLimit = 1000;
   1340   static const int kRangeCount = 16;
   1341   for (int t = 0; t < 10; t++) {
   1342     ZoneScope zone_scope(DELETE_ON_EXIT);
   1343     ZoneList<CharacterRange>* ranges =
   1344         new ZoneList<CharacterRange>(kRangeCount);
   1345     for (int i = 0; i < kRangeCount; i++) {
   1346       int from = PseudoRandom(t + 87, i + 25) % kLimit;
   1347       int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
   1348       if (to > kLimit) to = kLimit;
   1349       ranges->Add(CharacterRange(from, to));
   1350     }
   1351     DispatchTable table;
   1352     DispatchTableConstructor cons(&table, false);
   1353     cons.set_choice_index(0);
   1354     cons.AddInverse(ranges);
   1355     for (int i = 0; i < kLimit; i++) {
   1356       bool is_on = false;
   1357       for (int j = 0; !is_on && j < kRangeCount; j++)
   1358         is_on = ranges->at(j).Contains(i);
   1359       OutSet* set = table.Get(i);
   1360       CHECK_EQ(is_on, set->Get(0) == false);
   1361     }
   1362   }
   1363   ZoneScope zone_scope(DELETE_ON_EXIT);
   1364   ZoneList<CharacterRange>* ranges =
   1365           new ZoneList<CharacterRange>(1);
   1366   ranges->Add(CharacterRange(0xFFF0, 0xFFFE));
   1367   DispatchTable table;
   1368   DispatchTableConstructor cons(&table, false);
   1369   cons.set_choice_index(0);
   1370   cons.AddInverse(ranges);
   1371   CHECK(!table.Get(0xFFFE)->Get(0));
   1372   CHECK(table.Get(0xFFFF)->Get(0));
   1373 }
   1374 
   1375 
   1376 static uc32 canonicalize(uc32 c) {
   1377   unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
   1378   int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
   1379   if (count == 0) {
   1380     return c;
   1381   } else {
   1382     CHECK_EQ(1, count);
   1383     return canon[0];
   1384   }
   1385 }
   1386 
   1387 
   1388 TEST(LatinCanonicalize) {
   1389   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
   1390   for (char lower = 'a'; lower <= 'z'; lower++) {
   1391     char upper = lower + ('A' - 'a');
   1392     CHECK_EQ(canonicalize(lower), canonicalize(upper));
   1393     unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1394     int length = un_canonicalize.get(lower, '\0', uncanon);
   1395     CHECK_EQ(2, length);
   1396     CHECK_EQ(upper, uncanon[0]);
   1397     CHECK_EQ(lower, uncanon[1]);
   1398   }
   1399   for (uc32 c = 128; c < (1 << 21); c++)
   1400     CHECK_GE(canonicalize(c), 128);
   1401   unibrow::Mapping<unibrow::ToUppercase> to_upper;
   1402   for (uc32 c = 0; c < (1 << 21); c++) {
   1403     unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
   1404     int length = to_upper.get(c, '\0', upper);
   1405     if (length == 0) {
   1406       length = 1;
   1407       upper[0] = c;
   1408     }
   1409     uc32 u = upper[0];
   1410     if (length > 1 || (c >= 128 && u < 128))
   1411       u = c;
   1412     CHECK_EQ(u, canonicalize(c));
   1413   }
   1414 }
   1415 
   1416 
   1417 static uc32 CanonRange(uc32 c) {
   1418   unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
   1419   int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
   1420   if (count == 0) {
   1421     return c;
   1422   } else {
   1423     CHECK_EQ(1, count);
   1424     return canon[0];
   1425   }
   1426 }
   1427 
   1428 
   1429 TEST(RangeCanonicalization) {
   1430   CHECK_NE(CanonRange(0) & CharacterRange::kStartMarker, 0);
   1431   // Check that we arrive at the same result when using the basic
   1432   // range canonicalization primitives as when using immediate
   1433   // canonicalization.
   1434   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
   1435   for (int i = 0; i < CharacterRange::kRangeCanonicalizeMax; i++) {
   1436     int range = CanonRange(i);
   1437     int indirect_length = 0;
   1438     unibrow::uchar indirect[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1439     if ((range & CharacterRange::kStartMarker) == 0) {
   1440       indirect_length = un_canonicalize.get(i - range, '\0', indirect);
   1441       for (int i = 0; i < indirect_length; i++)
   1442         indirect[i] += range;
   1443     } else {
   1444       indirect_length = un_canonicalize.get(i, '\0', indirect);
   1445     }
   1446     unibrow::uchar direct[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1447     int direct_length = un_canonicalize.get(i, '\0', direct);
   1448     CHECK_EQ(direct_length, indirect_length);
   1449   }
   1450   // Check that we arrive at the same results when skipping over
   1451   // canonicalization ranges.
   1452   int next_block = 0;
   1453   while (next_block < CharacterRange::kRangeCanonicalizeMax) {
   1454     uc32 start = CanonRange(next_block);
   1455     CHECK_NE((start & CharacterRange::kStartMarker), 0);
   1456     unsigned dist = start & CharacterRange::kPayloadMask;
   1457     unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1458     int first_length = un_canonicalize.get(next_block, '\0', first);
   1459     for (unsigned i = 1; i < dist; i++) {
   1460       CHECK_EQ(i, CanonRange(next_block + i));
   1461       unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1462       int succ_length = un_canonicalize.get(next_block + i, '\0', succ);
   1463       CHECK_EQ(first_length, succ_length);
   1464       for (int j = 0; j < succ_length; j++) {
   1465         int calc = first[j] + i;
   1466         int found = succ[j];
   1467         CHECK_EQ(calc, found);
   1468       }
   1469     }
   1470     next_block = next_block + dist;
   1471   }
   1472 }
   1473 
   1474 
   1475 TEST(UncanonicalizeEquivalence) {
   1476   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
   1477   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1478   for (int i = 0; i < (1 << 16); i++) {
   1479     int length = un_canonicalize.get(i, '\0', chars);
   1480     for (int j = 0; j < length; j++) {
   1481       unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1482       int length2 = un_canonicalize.get(chars[j], '\0', chars2);
   1483       CHECK_EQ(length, length2);
   1484       for (int k = 0; k < length; k++)
   1485         CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
   1486     }
   1487   }
   1488 }
   1489 
   1490 
   1491 static void TestRangeCaseIndependence(CharacterRange input,
   1492                                       Vector<CharacterRange> expected) {
   1493   ZoneScope zone_scope(DELETE_ON_EXIT);
   1494   int count = expected.length();
   1495   ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(count);
   1496   input.AddCaseEquivalents(list, false);
   1497   CHECK_EQ(count, list->length());
   1498   for (int i = 0; i < list->length(); i++) {
   1499     CHECK_EQ(expected[i].from(), list->at(i).from());
   1500     CHECK_EQ(expected[i].to(), list->at(i).to());
   1501   }
   1502 }
   1503 
   1504 
   1505 static void TestSimpleRangeCaseIndependence(CharacterRange input,
   1506                                             CharacterRange expected) {
   1507   EmbeddedVector<CharacterRange, 1> vector;
   1508   vector[0] = expected;
   1509   TestRangeCaseIndependence(input, vector);
   1510 }
   1511 
   1512 
   1513 TEST(CharacterRangeCaseIndependence) {
   1514   TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'),
   1515                                   CharacterRange::Singleton('A'));
   1516   TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'),
   1517                                   CharacterRange::Singleton('Z'));
   1518   TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'),
   1519                                   CharacterRange('A', 'Z'));
   1520   TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'),
   1521                                   CharacterRange('C', 'F'));
   1522   TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'),
   1523                                   CharacterRange('A', 'B'));
   1524   TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'),
   1525                                   CharacterRange('Y', 'Z'));
   1526   TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1),
   1527                                   CharacterRange('A', 'Z'));
   1528   TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'),
   1529                                   CharacterRange('a', 'z'));
   1530   TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'),
   1531                                   CharacterRange('c', 'f'));
   1532   TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1),
   1533                                   CharacterRange('a', 'z'));
   1534   // Here we need to add [l-z] to complete the case independence of
   1535   // [A-Za-z] but we expect [a-z] to be added since we always add a
   1536   // whole block at a time.
   1537   TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'),
   1538                                   CharacterRange('a', 'z'));
   1539 }
   1540 
   1541 
   1542 static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
   1543   if (ranges == NULL)
   1544     return false;
   1545   for (int i = 0; i < ranges->length(); i++) {
   1546     CharacterRange range = ranges->at(i);
   1547     if (range.from() <= c && c <= range.to())
   1548       return true;
   1549   }
   1550   return false;
   1551 }
   1552 
   1553 
   1554 TEST(CharClassDifference) {
   1555   ZoneScope zone_scope(DELETE_ON_EXIT);
   1556   ZoneList<CharacterRange>* base = new ZoneList<CharacterRange>(1);
   1557   base->Add(CharacterRange::Everything());
   1558   Vector<const uc16> overlay = CharacterRange::GetWordBounds();
   1559   ZoneList<CharacterRange>* included = NULL;
   1560   ZoneList<CharacterRange>* excluded = NULL;
   1561   CharacterRange::Split(base, overlay, &included, &excluded);
   1562   for (int i = 0; i < (1 << 16); i++) {
   1563     bool in_base = InClass(i, base);
   1564     if (in_base) {
   1565       bool in_overlay = false;
   1566       for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
   1567         if (overlay[j] <= i && i <= overlay[j+1])
   1568           in_overlay = true;
   1569       }
   1570       CHECK_EQ(in_overlay, InClass(i, included));
   1571       CHECK_EQ(!in_overlay, InClass(i, excluded));
   1572     } else {
   1573       CHECK(!InClass(i, included));
   1574       CHECK(!InClass(i, excluded));
   1575     }
   1576   }
   1577 }
   1578 
   1579 
   1580 TEST(CanonicalizeCharacterSets) {
   1581   ZoneScope scope(DELETE_ON_EXIT);
   1582   ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4);
   1583   CharacterSet set(list);
   1584 
   1585   list->Add(CharacterRange(10, 20));
   1586   list->Add(CharacterRange(30, 40));
   1587   list->Add(CharacterRange(50, 60));
   1588   set.Canonicalize();
   1589   ASSERT_EQ(3, list->length());
   1590   ASSERT_EQ(10, list->at(0).from());
   1591   ASSERT_EQ(20, list->at(0).to());
   1592   ASSERT_EQ(30, list->at(1).from());
   1593   ASSERT_EQ(40, list->at(1).to());
   1594   ASSERT_EQ(50, list->at(2).from());
   1595   ASSERT_EQ(60, list->at(2).to());
   1596 
   1597   list->Rewind(0);
   1598   list->Add(CharacterRange(10, 20));
   1599   list->Add(CharacterRange(50, 60));
   1600   list->Add(CharacterRange(30, 40));
   1601   set.Canonicalize();
   1602   ASSERT_EQ(3, list->length());
   1603   ASSERT_EQ(10, list->at(0).from());
   1604   ASSERT_EQ(20, list->at(0).to());
   1605   ASSERT_EQ(30, list->at(1).from());
   1606   ASSERT_EQ(40, list->at(1).to());
   1607   ASSERT_EQ(50, list->at(2).from());
   1608   ASSERT_EQ(60, list->at(2).to());
   1609 
   1610   list->Rewind(0);
   1611   list->Add(CharacterRange(30, 40));
   1612   list->Add(CharacterRange(10, 20));
   1613   list->Add(CharacterRange(25, 25));
   1614   list->Add(CharacterRange(100, 100));
   1615   list->Add(CharacterRange(1, 1));
   1616   set.Canonicalize();
   1617   ASSERT_EQ(5, list->length());
   1618   ASSERT_EQ(1, list->at(0).from());
   1619   ASSERT_EQ(1, list->at(0).to());
   1620   ASSERT_EQ(10, list->at(1).from());
   1621   ASSERT_EQ(20, list->at(1).to());
   1622   ASSERT_EQ(25, list->at(2).from());
   1623   ASSERT_EQ(25, list->at(2).to());
   1624   ASSERT_EQ(30, list->at(3).from());
   1625   ASSERT_EQ(40, list->at(3).to());
   1626   ASSERT_EQ(100, list->at(4).from());
   1627   ASSERT_EQ(100, list->at(4).to());
   1628 
   1629   list->Rewind(0);
   1630   list->Add(CharacterRange(10, 19));
   1631   list->Add(CharacterRange(21, 30));
   1632   list->Add(CharacterRange(20, 20));
   1633   set.Canonicalize();
   1634   ASSERT_EQ(1, list->length());
   1635   ASSERT_EQ(10, list->at(0).from());
   1636   ASSERT_EQ(30, list->at(0).to());
   1637 }
   1638 
   1639 // Checks whether a character is in the set represented by a list of ranges.
   1640 static bool CharacterInSet(ZoneList<CharacterRange>* set, uc16 value) {
   1641   for (int i = 0; i < set->length(); i++) {
   1642     CharacterRange range = set->at(i);
   1643     if (range.from() <= value && value <= range.to()) {
   1644       return true;
   1645     }
   1646   }
   1647   return false;
   1648 }
   1649 
   1650 TEST(CharacterRangeMerge) {
   1651   ZoneScope zone_scope(DELETE_ON_EXIT);
   1652   ZoneList<CharacterRange> l1(4);
   1653   ZoneList<CharacterRange> l2(4);
   1654   // Create all combinations of intersections of ranges, both singletons and
   1655   // longer.
   1656 
   1657   int offset = 0;
   1658 
   1659   // The five kinds of singleton intersections:
   1660   //     X
   1661   //   Y      - outside before
   1662   //    Y     - outside touching start
   1663   //     Y    - overlap
   1664   //      Y   - outside touching end
   1665   //       Y  - outside after
   1666 
   1667   for (int i = 0; i < 5; i++) {
   1668     l1.Add(CharacterRange::Singleton(offset + 2));
   1669     l2.Add(CharacterRange::Singleton(offset + i));
   1670     offset += 6;
   1671   }
   1672 
   1673   // The seven kinds of singleton/non-singleton intersections:
   1674   //    XXX
   1675   //  Y        - outside before
   1676   //   Y       - outside touching start
   1677   //    Y      - inside touching start
   1678   //     Y     - entirely inside
   1679   //      Y    - inside touching end
   1680   //       Y   - outside touching end
   1681   //        Y  - disjoint after
   1682 
   1683   for (int i = 0; i < 7; i++) {
   1684     l1.Add(CharacterRange::Range(offset + 2, offset + 4));
   1685     l2.Add(CharacterRange::Singleton(offset + i));
   1686     offset += 8;
   1687   }
   1688 
   1689   // The eleven kinds of non-singleton intersections:
   1690   //
   1691   //       XXXXXXXX
   1692   // YYYY                  - outside before.
   1693   //   YYYY                - outside touching start.
   1694   //     YYYY              - overlapping start
   1695   //       YYYY            - inside touching start
   1696   //         YYYY          - entirely inside
   1697   //           YYYY        - inside touching end
   1698   //             YYYY      - overlapping end
   1699   //               YYYY    - outside touching end
   1700   //                 YYYY  - outside after
   1701   //       YYYYYYYY        - identical
   1702   //     YYYYYYYYYYYY      - containing entirely.
   1703 
   1704   for (int i = 0; i < 9; i++) {
   1705     l1.Add(CharacterRange::Range(offset + 6, offset + 15));  // Length 8.
   1706     l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3));
   1707     offset += 22;
   1708   }
   1709   l1.Add(CharacterRange::Range(offset + 6, offset + 15));
   1710   l2.Add(CharacterRange::Range(offset + 6, offset + 15));
   1711   offset += 22;
   1712   l1.Add(CharacterRange::Range(offset + 6, offset + 15));
   1713   l2.Add(CharacterRange::Range(offset + 4, offset + 17));
   1714   offset += 22;
   1715 
   1716   // Different kinds of multi-range overlap:
   1717   // XXXXXXXXXXXXXXXXXXXXXX         XXXXXXXXXXXXXXXXXXXXXX
   1718   //   YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y
   1719 
   1720   l1.Add(CharacterRange::Range(offset, offset + 21));
   1721   l1.Add(CharacterRange::Range(offset + 31, offset + 52));
   1722   for (int i = 0; i < 6; i++) {
   1723     l2.Add(CharacterRange::Range(offset + 2, offset + 5));
   1724     l2.Add(CharacterRange::Singleton(offset + 8));
   1725     offset += 9;
   1726   }
   1727 
   1728   ASSERT(CharacterRange::IsCanonical(&l1));
   1729   ASSERT(CharacterRange::IsCanonical(&l2));
   1730 
   1731   ZoneList<CharacterRange> first_only(4);
   1732   ZoneList<CharacterRange> second_only(4);
   1733   ZoneList<CharacterRange> both(4);
   1734 
   1735   // Merge one direction.
   1736   CharacterRange::Merge(&l1, &l2, &first_only, &second_only, &both);
   1737 
   1738   CHECK(CharacterRange::IsCanonical(&first_only));
   1739   CHECK(CharacterRange::IsCanonical(&second_only));
   1740   CHECK(CharacterRange::IsCanonical(&both));
   1741 
   1742   for (uc16 i = 0; i < offset; i++) {
   1743     bool in_first = CharacterInSet(&l1, i);
   1744     bool in_second = CharacterInSet(&l2, i);
   1745     CHECK((in_first && !in_second) == CharacterInSet(&first_only, i));
   1746     CHECK((!in_first && in_second) == CharacterInSet(&second_only, i));
   1747     CHECK((in_first && in_second) == CharacterInSet(&both, i));
   1748   }
   1749 
   1750   first_only.Clear();
   1751   second_only.Clear();
   1752   both.Clear();
   1753 
   1754   // Merge other direction.
   1755   CharacterRange::Merge(&l2, &l1, &second_only, &first_only, &both);
   1756 
   1757   CHECK(CharacterRange::IsCanonical(&first_only));
   1758   CHECK(CharacterRange::IsCanonical(&second_only));
   1759   CHECK(CharacterRange::IsCanonical(&both));
   1760 
   1761   for (uc16 i = 0; i < offset; i++) {
   1762     bool in_first = CharacterInSet(&l1, i);
   1763     bool in_second = CharacterInSet(&l2, i);
   1764     CHECK((in_first && !in_second) == CharacterInSet(&first_only, i));
   1765     CHECK((!in_first && in_second) == CharacterInSet(&second_only, i));
   1766     CHECK((in_first && in_second) == CharacterInSet(&both, i));
   1767   }
   1768 
   1769   first_only.Clear();
   1770   second_only.Clear();
   1771   both.Clear();
   1772 
   1773   // Merge but don't record all combinations.
   1774   CharacterRange::Merge(&l1, &l2, NULL, NULL, &both);
   1775 
   1776   CHECK(CharacterRange::IsCanonical(&both));
   1777 
   1778   for (uc16 i = 0; i < offset; i++) {
   1779     bool in_first = CharacterInSet(&l1, i);
   1780     bool in_second = CharacterInSet(&l2, i);
   1781     CHECK((in_first && in_second) == CharacterInSet(&both, i));
   1782   }
   1783 
   1784   // Merge into same set.
   1785   ZoneList<CharacterRange> all(4);
   1786   CharacterRange::Merge(&l1, &l2, &all, &all, &all);
   1787 
   1788   CHECK(CharacterRange::IsCanonical(&all));
   1789 
   1790   for (uc16 i = 0; i < offset; i++) {
   1791     bool in_first = CharacterInSet(&l1, i);
   1792     bool in_second = CharacterInSet(&l2, i);
   1793     CHECK((in_first || in_second) == CharacterInSet(&all, i));
   1794   }
   1795 }
   1796 
   1797 
   1798 TEST(Graph) {
   1799   V8::Initialize(NULL);
   1800   Execute("\\b\\w+\\b", false, true, true);
   1801 }
   1802