1 // Copyright 2008 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 29 #include <stdlib.h> 30 31 #include "v8.h" 32 33 #include "string-stream.h" 34 #include "cctest.h" 35 #include "zone-inl.h" 36 #include "parser.h" 37 #include "ast.h" 38 #include "jsregexp.h" 39 #include "regexp-macro-assembler.h" 40 #include "regexp-macro-assembler-irregexp.h" 41 #ifdef V8_NATIVE_REGEXP 42 #ifdef V8_TARGET_ARCH_ARM 43 #include "arm/macro-assembler-arm.h" 44 #include "arm/regexp-macro-assembler-arm.h" 45 #endif 46 #ifdef V8_TARGET_ARCH_X64 47 #include "x64/macro-assembler-x64.h" 48 #include "x64/regexp-macro-assembler-x64.h" 49 #endif 50 #ifdef V8_TARGET_ARCH_IA32 51 #include "ia32/macro-assembler-ia32.h" 52 #include "ia32/regexp-macro-assembler-ia32.h" 53 #endif 54 #else 55 #include "interpreter-irregexp.h" 56 #endif 57 58 using namespace v8::internal; 59 60 61 static bool CheckParse(const char* input) { 62 V8::Initialize(NULL); 63 v8::HandleScope scope; 64 ZoneScope zone_scope(DELETE_ON_EXIT); 65 FlatStringReader reader(CStrVector(input)); 66 RegExpCompileData result; 67 return v8::internal::ParseRegExp(&reader, false, &result); 68 } 69 70 71 static SmartPointer<const char> Parse(const char* input) { 72 V8::Initialize(NULL); 73 v8::HandleScope scope; 74 ZoneScope zone_scope(DELETE_ON_EXIT); 75 FlatStringReader reader(CStrVector(input)); 76 RegExpCompileData result; 77 CHECK(v8::internal::ParseRegExp(&reader, false, &result)); 78 CHECK(result.tree != NULL); 79 CHECK(result.error.is_null()); 80 SmartPointer<const char> output = result.tree->ToString(); 81 return output; 82 } 83 84 static bool CheckSimple(const char* input) { 85 V8::Initialize(NULL); 86 v8::HandleScope scope; 87 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input)); 88 ZoneScope zone_scope(DELETE_ON_EXIT); 89 FlatStringReader reader(CStrVector(input)); 90 RegExpCompileData result; 91 CHECK(v8::internal::ParseRegExp(&reader, false, &result)); 92 CHECK(result.tree != NULL); 93 CHECK(result.error.is_null()); 94 return result.simple; 95 } 96 97 struct MinMaxPair { 98 int min_match; 99 int max_match; 100 }; 101 102 static MinMaxPair CheckMinMaxMatch(const char* input) { 103 V8::Initialize(NULL); 104 v8::HandleScope scope; 105 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input)); 106 ZoneScope zone_scope(DELETE_ON_EXIT); 107 FlatStringReader reader(CStrVector(input)); 108 RegExpCompileData result; 109 CHECK(v8::internal::ParseRegExp(&reader, false, &result)); 110 CHECK(result.tree != NULL); 111 CHECK(result.error.is_null()); 112 int min_match = result.tree->min_match(); 113 int max_match = result.tree->max_match(); 114 MinMaxPair pair = { min_match, max_match }; 115 return pair; 116 } 117 118 119 #define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input)) 120 #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input)) 121 #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input)); 122 #define CHECK_MIN_MAX(input, min, max) \ 123 { MinMaxPair min_max = CheckMinMaxMatch(input); \ 124 CHECK_EQ(min, min_max.min_match); \ 125 CHECK_EQ(max, min_max.max_match); \ 126 } 127 128 TEST(Parser) { 129 V8::Initialize(NULL); 130 131 CHECK_PARSE_ERROR("?"); 132 133 CHECK_PARSE_EQ("abc", "'abc'"); 134 CHECK_PARSE_EQ("", "%"); 135 CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')"); 136 CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')"); 137 CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)"); 138 CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')"); 139 CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])"); 140 CHECK_PARSE_EQ("a*", "(# 0 - g 'a')"); 141 CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')"); 142 CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))"); 143 CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))"); 144 CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))"); 145 CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))"); 146 CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))"); 147 CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))"); 148 CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))"); 149 CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))"); 150 CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))"); 151 CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))"); 152 CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))"); 153 CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))"); 154 CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'"); 155 CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')"); 156 CHECK_PARSE_EQ("(?:foo)", "'foo'"); 157 CHECK_PARSE_EQ("(?: foo )", "' foo '"); 158 CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))"); 159 CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')"); 160 CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')"); 161 CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')"); 162 CHECK_PARSE_EQ("()", "(^ %)"); 163 CHECK_PARSE_EQ("(?=)", "(-> + %)"); 164 CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows 165 CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252 166 CHECK_PARSE_EQ("[x]", "[x]"); 167 CHECK_PARSE_EQ("[xyz]", "[x y z]"); 168 CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]"); 169 CHECK_PARSE_EQ("[-123]", "[- 1 2 3]"); 170 CHECK_PARSE_EQ("[^123]", "^[1 2 3]"); 171 CHECK_PARSE_EQ("]", "']'"); 172 CHECK_PARSE_EQ("}", "'}'"); 173 CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]"); 174 CHECK_PARSE_EQ("[\\d]", "[0-9]"); 175 CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]"); 176 CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]"); 177 CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]"); 178 CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]"); 179 CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK", 180 "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'"); 181 CHECK_PARSE_EQ("\\c!", "'c!'"); 182 CHECK_PARSE_EQ("\\c_", "'c_'"); 183 CHECK_PARSE_EQ("\\c~", "'c~'"); 184 CHECK_PARSE_EQ("[a\\]c]", "[a ] c]"); 185 CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '"); 186 CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]"); 187 CHECK_PARSE_EQ("\\0", "'\\x00'"); 188 CHECK_PARSE_EQ("\\8", "'8'"); 189 CHECK_PARSE_EQ("\\9", "'9'"); 190 CHECK_PARSE_EQ("\\11", "'\\x09'"); 191 CHECK_PARSE_EQ("\\11a", "'\\x09a'"); 192 CHECK_PARSE_EQ("\\011", "'\\x09'"); 193 CHECK_PARSE_EQ("\\00011", "'\\x0011'"); 194 CHECK_PARSE_EQ("\\118", "'\\x098'"); 195 CHECK_PARSE_EQ("\\111", "'I'"); 196 CHECK_PARSE_EQ("\\1111", "'I1'"); 197 CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))"); 198 CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))"); 199 CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))"); 200 CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')"); 201 CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')" 202 " (# 0 - g (<- 1)))"); 203 CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')" 204 " (# 0 - g (<- 2)))"); 205 CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')" 206 " (# 0 - g (<- 3)))"); 207 CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')" 208 " (# 0 - g '\\x04'))"); 209 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10", 210 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 211 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))"); 212 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11", 213 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 214 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')"); 215 CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))"); 216 CHECK_PARSE_EQ("(a\\1)", "(^ 'a')"); 217 CHECK_PARSE_EQ("(\\1a)", "(^ 'a')"); 218 CHECK_PARSE_EQ("(?=a)?a", "'a'"); 219 CHECK_PARSE_EQ("(?=a){0,10}a", "'a'"); 220 CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')"); 221 CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')"); 222 CHECK_PARSE_EQ("(?!a)?a", "'a'"); 223 CHECK_PARSE_EQ("\\1(a)", "(^ 'a')"); 224 CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))"); 225 CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))"); 226 CHECK_PARSE_EQ("[\\0]", "[\\x00]"); 227 CHECK_PARSE_EQ("[\\11]", "[\\x09]"); 228 CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]"); 229 CHECK_PARSE_EQ("[\\011]", "[\\x09]"); 230 CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]"); 231 CHECK_PARSE_EQ("[\\118]", "[\\x09 8]"); 232 CHECK_PARSE_EQ("[\\111]", "[I]"); 233 CHECK_PARSE_EQ("[\\1111]", "[I 1]"); 234 CHECK_PARSE_EQ("\\x34", "'\x34'"); 235 CHECK_PARSE_EQ("\\x60", "'\x60'"); 236 CHECK_PARSE_EQ("\\x3z", "'x3z'"); 237 CHECK_PARSE_EQ("\\c", "'c'"); 238 CHECK_PARSE_EQ("\\u0034", "'\x34'"); 239 CHECK_PARSE_EQ("\\u003z", "'u003z'"); 240 CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))"); 241 242 CHECK_SIMPLE("a", true); 243 CHECK_SIMPLE("a|b", false); 244 CHECK_SIMPLE("a\\n", false); 245 CHECK_SIMPLE("^a", false); 246 CHECK_SIMPLE("a$", false); 247 CHECK_SIMPLE("a\\b!", false); 248 CHECK_SIMPLE("a\\Bb", false); 249 CHECK_SIMPLE("a*", false); 250 CHECK_SIMPLE("a*?", false); 251 CHECK_SIMPLE("a?", false); 252 CHECK_SIMPLE("a??", false); 253 CHECK_SIMPLE("a{0,1}?", false); 254 CHECK_SIMPLE("a{1,1}?", false); 255 CHECK_SIMPLE("a{1,2}?", false); 256 CHECK_SIMPLE("a+?", false); 257 CHECK_SIMPLE("(a)", false); 258 CHECK_SIMPLE("(a)\\1", false); 259 CHECK_SIMPLE("(\\1a)", false); 260 CHECK_SIMPLE("\\1(a)", false); 261 CHECK_SIMPLE("a\\s", false); 262 CHECK_SIMPLE("a\\S", false); 263 CHECK_SIMPLE("a\\d", false); 264 CHECK_SIMPLE("a\\D", false); 265 CHECK_SIMPLE("a\\w", false); 266 CHECK_SIMPLE("a\\W", false); 267 CHECK_SIMPLE("a.", false); 268 CHECK_SIMPLE("a\\q", false); 269 CHECK_SIMPLE("a[a]", false); 270 CHECK_SIMPLE("a[^a]", false); 271 CHECK_SIMPLE("a[a-z]", false); 272 CHECK_SIMPLE("a[\\q]", false); 273 CHECK_SIMPLE("a(?:b)", false); 274 CHECK_SIMPLE("a(?=b)", false); 275 CHECK_SIMPLE("a(?!b)", false); 276 CHECK_SIMPLE("\\x60", false); 277 CHECK_SIMPLE("\\u0060", false); 278 CHECK_SIMPLE("\\cA", false); 279 CHECK_SIMPLE("\\q", false); 280 CHECK_SIMPLE("\\1112", false); 281 CHECK_SIMPLE("\\0", false); 282 CHECK_SIMPLE("(a)\\1", false); 283 CHECK_SIMPLE("(?=a)?a", false); 284 CHECK_SIMPLE("(?!a)?a\\1", false); 285 CHECK_SIMPLE("(?:(?=a))a\\1", false); 286 287 CHECK_PARSE_EQ("a{}", "'a{}'"); 288 CHECK_PARSE_EQ("a{,}", "'a{,}'"); 289 CHECK_PARSE_EQ("a{", "'a{'"); 290 CHECK_PARSE_EQ("a{z}", "'a{z}'"); 291 CHECK_PARSE_EQ("a{1z}", "'a{1z}'"); 292 CHECK_PARSE_EQ("a{12z}", "'a{12z}'"); 293 CHECK_PARSE_EQ("a{12,", "'a{12,'"); 294 CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'"); 295 CHECK_PARSE_EQ("{}", "'{}'"); 296 CHECK_PARSE_EQ("{,}", "'{,}'"); 297 CHECK_PARSE_EQ("{", "'{'"); 298 CHECK_PARSE_EQ("{z}", "'{z}'"); 299 CHECK_PARSE_EQ("{1z}", "'{1z}'"); 300 CHECK_PARSE_EQ("{12z}", "'{12z}'"); 301 CHECK_PARSE_EQ("{12,", "'{12,'"); 302 CHECK_PARSE_EQ("{12,3b", "'{12,3b'"); 303 304 CHECK_MIN_MAX("a", 1, 1); 305 CHECK_MIN_MAX("abc", 3, 3); 306 CHECK_MIN_MAX("a[bc]d", 3, 3); 307 CHECK_MIN_MAX("a|bc", 1, 2); 308 CHECK_MIN_MAX("ab|c", 1, 2); 309 CHECK_MIN_MAX("a||bc", 0, 2); 310 CHECK_MIN_MAX("|", 0, 0); 311 CHECK_MIN_MAX("(?:ab)", 2, 2); 312 CHECK_MIN_MAX("(?:ab|cde)", 2, 3); 313 CHECK_MIN_MAX("(?:ab)|cde", 2, 3); 314 CHECK_MIN_MAX("(ab)", 2, 2); 315 CHECK_MIN_MAX("(ab|cde)", 2, 3); 316 CHECK_MIN_MAX("(ab)\\1", 2, 4); 317 CHECK_MIN_MAX("(ab|cde)\\1", 2, 6); 318 CHECK_MIN_MAX("(?:ab)?", 0, 2); 319 CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity); 320 CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity); 321 CHECK_MIN_MAX("a?", 0, 1); 322 CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity); 323 CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity); 324 CHECK_MIN_MAX("a??", 0, 1); 325 CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity); 326 CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity); 327 CHECK_MIN_MAX("(?:a?)?", 0, 1); 328 CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity); 329 CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity); 330 CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity); 331 CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity); 332 CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity); 333 CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity); 334 CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity); 335 CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity); 336 CHECK_MIN_MAX("a{0}", 0, 0); 337 CHECK_MIN_MAX("(?:a+){0}", 0, 0); 338 CHECK_MIN_MAX("(?:a+){0,0}", 0, 0); 339 CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity); 340 CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity); 341 CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity); 342 CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity); 343 CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity); 344 CHECK_MIN_MAX("(?:ab){4,7}", 8, 14); 345 CHECK_MIN_MAX("a\\bc", 2, 2); 346 CHECK_MIN_MAX("a\\Bc", 2, 2); 347 CHECK_MIN_MAX("a\\sc", 3, 3); 348 CHECK_MIN_MAX("a\\Sc", 3, 3); 349 CHECK_MIN_MAX("a(?=b)c", 2, 2); 350 CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2); 351 CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2); 352 } 353 354 TEST(ParserRegression) { 355 CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])"); 356 CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')"); 357 CHECK_PARSE_EQ("{", "'{'"); 358 CHECK_PARSE_EQ("a|", "(| 'a' %)"); 359 } 360 361 static void ExpectError(const char* input, 362 const char* expected) { 363 V8::Initialize(NULL); 364 v8::HandleScope scope; 365 ZoneScope zone_scope(DELETE_ON_EXIT); 366 FlatStringReader reader(CStrVector(input)); 367 RegExpCompileData result; 368 CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result)); 369 CHECK(result.tree == NULL); 370 CHECK(!result.error.is_null()); 371 SmartPointer<char> str = result.error->ToCString(ALLOW_NULLS); 372 CHECK_EQ(expected, *str); 373 } 374 375 376 TEST(Errors) { 377 V8::Initialize(NULL); 378 const char* kEndBackslash = "\\ at end of pattern"; 379 ExpectError("\\", kEndBackslash); 380 const char* kUnterminatedGroup = "Unterminated group"; 381 ExpectError("(foo", kUnterminatedGroup); 382 const char* kInvalidGroup = "Invalid group"; 383 ExpectError("(?", kInvalidGroup); 384 const char* kUnterminatedCharacterClass = "Unterminated character class"; 385 ExpectError("[", kUnterminatedCharacterClass); 386 ExpectError("[a-", kUnterminatedCharacterClass); 387 const char* kNothingToRepeat = "Nothing to repeat"; 388 ExpectError("*", kNothingToRepeat); 389 ExpectError("?", kNothingToRepeat); 390 ExpectError("+", kNothingToRepeat); 391 ExpectError("{1}", kNothingToRepeat); 392 ExpectError("{1,2}", kNothingToRepeat); 393 ExpectError("{1,}", kNothingToRepeat); 394 395 // Check that we don't allow more than kMaxCapture captures 396 const int kMaxCaptures = 1 << 16; // Must match RegExpParser::kMaxCaptures. 397 const char* kTooManyCaptures = "Too many captures"; 398 HeapStringAllocator allocator; 399 StringStream accumulator(&allocator); 400 for (int i = 0; i <= kMaxCaptures; i++) { 401 accumulator.Add("()"); 402 } 403 SmartPointer<const char> many_captures(accumulator.ToCString()); 404 ExpectError(*many_captures, kTooManyCaptures); 405 } 406 407 408 static bool IsDigit(uc16 c) { 409 return ('0' <= c && c <= '9'); 410 } 411 412 413 static bool NotDigit(uc16 c) { 414 return !IsDigit(c); 415 } 416 417 418 static bool IsWhiteSpace(uc16 c) { 419 switch (c) { 420 case 0x09: 421 case 0x0A: 422 case 0x0B: 423 case 0x0C: 424 case 0x0d: 425 case 0x20: 426 case 0xA0: 427 case 0x2028: 428 case 0x2029: 429 return true; 430 default: 431 return unibrow::Space::Is(c); 432 } 433 } 434 435 436 static bool NotWhiteSpace(uc16 c) { 437 return !IsWhiteSpace(c); 438 } 439 440 441 static bool NotWord(uc16 c) { 442 return !IsRegExpWord(c); 443 } 444 445 446 static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) { 447 ZoneScope scope(DELETE_ON_EXIT); 448 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); 449 CharacterRange::AddClassEscape(c, ranges); 450 for (unsigned i = 0; i < (1 << 16); i++) { 451 bool in_class = false; 452 for (int j = 0; !in_class && j < ranges->length(); j++) { 453 CharacterRange& range = ranges->at(j); 454 in_class = (range.from() <= i && i <= range.to()); 455 } 456 CHECK_EQ(pred(i), in_class); 457 } 458 } 459 460 461 TEST(CharacterClassEscapes) { 462 TestCharacterClassEscapes('.', IsRegExpNewline); 463 TestCharacterClassEscapes('d', IsDigit); 464 TestCharacterClassEscapes('D', NotDigit); 465 TestCharacterClassEscapes('s', IsWhiteSpace); 466 TestCharacterClassEscapes('S', NotWhiteSpace); 467 TestCharacterClassEscapes('w', IsRegExpWord); 468 TestCharacterClassEscapes('W', NotWord); 469 } 470 471 472 static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) { 473 V8::Initialize(NULL); 474 FlatStringReader reader(CStrVector(input)); 475 RegExpCompileData compile_data; 476 if (!v8::internal::ParseRegExp(&reader, multiline, &compile_data)) 477 return NULL; 478 Handle<String> pattern = Factory::NewStringFromUtf8(CStrVector(input)); 479 RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii); 480 return compile_data.node; 481 } 482 483 484 static void Execute(const char* input, 485 bool multiline, 486 bool is_ascii, 487 bool dot_output = false) { 488 v8::HandleScope scope; 489 ZoneScope zone_scope(DELETE_ON_EXIT); 490 RegExpNode* node = Compile(input, multiline, is_ascii); 491 USE(node); 492 #ifdef DEBUG 493 if (dot_output) { 494 RegExpEngine::DotPrint(input, node, false); 495 exit(0); 496 } 497 #endif // DEBUG 498 } 499 500 501 class TestConfig { 502 public: 503 typedef int Key; 504 typedef int Value; 505 static const int kNoKey; 506 static const int kNoValue; 507 static inline int Compare(int a, int b) { 508 if (a < b) 509 return -1; 510 else if (a > b) 511 return 1; 512 else 513 return 0; 514 } 515 }; 516 517 518 const int TestConfig::kNoKey = 0; 519 const int TestConfig::kNoValue = 0; 520 521 522 static unsigned PseudoRandom(int i, int j) { 523 return ~(~((i * 781) ^ (j * 329))); 524 } 525 526 527 TEST(SplayTreeSimple) { 528 static const unsigned kLimit = 1000; 529 ZoneScope zone_scope(DELETE_ON_EXIT); 530 ZoneSplayTree<TestConfig> tree; 531 bool seen[kLimit]; 532 for (unsigned i = 0; i < kLimit; i++) seen[i] = false; 533 #define CHECK_MAPS_EQUAL() do { \ 534 for (unsigned k = 0; k < kLimit; k++) \ 535 CHECK_EQ(seen[k], tree.Find(k, &loc)); \ 536 } while (false) 537 for (int i = 0; i < 50; i++) { 538 for (int j = 0; j < 50; j++) { 539 unsigned next = PseudoRandom(i, j) % kLimit; 540 if (seen[next]) { 541 // We've already seen this one. Check the value and remove 542 // it. 543 ZoneSplayTree<TestConfig>::Locator loc; 544 CHECK(tree.Find(next, &loc)); 545 CHECK_EQ(next, loc.key()); 546 CHECK_EQ(3 * next, loc.value()); 547 tree.Remove(next); 548 seen[next] = false; 549 CHECK_MAPS_EQUAL(); 550 } else { 551 // Check that it wasn't there already and then add it. 552 ZoneSplayTree<TestConfig>::Locator loc; 553 CHECK(!tree.Find(next, &loc)); 554 CHECK(tree.Insert(next, &loc)); 555 CHECK_EQ(next, loc.key()); 556 loc.set_value(3 * next); 557 seen[next] = true; 558 CHECK_MAPS_EQUAL(); 559 } 560 int val = PseudoRandom(j, i) % kLimit; 561 if (seen[val]) { 562 ZoneSplayTree<TestConfig>::Locator loc; 563 CHECK(tree.FindGreatestLessThan(val, &loc)); 564 CHECK_EQ(loc.key(), val); 565 break; 566 } 567 val = PseudoRandom(i + j, i - j) % kLimit; 568 if (seen[val]) { 569 ZoneSplayTree<TestConfig>::Locator loc; 570 CHECK(tree.FindLeastGreaterThan(val, &loc)); 571 CHECK_EQ(loc.key(), val); 572 break; 573 } 574 } 575 } 576 } 577 578 579 TEST(DispatchTableConstruction) { 580 // Initialize test data. 581 static const int kLimit = 1000; 582 static const int kRangeCount = 8; 583 static const int kRangeSize = 16; 584 uc16 ranges[kRangeCount][2 * kRangeSize]; 585 for (int i = 0; i < kRangeCount; i++) { 586 Vector<uc16> range(ranges[i], 2 * kRangeSize); 587 for (int j = 0; j < 2 * kRangeSize; j++) { 588 range[j] = PseudoRandom(i + 25, j + 87) % kLimit; 589 } 590 range.Sort(); 591 for (int j = 1; j < 2 * kRangeSize; j++) { 592 CHECK(range[j-1] <= range[j]); 593 } 594 } 595 // Enter test data into dispatch table. 596 ZoneScope zone_scope(DELETE_ON_EXIT); 597 DispatchTable table; 598 for (int i = 0; i < kRangeCount; i++) { 599 uc16* range = ranges[i]; 600 for (int j = 0; j < 2 * kRangeSize; j += 2) 601 table.AddRange(CharacterRange(range[j], range[j + 1]), i); 602 } 603 // Check that the table looks as we would expect 604 for (int p = 0; p < kLimit; p++) { 605 OutSet* outs = table.Get(p); 606 for (int j = 0; j < kRangeCount; j++) { 607 uc16* range = ranges[j]; 608 bool is_on = false; 609 for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2) 610 is_on = (range[k] <= p && p <= range[k + 1]); 611 CHECK_EQ(is_on, outs->Get(j)); 612 } 613 } 614 } 615 616 // Test of debug-only syntax. 617 #ifdef DEBUG 618 619 TEST(ParsePossessiveRepetition) { 620 bool old_flag_value = FLAG_regexp_possessive_quantifier; 621 622 // Enable possessive quantifier syntax. 623 FLAG_regexp_possessive_quantifier = true; 624 625 CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')"); 626 CHECK_PARSE_EQ("a++", "(# 1 - p 'a')"); 627 CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')"); 628 CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')"); 629 CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')"); 630 631 // Disable possessive quantifier syntax. 632 FLAG_regexp_possessive_quantifier = false; 633 634 CHECK_PARSE_ERROR("a*+"); 635 CHECK_PARSE_ERROR("a++"); 636 CHECK_PARSE_ERROR("a?+"); 637 CHECK_PARSE_ERROR("a{10,20}+"); 638 CHECK_PARSE_ERROR("a{10,20}+b"); 639 640 FLAG_regexp_possessive_quantifier = old_flag_value; 641 } 642 643 #endif 644 645 // Tests of interpreter. 646 647 648 #ifdef V8_NATIVE_REGEXP 649 650 #if V8_TARGET_ARCH_IA32 651 typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler; 652 #elif V8_TARGET_ARCH_X64 653 typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler; 654 #elif V8_TARGET_ARCH_ARM 655 typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler; 656 #elif V8_TARGET_ARCH_MIPS 657 typedef RegExpMacroAssembler ArchRegExpMacroAssembler; 658 #endif 659 660 class ContextInitializer { 661 public: 662 ContextInitializer() 663 : env_(), scope_(), zone_(DELETE_ON_EXIT), stack_guard_() { 664 env_ = v8::Context::New(); 665 env_->Enter(); 666 } 667 ~ContextInitializer() { 668 env_->Exit(); 669 env_.Dispose(); 670 } 671 private: 672 v8::Persistent<v8::Context> env_; 673 v8::HandleScope scope_; 674 v8::internal::ZoneScope zone_; 675 v8::internal::StackGuard stack_guard_; 676 }; 677 678 679 static ArchRegExpMacroAssembler::Result Execute(Code* code, 680 String* input, 681 int start_offset, 682 const byte* input_start, 683 const byte* input_end, 684 int* captures) { 685 return NativeRegExpMacroAssembler::Execute( 686 code, 687 input, 688 start_offset, 689 input_start, 690 input_end, 691 captures); 692 } 693 694 695 TEST(MacroAssemblerNativeSuccess) { 696 v8::V8::Initialize(); 697 ContextInitializer initializer; 698 699 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 700 701 m.Succeed(); 702 703 Handle<String> source = Factory::NewStringFromAscii(CStrVector("")); 704 Handle<Object> code_object = m.GetCode(source); 705 Handle<Code> code = Handle<Code>::cast(code_object); 706 707 int captures[4] = {42, 37, 87, 117}; 708 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo")); 709 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 710 const byte* start_adr = 711 reinterpret_cast<const byte*>(seq_input->GetCharsAddress()); 712 713 NativeRegExpMacroAssembler::Result result = 714 Execute(*code, 715 *input, 716 0, 717 start_adr, 718 start_adr + seq_input->length(), 719 captures); 720 721 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 722 CHECK_EQ(-1, captures[0]); 723 CHECK_EQ(-1, captures[1]); 724 CHECK_EQ(-1, captures[2]); 725 CHECK_EQ(-1, captures[3]); 726 } 727 728 729 TEST(MacroAssemblerNativeSimple) { 730 v8::V8::Initialize(); 731 ContextInitializer initializer; 732 733 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 734 735 uc16 foo_chars[3] = {'f', 'o', 'o'}; 736 Vector<const uc16> foo(foo_chars, 3); 737 738 Label fail; 739 m.CheckCharacters(foo, 0, &fail, true); 740 m.WriteCurrentPositionToRegister(0, 0); 741 m.AdvanceCurrentPosition(3); 742 m.WriteCurrentPositionToRegister(1, 0); 743 m.Succeed(); 744 m.Bind(&fail); 745 m.Fail(); 746 747 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^foo")); 748 Handle<Object> code_object = m.GetCode(source); 749 Handle<Code> code = Handle<Code>::cast(code_object); 750 751 int captures[4] = {42, 37, 87, 117}; 752 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo")); 753 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 754 Address start_adr = seq_input->GetCharsAddress(); 755 756 NativeRegExpMacroAssembler::Result result = 757 Execute(*code, 758 *input, 759 0, 760 start_adr, 761 start_adr + input->length(), 762 captures); 763 764 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 765 CHECK_EQ(0, captures[0]); 766 CHECK_EQ(3, captures[1]); 767 CHECK_EQ(-1, captures[2]); 768 CHECK_EQ(-1, captures[3]); 769 770 input = Factory::NewStringFromAscii(CStrVector("barbarbar")); 771 seq_input = Handle<SeqAsciiString>::cast(input); 772 start_adr = seq_input->GetCharsAddress(); 773 774 result = Execute(*code, 775 *input, 776 0, 777 start_adr, 778 start_adr + input->length(), 779 captures); 780 781 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 782 } 783 784 785 TEST(MacroAssemblerNativeSimpleUC16) { 786 v8::V8::Initialize(); 787 ContextInitializer initializer; 788 789 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4); 790 791 uc16 foo_chars[3] = {'f', 'o', 'o'}; 792 Vector<const uc16> foo(foo_chars, 3); 793 794 Label fail; 795 m.CheckCharacters(foo, 0, &fail, true); 796 m.WriteCurrentPositionToRegister(0, 0); 797 m.AdvanceCurrentPosition(3); 798 m.WriteCurrentPositionToRegister(1, 0); 799 m.Succeed(); 800 m.Bind(&fail); 801 m.Fail(); 802 803 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^foo")); 804 Handle<Object> code_object = m.GetCode(source); 805 Handle<Code> code = Handle<Code>::cast(code_object); 806 807 int captures[4] = {42, 37, 87, 117}; 808 const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o', '\xa0'}; 809 Handle<String> input = 810 Factory::NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 811 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 812 Address start_adr = seq_input->GetCharsAddress(); 813 814 NativeRegExpMacroAssembler::Result result = 815 Execute(*code, 816 *input, 817 0, 818 start_adr, 819 start_adr + input->length(), 820 captures); 821 822 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 823 CHECK_EQ(0, captures[0]); 824 CHECK_EQ(3, captures[1]); 825 CHECK_EQ(-1, captures[2]); 826 CHECK_EQ(-1, captures[3]); 827 828 const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', '\xa0'}; 829 input = Factory::NewStringFromTwoByte(Vector<const uc16>(input_data2, 9)); 830 seq_input = Handle<SeqTwoByteString>::cast(input); 831 start_adr = seq_input->GetCharsAddress(); 832 833 result = Execute(*code, 834 *input, 835 0, 836 start_adr, 837 start_adr + input->length() * 2, 838 captures); 839 840 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 841 } 842 843 844 TEST(MacroAssemblerNativeBacktrack) { 845 v8::V8::Initialize(); 846 ContextInitializer initializer; 847 848 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 849 850 Label fail; 851 Label backtrack; 852 m.LoadCurrentCharacter(10, &fail); 853 m.Succeed(); 854 m.Bind(&fail); 855 m.PushBacktrack(&backtrack); 856 m.LoadCurrentCharacter(10, NULL); 857 m.Succeed(); 858 m.Bind(&backtrack); 859 m.Fail(); 860 861 Handle<String> source = Factory::NewStringFromAscii(CStrVector("..........")); 862 Handle<Object> code_object = m.GetCode(source); 863 Handle<Code> code = Handle<Code>::cast(code_object); 864 865 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foofoo")); 866 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 867 Address start_adr = seq_input->GetCharsAddress(); 868 869 NativeRegExpMacroAssembler::Result result = 870 Execute(*code, 871 *input, 872 0, 873 start_adr, 874 start_adr + input->length(), 875 NULL); 876 877 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 878 } 879 880 881 TEST(MacroAssemblerNativeBackReferenceASCII) { 882 v8::V8::Initialize(); 883 ContextInitializer initializer; 884 885 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 886 887 m.WriteCurrentPositionToRegister(0, 0); 888 m.AdvanceCurrentPosition(2); 889 m.WriteCurrentPositionToRegister(1, 0); 890 Label nomatch; 891 m.CheckNotBackReference(0, &nomatch); 892 m.Fail(); 893 m.Bind(&nomatch); 894 m.AdvanceCurrentPosition(2); 895 Label missing_match; 896 m.CheckNotBackReference(0, &missing_match); 897 m.WriteCurrentPositionToRegister(2, 0); 898 m.Succeed(); 899 m.Bind(&missing_match); 900 m.Fail(); 901 902 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^(..)..\1")); 903 Handle<Object> code_object = m.GetCode(source); 904 Handle<Code> code = Handle<Code>::cast(code_object); 905 906 Handle<String> input = Factory::NewStringFromAscii(CStrVector("fooofo")); 907 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 908 Address start_adr = seq_input->GetCharsAddress(); 909 910 int output[4]; 911 NativeRegExpMacroAssembler::Result result = 912 Execute(*code, 913 *input, 914 0, 915 start_adr, 916 start_adr + input->length(), 917 output); 918 919 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 920 CHECK_EQ(0, output[0]); 921 CHECK_EQ(2, output[1]); 922 CHECK_EQ(6, output[2]); 923 CHECK_EQ(-1, output[3]); 924 } 925 926 927 TEST(MacroAssemblerNativeBackReferenceUC16) { 928 v8::V8::Initialize(); 929 ContextInitializer initializer; 930 931 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4); 932 933 m.WriteCurrentPositionToRegister(0, 0); 934 m.AdvanceCurrentPosition(2); 935 m.WriteCurrentPositionToRegister(1, 0); 936 Label nomatch; 937 m.CheckNotBackReference(0, &nomatch); 938 m.Fail(); 939 m.Bind(&nomatch); 940 m.AdvanceCurrentPosition(2); 941 Label missing_match; 942 m.CheckNotBackReference(0, &missing_match); 943 m.WriteCurrentPositionToRegister(2, 0); 944 m.Succeed(); 945 m.Bind(&missing_match); 946 m.Fail(); 947 948 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^(..)..\1")); 949 Handle<Object> code_object = m.GetCode(source); 950 Handle<Code> code = Handle<Code>::cast(code_object); 951 952 const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028}; 953 Handle<String> input = 954 Factory::NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 955 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 956 Address start_adr = seq_input->GetCharsAddress(); 957 958 int output[4]; 959 NativeRegExpMacroAssembler::Result result = 960 Execute(*code, 961 *input, 962 0, 963 start_adr, 964 start_adr + input->length() * 2, 965 output); 966 967 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 968 CHECK_EQ(0, output[0]); 969 CHECK_EQ(2, output[1]); 970 CHECK_EQ(6, output[2]); 971 CHECK_EQ(-1, output[3]); 972 } 973 974 975 976 TEST(MacroAssemblernativeAtStart) { 977 v8::V8::Initialize(); 978 ContextInitializer initializer; 979 980 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 981 982 Label not_at_start, newline, fail; 983 m.CheckNotAtStart(¬_at_start); 984 // Check that prevchar = '\n' and current = 'f'. 985 m.CheckCharacter('\n', &newline); 986 m.Bind(&fail); 987 m.Fail(); 988 m.Bind(&newline); 989 m.LoadCurrentCharacter(0, &fail); 990 m.CheckNotCharacter('f', &fail); 991 m.Succeed(); 992 993 m.Bind(¬_at_start); 994 // Check that prevchar = 'o' and current = 'b'. 995 Label prevo; 996 m.CheckCharacter('o', &prevo); 997 m.Fail(); 998 m.Bind(&prevo); 999 m.LoadCurrentCharacter(0, &fail); 1000 m.CheckNotCharacter('b', &fail); 1001 m.Succeed(); 1002 1003 Handle<String> source = Factory::NewStringFromAscii(CStrVector("(^f|ob)")); 1004 Handle<Object> code_object = m.GetCode(source); 1005 Handle<Code> code = Handle<Code>::cast(code_object); 1006 1007 Handle<String> input = Factory::NewStringFromAscii(CStrVector("foobar")); 1008 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1009 Address start_adr = seq_input->GetCharsAddress(); 1010 1011 NativeRegExpMacroAssembler::Result result = 1012 Execute(*code, 1013 *input, 1014 0, 1015 start_adr, 1016 start_adr + input->length(), 1017 NULL); 1018 1019 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1020 1021 result = Execute(*code, 1022 *input, 1023 3, 1024 start_adr + 3, 1025 start_adr + input->length(), 1026 NULL); 1027 1028 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1029 } 1030 1031 1032 TEST(MacroAssemblerNativeBackRefNoCase) { 1033 v8::V8::Initialize(); 1034 ContextInitializer initializer; 1035 1036 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 1037 1038 Label fail, succ; 1039 1040 m.WriteCurrentPositionToRegister(0, 0); 1041 m.WriteCurrentPositionToRegister(2, 0); 1042 m.AdvanceCurrentPosition(3); 1043 m.WriteCurrentPositionToRegister(3, 0); 1044 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "AbC". 1045 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "ABC". 1046 Label expected_fail; 1047 m.CheckNotBackReferenceIgnoreCase(2, &expected_fail); 1048 m.Bind(&fail); 1049 m.Fail(); 1050 1051 m.Bind(&expected_fail); 1052 m.AdvanceCurrentPosition(3); // Skip "xYz" 1053 m.CheckNotBackReferenceIgnoreCase(2, &succ); 1054 m.Fail(); 1055 1056 m.Bind(&succ); 1057 m.WriteCurrentPositionToRegister(1, 0); 1058 m.Succeed(); 1059 1060 Handle<String> source = 1061 Factory::NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)")); 1062 Handle<Object> code_object = m.GetCode(source); 1063 Handle<Code> code = Handle<Code>::cast(code_object); 1064 1065 Handle<String> input = 1066 Factory::NewStringFromAscii(CStrVector("aBcAbCABCxYzab")); 1067 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1068 Address start_adr = seq_input->GetCharsAddress(); 1069 1070 int output[4]; 1071 NativeRegExpMacroAssembler::Result result = 1072 Execute(*code, 1073 *input, 1074 0, 1075 start_adr, 1076 start_adr + input->length(), 1077 output); 1078 1079 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1080 CHECK_EQ(0, output[0]); 1081 CHECK_EQ(12, output[1]); 1082 CHECK_EQ(0, output[2]); 1083 CHECK_EQ(3, output[3]); 1084 } 1085 1086 1087 1088 TEST(MacroAssemblerNativeRegisters) { 1089 v8::V8::Initialize(); 1090 ContextInitializer initializer; 1091 1092 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6); 1093 1094 uc16 foo_chars[3] = {'f', 'o', 'o'}; 1095 Vector<const uc16> foo(foo_chars, 3); 1096 1097 enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt }; 1098 Label fail; 1099 Label backtrack; 1100 m.WriteCurrentPositionToRegister(out1, 0); // Output: [0] 1101 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1102 m.PushBacktrack(&backtrack); 1103 m.WriteStackPointerToRegister(sp); 1104 // Fill stack and registers 1105 m.AdvanceCurrentPosition(2); 1106 m.WriteCurrentPositionToRegister(out1, 0); 1107 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1108 m.PushBacktrack(&fail); 1109 // Drop backtrack stack frames. 1110 m.ReadStackPointerFromRegister(sp); 1111 // And take the first backtrack (to &backtrack) 1112 m.Backtrack(); 1113 1114 m.PushCurrentPosition(); 1115 m.AdvanceCurrentPosition(2); 1116 m.PopCurrentPosition(); 1117 1118 m.Bind(&backtrack); 1119 m.PopRegister(out1); 1120 m.ReadCurrentPositionFromRegister(out1); 1121 m.AdvanceCurrentPosition(3); 1122 m.WriteCurrentPositionToRegister(out2, 0); // [0,3] 1123 1124 Label loop; 1125 m.SetRegister(loop_cnt, 0); // loop counter 1126 m.Bind(&loop); 1127 m.AdvanceRegister(loop_cnt, 1); 1128 m.AdvanceCurrentPosition(1); 1129 m.IfRegisterLT(loop_cnt, 3, &loop); 1130 m.WriteCurrentPositionToRegister(out3, 0); // [0,3,6] 1131 1132 Label loop2; 1133 m.SetRegister(loop_cnt, 2); // loop counter 1134 m.Bind(&loop2); 1135 m.AdvanceRegister(loop_cnt, -1); 1136 m.AdvanceCurrentPosition(1); 1137 m.IfRegisterGE(loop_cnt, 0, &loop2); 1138 m.WriteCurrentPositionToRegister(out4, 0); // [0,3,6,9] 1139 1140 Label loop3; 1141 Label exit_loop3; 1142 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1143 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1144 m.ReadCurrentPositionFromRegister(out3); 1145 m.Bind(&loop3); 1146 m.AdvanceCurrentPosition(1); 1147 m.CheckGreedyLoop(&exit_loop3); 1148 m.GoTo(&loop3); 1149 m.Bind(&exit_loop3); 1150 m.PopCurrentPosition(); 1151 m.WriteCurrentPositionToRegister(out5, 0); // [0,3,6,9,9,-1] 1152 1153 m.Succeed(); 1154 1155 m.Bind(&fail); 1156 m.Fail(); 1157 1158 Handle<String> source = 1159 Factory::NewStringFromAscii(CStrVector("<loop test>")); 1160 Handle<Object> code_object = m.GetCode(source); 1161 Handle<Code> code = Handle<Code>::cast(code_object); 1162 1163 // String long enough for test (content doesn't matter). 1164 Handle<String> input = 1165 Factory::NewStringFromAscii(CStrVector("foofoofoofoofoo")); 1166 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1167 Address start_adr = seq_input->GetCharsAddress(); 1168 1169 int output[6]; 1170 NativeRegExpMacroAssembler::Result result = 1171 Execute(*code, 1172 *input, 1173 0, 1174 start_adr, 1175 start_adr + input->length(), 1176 output); 1177 1178 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1179 CHECK_EQ(0, output[0]); 1180 CHECK_EQ(3, output[1]); 1181 CHECK_EQ(6, output[2]); 1182 CHECK_EQ(9, output[3]); 1183 CHECK_EQ(9, output[4]); 1184 CHECK_EQ(-1, output[5]); 1185 } 1186 1187 1188 TEST(MacroAssemblerStackOverflow) { 1189 v8::V8::Initialize(); 1190 ContextInitializer initializer; 1191 1192 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 1193 1194 Label loop; 1195 m.Bind(&loop); 1196 m.PushBacktrack(&loop); 1197 m.GoTo(&loop); 1198 1199 Handle<String> source = 1200 Factory::NewStringFromAscii(CStrVector("<stack overflow test>")); 1201 Handle<Object> code_object = m.GetCode(source); 1202 Handle<Code> code = Handle<Code>::cast(code_object); 1203 1204 // String long enough for test (content doesn't matter). 1205 Handle<String> input = 1206 Factory::NewStringFromAscii(CStrVector("dummy")); 1207 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1208 Address start_adr = seq_input->GetCharsAddress(); 1209 1210 NativeRegExpMacroAssembler::Result result = 1211 Execute(*code, 1212 *input, 1213 0, 1214 start_adr, 1215 start_adr + input->length(), 1216 NULL); 1217 1218 CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result); 1219 CHECK(Top::has_pending_exception()); 1220 Top::clear_pending_exception(); 1221 } 1222 1223 1224 TEST(MacroAssemblerNativeLotsOfRegisters) { 1225 v8::V8::Initialize(); 1226 ContextInitializer initializer; 1227 1228 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2); 1229 1230 // At least 2048, to ensure the allocated space for registers 1231 // span one full page. 1232 const int large_number = 8000; 1233 m.WriteCurrentPositionToRegister(large_number, 42); 1234 m.WriteCurrentPositionToRegister(0, 0); 1235 m.WriteCurrentPositionToRegister(1, 1); 1236 Label done; 1237 m.CheckNotBackReference(0, &done); // Performs a system-stack push. 1238 m.Bind(&done); 1239 m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck); 1240 m.PopRegister(1); 1241 m.Succeed(); 1242 1243 Handle<String> source = 1244 Factory::NewStringFromAscii(CStrVector("<huge register space test>")); 1245 Handle<Object> code_object = m.GetCode(source); 1246 Handle<Code> code = Handle<Code>::cast(code_object); 1247 1248 // String long enough for test (content doesn't matter). 1249 Handle<String> input = 1250 Factory::NewStringFromAscii(CStrVector("sample text")); 1251 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1252 Address start_adr = seq_input->GetCharsAddress(); 1253 1254 int captures[2]; 1255 NativeRegExpMacroAssembler::Result result = 1256 Execute(*code, 1257 *input, 1258 0, 1259 start_adr, 1260 start_adr + input->length(), 1261 captures); 1262 1263 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1264 CHECK_EQ(0, captures[0]); 1265 CHECK_EQ(42, captures[1]); 1266 1267 Top::clear_pending_exception(); 1268 } 1269 1270 #else // ! V8_REGEX_NATIVE 1271 1272 TEST(MacroAssembler) { 1273 V8::Initialize(NULL); 1274 byte codes[1024]; 1275 RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024)); 1276 // ^f(o)o. 1277 Label fail, fail2, start; 1278 uc16 foo_chars[3]; 1279 foo_chars[0] = 'f'; 1280 foo_chars[1] = 'o'; 1281 foo_chars[2] = 'o'; 1282 Vector<const uc16> foo(foo_chars, 3); 1283 m.SetRegister(4, 42); 1284 m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck); 1285 m.AdvanceRegister(4, 42); 1286 m.GoTo(&start); 1287 m.Fail(); 1288 m.Bind(&start); 1289 m.PushBacktrack(&fail2); 1290 m.CheckCharacters(foo, 0, &fail, true); 1291 m.WriteCurrentPositionToRegister(0, 0); 1292 m.PushCurrentPosition(); 1293 m.AdvanceCurrentPosition(3); 1294 m.WriteCurrentPositionToRegister(1, 0); 1295 m.PopCurrentPosition(); 1296 m.AdvanceCurrentPosition(1); 1297 m.WriteCurrentPositionToRegister(2, 0); 1298 m.AdvanceCurrentPosition(1); 1299 m.WriteCurrentPositionToRegister(3, 0); 1300 m.Succeed(); 1301 1302 m.Bind(&fail); 1303 m.Backtrack(); 1304 m.Succeed(); 1305 1306 m.Bind(&fail2); 1307 m.PopRegister(0); 1308 m.Fail(); 1309 1310 v8::HandleScope scope; 1311 1312 Handle<String> source = Factory::NewStringFromAscii(CStrVector("^f(o)o")); 1313 Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source)); 1314 int captures[5]; 1315 1316 const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'}; 1317 Handle<String> f1_16 = 1318 Factory::NewStringFromTwoByte(Vector<const uc16>(str1, 6)); 1319 1320 CHECK(IrregexpInterpreter::Match(array, f1_16, captures, 0)); 1321 CHECK_EQ(0, captures[0]); 1322 CHECK_EQ(3, captures[1]); 1323 CHECK_EQ(1, captures[2]); 1324 CHECK_EQ(2, captures[3]); 1325 CHECK_EQ(84, captures[4]); 1326 1327 const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'}; 1328 Handle<String> f2_16 = 1329 Factory::NewStringFromTwoByte(Vector<const uc16>(str2, 6)); 1330 1331 CHECK(!IrregexpInterpreter::Match(array, f2_16, captures, 0)); 1332 CHECK_EQ(42, captures[0]); 1333 } 1334 1335 #endif // ! V8_REGEXP_NATIVE 1336 1337 1338 TEST(AddInverseToTable) { 1339 static const int kLimit = 1000; 1340 static const int kRangeCount = 16; 1341 for (int t = 0; t < 10; t++) { 1342 ZoneScope zone_scope(DELETE_ON_EXIT); 1343 ZoneList<CharacterRange>* ranges = 1344 new ZoneList<CharacterRange>(kRangeCount); 1345 for (int i = 0; i < kRangeCount; i++) { 1346 int from = PseudoRandom(t + 87, i + 25) % kLimit; 1347 int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20)); 1348 if (to > kLimit) to = kLimit; 1349 ranges->Add(CharacterRange(from, to)); 1350 } 1351 DispatchTable table; 1352 DispatchTableConstructor cons(&table, false); 1353 cons.set_choice_index(0); 1354 cons.AddInverse(ranges); 1355 for (int i = 0; i < kLimit; i++) { 1356 bool is_on = false; 1357 for (int j = 0; !is_on && j < kRangeCount; j++) 1358 is_on = ranges->at(j).Contains(i); 1359 OutSet* set = table.Get(i); 1360 CHECK_EQ(is_on, set->Get(0) == false); 1361 } 1362 } 1363 ZoneScope zone_scope(DELETE_ON_EXIT); 1364 ZoneList<CharacterRange>* ranges = 1365 new ZoneList<CharacterRange>(1); 1366 ranges->Add(CharacterRange(0xFFF0, 0xFFFE)); 1367 DispatchTable table; 1368 DispatchTableConstructor cons(&table, false); 1369 cons.set_choice_index(0); 1370 cons.AddInverse(ranges); 1371 CHECK(!table.Get(0xFFFE)->Get(0)); 1372 CHECK(table.Get(0xFFFF)->Get(0)); 1373 } 1374 1375 1376 static uc32 canonicalize(uc32 c) { 1377 unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth]; 1378 int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL); 1379 if (count == 0) { 1380 return c; 1381 } else { 1382 CHECK_EQ(1, count); 1383 return canon[0]; 1384 } 1385 } 1386 1387 1388 TEST(LatinCanonicalize) { 1389 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1390 for (char lower = 'a'; lower <= 'z'; lower++) { 1391 char upper = lower + ('A' - 'a'); 1392 CHECK_EQ(canonicalize(lower), canonicalize(upper)); 1393 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1394 int length = un_canonicalize.get(lower, '\0', uncanon); 1395 CHECK_EQ(2, length); 1396 CHECK_EQ(upper, uncanon[0]); 1397 CHECK_EQ(lower, uncanon[1]); 1398 } 1399 for (uc32 c = 128; c < (1 << 21); c++) 1400 CHECK_GE(canonicalize(c), 128); 1401 unibrow::Mapping<unibrow::ToUppercase> to_upper; 1402 for (uc32 c = 0; c < (1 << 21); c++) { 1403 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth]; 1404 int length = to_upper.get(c, '\0', upper); 1405 if (length == 0) { 1406 length = 1; 1407 upper[0] = c; 1408 } 1409 uc32 u = upper[0]; 1410 if (length > 1 || (c >= 128 && u < 128)) 1411 u = c; 1412 CHECK_EQ(u, canonicalize(c)); 1413 } 1414 } 1415 1416 1417 static uc32 CanonRange(uc32 c) { 1418 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; 1419 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); 1420 if (count == 0) { 1421 return c; 1422 } else { 1423 CHECK_EQ(1, count); 1424 return canon[0]; 1425 } 1426 } 1427 1428 1429 TEST(RangeCanonicalization) { 1430 CHECK_NE(CanonRange(0) & CharacterRange::kStartMarker, 0); 1431 // Check that we arrive at the same result when using the basic 1432 // range canonicalization primitives as when using immediate 1433 // canonicalization. 1434 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1435 for (int i = 0; i < CharacterRange::kRangeCanonicalizeMax; i++) { 1436 int range = CanonRange(i); 1437 int indirect_length = 0; 1438 unibrow::uchar indirect[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1439 if ((range & CharacterRange::kStartMarker) == 0) { 1440 indirect_length = un_canonicalize.get(i - range, '\0', indirect); 1441 for (int i = 0; i < indirect_length; i++) 1442 indirect[i] += range; 1443 } else { 1444 indirect_length = un_canonicalize.get(i, '\0', indirect); 1445 } 1446 unibrow::uchar direct[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1447 int direct_length = un_canonicalize.get(i, '\0', direct); 1448 CHECK_EQ(direct_length, indirect_length); 1449 } 1450 // Check that we arrive at the same results when skipping over 1451 // canonicalization ranges. 1452 int next_block = 0; 1453 while (next_block < CharacterRange::kRangeCanonicalizeMax) { 1454 uc32 start = CanonRange(next_block); 1455 CHECK_NE((start & CharacterRange::kStartMarker), 0); 1456 unsigned dist = start & CharacterRange::kPayloadMask; 1457 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1458 int first_length = un_canonicalize.get(next_block, '\0', first); 1459 for (unsigned i = 1; i < dist; i++) { 1460 CHECK_EQ(i, CanonRange(next_block + i)); 1461 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1462 int succ_length = un_canonicalize.get(next_block + i, '\0', succ); 1463 CHECK_EQ(first_length, succ_length); 1464 for (int j = 0; j < succ_length; j++) { 1465 int calc = first[j] + i; 1466 int found = succ[j]; 1467 CHECK_EQ(calc, found); 1468 } 1469 } 1470 next_block = next_block + dist; 1471 } 1472 } 1473 1474 1475 TEST(UncanonicalizeEquivalence) { 1476 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1477 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1478 for (int i = 0; i < (1 << 16); i++) { 1479 int length = un_canonicalize.get(i, '\0', chars); 1480 for (int j = 0; j < length; j++) { 1481 unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1482 int length2 = un_canonicalize.get(chars[j], '\0', chars2); 1483 CHECK_EQ(length, length2); 1484 for (int k = 0; k < length; k++) 1485 CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k])); 1486 } 1487 } 1488 } 1489 1490 1491 static void TestRangeCaseIndependence(CharacterRange input, 1492 Vector<CharacterRange> expected) { 1493 ZoneScope zone_scope(DELETE_ON_EXIT); 1494 int count = expected.length(); 1495 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(count); 1496 input.AddCaseEquivalents(list, false); 1497 CHECK_EQ(count, list->length()); 1498 for (int i = 0; i < list->length(); i++) { 1499 CHECK_EQ(expected[i].from(), list->at(i).from()); 1500 CHECK_EQ(expected[i].to(), list->at(i).to()); 1501 } 1502 } 1503 1504 1505 static void TestSimpleRangeCaseIndependence(CharacterRange input, 1506 CharacterRange expected) { 1507 EmbeddedVector<CharacterRange, 1> vector; 1508 vector[0] = expected; 1509 TestRangeCaseIndependence(input, vector); 1510 } 1511 1512 1513 TEST(CharacterRangeCaseIndependence) { 1514 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'), 1515 CharacterRange::Singleton('A')); 1516 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'), 1517 CharacterRange::Singleton('Z')); 1518 TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'), 1519 CharacterRange('A', 'Z')); 1520 TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'), 1521 CharacterRange('C', 'F')); 1522 TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'), 1523 CharacterRange('A', 'B')); 1524 TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'), 1525 CharacterRange('Y', 'Z')); 1526 TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1), 1527 CharacterRange('A', 'Z')); 1528 TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'), 1529 CharacterRange('a', 'z')); 1530 TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'), 1531 CharacterRange('c', 'f')); 1532 TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1), 1533 CharacterRange('a', 'z')); 1534 // Here we need to add [l-z] to complete the case independence of 1535 // [A-Za-z] but we expect [a-z] to be added since we always add a 1536 // whole block at a time. 1537 TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'), 1538 CharacterRange('a', 'z')); 1539 } 1540 1541 1542 static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) { 1543 if (ranges == NULL) 1544 return false; 1545 for (int i = 0; i < ranges->length(); i++) { 1546 CharacterRange range = ranges->at(i); 1547 if (range.from() <= c && c <= range.to()) 1548 return true; 1549 } 1550 return false; 1551 } 1552 1553 1554 TEST(CharClassDifference) { 1555 ZoneScope zone_scope(DELETE_ON_EXIT); 1556 ZoneList<CharacterRange>* base = new ZoneList<CharacterRange>(1); 1557 base->Add(CharacterRange::Everything()); 1558 Vector<const uc16> overlay = CharacterRange::GetWordBounds(); 1559 ZoneList<CharacterRange>* included = NULL; 1560 ZoneList<CharacterRange>* excluded = NULL; 1561 CharacterRange::Split(base, overlay, &included, &excluded); 1562 for (int i = 0; i < (1 << 16); i++) { 1563 bool in_base = InClass(i, base); 1564 if (in_base) { 1565 bool in_overlay = false; 1566 for (int j = 0; !in_overlay && j < overlay.length(); j += 2) { 1567 if (overlay[j] <= i && i <= overlay[j+1]) 1568 in_overlay = true; 1569 } 1570 CHECK_EQ(in_overlay, InClass(i, included)); 1571 CHECK_EQ(!in_overlay, InClass(i, excluded)); 1572 } else { 1573 CHECK(!InClass(i, included)); 1574 CHECK(!InClass(i, excluded)); 1575 } 1576 } 1577 } 1578 1579 1580 TEST(CanonicalizeCharacterSets) { 1581 ZoneScope scope(DELETE_ON_EXIT); 1582 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4); 1583 CharacterSet set(list); 1584 1585 list->Add(CharacterRange(10, 20)); 1586 list->Add(CharacterRange(30, 40)); 1587 list->Add(CharacterRange(50, 60)); 1588 set.Canonicalize(); 1589 ASSERT_EQ(3, list->length()); 1590 ASSERT_EQ(10, list->at(0).from()); 1591 ASSERT_EQ(20, list->at(0).to()); 1592 ASSERT_EQ(30, list->at(1).from()); 1593 ASSERT_EQ(40, list->at(1).to()); 1594 ASSERT_EQ(50, list->at(2).from()); 1595 ASSERT_EQ(60, list->at(2).to()); 1596 1597 list->Rewind(0); 1598 list->Add(CharacterRange(10, 20)); 1599 list->Add(CharacterRange(50, 60)); 1600 list->Add(CharacterRange(30, 40)); 1601 set.Canonicalize(); 1602 ASSERT_EQ(3, list->length()); 1603 ASSERT_EQ(10, list->at(0).from()); 1604 ASSERT_EQ(20, list->at(0).to()); 1605 ASSERT_EQ(30, list->at(1).from()); 1606 ASSERT_EQ(40, list->at(1).to()); 1607 ASSERT_EQ(50, list->at(2).from()); 1608 ASSERT_EQ(60, list->at(2).to()); 1609 1610 list->Rewind(0); 1611 list->Add(CharacterRange(30, 40)); 1612 list->Add(CharacterRange(10, 20)); 1613 list->Add(CharacterRange(25, 25)); 1614 list->Add(CharacterRange(100, 100)); 1615 list->Add(CharacterRange(1, 1)); 1616 set.Canonicalize(); 1617 ASSERT_EQ(5, list->length()); 1618 ASSERT_EQ(1, list->at(0).from()); 1619 ASSERT_EQ(1, list->at(0).to()); 1620 ASSERT_EQ(10, list->at(1).from()); 1621 ASSERT_EQ(20, list->at(1).to()); 1622 ASSERT_EQ(25, list->at(2).from()); 1623 ASSERT_EQ(25, list->at(2).to()); 1624 ASSERT_EQ(30, list->at(3).from()); 1625 ASSERT_EQ(40, list->at(3).to()); 1626 ASSERT_EQ(100, list->at(4).from()); 1627 ASSERT_EQ(100, list->at(4).to()); 1628 1629 list->Rewind(0); 1630 list->Add(CharacterRange(10, 19)); 1631 list->Add(CharacterRange(21, 30)); 1632 list->Add(CharacterRange(20, 20)); 1633 set.Canonicalize(); 1634 ASSERT_EQ(1, list->length()); 1635 ASSERT_EQ(10, list->at(0).from()); 1636 ASSERT_EQ(30, list->at(0).to()); 1637 } 1638 1639 // Checks whether a character is in the set represented by a list of ranges. 1640 static bool CharacterInSet(ZoneList<CharacterRange>* set, uc16 value) { 1641 for (int i = 0; i < set->length(); i++) { 1642 CharacterRange range = set->at(i); 1643 if (range.from() <= value && value <= range.to()) { 1644 return true; 1645 } 1646 } 1647 return false; 1648 } 1649 1650 TEST(CharacterRangeMerge) { 1651 ZoneScope zone_scope(DELETE_ON_EXIT); 1652 ZoneList<CharacterRange> l1(4); 1653 ZoneList<CharacterRange> l2(4); 1654 // Create all combinations of intersections of ranges, both singletons and 1655 // longer. 1656 1657 int offset = 0; 1658 1659 // The five kinds of singleton intersections: 1660 // X 1661 // Y - outside before 1662 // Y - outside touching start 1663 // Y - overlap 1664 // Y - outside touching end 1665 // Y - outside after 1666 1667 for (int i = 0; i < 5; i++) { 1668 l1.Add(CharacterRange::Singleton(offset + 2)); 1669 l2.Add(CharacterRange::Singleton(offset + i)); 1670 offset += 6; 1671 } 1672 1673 // The seven kinds of singleton/non-singleton intersections: 1674 // XXX 1675 // Y - outside before 1676 // Y - outside touching start 1677 // Y - inside touching start 1678 // Y - entirely inside 1679 // Y - inside touching end 1680 // Y - outside touching end 1681 // Y - disjoint after 1682 1683 for (int i = 0; i < 7; i++) { 1684 l1.Add(CharacterRange::Range(offset + 2, offset + 4)); 1685 l2.Add(CharacterRange::Singleton(offset + i)); 1686 offset += 8; 1687 } 1688 1689 // The eleven kinds of non-singleton intersections: 1690 // 1691 // XXXXXXXX 1692 // YYYY - outside before. 1693 // YYYY - outside touching start. 1694 // YYYY - overlapping start 1695 // YYYY - inside touching start 1696 // YYYY - entirely inside 1697 // YYYY - inside touching end 1698 // YYYY - overlapping end 1699 // YYYY - outside touching end 1700 // YYYY - outside after 1701 // YYYYYYYY - identical 1702 // YYYYYYYYYYYY - containing entirely. 1703 1704 for (int i = 0; i < 9; i++) { 1705 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); // Length 8. 1706 l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3)); 1707 offset += 22; 1708 } 1709 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); 1710 l2.Add(CharacterRange::Range(offset + 6, offset + 15)); 1711 offset += 22; 1712 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); 1713 l2.Add(CharacterRange::Range(offset + 4, offset + 17)); 1714 offset += 22; 1715 1716 // Different kinds of multi-range overlap: 1717 // XXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXX 1718 // YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y 1719 1720 l1.Add(CharacterRange::Range(offset, offset + 21)); 1721 l1.Add(CharacterRange::Range(offset + 31, offset + 52)); 1722 for (int i = 0; i < 6; i++) { 1723 l2.Add(CharacterRange::Range(offset + 2, offset + 5)); 1724 l2.Add(CharacterRange::Singleton(offset + 8)); 1725 offset += 9; 1726 } 1727 1728 ASSERT(CharacterRange::IsCanonical(&l1)); 1729 ASSERT(CharacterRange::IsCanonical(&l2)); 1730 1731 ZoneList<CharacterRange> first_only(4); 1732 ZoneList<CharacterRange> second_only(4); 1733 ZoneList<CharacterRange> both(4); 1734 1735 // Merge one direction. 1736 CharacterRange::Merge(&l1, &l2, &first_only, &second_only, &both); 1737 1738 CHECK(CharacterRange::IsCanonical(&first_only)); 1739 CHECK(CharacterRange::IsCanonical(&second_only)); 1740 CHECK(CharacterRange::IsCanonical(&both)); 1741 1742 for (uc16 i = 0; i < offset; i++) { 1743 bool in_first = CharacterInSet(&l1, i); 1744 bool in_second = CharacterInSet(&l2, i); 1745 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); 1746 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); 1747 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1748 } 1749 1750 first_only.Clear(); 1751 second_only.Clear(); 1752 both.Clear(); 1753 1754 // Merge other direction. 1755 CharacterRange::Merge(&l2, &l1, &second_only, &first_only, &both); 1756 1757 CHECK(CharacterRange::IsCanonical(&first_only)); 1758 CHECK(CharacterRange::IsCanonical(&second_only)); 1759 CHECK(CharacterRange::IsCanonical(&both)); 1760 1761 for (uc16 i = 0; i < offset; i++) { 1762 bool in_first = CharacterInSet(&l1, i); 1763 bool in_second = CharacterInSet(&l2, i); 1764 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); 1765 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); 1766 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1767 } 1768 1769 first_only.Clear(); 1770 second_only.Clear(); 1771 both.Clear(); 1772 1773 // Merge but don't record all combinations. 1774 CharacterRange::Merge(&l1, &l2, NULL, NULL, &both); 1775 1776 CHECK(CharacterRange::IsCanonical(&both)); 1777 1778 for (uc16 i = 0; i < offset; i++) { 1779 bool in_first = CharacterInSet(&l1, i); 1780 bool in_second = CharacterInSet(&l2, i); 1781 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1782 } 1783 1784 // Merge into same set. 1785 ZoneList<CharacterRange> all(4); 1786 CharacterRange::Merge(&l1, &l2, &all, &all, &all); 1787 1788 CHECK(CharacterRange::IsCanonical(&all)); 1789 1790 for (uc16 i = 0; i < offset; i++) { 1791 bool in_first = CharacterInSet(&l1, i); 1792 bool in_second = CharacterInSet(&l2, i); 1793 CHECK((in_first || in_second) == CharacterInSet(&all, i)); 1794 } 1795 } 1796 1797 1798 TEST(Graph) { 1799 V8::Initialize(NULL); 1800 Execute("\\b\\w+\\b", false, true, true); 1801 } 1802