1 // Copyright 2008 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 29 #include <stdlib.h> 30 31 #include "v8.h" 32 33 #include "ast.h" 34 #include "char-predicates-inl.h" 35 #include "cctest.h" 36 #include "jsregexp.h" 37 #include "parser.h" 38 #include "regexp-macro-assembler.h" 39 #include "regexp-macro-assembler-irregexp.h" 40 #include "string-stream.h" 41 #include "zone-inl.h" 42 #ifdef V8_INTERPRETED_REGEXP 43 #include "interpreter-irregexp.h" 44 #else // V8_INTERPRETED_REGEXP 45 #include "macro-assembler.h" 46 #include "code.h" 47 #ifdef V8_TARGET_ARCH_ARM 48 #include "arm/assembler-arm.h" 49 #include "arm/macro-assembler-arm.h" 50 #include "arm/regexp-macro-assembler-arm.h" 51 #endif 52 #ifdef V8_TARGET_ARCH_MIPS 53 #include "mips/assembler-mips.h" 54 #include "mips/macro-assembler-mips.h" 55 #include "mips/regexp-macro-assembler-mips.h" 56 #endif 57 #ifdef V8_TARGET_ARCH_X64 58 #include "x64/assembler-x64.h" 59 #include "x64/macro-assembler-x64.h" 60 #include "x64/regexp-macro-assembler-x64.h" 61 #endif 62 #ifdef V8_TARGET_ARCH_IA32 63 #include "ia32/assembler-ia32.h" 64 #include "ia32/macro-assembler-ia32.h" 65 #include "ia32/regexp-macro-assembler-ia32.h" 66 #endif 67 #endif // V8_INTERPRETED_REGEXP 68 69 using namespace v8::internal; 70 71 72 static bool CheckParse(const char* input) { 73 V8::Initialize(NULL); 74 v8::HandleScope scope; 75 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 76 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 77 RegExpCompileData result; 78 return v8::internal::RegExpParser::ParseRegExp(&reader, false, &result); 79 } 80 81 82 static SmartArrayPointer<const char> Parse(const char* input) { 83 V8::Initialize(NULL); 84 v8::HandleScope scope; 85 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 86 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 87 RegExpCompileData result; 88 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 89 CHECK(result.tree != NULL); 90 CHECK(result.error.is_null()); 91 SmartArrayPointer<const char> output = result.tree->ToString(); 92 return output; 93 } 94 95 static bool CheckSimple(const char* input) { 96 V8::Initialize(NULL); 97 v8::HandleScope scope; 98 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input)); 99 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 100 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 101 RegExpCompileData result; 102 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 103 CHECK(result.tree != NULL); 104 CHECK(result.error.is_null()); 105 return result.simple; 106 } 107 108 struct MinMaxPair { 109 int min_match; 110 int max_match; 111 }; 112 113 static MinMaxPair CheckMinMaxMatch(const char* input) { 114 V8::Initialize(NULL); 115 v8::HandleScope scope; 116 unibrow::Utf8InputBuffer<> buffer(input, StrLength(input)); 117 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 118 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 119 RegExpCompileData result; 120 CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 121 CHECK(result.tree != NULL); 122 CHECK(result.error.is_null()); 123 int min_match = result.tree->min_match(); 124 int max_match = result.tree->max_match(); 125 MinMaxPair pair = { min_match, max_match }; 126 return pair; 127 } 128 129 130 #define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input)) 131 #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input)) 132 #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input)); 133 #define CHECK_MIN_MAX(input, min, max) \ 134 { MinMaxPair min_max = CheckMinMaxMatch(input); \ 135 CHECK_EQ(min, min_max.min_match); \ 136 CHECK_EQ(max, min_max.max_match); \ 137 } 138 139 TEST(Parser) { 140 V8::Initialize(NULL); 141 142 CHECK_PARSE_ERROR("?"); 143 144 CHECK_PARSE_EQ("abc", "'abc'"); 145 CHECK_PARSE_EQ("", "%"); 146 CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')"); 147 CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')"); 148 CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)"); 149 CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')"); 150 CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])"); 151 CHECK_PARSE_EQ("a*", "(# 0 - g 'a')"); 152 CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')"); 153 CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))"); 154 CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))"); 155 CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))"); 156 CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))"); 157 CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))"); 158 CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))"); 159 CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))"); 160 CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))"); 161 CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))"); 162 CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))"); 163 CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))"); 164 CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))"); 165 CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'"); 166 CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')"); 167 CHECK_PARSE_EQ("(?:foo)", "'foo'"); 168 CHECK_PARSE_EQ("(?: foo )", "' foo '"); 169 CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))"); 170 CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')"); 171 CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')"); 172 CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')"); 173 CHECK_PARSE_EQ("()", "(^ %)"); 174 CHECK_PARSE_EQ("(?=)", "(-> + %)"); 175 CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows 176 CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252 177 CHECK_PARSE_EQ("[x]", "[x]"); 178 CHECK_PARSE_EQ("[xyz]", "[x y z]"); 179 CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]"); 180 CHECK_PARSE_EQ("[-123]", "[- 1 2 3]"); 181 CHECK_PARSE_EQ("[^123]", "^[1 2 3]"); 182 CHECK_PARSE_EQ("]", "']'"); 183 CHECK_PARSE_EQ("}", "'}'"); 184 CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]"); 185 CHECK_PARSE_EQ("[\\d]", "[0-9]"); 186 CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]"); 187 CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]"); 188 CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]"); 189 CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]"); 190 // Control character outside character class. 191 CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK", 192 "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'"); 193 CHECK_PARSE_EQ("\\c!", "'\\c!'"); 194 CHECK_PARSE_EQ("\\c_", "'\\c_'"); 195 CHECK_PARSE_EQ("\\c~", "'\\c~'"); 196 CHECK_PARSE_EQ("\\c1", "'\\c1'"); 197 // Control character inside character class. 198 CHECK_PARSE_EQ("[\\c!]", "[\\ c !]"); 199 CHECK_PARSE_EQ("[\\c_]", "[\\x1f]"); 200 CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]"); 201 CHECK_PARSE_EQ("[\\ca]", "[\\x01]"); 202 CHECK_PARSE_EQ("[\\cz]", "[\\x1a]"); 203 CHECK_PARSE_EQ("[\\cA]", "[\\x01]"); 204 CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]"); 205 CHECK_PARSE_EQ("[\\c1]", "[\\x11]"); 206 207 CHECK_PARSE_EQ("[a\\]c]", "[a ] c]"); 208 CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '"); 209 CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]"); 210 CHECK_PARSE_EQ("\\0", "'\\x00'"); 211 CHECK_PARSE_EQ("\\8", "'8'"); 212 CHECK_PARSE_EQ("\\9", "'9'"); 213 CHECK_PARSE_EQ("\\11", "'\\x09'"); 214 CHECK_PARSE_EQ("\\11a", "'\\x09a'"); 215 CHECK_PARSE_EQ("\\011", "'\\x09'"); 216 CHECK_PARSE_EQ("\\00011", "'\\x0011'"); 217 CHECK_PARSE_EQ("\\118", "'\\x098'"); 218 CHECK_PARSE_EQ("\\111", "'I'"); 219 CHECK_PARSE_EQ("\\1111", "'I1'"); 220 CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))"); 221 CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))"); 222 CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))"); 223 CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')"); 224 CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')" 225 " (# 0 - g (<- 1)))"); 226 CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')" 227 " (# 0 - g (<- 2)))"); 228 CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')" 229 " (# 0 - g (<- 3)))"); 230 CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')" 231 " (# 0 - g '\\x04'))"); 232 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10", 233 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 234 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))"); 235 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11", 236 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')" 237 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')"); 238 CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))"); 239 CHECK_PARSE_EQ("(a\\1)", "(^ 'a')"); 240 CHECK_PARSE_EQ("(\\1a)", "(^ 'a')"); 241 CHECK_PARSE_EQ("(?=a)?a", "'a'"); 242 CHECK_PARSE_EQ("(?=a){0,10}a", "'a'"); 243 CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')"); 244 CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')"); 245 CHECK_PARSE_EQ("(?!a)?a", "'a'"); 246 CHECK_PARSE_EQ("\\1(a)", "(^ 'a')"); 247 CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))"); 248 CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))"); 249 CHECK_PARSE_EQ("[\\0]", "[\\x00]"); 250 CHECK_PARSE_EQ("[\\11]", "[\\x09]"); 251 CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]"); 252 CHECK_PARSE_EQ("[\\011]", "[\\x09]"); 253 CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]"); 254 CHECK_PARSE_EQ("[\\118]", "[\\x09 8]"); 255 CHECK_PARSE_EQ("[\\111]", "[I]"); 256 CHECK_PARSE_EQ("[\\1111]", "[I 1]"); 257 CHECK_PARSE_EQ("\\x34", "'\x34'"); 258 CHECK_PARSE_EQ("\\x60", "'\x60'"); 259 CHECK_PARSE_EQ("\\x3z", "'x3z'"); 260 CHECK_PARSE_EQ("\\c", "'\\c'"); 261 CHECK_PARSE_EQ("\\u0034", "'\x34'"); 262 CHECK_PARSE_EQ("\\u003z", "'u003z'"); 263 CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))"); 264 265 CHECK_SIMPLE("a", true); 266 CHECK_SIMPLE("a|b", false); 267 CHECK_SIMPLE("a\\n", false); 268 CHECK_SIMPLE("^a", false); 269 CHECK_SIMPLE("a$", false); 270 CHECK_SIMPLE("a\\b!", false); 271 CHECK_SIMPLE("a\\Bb", false); 272 CHECK_SIMPLE("a*", false); 273 CHECK_SIMPLE("a*?", false); 274 CHECK_SIMPLE("a?", false); 275 CHECK_SIMPLE("a??", false); 276 CHECK_SIMPLE("a{0,1}?", false); 277 CHECK_SIMPLE("a{1,1}?", false); 278 CHECK_SIMPLE("a{1,2}?", false); 279 CHECK_SIMPLE("a+?", false); 280 CHECK_SIMPLE("(a)", false); 281 CHECK_SIMPLE("(a)\\1", false); 282 CHECK_SIMPLE("(\\1a)", false); 283 CHECK_SIMPLE("\\1(a)", false); 284 CHECK_SIMPLE("a\\s", false); 285 CHECK_SIMPLE("a\\S", false); 286 CHECK_SIMPLE("a\\d", false); 287 CHECK_SIMPLE("a\\D", false); 288 CHECK_SIMPLE("a\\w", false); 289 CHECK_SIMPLE("a\\W", false); 290 CHECK_SIMPLE("a.", false); 291 CHECK_SIMPLE("a\\q", false); 292 CHECK_SIMPLE("a[a]", false); 293 CHECK_SIMPLE("a[^a]", false); 294 CHECK_SIMPLE("a[a-z]", false); 295 CHECK_SIMPLE("a[\\q]", false); 296 CHECK_SIMPLE("a(?:b)", false); 297 CHECK_SIMPLE("a(?=b)", false); 298 CHECK_SIMPLE("a(?!b)", false); 299 CHECK_SIMPLE("\\x60", false); 300 CHECK_SIMPLE("\\u0060", false); 301 CHECK_SIMPLE("\\cA", false); 302 CHECK_SIMPLE("\\q", false); 303 CHECK_SIMPLE("\\1112", false); 304 CHECK_SIMPLE("\\0", false); 305 CHECK_SIMPLE("(a)\\1", false); 306 CHECK_SIMPLE("(?=a)?a", false); 307 CHECK_SIMPLE("(?!a)?a\\1", false); 308 CHECK_SIMPLE("(?:(?=a))a\\1", false); 309 310 CHECK_PARSE_EQ("a{}", "'a{}'"); 311 CHECK_PARSE_EQ("a{,}", "'a{,}'"); 312 CHECK_PARSE_EQ("a{", "'a{'"); 313 CHECK_PARSE_EQ("a{z}", "'a{z}'"); 314 CHECK_PARSE_EQ("a{1z}", "'a{1z}'"); 315 CHECK_PARSE_EQ("a{12z}", "'a{12z}'"); 316 CHECK_PARSE_EQ("a{12,", "'a{12,'"); 317 CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'"); 318 CHECK_PARSE_EQ("{}", "'{}'"); 319 CHECK_PARSE_EQ("{,}", "'{,}'"); 320 CHECK_PARSE_EQ("{", "'{'"); 321 CHECK_PARSE_EQ("{z}", "'{z}'"); 322 CHECK_PARSE_EQ("{1z}", "'{1z}'"); 323 CHECK_PARSE_EQ("{12z}", "'{12z}'"); 324 CHECK_PARSE_EQ("{12,", "'{12,'"); 325 CHECK_PARSE_EQ("{12,3b", "'{12,3b'"); 326 327 CHECK_MIN_MAX("a", 1, 1); 328 CHECK_MIN_MAX("abc", 3, 3); 329 CHECK_MIN_MAX("a[bc]d", 3, 3); 330 CHECK_MIN_MAX("a|bc", 1, 2); 331 CHECK_MIN_MAX("ab|c", 1, 2); 332 CHECK_MIN_MAX("a||bc", 0, 2); 333 CHECK_MIN_MAX("|", 0, 0); 334 CHECK_MIN_MAX("(?:ab)", 2, 2); 335 CHECK_MIN_MAX("(?:ab|cde)", 2, 3); 336 CHECK_MIN_MAX("(?:ab)|cde", 2, 3); 337 CHECK_MIN_MAX("(ab)", 2, 2); 338 CHECK_MIN_MAX("(ab|cde)", 2, 3); 339 CHECK_MIN_MAX("(ab)\\1", 2, 4); 340 CHECK_MIN_MAX("(ab|cde)\\1", 2, 6); 341 CHECK_MIN_MAX("(?:ab)?", 0, 2); 342 CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity); 343 CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity); 344 CHECK_MIN_MAX("a?", 0, 1); 345 CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity); 346 CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity); 347 CHECK_MIN_MAX("a??", 0, 1); 348 CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity); 349 CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity); 350 CHECK_MIN_MAX("(?:a?)?", 0, 1); 351 CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity); 352 CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity); 353 CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity); 354 CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity); 355 CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity); 356 CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity); 357 CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity); 358 CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity); 359 CHECK_MIN_MAX("a{0}", 0, 0); 360 CHECK_MIN_MAX("(?:a+){0}", 0, 0); 361 CHECK_MIN_MAX("(?:a+){0,0}", 0, 0); 362 CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity); 363 CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity); 364 CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity); 365 CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity); 366 CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity); 367 CHECK_MIN_MAX("(?:ab){4,7}", 8, 14); 368 CHECK_MIN_MAX("a\\bc", 2, 2); 369 CHECK_MIN_MAX("a\\Bc", 2, 2); 370 CHECK_MIN_MAX("a\\sc", 3, 3); 371 CHECK_MIN_MAX("a\\Sc", 3, 3); 372 CHECK_MIN_MAX("a(?=b)c", 2, 2); 373 CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2); 374 CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2); 375 } 376 377 TEST(ParserRegression) { 378 CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])"); 379 CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')"); 380 CHECK_PARSE_EQ("{", "'{'"); 381 CHECK_PARSE_EQ("a|", "(| 'a' %)"); 382 } 383 384 static void ExpectError(const char* input, 385 const char* expected) { 386 V8::Initialize(NULL); 387 v8::HandleScope scope; 388 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 389 FlatStringReader reader(Isolate::Current(), CStrVector(input)); 390 RegExpCompileData result; 391 CHECK(!v8::internal::RegExpParser::ParseRegExp(&reader, false, &result)); 392 CHECK(result.tree == NULL); 393 CHECK(!result.error.is_null()); 394 SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS); 395 CHECK_EQ(expected, *str); 396 } 397 398 399 TEST(Errors) { 400 V8::Initialize(NULL); 401 const char* kEndBackslash = "\\ at end of pattern"; 402 ExpectError("\\", kEndBackslash); 403 const char* kUnterminatedGroup = "Unterminated group"; 404 ExpectError("(foo", kUnterminatedGroup); 405 const char* kInvalidGroup = "Invalid group"; 406 ExpectError("(?", kInvalidGroup); 407 const char* kUnterminatedCharacterClass = "Unterminated character class"; 408 ExpectError("[", kUnterminatedCharacterClass); 409 ExpectError("[a-", kUnterminatedCharacterClass); 410 const char* kNothingToRepeat = "Nothing to repeat"; 411 ExpectError("*", kNothingToRepeat); 412 ExpectError("?", kNothingToRepeat); 413 ExpectError("+", kNothingToRepeat); 414 ExpectError("{1}", kNothingToRepeat); 415 ExpectError("{1,2}", kNothingToRepeat); 416 ExpectError("{1,}", kNothingToRepeat); 417 418 // Check that we don't allow more than kMaxCapture captures 419 const int kMaxCaptures = 1 << 16; // Must match RegExpParser::kMaxCaptures. 420 const char* kTooManyCaptures = "Too many captures"; 421 HeapStringAllocator allocator; 422 StringStream accumulator(&allocator); 423 for (int i = 0; i <= kMaxCaptures; i++) { 424 accumulator.Add("()"); 425 } 426 SmartArrayPointer<const char> many_captures(accumulator.ToCString()); 427 ExpectError(*many_captures, kTooManyCaptures); 428 } 429 430 431 static bool IsDigit(uc16 c) { 432 return ('0' <= c && c <= '9'); 433 } 434 435 436 static bool NotDigit(uc16 c) { 437 return !IsDigit(c); 438 } 439 440 441 static bool IsWhiteSpace(uc16 c) { 442 switch (c) { 443 case 0x09: 444 case 0x0A: 445 case 0x0B: 446 case 0x0C: 447 case 0x0d: 448 case 0x20: 449 case 0xA0: 450 case 0x2028: 451 case 0x2029: 452 case 0xFEFF: 453 return true; 454 default: 455 return unibrow::Space::Is(c); 456 } 457 } 458 459 460 static bool NotWhiteSpace(uc16 c) { 461 return !IsWhiteSpace(c); 462 } 463 464 465 static bool NotWord(uc16 c) { 466 return !IsRegExpWord(c); 467 } 468 469 470 static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) { 471 ZoneScope scope(Isolate::Current(), DELETE_ON_EXIT); 472 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2); 473 CharacterRange::AddClassEscape(c, ranges); 474 for (unsigned i = 0; i < (1 << 16); i++) { 475 bool in_class = false; 476 for (int j = 0; !in_class && j < ranges->length(); j++) { 477 CharacterRange& range = ranges->at(j); 478 in_class = (range.from() <= i && i <= range.to()); 479 } 480 CHECK_EQ(pred(i), in_class); 481 } 482 } 483 484 485 TEST(CharacterClassEscapes) { 486 v8::internal::V8::Initialize(NULL); 487 TestCharacterClassEscapes('.', IsRegExpNewline); 488 TestCharacterClassEscapes('d', IsDigit); 489 TestCharacterClassEscapes('D', NotDigit); 490 TestCharacterClassEscapes('s', IsWhiteSpace); 491 TestCharacterClassEscapes('S', NotWhiteSpace); 492 TestCharacterClassEscapes('w', IsRegExpWord); 493 TestCharacterClassEscapes('W', NotWord); 494 } 495 496 497 static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) { 498 V8::Initialize(NULL); 499 Isolate* isolate = Isolate::Current(); 500 FlatStringReader reader(isolate, CStrVector(input)); 501 RegExpCompileData compile_data; 502 if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline, 503 &compile_data)) 504 return NULL; 505 Handle<String> pattern = isolate->factory()-> 506 NewStringFromUtf8(CStrVector(input)); 507 RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii); 508 return compile_data.node; 509 } 510 511 512 static void Execute(const char* input, 513 bool multiline, 514 bool is_ascii, 515 bool dot_output = false) { 516 v8::HandleScope scope; 517 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 518 RegExpNode* node = Compile(input, multiline, is_ascii); 519 USE(node); 520 #ifdef DEBUG 521 if (dot_output) { 522 RegExpEngine::DotPrint(input, node, false); 523 exit(0); 524 } 525 #endif // DEBUG 526 } 527 528 529 class TestConfig { 530 public: 531 typedef int Key; 532 typedef int Value; 533 static const int kNoKey; 534 static int NoValue() { return 0; } 535 static inline int Compare(int a, int b) { 536 if (a < b) 537 return -1; 538 else if (a > b) 539 return 1; 540 else 541 return 0; 542 } 543 }; 544 545 546 const int TestConfig::kNoKey = 0; 547 548 549 static unsigned PseudoRandom(int i, int j) { 550 return ~(~((i * 781) ^ (j * 329))); 551 } 552 553 554 TEST(SplayTreeSimple) { 555 v8::internal::V8::Initialize(NULL); 556 static const unsigned kLimit = 1000; 557 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 558 ZoneSplayTree<TestConfig> tree; 559 bool seen[kLimit]; 560 for (unsigned i = 0; i < kLimit; i++) seen[i] = false; 561 #define CHECK_MAPS_EQUAL() do { \ 562 for (unsigned k = 0; k < kLimit; k++) \ 563 CHECK_EQ(seen[k], tree.Find(k, &loc)); \ 564 } while (false) 565 for (int i = 0; i < 50; i++) { 566 for (int j = 0; j < 50; j++) { 567 unsigned next = PseudoRandom(i, j) % kLimit; 568 if (seen[next]) { 569 // We've already seen this one. Check the value and remove 570 // it. 571 ZoneSplayTree<TestConfig>::Locator loc; 572 CHECK(tree.Find(next, &loc)); 573 CHECK_EQ(next, loc.key()); 574 CHECK_EQ(3 * next, loc.value()); 575 tree.Remove(next); 576 seen[next] = false; 577 CHECK_MAPS_EQUAL(); 578 } else { 579 // Check that it wasn't there already and then add it. 580 ZoneSplayTree<TestConfig>::Locator loc; 581 CHECK(!tree.Find(next, &loc)); 582 CHECK(tree.Insert(next, &loc)); 583 CHECK_EQ(next, loc.key()); 584 loc.set_value(3 * next); 585 seen[next] = true; 586 CHECK_MAPS_EQUAL(); 587 } 588 int val = PseudoRandom(j, i) % kLimit; 589 if (seen[val]) { 590 ZoneSplayTree<TestConfig>::Locator loc; 591 CHECK(tree.FindGreatestLessThan(val, &loc)); 592 CHECK_EQ(loc.key(), val); 593 break; 594 } 595 val = PseudoRandom(i + j, i - j) % kLimit; 596 if (seen[val]) { 597 ZoneSplayTree<TestConfig>::Locator loc; 598 CHECK(tree.FindLeastGreaterThan(val, &loc)); 599 CHECK_EQ(loc.key(), val); 600 break; 601 } 602 } 603 } 604 } 605 606 607 TEST(DispatchTableConstruction) { 608 v8::internal::V8::Initialize(NULL); 609 // Initialize test data. 610 static const int kLimit = 1000; 611 static const int kRangeCount = 8; 612 static const int kRangeSize = 16; 613 uc16 ranges[kRangeCount][2 * kRangeSize]; 614 for (int i = 0; i < kRangeCount; i++) { 615 Vector<uc16> range(ranges[i], 2 * kRangeSize); 616 for (int j = 0; j < 2 * kRangeSize; j++) { 617 range[j] = PseudoRandom(i + 25, j + 87) % kLimit; 618 } 619 range.Sort(); 620 for (int j = 1; j < 2 * kRangeSize; j++) { 621 CHECK(range[j-1] <= range[j]); 622 } 623 } 624 // Enter test data into dispatch table. 625 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 626 DispatchTable table; 627 for (int i = 0; i < kRangeCount; i++) { 628 uc16* range = ranges[i]; 629 for (int j = 0; j < 2 * kRangeSize; j += 2) 630 table.AddRange(CharacterRange(range[j], range[j + 1]), i); 631 } 632 // Check that the table looks as we would expect 633 for (int p = 0; p < kLimit; p++) { 634 OutSet* outs = table.Get(p); 635 for (int j = 0; j < kRangeCount; j++) { 636 uc16* range = ranges[j]; 637 bool is_on = false; 638 for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2) 639 is_on = (range[k] <= p && p <= range[k + 1]); 640 CHECK_EQ(is_on, outs->Get(j)); 641 } 642 } 643 } 644 645 // Test of debug-only syntax. 646 #ifdef DEBUG 647 648 TEST(ParsePossessiveRepetition) { 649 bool old_flag_value = FLAG_regexp_possessive_quantifier; 650 651 // Enable possessive quantifier syntax. 652 FLAG_regexp_possessive_quantifier = true; 653 654 CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')"); 655 CHECK_PARSE_EQ("a++", "(# 1 - p 'a')"); 656 CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')"); 657 CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')"); 658 CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')"); 659 660 // Disable possessive quantifier syntax. 661 FLAG_regexp_possessive_quantifier = false; 662 663 CHECK_PARSE_ERROR("a*+"); 664 CHECK_PARSE_ERROR("a++"); 665 CHECK_PARSE_ERROR("a?+"); 666 CHECK_PARSE_ERROR("a{10,20}+"); 667 CHECK_PARSE_ERROR("a{10,20}+b"); 668 669 FLAG_regexp_possessive_quantifier = old_flag_value; 670 } 671 672 #endif 673 674 // Tests of interpreter. 675 676 677 #ifndef V8_INTERPRETED_REGEXP 678 679 #if V8_TARGET_ARCH_IA32 680 typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler; 681 #elif V8_TARGET_ARCH_X64 682 typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler; 683 #elif V8_TARGET_ARCH_ARM 684 typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler; 685 #elif V8_TARGET_ARCH_MIPS 686 typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler; 687 #endif 688 689 class ContextInitializer { 690 public: 691 ContextInitializer() 692 : env_(), scope_(), zone_(Isolate::Current(), DELETE_ON_EXIT) { 693 env_ = v8::Context::New(); 694 env_->Enter(); 695 } 696 ~ContextInitializer() { 697 env_->Exit(); 698 env_.Dispose(); 699 } 700 private: 701 v8::Persistent<v8::Context> env_; 702 v8::HandleScope scope_; 703 v8::internal::ZoneScope zone_; 704 }; 705 706 707 static ArchRegExpMacroAssembler::Result Execute(Code* code, 708 String* input, 709 int start_offset, 710 const byte* input_start, 711 const byte* input_end, 712 int* captures) { 713 return NativeRegExpMacroAssembler::Execute( 714 code, 715 input, 716 start_offset, 717 input_start, 718 input_end, 719 captures, 720 Isolate::Current()); 721 } 722 723 724 TEST(MacroAssemblerNativeSuccess) { 725 v8::V8::Initialize(); 726 ContextInitializer initializer; 727 Factory* factory = Isolate::Current()->factory(); 728 729 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 730 731 m.Succeed(); 732 733 Handle<String> source = factory->NewStringFromAscii(CStrVector("")); 734 Handle<Object> code_object = m.GetCode(source); 735 Handle<Code> code = Handle<Code>::cast(code_object); 736 737 int captures[4] = {42, 37, 87, 117}; 738 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 739 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 740 const byte* start_adr = 741 reinterpret_cast<const byte*>(seq_input->GetCharsAddress()); 742 743 NativeRegExpMacroAssembler::Result result = 744 Execute(*code, 745 *input, 746 0, 747 start_adr, 748 start_adr + seq_input->length(), 749 captures); 750 751 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 752 CHECK_EQ(-1, captures[0]); 753 CHECK_EQ(-1, captures[1]); 754 CHECK_EQ(-1, captures[2]); 755 CHECK_EQ(-1, captures[3]); 756 } 757 758 759 TEST(MacroAssemblerNativeSimple) { 760 v8::V8::Initialize(); 761 ContextInitializer initializer; 762 Factory* factory = Isolate::Current()->factory(); 763 764 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 765 766 uc16 foo_chars[3] = {'f', 'o', 'o'}; 767 Vector<const uc16> foo(foo_chars, 3); 768 769 Label fail; 770 m.CheckCharacters(foo, 0, &fail, true); 771 m.WriteCurrentPositionToRegister(0, 0); 772 m.AdvanceCurrentPosition(3); 773 m.WriteCurrentPositionToRegister(1, 0); 774 m.Succeed(); 775 m.Bind(&fail); 776 m.Fail(); 777 778 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo")); 779 Handle<Object> code_object = m.GetCode(source); 780 Handle<Code> code = Handle<Code>::cast(code_object); 781 782 int captures[4] = {42, 37, 87, 117}; 783 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 784 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 785 Address start_adr = seq_input->GetCharsAddress(); 786 787 NativeRegExpMacroAssembler::Result result = 788 Execute(*code, 789 *input, 790 0, 791 start_adr, 792 start_adr + input->length(), 793 captures); 794 795 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 796 CHECK_EQ(0, captures[0]); 797 CHECK_EQ(3, captures[1]); 798 CHECK_EQ(-1, captures[2]); 799 CHECK_EQ(-1, captures[3]); 800 801 input = factory->NewStringFromAscii(CStrVector("barbarbar")); 802 seq_input = Handle<SeqAsciiString>::cast(input); 803 start_adr = seq_input->GetCharsAddress(); 804 805 result = Execute(*code, 806 *input, 807 0, 808 start_adr, 809 start_adr + input->length(), 810 captures); 811 812 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 813 } 814 815 816 TEST(MacroAssemblerNativeSimpleUC16) { 817 v8::V8::Initialize(); 818 ContextInitializer initializer; 819 Factory* factory = Isolate::Current()->factory(); 820 821 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4); 822 823 uc16 foo_chars[3] = {'f', 'o', 'o'}; 824 Vector<const uc16> foo(foo_chars, 3); 825 826 Label fail; 827 m.CheckCharacters(foo, 0, &fail, true); 828 m.WriteCurrentPositionToRegister(0, 0); 829 m.AdvanceCurrentPosition(3); 830 m.WriteCurrentPositionToRegister(1, 0); 831 m.Succeed(); 832 m.Bind(&fail); 833 m.Fail(); 834 835 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo")); 836 Handle<Object> code_object = m.GetCode(source); 837 Handle<Code> code = Handle<Code>::cast(code_object); 838 839 int captures[4] = {42, 37, 87, 117}; 840 const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o', 841 static_cast<uc16>('\xa0')}; 842 Handle<String> input = 843 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 844 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 845 Address start_adr = seq_input->GetCharsAddress(); 846 847 NativeRegExpMacroAssembler::Result result = 848 Execute(*code, 849 *input, 850 0, 851 start_adr, 852 start_adr + input->length(), 853 captures); 854 855 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 856 CHECK_EQ(0, captures[0]); 857 CHECK_EQ(3, captures[1]); 858 CHECK_EQ(-1, captures[2]); 859 CHECK_EQ(-1, captures[3]); 860 861 const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', 862 static_cast<uc16>('\xa0')}; 863 input = factory->NewStringFromTwoByte(Vector<const uc16>(input_data2, 9)); 864 seq_input = Handle<SeqTwoByteString>::cast(input); 865 start_adr = seq_input->GetCharsAddress(); 866 867 result = Execute(*code, 868 *input, 869 0, 870 start_adr, 871 start_adr + input->length() * 2, 872 captures); 873 874 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 875 } 876 877 878 TEST(MacroAssemblerNativeBacktrack) { 879 v8::V8::Initialize(); 880 ContextInitializer initializer; 881 Factory* factory = Isolate::Current()->factory(); 882 883 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 884 885 Label fail; 886 Label backtrack; 887 m.LoadCurrentCharacter(10, &fail); 888 m.Succeed(); 889 m.Bind(&fail); 890 m.PushBacktrack(&backtrack); 891 m.LoadCurrentCharacter(10, NULL); 892 m.Succeed(); 893 m.Bind(&backtrack); 894 m.Fail(); 895 896 Handle<String> source = factory->NewStringFromAscii(CStrVector("..........")); 897 Handle<Object> code_object = m.GetCode(source); 898 Handle<Code> code = Handle<Code>::cast(code_object); 899 900 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo")); 901 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 902 Address start_adr = seq_input->GetCharsAddress(); 903 904 NativeRegExpMacroAssembler::Result result = 905 Execute(*code, 906 *input, 907 0, 908 start_adr, 909 start_adr + input->length(), 910 NULL); 911 912 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result); 913 } 914 915 916 TEST(MacroAssemblerNativeBackReferenceASCII) { 917 v8::V8::Initialize(); 918 ContextInitializer initializer; 919 Factory* factory = Isolate::Current()->factory(); 920 921 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 922 923 m.WriteCurrentPositionToRegister(0, 0); 924 m.AdvanceCurrentPosition(2); 925 m.WriteCurrentPositionToRegister(1, 0); 926 Label nomatch; 927 m.CheckNotBackReference(0, &nomatch); 928 m.Fail(); 929 m.Bind(&nomatch); 930 m.AdvanceCurrentPosition(2); 931 Label missing_match; 932 m.CheckNotBackReference(0, &missing_match); 933 m.WriteCurrentPositionToRegister(2, 0); 934 m.Succeed(); 935 m.Bind(&missing_match); 936 m.Fail(); 937 938 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1")); 939 Handle<Object> code_object = m.GetCode(source); 940 Handle<Code> code = Handle<Code>::cast(code_object); 941 942 Handle<String> input = factory->NewStringFromAscii(CStrVector("fooofo")); 943 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 944 Address start_adr = seq_input->GetCharsAddress(); 945 946 int output[4]; 947 NativeRegExpMacroAssembler::Result result = 948 Execute(*code, 949 *input, 950 0, 951 start_adr, 952 start_adr + input->length(), 953 output); 954 955 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 956 CHECK_EQ(0, output[0]); 957 CHECK_EQ(2, output[1]); 958 CHECK_EQ(6, output[2]); 959 CHECK_EQ(-1, output[3]); 960 } 961 962 963 TEST(MacroAssemblerNativeBackReferenceUC16) { 964 v8::V8::Initialize(); 965 ContextInitializer initializer; 966 Factory* factory = Isolate::Current()->factory(); 967 968 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4); 969 970 m.WriteCurrentPositionToRegister(0, 0); 971 m.AdvanceCurrentPosition(2); 972 m.WriteCurrentPositionToRegister(1, 0); 973 Label nomatch; 974 m.CheckNotBackReference(0, &nomatch); 975 m.Fail(); 976 m.Bind(&nomatch); 977 m.AdvanceCurrentPosition(2); 978 Label missing_match; 979 m.CheckNotBackReference(0, &missing_match); 980 m.WriteCurrentPositionToRegister(2, 0); 981 m.Succeed(); 982 m.Bind(&missing_match); 983 m.Fail(); 984 985 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1")); 986 Handle<Object> code_object = m.GetCode(source); 987 Handle<Code> code = Handle<Code>::cast(code_object); 988 989 const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028}; 990 Handle<String> input = 991 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6)); 992 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input); 993 Address start_adr = seq_input->GetCharsAddress(); 994 995 int output[4]; 996 NativeRegExpMacroAssembler::Result result = 997 Execute(*code, 998 *input, 999 0, 1000 start_adr, 1001 start_adr + input->length() * 2, 1002 output); 1003 1004 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1005 CHECK_EQ(0, output[0]); 1006 CHECK_EQ(2, output[1]); 1007 CHECK_EQ(6, output[2]); 1008 CHECK_EQ(-1, output[3]); 1009 } 1010 1011 1012 1013 TEST(MacroAssemblernativeAtStart) { 1014 v8::V8::Initialize(); 1015 ContextInitializer initializer; 1016 Factory* factory = Isolate::Current()->factory(); 1017 1018 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 1019 1020 Label not_at_start, newline, fail; 1021 m.CheckNotAtStart(¬_at_start); 1022 // Check that prevchar = '\n' and current = 'f'. 1023 m.CheckCharacter('\n', &newline); 1024 m.Bind(&fail); 1025 m.Fail(); 1026 m.Bind(&newline); 1027 m.LoadCurrentCharacter(0, &fail); 1028 m.CheckNotCharacter('f', &fail); 1029 m.Succeed(); 1030 1031 m.Bind(¬_at_start); 1032 // Check that prevchar = 'o' and current = 'b'. 1033 Label prevo; 1034 m.CheckCharacter('o', &prevo); 1035 m.Fail(); 1036 m.Bind(&prevo); 1037 m.LoadCurrentCharacter(0, &fail); 1038 m.CheckNotCharacter('b', &fail); 1039 m.Succeed(); 1040 1041 Handle<String> source = factory->NewStringFromAscii(CStrVector("(^f|ob)")); 1042 Handle<Object> code_object = m.GetCode(source); 1043 Handle<Code> code = Handle<Code>::cast(code_object); 1044 1045 Handle<String> input = factory->NewStringFromAscii(CStrVector("foobar")); 1046 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1047 Address start_adr = seq_input->GetCharsAddress(); 1048 1049 NativeRegExpMacroAssembler::Result result = 1050 Execute(*code, 1051 *input, 1052 0, 1053 start_adr, 1054 start_adr + input->length(), 1055 NULL); 1056 1057 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1058 1059 result = Execute(*code, 1060 *input, 1061 3, 1062 start_adr + 3, 1063 start_adr + input->length(), 1064 NULL); 1065 1066 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1067 } 1068 1069 1070 TEST(MacroAssemblerNativeBackRefNoCase) { 1071 v8::V8::Initialize(); 1072 ContextInitializer initializer; 1073 Factory* factory = Isolate::Current()->factory(); 1074 1075 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4); 1076 1077 Label fail, succ; 1078 1079 m.WriteCurrentPositionToRegister(0, 0); 1080 m.WriteCurrentPositionToRegister(2, 0); 1081 m.AdvanceCurrentPosition(3); 1082 m.WriteCurrentPositionToRegister(3, 0); 1083 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "AbC". 1084 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "ABC". 1085 Label expected_fail; 1086 m.CheckNotBackReferenceIgnoreCase(2, &expected_fail); 1087 m.Bind(&fail); 1088 m.Fail(); 1089 1090 m.Bind(&expected_fail); 1091 m.AdvanceCurrentPosition(3); // Skip "xYz" 1092 m.CheckNotBackReferenceIgnoreCase(2, &succ); 1093 m.Fail(); 1094 1095 m.Bind(&succ); 1096 m.WriteCurrentPositionToRegister(1, 0); 1097 m.Succeed(); 1098 1099 Handle<String> source = 1100 factory->NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)")); 1101 Handle<Object> code_object = m.GetCode(source); 1102 Handle<Code> code = Handle<Code>::cast(code_object); 1103 1104 Handle<String> input = 1105 factory->NewStringFromAscii(CStrVector("aBcAbCABCxYzab")); 1106 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1107 Address start_adr = seq_input->GetCharsAddress(); 1108 1109 int output[4]; 1110 NativeRegExpMacroAssembler::Result result = 1111 Execute(*code, 1112 *input, 1113 0, 1114 start_adr, 1115 start_adr + input->length(), 1116 output); 1117 1118 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1119 CHECK_EQ(0, output[0]); 1120 CHECK_EQ(12, output[1]); 1121 CHECK_EQ(0, output[2]); 1122 CHECK_EQ(3, output[3]); 1123 } 1124 1125 1126 1127 TEST(MacroAssemblerNativeRegisters) { 1128 v8::V8::Initialize(); 1129 ContextInitializer initializer; 1130 Factory* factory = Isolate::Current()->factory(); 1131 1132 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6); 1133 1134 uc16 foo_chars[3] = {'f', 'o', 'o'}; 1135 Vector<const uc16> foo(foo_chars, 3); 1136 1137 enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt }; 1138 Label fail; 1139 Label backtrack; 1140 m.WriteCurrentPositionToRegister(out1, 0); // Output: [0] 1141 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1142 m.PushBacktrack(&backtrack); 1143 m.WriteStackPointerToRegister(sp); 1144 // Fill stack and registers 1145 m.AdvanceCurrentPosition(2); 1146 m.WriteCurrentPositionToRegister(out1, 0); 1147 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck); 1148 m.PushBacktrack(&fail); 1149 // Drop backtrack stack frames. 1150 m.ReadStackPointerFromRegister(sp); 1151 // And take the first backtrack (to &backtrack) 1152 m.Backtrack(); 1153 1154 m.PushCurrentPosition(); 1155 m.AdvanceCurrentPosition(2); 1156 m.PopCurrentPosition(); 1157 1158 m.Bind(&backtrack); 1159 m.PopRegister(out1); 1160 m.ReadCurrentPositionFromRegister(out1); 1161 m.AdvanceCurrentPosition(3); 1162 m.WriteCurrentPositionToRegister(out2, 0); // [0,3] 1163 1164 Label loop; 1165 m.SetRegister(loop_cnt, 0); // loop counter 1166 m.Bind(&loop); 1167 m.AdvanceRegister(loop_cnt, 1); 1168 m.AdvanceCurrentPosition(1); 1169 m.IfRegisterLT(loop_cnt, 3, &loop); 1170 m.WriteCurrentPositionToRegister(out3, 0); // [0,3,6] 1171 1172 Label loop2; 1173 m.SetRegister(loop_cnt, 2); // loop counter 1174 m.Bind(&loop2); 1175 m.AdvanceRegister(loop_cnt, -1); 1176 m.AdvanceCurrentPosition(1); 1177 m.IfRegisterGE(loop_cnt, 0, &loop2); 1178 m.WriteCurrentPositionToRegister(out4, 0); // [0,3,6,9] 1179 1180 Label loop3; 1181 Label exit_loop3; 1182 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1183 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck); 1184 m.ReadCurrentPositionFromRegister(out3); 1185 m.Bind(&loop3); 1186 m.AdvanceCurrentPosition(1); 1187 m.CheckGreedyLoop(&exit_loop3); 1188 m.GoTo(&loop3); 1189 m.Bind(&exit_loop3); 1190 m.PopCurrentPosition(); 1191 m.WriteCurrentPositionToRegister(out5, 0); // [0,3,6,9,9,-1] 1192 1193 m.Succeed(); 1194 1195 m.Bind(&fail); 1196 m.Fail(); 1197 1198 Handle<String> source = 1199 factory->NewStringFromAscii(CStrVector("<loop test>")); 1200 Handle<Object> code_object = m.GetCode(source); 1201 Handle<Code> code = Handle<Code>::cast(code_object); 1202 1203 // String long enough for test (content doesn't matter). 1204 Handle<String> input = 1205 factory->NewStringFromAscii(CStrVector("foofoofoofoofoo")); 1206 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1207 Address start_adr = seq_input->GetCharsAddress(); 1208 1209 int output[6]; 1210 NativeRegExpMacroAssembler::Result result = 1211 Execute(*code, 1212 *input, 1213 0, 1214 start_adr, 1215 start_adr + input->length(), 1216 output); 1217 1218 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1219 CHECK_EQ(0, output[0]); 1220 CHECK_EQ(3, output[1]); 1221 CHECK_EQ(6, output[2]); 1222 CHECK_EQ(9, output[3]); 1223 CHECK_EQ(9, output[4]); 1224 CHECK_EQ(-1, output[5]); 1225 } 1226 1227 1228 TEST(MacroAssemblerStackOverflow) { 1229 v8::V8::Initialize(); 1230 ContextInitializer initializer; 1231 Isolate* isolate = Isolate::Current(); 1232 Factory* factory = isolate->factory(); 1233 1234 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0); 1235 1236 Label loop; 1237 m.Bind(&loop); 1238 m.PushBacktrack(&loop); 1239 m.GoTo(&loop); 1240 1241 Handle<String> source = 1242 factory->NewStringFromAscii(CStrVector("<stack overflow test>")); 1243 Handle<Object> code_object = m.GetCode(source); 1244 Handle<Code> code = Handle<Code>::cast(code_object); 1245 1246 // String long enough for test (content doesn't matter). 1247 Handle<String> input = 1248 factory->NewStringFromAscii(CStrVector("dummy")); 1249 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1250 Address start_adr = seq_input->GetCharsAddress(); 1251 1252 NativeRegExpMacroAssembler::Result result = 1253 Execute(*code, 1254 *input, 1255 0, 1256 start_adr, 1257 start_adr + input->length(), 1258 NULL); 1259 1260 CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result); 1261 CHECK(isolate->has_pending_exception()); 1262 isolate->clear_pending_exception(); 1263 } 1264 1265 1266 TEST(MacroAssemblerNativeLotsOfRegisters) { 1267 v8::V8::Initialize(); 1268 ContextInitializer initializer; 1269 Isolate* isolate = Isolate::Current(); 1270 Factory* factory = isolate->factory(); 1271 1272 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2); 1273 1274 // At least 2048, to ensure the allocated space for registers 1275 // span one full page. 1276 const int large_number = 8000; 1277 m.WriteCurrentPositionToRegister(large_number, 42); 1278 m.WriteCurrentPositionToRegister(0, 0); 1279 m.WriteCurrentPositionToRegister(1, 1); 1280 Label done; 1281 m.CheckNotBackReference(0, &done); // Performs a system-stack push. 1282 m.Bind(&done); 1283 m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck); 1284 m.PopRegister(1); 1285 m.Succeed(); 1286 1287 Handle<String> source = 1288 factory->NewStringFromAscii(CStrVector("<huge register space test>")); 1289 Handle<Object> code_object = m.GetCode(source); 1290 Handle<Code> code = Handle<Code>::cast(code_object); 1291 1292 // String long enough for test (content doesn't matter). 1293 Handle<String> input = 1294 factory->NewStringFromAscii(CStrVector("sample text")); 1295 Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input); 1296 Address start_adr = seq_input->GetCharsAddress(); 1297 1298 int captures[2]; 1299 NativeRegExpMacroAssembler::Result result = 1300 Execute(*code, 1301 *input, 1302 0, 1303 start_adr, 1304 start_adr + input->length(), 1305 captures); 1306 1307 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result); 1308 CHECK_EQ(0, captures[0]); 1309 CHECK_EQ(42, captures[1]); 1310 1311 isolate->clear_pending_exception(); 1312 } 1313 1314 #else // V8_INTERPRETED_REGEXP 1315 1316 TEST(MacroAssembler) { 1317 V8::Initialize(NULL); 1318 byte codes[1024]; 1319 RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024)); 1320 // ^f(o)o. 1321 Label fail, fail2, start; 1322 uc16 foo_chars[3]; 1323 foo_chars[0] = 'f'; 1324 foo_chars[1] = 'o'; 1325 foo_chars[2] = 'o'; 1326 Vector<const uc16> foo(foo_chars, 3); 1327 m.SetRegister(4, 42); 1328 m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck); 1329 m.AdvanceRegister(4, 42); 1330 m.GoTo(&start); 1331 m.Fail(); 1332 m.Bind(&start); 1333 m.PushBacktrack(&fail2); 1334 m.CheckCharacters(foo, 0, &fail, true); 1335 m.WriteCurrentPositionToRegister(0, 0); 1336 m.PushCurrentPosition(); 1337 m.AdvanceCurrentPosition(3); 1338 m.WriteCurrentPositionToRegister(1, 0); 1339 m.PopCurrentPosition(); 1340 m.AdvanceCurrentPosition(1); 1341 m.WriteCurrentPositionToRegister(2, 0); 1342 m.AdvanceCurrentPosition(1); 1343 m.WriteCurrentPositionToRegister(3, 0); 1344 m.Succeed(); 1345 1346 m.Bind(&fail); 1347 m.Backtrack(); 1348 m.Succeed(); 1349 1350 m.Bind(&fail2); 1351 m.PopRegister(0); 1352 m.Fail(); 1353 1354 Isolate* isolate = Isolate::Current(); 1355 Factory* factory = isolate->factory(); 1356 HandleScope scope(isolate); 1357 1358 Handle<String> source = factory->NewStringFromAscii(CStrVector("^f(o)o")); 1359 Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source)); 1360 int captures[5]; 1361 1362 const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'}; 1363 Handle<String> f1_16 = 1364 factory->NewStringFromTwoByte(Vector<const uc16>(str1, 6)); 1365 1366 CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0)); 1367 CHECK_EQ(0, captures[0]); 1368 CHECK_EQ(3, captures[1]); 1369 CHECK_EQ(1, captures[2]); 1370 CHECK_EQ(2, captures[3]); 1371 CHECK_EQ(84, captures[4]); 1372 1373 const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'}; 1374 Handle<String> f2_16 = 1375 factory->NewStringFromTwoByte(Vector<const uc16>(str2, 6)); 1376 1377 CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0)); 1378 CHECK_EQ(42, captures[0]); 1379 } 1380 1381 #endif // V8_INTERPRETED_REGEXP 1382 1383 1384 TEST(AddInverseToTable) { 1385 v8::internal::V8::Initialize(NULL); 1386 static const int kLimit = 1000; 1387 static const int kRangeCount = 16; 1388 for (int t = 0; t < 10; t++) { 1389 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1390 ZoneList<CharacterRange>* ranges = 1391 new ZoneList<CharacterRange>(kRangeCount); 1392 for (int i = 0; i < kRangeCount; i++) { 1393 int from = PseudoRandom(t + 87, i + 25) % kLimit; 1394 int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20)); 1395 if (to > kLimit) to = kLimit; 1396 ranges->Add(CharacterRange(from, to)); 1397 } 1398 DispatchTable table; 1399 DispatchTableConstructor cons(&table, false); 1400 cons.set_choice_index(0); 1401 cons.AddInverse(ranges); 1402 for (int i = 0; i < kLimit; i++) { 1403 bool is_on = false; 1404 for (int j = 0; !is_on && j < kRangeCount; j++) 1405 is_on = ranges->at(j).Contains(i); 1406 OutSet* set = table.Get(i); 1407 CHECK_EQ(is_on, set->Get(0) == false); 1408 } 1409 } 1410 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1411 ZoneList<CharacterRange>* ranges = 1412 new ZoneList<CharacterRange>(1); 1413 ranges->Add(CharacterRange(0xFFF0, 0xFFFE)); 1414 DispatchTable table; 1415 DispatchTableConstructor cons(&table, false); 1416 cons.set_choice_index(0); 1417 cons.AddInverse(ranges); 1418 CHECK(!table.Get(0xFFFE)->Get(0)); 1419 CHECK(table.Get(0xFFFF)->Get(0)); 1420 } 1421 1422 1423 static uc32 canonicalize(uc32 c) { 1424 unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth]; 1425 int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL); 1426 if (count == 0) { 1427 return c; 1428 } else { 1429 CHECK_EQ(1, count); 1430 return canon[0]; 1431 } 1432 } 1433 1434 1435 TEST(LatinCanonicalize) { 1436 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1437 for (char lower = 'a'; lower <= 'z'; lower++) { 1438 char upper = lower + ('A' - 'a'); 1439 CHECK_EQ(canonicalize(lower), canonicalize(upper)); 1440 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1441 int length = un_canonicalize.get(lower, '\0', uncanon); 1442 CHECK_EQ(2, length); 1443 CHECK_EQ(upper, uncanon[0]); 1444 CHECK_EQ(lower, uncanon[1]); 1445 } 1446 for (uc32 c = 128; c < (1 << 21); c++) 1447 CHECK_GE(canonicalize(c), 128); 1448 unibrow::Mapping<unibrow::ToUppercase> to_upper; 1449 // Canonicalization is only defined for the Basic Multilingual Plane. 1450 for (uc32 c = 0; c < (1 << 16); c++) { 1451 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth]; 1452 int length = to_upper.get(c, '\0', upper); 1453 if (length == 0) { 1454 length = 1; 1455 upper[0] = c; 1456 } 1457 uc32 u = upper[0]; 1458 if (length > 1 || (c >= 128 && u < 128)) 1459 u = c; 1460 CHECK_EQ(u, canonicalize(c)); 1461 } 1462 } 1463 1464 1465 static uc32 CanonRangeEnd(uc32 c) { 1466 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; 1467 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); 1468 if (count == 0) { 1469 return c; 1470 } else { 1471 CHECK_EQ(1, count); 1472 return canon[0]; 1473 } 1474 } 1475 1476 1477 TEST(RangeCanonicalization) { 1478 // Check that we arrive at the same result when using the basic 1479 // range canonicalization primitives as when using immediate 1480 // canonicalization. 1481 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1482 int block_start = 0; 1483 while (block_start <= 0xFFFF) { 1484 uc32 block_end = CanonRangeEnd(block_start); 1485 unsigned block_length = block_end - block_start + 1; 1486 if (block_length > 1) { 1487 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1488 int first_length = un_canonicalize.get(block_start, '\0', first); 1489 for (unsigned i = 1; i < block_length; i++) { 1490 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1491 int succ_length = un_canonicalize.get(block_start + i, '\0', succ); 1492 CHECK_EQ(first_length, succ_length); 1493 for (int j = 0; j < succ_length; j++) { 1494 int calc = first[j] + i; 1495 int found = succ[j]; 1496 CHECK_EQ(calc, found); 1497 } 1498 } 1499 } 1500 block_start = block_start + block_length; 1501 } 1502 } 1503 1504 1505 TEST(UncanonicalizeEquivalence) { 1506 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; 1507 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1508 for (int i = 0; i < (1 << 16); i++) { 1509 int length = un_canonicalize.get(i, '\0', chars); 1510 for (int j = 0; j < length; j++) { 1511 unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1512 int length2 = un_canonicalize.get(chars[j], '\0', chars2); 1513 CHECK_EQ(length, length2); 1514 for (int k = 0; k < length; k++) 1515 CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k])); 1516 } 1517 } 1518 } 1519 1520 1521 static void TestRangeCaseIndependence(CharacterRange input, 1522 Vector<CharacterRange> expected) { 1523 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1524 int count = expected.length(); 1525 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(count); 1526 input.AddCaseEquivalents(list, false); 1527 CHECK_EQ(count, list->length()); 1528 for (int i = 0; i < list->length(); i++) { 1529 CHECK_EQ(expected[i].from(), list->at(i).from()); 1530 CHECK_EQ(expected[i].to(), list->at(i).to()); 1531 } 1532 } 1533 1534 1535 static void TestSimpleRangeCaseIndependence(CharacterRange input, 1536 CharacterRange expected) { 1537 EmbeddedVector<CharacterRange, 1> vector; 1538 vector[0] = expected; 1539 TestRangeCaseIndependence(input, vector); 1540 } 1541 1542 1543 TEST(CharacterRangeCaseIndependence) { 1544 v8::internal::V8::Initialize(NULL); 1545 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'), 1546 CharacterRange::Singleton('A')); 1547 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'), 1548 CharacterRange::Singleton('Z')); 1549 TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'), 1550 CharacterRange('A', 'Z')); 1551 TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'), 1552 CharacterRange('C', 'F')); 1553 TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'), 1554 CharacterRange('A', 'B')); 1555 TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'), 1556 CharacterRange('Y', 'Z')); 1557 TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1), 1558 CharacterRange('A', 'Z')); 1559 TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'), 1560 CharacterRange('a', 'z')); 1561 TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'), 1562 CharacterRange('c', 'f')); 1563 TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1), 1564 CharacterRange('a', 'z')); 1565 // Here we need to add [l-z] to complete the case independence of 1566 // [A-Za-z] but we expect [a-z] to be added since we always add a 1567 // whole block at a time. 1568 TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'), 1569 CharacterRange('a', 'z')); 1570 } 1571 1572 1573 static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) { 1574 if (ranges == NULL) 1575 return false; 1576 for (int i = 0; i < ranges->length(); i++) { 1577 CharacterRange range = ranges->at(i); 1578 if (range.from() <= c && c <= range.to()) 1579 return true; 1580 } 1581 return false; 1582 } 1583 1584 1585 TEST(CharClassDifference) { 1586 v8::internal::V8::Initialize(NULL); 1587 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1588 ZoneList<CharacterRange>* base = new ZoneList<CharacterRange>(1); 1589 base->Add(CharacterRange::Everything()); 1590 Vector<const uc16> overlay = CharacterRange::GetWordBounds(); 1591 ZoneList<CharacterRange>* included = NULL; 1592 ZoneList<CharacterRange>* excluded = NULL; 1593 CharacterRange::Split(base, overlay, &included, &excluded); 1594 for (int i = 0; i < (1 << 16); i++) { 1595 bool in_base = InClass(i, base); 1596 if (in_base) { 1597 bool in_overlay = false; 1598 for (int j = 0; !in_overlay && j < overlay.length(); j += 2) { 1599 if (overlay[j] <= i && i <= overlay[j+1]) 1600 in_overlay = true; 1601 } 1602 CHECK_EQ(in_overlay, InClass(i, included)); 1603 CHECK_EQ(!in_overlay, InClass(i, excluded)); 1604 } else { 1605 CHECK(!InClass(i, included)); 1606 CHECK(!InClass(i, excluded)); 1607 } 1608 } 1609 } 1610 1611 1612 TEST(CanonicalizeCharacterSets) { 1613 v8::internal::V8::Initialize(NULL); 1614 ZoneScope scope(Isolate::Current(), DELETE_ON_EXIT); 1615 ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4); 1616 CharacterSet set(list); 1617 1618 list->Add(CharacterRange(10, 20)); 1619 list->Add(CharacterRange(30, 40)); 1620 list->Add(CharacterRange(50, 60)); 1621 set.Canonicalize(); 1622 ASSERT_EQ(3, list->length()); 1623 ASSERT_EQ(10, list->at(0).from()); 1624 ASSERT_EQ(20, list->at(0).to()); 1625 ASSERT_EQ(30, list->at(1).from()); 1626 ASSERT_EQ(40, list->at(1).to()); 1627 ASSERT_EQ(50, list->at(2).from()); 1628 ASSERT_EQ(60, list->at(2).to()); 1629 1630 list->Rewind(0); 1631 list->Add(CharacterRange(10, 20)); 1632 list->Add(CharacterRange(50, 60)); 1633 list->Add(CharacterRange(30, 40)); 1634 set.Canonicalize(); 1635 ASSERT_EQ(3, list->length()); 1636 ASSERT_EQ(10, list->at(0).from()); 1637 ASSERT_EQ(20, list->at(0).to()); 1638 ASSERT_EQ(30, list->at(1).from()); 1639 ASSERT_EQ(40, list->at(1).to()); 1640 ASSERT_EQ(50, list->at(2).from()); 1641 ASSERT_EQ(60, list->at(2).to()); 1642 1643 list->Rewind(0); 1644 list->Add(CharacterRange(30, 40)); 1645 list->Add(CharacterRange(10, 20)); 1646 list->Add(CharacterRange(25, 25)); 1647 list->Add(CharacterRange(100, 100)); 1648 list->Add(CharacterRange(1, 1)); 1649 set.Canonicalize(); 1650 ASSERT_EQ(5, list->length()); 1651 ASSERT_EQ(1, list->at(0).from()); 1652 ASSERT_EQ(1, list->at(0).to()); 1653 ASSERT_EQ(10, list->at(1).from()); 1654 ASSERT_EQ(20, list->at(1).to()); 1655 ASSERT_EQ(25, list->at(2).from()); 1656 ASSERT_EQ(25, list->at(2).to()); 1657 ASSERT_EQ(30, list->at(3).from()); 1658 ASSERT_EQ(40, list->at(3).to()); 1659 ASSERT_EQ(100, list->at(4).from()); 1660 ASSERT_EQ(100, list->at(4).to()); 1661 1662 list->Rewind(0); 1663 list->Add(CharacterRange(10, 19)); 1664 list->Add(CharacterRange(21, 30)); 1665 list->Add(CharacterRange(20, 20)); 1666 set.Canonicalize(); 1667 ASSERT_EQ(1, list->length()); 1668 ASSERT_EQ(10, list->at(0).from()); 1669 ASSERT_EQ(30, list->at(0).to()); 1670 } 1671 1672 // Checks whether a character is in the set represented by a list of ranges. 1673 static bool CharacterInSet(ZoneList<CharacterRange>* set, uc16 value) { 1674 for (int i = 0; i < set->length(); i++) { 1675 CharacterRange range = set->at(i); 1676 if (range.from() <= value && value <= range.to()) { 1677 return true; 1678 } 1679 } 1680 return false; 1681 } 1682 1683 TEST(CharacterRangeMerge) { 1684 v8::internal::V8::Initialize(NULL); 1685 ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); 1686 ZoneList<CharacterRange> l1(4); 1687 ZoneList<CharacterRange> l2(4); 1688 // Create all combinations of intersections of ranges, both singletons and 1689 // longer. 1690 1691 int offset = 0; 1692 1693 // The five kinds of singleton intersections: 1694 // X 1695 // Y - outside before 1696 // Y - outside touching start 1697 // Y - overlap 1698 // Y - outside touching end 1699 // Y - outside after 1700 1701 for (int i = 0; i < 5; i++) { 1702 l1.Add(CharacterRange::Singleton(offset + 2)); 1703 l2.Add(CharacterRange::Singleton(offset + i)); 1704 offset += 6; 1705 } 1706 1707 // The seven kinds of singleton/non-singleton intersections: 1708 // XXX 1709 // Y - outside before 1710 // Y - outside touching start 1711 // Y - inside touching start 1712 // Y - entirely inside 1713 // Y - inside touching end 1714 // Y - outside touching end 1715 // Y - disjoint after 1716 1717 for (int i = 0; i < 7; i++) { 1718 l1.Add(CharacterRange::Range(offset + 2, offset + 4)); 1719 l2.Add(CharacterRange::Singleton(offset + i)); 1720 offset += 8; 1721 } 1722 1723 // The eleven kinds of non-singleton intersections: 1724 // 1725 // XXXXXXXX 1726 // YYYY - outside before. 1727 // YYYY - outside touching start. 1728 // YYYY - overlapping start 1729 // YYYY - inside touching start 1730 // YYYY - entirely inside 1731 // YYYY - inside touching end 1732 // YYYY - overlapping end 1733 // YYYY - outside touching end 1734 // YYYY - outside after 1735 // YYYYYYYY - identical 1736 // YYYYYYYYYYYY - containing entirely. 1737 1738 for (int i = 0; i < 9; i++) { 1739 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); // Length 8. 1740 l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3)); 1741 offset += 22; 1742 } 1743 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); 1744 l2.Add(CharacterRange::Range(offset + 6, offset + 15)); 1745 offset += 22; 1746 l1.Add(CharacterRange::Range(offset + 6, offset + 15)); 1747 l2.Add(CharacterRange::Range(offset + 4, offset + 17)); 1748 offset += 22; 1749 1750 // Different kinds of multi-range overlap: 1751 // XXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXX 1752 // YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y 1753 1754 l1.Add(CharacterRange::Range(offset, offset + 21)); 1755 l1.Add(CharacterRange::Range(offset + 31, offset + 52)); 1756 for (int i = 0; i < 6; i++) { 1757 l2.Add(CharacterRange::Range(offset + 2, offset + 5)); 1758 l2.Add(CharacterRange::Singleton(offset + 8)); 1759 offset += 9; 1760 } 1761 1762 ASSERT(CharacterRange::IsCanonical(&l1)); 1763 ASSERT(CharacterRange::IsCanonical(&l2)); 1764 1765 ZoneList<CharacterRange> first_only(4); 1766 ZoneList<CharacterRange> second_only(4); 1767 ZoneList<CharacterRange> both(4); 1768 1769 // Merge one direction. 1770 CharacterRange::Merge(&l1, &l2, &first_only, &second_only, &both); 1771 1772 CHECK(CharacterRange::IsCanonical(&first_only)); 1773 CHECK(CharacterRange::IsCanonical(&second_only)); 1774 CHECK(CharacterRange::IsCanonical(&both)); 1775 1776 for (uc16 i = 0; i < offset; i++) { 1777 bool in_first = CharacterInSet(&l1, i); 1778 bool in_second = CharacterInSet(&l2, i); 1779 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); 1780 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); 1781 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1782 } 1783 1784 first_only.Clear(); 1785 second_only.Clear(); 1786 both.Clear(); 1787 1788 // Merge other direction. 1789 CharacterRange::Merge(&l2, &l1, &second_only, &first_only, &both); 1790 1791 CHECK(CharacterRange::IsCanonical(&first_only)); 1792 CHECK(CharacterRange::IsCanonical(&second_only)); 1793 CHECK(CharacterRange::IsCanonical(&both)); 1794 1795 for (uc16 i = 0; i < offset; i++) { 1796 bool in_first = CharacterInSet(&l1, i); 1797 bool in_second = CharacterInSet(&l2, i); 1798 CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); 1799 CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); 1800 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1801 } 1802 1803 first_only.Clear(); 1804 second_only.Clear(); 1805 both.Clear(); 1806 1807 // Merge but don't record all combinations. 1808 CharacterRange::Merge(&l1, &l2, NULL, NULL, &both); 1809 1810 CHECK(CharacterRange::IsCanonical(&both)); 1811 1812 for (uc16 i = 0; i < offset; i++) { 1813 bool in_first = CharacterInSet(&l1, i); 1814 bool in_second = CharacterInSet(&l2, i); 1815 CHECK((in_first && in_second) == CharacterInSet(&both, i)); 1816 } 1817 1818 // Merge into same set. 1819 ZoneList<CharacterRange> all(4); 1820 CharacterRange::Merge(&l1, &l2, &all, &all, &all); 1821 1822 CHECK(CharacterRange::IsCanonical(&all)); 1823 1824 for (uc16 i = 0; i < offset; i++) { 1825 bool in_first = CharacterInSet(&l1, i); 1826 bool in_second = CharacterInSet(&l2, i); 1827 CHECK((in_first || in_second) == CharacterInSet(&all, i)); 1828 } 1829 } 1830 1831 1832 TEST(Graph) { 1833 V8::Initialize(NULL); 1834 Execute("\\b\\w+\\b", false, true, true); 1835 } 1836