1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_AST_H_ 6 #define V8_REGEXP_REGEXP_AST_H_ 7 8 #include "src/objects.h" 9 #include "src/utils.h" 10 #include "src/zone/zone-containers.h" 11 #include "src/zone/zone.h" 12 13 namespace v8 { 14 namespace internal { 15 16 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 17 VISIT(Disjunction) \ 18 VISIT(Alternative) \ 19 VISIT(Assertion) \ 20 VISIT(CharacterClass) \ 21 VISIT(Atom) \ 22 VISIT(Quantifier) \ 23 VISIT(Capture) \ 24 VISIT(Group) \ 25 VISIT(Lookaround) \ 26 VISIT(BackReference) \ 27 VISIT(Empty) \ 28 VISIT(Text) 29 30 #define FORWARD_DECLARE(Name) class RegExp##Name; 31 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) 32 #undef FORWARD_DECLARE 33 34 class RegExpCompiler; 35 class RegExpNode; 36 class RegExpTree; 37 38 39 class RegExpVisitor BASE_EMBEDDED { 40 public: 41 virtual ~RegExpVisitor() {} 42 #define MAKE_CASE(Name) \ 43 virtual void* Visit##Name(RegExp##Name*, void* data) = 0; 44 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) 45 #undef MAKE_CASE 46 }; 47 48 49 // A simple closed interval. 50 class Interval { 51 public: 52 Interval() : from_(kNone), to_(kNone) {} 53 Interval(int from, int to) : from_(from), to_(to) {} 54 Interval Union(Interval that) { 55 if (that.from_ == kNone) 56 return *this; 57 else if (from_ == kNone) 58 return that; 59 else 60 return Interval(Min(from_, that.from_), Max(to_, that.to_)); 61 } 62 bool Contains(int value) { return (from_ <= value) && (value <= to_); } 63 bool is_empty() { return from_ == kNone; } 64 int from() const { return from_; } 65 int to() const { return to_; } 66 static Interval Empty() { return Interval(); } 67 static const int kNone = -1; 68 69 private: 70 int from_; 71 int to_; 72 }; 73 74 75 // Represents code units in the range from from_ to to_, both ends are 76 // inclusive. 77 class CharacterRange { 78 public: 79 CharacterRange() : from_(0), to_(0) {} 80 // For compatibility with the CHECK_OK macro 81 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT 82 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, 83 Zone* zone); 84 static Vector<const int> GetWordBounds(); 85 static inline CharacterRange Singleton(uc32 value) { 86 return CharacterRange(value, value); 87 } 88 static inline CharacterRange Range(uc32 from, uc32 to) { 89 DCHECK(0 <= from && to <= String::kMaxCodePoint); 90 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); 91 return CharacterRange(from, to); 92 } 93 static inline CharacterRange Everything() { 94 return CharacterRange(0, String::kMaxCodePoint); 95 } 96 static inline ZoneList<CharacterRange>* List(Zone* zone, 97 CharacterRange range) { 98 ZoneList<CharacterRange>* list = 99 new (zone) ZoneList<CharacterRange>(1, zone); 100 list->Add(range, zone); 101 return list; 102 } 103 bool Contains(uc32 i) { return from_ <= i && i <= to_; } 104 uc32 from() const { return from_; } 105 void set_from(uc32 value) { from_ = value; } 106 uc32 to() const { return to_; } 107 void set_to(uc32 value) { to_ = value; } 108 bool is_valid() { return from_ <= to_; } 109 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } 110 bool IsSingleton() { return (from_ == to_); } 111 static void AddCaseEquivalents(Isolate* isolate, Zone* zone, 112 ZoneList<CharacterRange>* ranges, 113 bool is_one_byte); 114 // Whether a range list is in canonical form: Ranges ordered by from value, 115 // and ranges non-overlapping and non-adjacent. 116 static bool IsCanonical(ZoneList<CharacterRange>* ranges); 117 // Convert range list to canonical form. The characters covered by the ranges 118 // will still be the same, but no character is in more than one range, and 119 // adjacent ranges are merged. The resulting list may be shorter than the 120 // original, but cannot be longer. 121 static void Canonicalize(ZoneList<CharacterRange>* ranges); 122 // Negate the contents of a character range in canonical form. 123 static void Negate(ZoneList<CharacterRange>* src, 124 ZoneList<CharacterRange>* dst, Zone* zone); 125 static const int kStartMarker = (1 << 24); 126 static const int kPayloadMask = (1 << 24) - 1; 127 128 private: 129 CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {} 130 131 uc32 from_; 132 uc32 to_; 133 }; 134 135 136 class CharacterSet final BASE_EMBEDDED { 137 public: 138 explicit CharacterSet(uc16 standard_set_type) 139 : ranges_(NULL), standard_set_type_(standard_set_type) {} 140 explicit CharacterSet(ZoneList<CharacterRange>* ranges) 141 : ranges_(ranges), standard_set_type_(0) {} 142 ZoneList<CharacterRange>* ranges(Zone* zone); 143 uc16 standard_set_type() { return standard_set_type_; } 144 void set_standard_set_type(uc16 special_set_type) { 145 standard_set_type_ = special_set_type; 146 } 147 bool is_standard() { return standard_set_type_ != 0; } 148 void Canonicalize(); 149 150 private: 151 ZoneList<CharacterRange>* ranges_; 152 // If non-zero, the value represents a standard set (e.g., all whitespace 153 // characters) without having to expand the ranges. 154 uc16 standard_set_type_; 155 }; 156 157 158 class TextElement final BASE_EMBEDDED { 159 public: 160 enum TextType { ATOM, CHAR_CLASS }; 161 162 static TextElement Atom(RegExpAtom* atom); 163 static TextElement CharClass(RegExpCharacterClass* char_class); 164 165 int cp_offset() const { return cp_offset_; } 166 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 167 int length() const; 168 169 TextType text_type() const { return text_type_; } 170 171 RegExpTree* tree() const { return tree_; } 172 173 RegExpAtom* atom() const { 174 DCHECK(text_type() == ATOM); 175 return reinterpret_cast<RegExpAtom*>(tree()); 176 } 177 178 RegExpCharacterClass* char_class() const { 179 DCHECK(text_type() == CHAR_CLASS); 180 return reinterpret_cast<RegExpCharacterClass*>(tree()); 181 } 182 183 private: 184 TextElement(TextType text_type, RegExpTree* tree) 185 : cp_offset_(-1), text_type_(text_type), tree_(tree) {} 186 187 int cp_offset_; 188 TextType text_type_; 189 RegExpTree* tree_; 190 }; 191 192 193 class RegExpTree : public ZoneObject { 194 public: 195 static const int kInfinity = kMaxInt; 196 virtual ~RegExpTree() {} 197 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; 198 virtual RegExpNode* ToNode(RegExpCompiler* compiler, 199 RegExpNode* on_success) = 0; 200 virtual bool IsTextElement() { return false; } 201 virtual bool IsAnchoredAtStart() { return false; } 202 virtual bool IsAnchoredAtEnd() { return false; } 203 virtual int min_match() = 0; 204 virtual int max_match() = 0; 205 // Returns the interval of registers used for captures within this 206 // expression. 207 virtual Interval CaptureRegisters() { return Interval::Empty(); } 208 virtual void AppendToText(RegExpText* text, Zone* zone); 209 std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT 210 #define MAKE_ASTYPE(Name) \ 211 virtual RegExp##Name* As##Name(); \ 212 virtual bool Is##Name(); 213 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) 214 #undef MAKE_ASTYPE 215 }; 216 217 218 class RegExpDisjunction final : public RegExpTree { 219 public: 220 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); 221 void* Accept(RegExpVisitor* visitor, void* data) override; 222 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 223 RegExpDisjunction* AsDisjunction() override; 224 Interval CaptureRegisters() override; 225 bool IsDisjunction() override; 226 bool IsAnchoredAtStart() override; 227 bool IsAnchoredAtEnd() override; 228 int min_match() override { return min_match_; } 229 int max_match() override { return max_match_; } 230 ZoneList<RegExpTree*>* alternatives() { return alternatives_; } 231 232 private: 233 bool SortConsecutiveAtoms(RegExpCompiler* compiler); 234 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); 235 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); 236 ZoneList<RegExpTree*>* alternatives_; 237 int min_match_; 238 int max_match_; 239 }; 240 241 242 class RegExpAlternative final : public RegExpTree { 243 public: 244 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); 245 void* Accept(RegExpVisitor* visitor, void* data) override; 246 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 247 RegExpAlternative* AsAlternative() override; 248 Interval CaptureRegisters() override; 249 bool IsAlternative() override; 250 bool IsAnchoredAtStart() override; 251 bool IsAnchoredAtEnd() override; 252 int min_match() override { return min_match_; } 253 int max_match() override { return max_match_; } 254 ZoneList<RegExpTree*>* nodes() { return nodes_; } 255 256 private: 257 ZoneList<RegExpTree*>* nodes_; 258 int min_match_; 259 int max_match_; 260 }; 261 262 263 class RegExpAssertion final : public RegExpTree { 264 public: 265 enum AssertionType { 266 START_OF_LINE, 267 START_OF_INPUT, 268 END_OF_LINE, 269 END_OF_INPUT, 270 BOUNDARY, 271 NON_BOUNDARY 272 }; 273 explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} 274 void* Accept(RegExpVisitor* visitor, void* data) override; 275 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 276 RegExpAssertion* AsAssertion() override; 277 bool IsAssertion() override; 278 bool IsAnchoredAtStart() override; 279 bool IsAnchoredAtEnd() override; 280 int min_match() override { return 0; } 281 int max_match() override { return 0; } 282 AssertionType assertion_type() { return assertion_type_; } 283 284 private: 285 AssertionType assertion_type_; 286 }; 287 288 289 class RegExpCharacterClass final : public RegExpTree { 290 public: 291 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated) 292 : set_(ranges), is_negated_(is_negated) {} 293 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {} 294 void* Accept(RegExpVisitor* visitor, void* data) override; 295 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 296 RegExpCharacterClass* AsCharacterClass() override; 297 bool IsCharacterClass() override; 298 bool IsTextElement() override { return true; } 299 int min_match() override { return 1; } 300 // The character class may match two code units for unicode regexps. 301 // TODO(yangguo): we should split this class for usage in TextElement, and 302 // make max_match() dependent on the character class content. 303 int max_match() override { return 2; } 304 void AppendToText(RegExpText* text, Zone* zone) override; 305 CharacterSet character_set() { return set_; } 306 // TODO(lrn): Remove need for complex version if is_standard that 307 // recognizes a mangled standard set and just do { return set_.is_special(); } 308 bool is_standard(Zone* zone); 309 // Returns a value representing the standard character set if is_standard() 310 // returns true. 311 // Currently used values are: 312 // s : unicode whitespace 313 // S : unicode non-whitespace 314 // w : ASCII word character (digit, letter, underscore) 315 // W : non-ASCII word character 316 // d : ASCII digit 317 // D : non-ASCII digit 318 // . : non-newline 319 // * : All characters, for advancing unanchored regexp 320 uc16 standard_type() { return set_.standard_set_type(); } 321 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } 322 bool is_negated() { return is_negated_; } 323 324 private: 325 CharacterSet set_; 326 bool is_negated_; 327 }; 328 329 330 class RegExpAtom final : public RegExpTree { 331 public: 332 explicit RegExpAtom(Vector<const uc16> data) : data_(data) {} 333 void* Accept(RegExpVisitor* visitor, void* data) override; 334 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 335 RegExpAtom* AsAtom() override; 336 bool IsAtom() override; 337 bool IsTextElement() override { return true; } 338 int min_match() override { return data_.length(); } 339 int max_match() override { return data_.length(); } 340 void AppendToText(RegExpText* text, Zone* zone) override; 341 Vector<const uc16> data() { return data_; } 342 int length() { return data_.length(); } 343 344 private: 345 Vector<const uc16> data_; 346 }; 347 348 349 class RegExpText final : public RegExpTree { 350 public: 351 explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} 352 void* Accept(RegExpVisitor* visitor, void* data) override; 353 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 354 RegExpText* AsText() override; 355 bool IsText() override; 356 bool IsTextElement() override { return true; } 357 int min_match() override { return length_; } 358 int max_match() override { return length_; } 359 void AppendToText(RegExpText* text, Zone* zone) override; 360 void AddElement(TextElement elm, Zone* zone) { 361 elements_.Add(elm, zone); 362 length_ += elm.length(); 363 } 364 ZoneList<TextElement>* elements() { return &elements_; } 365 366 private: 367 ZoneList<TextElement> elements_; 368 int length_; 369 }; 370 371 372 class RegExpQuantifier final : public RegExpTree { 373 public: 374 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; 375 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) 376 : body_(body), 377 min_(min), 378 max_(max), 379 min_match_(min * body->min_match()), 380 quantifier_type_(type) { 381 if (max > 0 && body->max_match() > kInfinity / max) { 382 max_match_ = kInfinity; 383 } else { 384 max_match_ = max * body->max_match(); 385 } 386 } 387 void* Accept(RegExpVisitor* visitor, void* data) override; 388 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 389 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, 390 RegExpCompiler* compiler, RegExpNode* on_success, 391 bool not_at_start = false); 392 RegExpQuantifier* AsQuantifier() override; 393 Interval CaptureRegisters() override; 394 bool IsQuantifier() override; 395 int min_match() override { return min_match_; } 396 int max_match() override { return max_match_; } 397 int min() { return min_; } 398 int max() { return max_; } 399 bool is_possessive() { return quantifier_type_ == POSSESSIVE; } 400 bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } 401 bool is_greedy() { return quantifier_type_ == GREEDY; } 402 RegExpTree* body() { return body_; } 403 404 private: 405 RegExpTree* body_; 406 int min_; 407 int max_; 408 int min_match_; 409 int max_match_; 410 QuantifierType quantifier_type_; 411 }; 412 413 414 class RegExpCapture final : public RegExpTree { 415 public: 416 explicit RegExpCapture(int index) 417 : body_(NULL), index_(index), name_(nullptr) {} 418 void* Accept(RegExpVisitor* visitor, void* data) override; 419 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 420 static RegExpNode* ToNode(RegExpTree* body, int index, 421 RegExpCompiler* compiler, RegExpNode* on_success); 422 RegExpCapture* AsCapture() override; 423 bool IsAnchoredAtStart() override; 424 bool IsAnchoredAtEnd() override; 425 Interval CaptureRegisters() override; 426 bool IsCapture() override; 427 int min_match() override { return body_->min_match(); } 428 int max_match() override { return body_->max_match(); } 429 RegExpTree* body() { return body_; } 430 void set_body(RegExpTree* body) { body_ = body; } 431 int index() { return index_; } 432 const ZoneVector<uc16>* name() const { return name_; } 433 void set_name(const ZoneVector<uc16>* name) { name_ = name; } 434 static int StartRegister(int index) { return index * 2; } 435 static int EndRegister(int index) { return index * 2 + 1; } 436 437 private: 438 RegExpTree* body_; 439 int index_; 440 const ZoneVector<uc16>* name_; 441 }; 442 443 class RegExpGroup final : public RegExpTree { 444 public: 445 explicit RegExpGroup(RegExpTree* body) : body_(body) {} 446 void* Accept(RegExpVisitor* visitor, void* data) override; 447 RegExpNode* ToNode(RegExpCompiler* compiler, 448 RegExpNode* on_success) override { 449 return body_->ToNode(compiler, on_success); 450 } 451 RegExpGroup* AsGroup() override; 452 bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } 453 bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } 454 bool IsGroup() override; 455 int min_match() override { return body_->min_match(); } 456 int max_match() override { return body_->max_match(); } 457 Interval CaptureRegisters() override { return body_->CaptureRegisters(); } 458 RegExpTree* body() { return body_; } 459 460 private: 461 RegExpTree* body_; 462 }; 463 464 class RegExpLookaround final : public RegExpTree { 465 public: 466 enum Type { LOOKAHEAD, LOOKBEHIND }; 467 468 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, 469 int capture_from, Type type) 470 : body_(body), 471 is_positive_(is_positive), 472 capture_count_(capture_count), 473 capture_from_(capture_from), 474 type_(type) {} 475 476 void* Accept(RegExpVisitor* visitor, void* data) override; 477 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 478 RegExpLookaround* AsLookaround() override; 479 Interval CaptureRegisters() override; 480 bool IsLookaround() override; 481 bool IsAnchoredAtStart() override; 482 int min_match() override { return 0; } 483 int max_match() override { return 0; } 484 RegExpTree* body() { return body_; } 485 bool is_positive() { return is_positive_; } 486 int capture_count() { return capture_count_; } 487 int capture_from() { return capture_from_; } 488 Type type() { return type_; } 489 490 class Builder { 491 public: 492 Builder(bool is_positive, RegExpNode* on_success, 493 int stack_pointer_register, int position_register, 494 int capture_register_count = 0, int capture_register_start = 0); 495 RegExpNode* on_match_success() { return on_match_success_; } 496 RegExpNode* ForMatch(RegExpNode* match); 497 498 private: 499 bool is_positive_; 500 RegExpNode* on_match_success_; 501 RegExpNode* on_success_; 502 int stack_pointer_register_; 503 int position_register_; 504 }; 505 506 private: 507 RegExpTree* body_; 508 bool is_positive_; 509 int capture_count_; 510 int capture_from_; 511 Type type_; 512 }; 513 514 515 class RegExpBackReference final : public RegExpTree { 516 public: 517 RegExpBackReference() : capture_(nullptr), name_(nullptr) {} 518 explicit RegExpBackReference(RegExpCapture* capture) 519 : capture_(capture), name_(nullptr) {} 520 void* Accept(RegExpVisitor* visitor, void* data) override; 521 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 522 RegExpBackReference* AsBackReference() override; 523 bool IsBackReference() override; 524 int min_match() override { return 0; } 525 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite 526 // recursion, we give up. Ignorance is bliss. 527 int max_match() override { return kInfinity; } 528 int index() { return capture_->index(); } 529 RegExpCapture* capture() { return capture_; } 530 void set_capture(RegExpCapture* capture) { capture_ = capture; } 531 const ZoneVector<uc16>* name() const { return name_; } 532 void set_name(const ZoneVector<uc16>* name) { name_ = name; } 533 534 private: 535 RegExpCapture* capture_; 536 const ZoneVector<uc16>* name_; 537 }; 538 539 540 class RegExpEmpty final : public RegExpTree { 541 public: 542 RegExpEmpty() {} 543 void* Accept(RegExpVisitor* visitor, void* data) override; 544 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 545 RegExpEmpty* AsEmpty() override; 546 bool IsEmpty() override; 547 int min_match() override { return 0; } 548 int max_match() override { return 0; } 549 }; 550 551 } // namespace internal 552 } // namespace v8 553 554 #endif // V8_REGEXP_REGEXP_AST_H_ 555