1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_AST_H_ 6 #define V8_REGEXP_REGEXP_AST_H_ 7 8 #include "src/utils.h" 9 #include "src/zone.h" 10 11 namespace v8 { 12 namespace internal { 13 14 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 15 VISIT(Disjunction) \ 16 VISIT(Alternative) \ 17 VISIT(Assertion) \ 18 VISIT(CharacterClass) \ 19 VISIT(Atom) \ 20 VISIT(Quantifier) \ 21 VISIT(Capture) \ 22 VISIT(Lookaround) \ 23 VISIT(BackReference) \ 24 VISIT(Empty) \ 25 VISIT(Text) 26 27 28 #define FORWARD_DECLARE(Name) class RegExp##Name; 29 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) 30 #undef FORWARD_DECLARE 31 32 class RegExpCompiler; 33 class RegExpNode; 34 class RegExpTree; 35 36 37 class RegExpVisitor BASE_EMBEDDED { 38 public: 39 virtual ~RegExpVisitor() {} 40 #define MAKE_CASE(Name) \ 41 virtual void* Visit##Name(RegExp##Name*, void* data) = 0; 42 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) 43 #undef MAKE_CASE 44 }; 45 46 47 // A simple closed interval. 48 class Interval { 49 public: 50 Interval() : from_(kNone), to_(kNone) {} 51 Interval(int from, int to) : from_(from), to_(to) {} 52 Interval Union(Interval that) { 53 if (that.from_ == kNone) 54 return *this; 55 else if (from_ == kNone) 56 return that; 57 else 58 return Interval(Min(from_, that.from_), Max(to_, that.to_)); 59 } 60 bool Contains(int value) { return (from_ <= value) && (value <= to_); } 61 bool is_empty() { return from_ == kNone; } 62 int from() const { return from_; } 63 int to() const { return to_; } 64 static Interval Empty() { return Interval(); } 65 static const int kNone = -1; 66 67 private: 68 int from_; 69 int to_; 70 }; 71 72 73 // Represents code units in the range from from_ to to_, both ends are 74 // inclusive. 75 class CharacterRange { 76 public: 77 CharacterRange() : from_(0), to_(0) {} 78 // For compatibility with the CHECK_OK macro 79 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT 80 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {} 81 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, 82 Zone* zone); 83 static Vector<const int> GetWordBounds(); 84 static inline CharacterRange Singleton(uc16 value) { 85 return CharacterRange(value, value); 86 } 87 static inline CharacterRange Range(uc16 from, uc16 to) { 88 DCHECK(from <= to); 89 return CharacterRange(from, to); 90 } 91 static inline CharacterRange Everything() { 92 return CharacterRange(0, 0xFFFF); 93 } 94 bool Contains(uc16 i) { return from_ <= i && i <= to_; } 95 uc16 from() const { return from_; } 96 void set_from(uc16 value) { from_ = value; } 97 uc16 to() const { return to_; } 98 void set_to(uc16 value) { to_ = value; } 99 bool is_valid() { return from_ <= to_; } 100 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } 101 bool IsSingleton() { return (from_ == to_); } 102 void AddCaseEquivalents(Isolate* isolate, Zone* zone, 103 ZoneList<CharacterRange>* ranges, bool is_one_byte); 104 static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay, 105 ZoneList<CharacterRange>** included, 106 ZoneList<CharacterRange>** excluded, Zone* zone); 107 // Whether a range list is in canonical form: Ranges ordered by from value, 108 // and ranges non-overlapping and non-adjacent. 109 static bool IsCanonical(ZoneList<CharacterRange>* ranges); 110 // Convert range list to canonical form. The characters covered by the ranges 111 // will still be the same, but no character is in more than one range, and 112 // adjacent ranges are merged. The resulting list may be shorter than the 113 // original, but cannot be longer. 114 static void Canonicalize(ZoneList<CharacterRange>* ranges); 115 // Negate the contents of a character range in canonical form. 116 static void Negate(ZoneList<CharacterRange>* src, 117 ZoneList<CharacterRange>* dst, Zone* zone); 118 static const int kStartMarker = (1 << 24); 119 static const int kPayloadMask = (1 << 24) - 1; 120 121 private: 122 uc16 from_; 123 uc16 to_; 124 }; 125 126 127 class CharacterSet final BASE_EMBEDDED { 128 public: 129 explicit CharacterSet(uc16 standard_set_type) 130 : ranges_(NULL), standard_set_type_(standard_set_type) {} 131 explicit CharacterSet(ZoneList<CharacterRange>* ranges) 132 : ranges_(ranges), standard_set_type_(0) {} 133 ZoneList<CharacterRange>* ranges(Zone* zone); 134 uc16 standard_set_type() { return standard_set_type_; } 135 void set_standard_set_type(uc16 special_set_type) { 136 standard_set_type_ = special_set_type; 137 } 138 bool is_standard() { return standard_set_type_ != 0; } 139 void Canonicalize(); 140 141 private: 142 ZoneList<CharacterRange>* ranges_; 143 // If non-zero, the value represents a standard set (e.g., all whitespace 144 // characters) without having to expand the ranges. 145 uc16 standard_set_type_; 146 }; 147 148 149 class TextElement final BASE_EMBEDDED { 150 public: 151 enum TextType { ATOM, CHAR_CLASS }; 152 153 static TextElement Atom(RegExpAtom* atom); 154 static TextElement CharClass(RegExpCharacterClass* char_class); 155 156 int cp_offset() const { return cp_offset_; } 157 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 158 int length() const; 159 160 TextType text_type() const { return text_type_; } 161 162 RegExpTree* tree() const { return tree_; } 163 164 RegExpAtom* atom() const { 165 DCHECK(text_type() == ATOM); 166 return reinterpret_cast<RegExpAtom*>(tree()); 167 } 168 169 RegExpCharacterClass* char_class() const { 170 DCHECK(text_type() == CHAR_CLASS); 171 return reinterpret_cast<RegExpCharacterClass*>(tree()); 172 } 173 174 private: 175 TextElement(TextType text_type, RegExpTree* tree) 176 : cp_offset_(-1), text_type_(text_type), tree_(tree) {} 177 178 int cp_offset_; 179 TextType text_type_; 180 RegExpTree* tree_; 181 }; 182 183 184 class RegExpTree : public ZoneObject { 185 public: 186 static const int kInfinity = kMaxInt; 187 virtual ~RegExpTree() {} 188 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; 189 virtual RegExpNode* ToNode(RegExpCompiler* compiler, 190 RegExpNode* on_success) = 0; 191 virtual bool IsTextElement() { return false; } 192 virtual bool IsAnchoredAtStart() { return false; } 193 virtual bool IsAnchoredAtEnd() { return false; } 194 virtual int min_match() = 0; 195 virtual int max_match() = 0; 196 // Returns the interval of registers used for captures within this 197 // expression. 198 virtual Interval CaptureRegisters() { return Interval::Empty(); } 199 virtual void AppendToText(RegExpText* text, Zone* zone); 200 std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT 201 #define MAKE_ASTYPE(Name) \ 202 virtual RegExp##Name* As##Name(); \ 203 virtual bool Is##Name(); 204 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) 205 #undef MAKE_ASTYPE 206 }; 207 208 209 class RegExpDisjunction final : public RegExpTree { 210 public: 211 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); 212 void* Accept(RegExpVisitor* visitor, void* data) override; 213 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 214 RegExpDisjunction* AsDisjunction() override; 215 Interval CaptureRegisters() override; 216 bool IsDisjunction() override; 217 bool IsAnchoredAtStart() override; 218 bool IsAnchoredAtEnd() override; 219 int min_match() override { return min_match_; } 220 int max_match() override { return max_match_; } 221 ZoneList<RegExpTree*>* alternatives() { return alternatives_; } 222 223 private: 224 bool SortConsecutiveAtoms(RegExpCompiler* compiler); 225 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); 226 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); 227 ZoneList<RegExpTree*>* alternatives_; 228 int min_match_; 229 int max_match_; 230 }; 231 232 233 class RegExpAlternative final : public RegExpTree { 234 public: 235 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); 236 void* Accept(RegExpVisitor* visitor, void* data) override; 237 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 238 RegExpAlternative* AsAlternative() override; 239 Interval CaptureRegisters() override; 240 bool IsAlternative() override; 241 bool IsAnchoredAtStart() override; 242 bool IsAnchoredAtEnd() override; 243 int min_match() override { return min_match_; } 244 int max_match() override { return max_match_; } 245 ZoneList<RegExpTree*>* nodes() { return nodes_; } 246 247 private: 248 ZoneList<RegExpTree*>* nodes_; 249 int min_match_; 250 int max_match_; 251 }; 252 253 254 class RegExpAssertion final : public RegExpTree { 255 public: 256 enum AssertionType { 257 START_OF_LINE, 258 START_OF_INPUT, 259 END_OF_LINE, 260 END_OF_INPUT, 261 BOUNDARY, 262 NON_BOUNDARY 263 }; 264 explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} 265 void* Accept(RegExpVisitor* visitor, void* data) override; 266 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 267 RegExpAssertion* AsAssertion() override; 268 bool IsAssertion() override; 269 bool IsAnchoredAtStart() override; 270 bool IsAnchoredAtEnd() override; 271 int min_match() override { return 0; } 272 int max_match() override { return 0; } 273 AssertionType assertion_type() { return assertion_type_; } 274 275 private: 276 AssertionType assertion_type_; 277 }; 278 279 280 class RegExpCharacterClass final : public RegExpTree { 281 public: 282 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated) 283 : set_(ranges), is_negated_(is_negated) {} 284 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {} 285 void* Accept(RegExpVisitor* visitor, void* data) override; 286 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 287 RegExpCharacterClass* AsCharacterClass() override; 288 bool IsCharacterClass() override; 289 bool IsTextElement() override { return true; } 290 int min_match() override { return 1; } 291 int max_match() override { return 1; } 292 void AppendToText(RegExpText* text, Zone* zone) override; 293 CharacterSet character_set() { return set_; } 294 // TODO(lrn): Remove need for complex version if is_standard that 295 // recognizes a mangled standard set and just do { return set_.is_special(); } 296 bool is_standard(Zone* zone); 297 // Returns a value representing the standard character set if is_standard() 298 // returns true. 299 // Currently used values are: 300 // s : unicode whitespace 301 // S : unicode non-whitespace 302 // w : ASCII word character (digit, letter, underscore) 303 // W : non-ASCII word character 304 // d : ASCII digit 305 // D : non-ASCII digit 306 // . : non-unicode non-newline 307 // * : All characters 308 uc16 standard_type() { return set_.standard_set_type(); } 309 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } 310 bool is_negated() { return is_negated_; } 311 312 private: 313 CharacterSet set_; 314 bool is_negated_; 315 }; 316 317 318 class RegExpAtom final : public RegExpTree { 319 public: 320 explicit RegExpAtom(Vector<const uc16> data) : data_(data) {} 321 void* Accept(RegExpVisitor* visitor, void* data) override; 322 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 323 RegExpAtom* AsAtom() override; 324 bool IsAtom() override; 325 bool IsTextElement() override { return true; } 326 int min_match() override { return data_.length(); } 327 int max_match() override { return data_.length(); } 328 void AppendToText(RegExpText* text, Zone* zone) override; 329 Vector<const uc16> data() { return data_; } 330 int length() { return data_.length(); } 331 332 private: 333 Vector<const uc16> data_; 334 }; 335 336 337 class RegExpText final : public RegExpTree { 338 public: 339 explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} 340 void* Accept(RegExpVisitor* visitor, void* data) override; 341 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 342 RegExpText* AsText() override; 343 bool IsText() override; 344 bool IsTextElement() override { return true; } 345 int min_match() override { return length_; } 346 int max_match() override { return length_; } 347 void AppendToText(RegExpText* text, Zone* zone) override; 348 void AddElement(TextElement elm, Zone* zone) { 349 elements_.Add(elm, zone); 350 length_ += elm.length(); 351 } 352 ZoneList<TextElement>* elements() { return &elements_; } 353 354 private: 355 ZoneList<TextElement> elements_; 356 int length_; 357 }; 358 359 360 class RegExpQuantifier final : public RegExpTree { 361 public: 362 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; 363 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) 364 : body_(body), 365 min_(min), 366 max_(max), 367 min_match_(min * body->min_match()), 368 quantifier_type_(type) { 369 if (max > 0 && body->max_match() > kInfinity / max) { 370 max_match_ = kInfinity; 371 } else { 372 max_match_ = max * body->max_match(); 373 } 374 } 375 void* Accept(RegExpVisitor* visitor, void* data) override; 376 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 377 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, 378 RegExpCompiler* compiler, RegExpNode* on_success, 379 bool not_at_start = false); 380 RegExpQuantifier* AsQuantifier() override; 381 Interval CaptureRegisters() override; 382 bool IsQuantifier() override; 383 int min_match() override { return min_match_; } 384 int max_match() override { return max_match_; } 385 int min() { return min_; } 386 int max() { return max_; } 387 bool is_possessive() { return quantifier_type_ == POSSESSIVE; } 388 bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } 389 bool is_greedy() { return quantifier_type_ == GREEDY; } 390 RegExpTree* body() { return body_; } 391 392 private: 393 RegExpTree* body_; 394 int min_; 395 int max_; 396 int min_match_; 397 int max_match_; 398 QuantifierType quantifier_type_; 399 }; 400 401 402 class RegExpCapture final : public RegExpTree { 403 public: 404 explicit RegExpCapture(int index) : body_(NULL), index_(index) {} 405 void* Accept(RegExpVisitor* visitor, void* data) override; 406 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 407 static RegExpNode* ToNode(RegExpTree* body, int index, 408 RegExpCompiler* compiler, RegExpNode* on_success); 409 RegExpCapture* AsCapture() override; 410 bool IsAnchoredAtStart() override; 411 bool IsAnchoredAtEnd() override; 412 Interval CaptureRegisters() override; 413 bool IsCapture() override; 414 int min_match() override { return body_->min_match(); } 415 int max_match() override { return body_->max_match(); } 416 RegExpTree* body() { return body_; } 417 void set_body(RegExpTree* body) { body_ = body; } 418 int index() { return index_; } 419 static int StartRegister(int index) { return index * 2; } 420 static int EndRegister(int index) { return index * 2 + 1; } 421 422 private: 423 RegExpTree* body_; 424 int index_; 425 }; 426 427 428 class RegExpLookaround final : public RegExpTree { 429 public: 430 enum Type { LOOKAHEAD, LOOKBEHIND }; 431 432 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, 433 int capture_from, Type type) 434 : body_(body), 435 is_positive_(is_positive), 436 capture_count_(capture_count), 437 capture_from_(capture_from), 438 type_(type) {} 439 440 void* Accept(RegExpVisitor* visitor, void* data) override; 441 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 442 RegExpLookaround* AsLookaround() override; 443 Interval CaptureRegisters() override; 444 bool IsLookaround() override; 445 bool IsAnchoredAtStart() override; 446 int min_match() override { return 0; } 447 int max_match() override { return 0; } 448 RegExpTree* body() { return body_; } 449 bool is_positive() { return is_positive_; } 450 int capture_count() { return capture_count_; } 451 int capture_from() { return capture_from_; } 452 Type type() { return type_; } 453 454 private: 455 RegExpTree* body_; 456 bool is_positive_; 457 int capture_count_; 458 int capture_from_; 459 Type type_; 460 }; 461 462 463 class RegExpBackReference final : public RegExpTree { 464 public: 465 explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {} 466 void* Accept(RegExpVisitor* visitor, void* data) override; 467 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 468 RegExpBackReference* AsBackReference() override; 469 bool IsBackReference() override; 470 int min_match() override { return 0; } 471 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite 472 // recursion, we give up. Ignorance is bliss. 473 int max_match() override { return kInfinity; } 474 int index() { return capture_->index(); } 475 RegExpCapture* capture() { return capture_; } 476 477 private: 478 RegExpCapture* capture_; 479 }; 480 481 482 class RegExpEmpty final : public RegExpTree { 483 public: 484 RegExpEmpty() {} 485 void* Accept(RegExpVisitor* visitor, void* data) override; 486 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 487 RegExpEmpty* AsEmpty() override; 488 bool IsEmpty() override; 489 int min_match() override { return 0; } 490 int max_match() override { return 0; } 491 }; 492 493 } // namespace internal 494 } // namespace v8 495 496 #endif // V8_REGEXP_REGEXP_AST_H_ 497