1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Peter Varga (pvarga (at) inf.u-szeged.hu), University of Szeged 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #ifndef YarrPattern_h 28 #define YarrPattern_h 29 30 #include <runtime/UString.h> 31 #include <wtf/Vector.h> 32 #include <wtf/unicode/Unicode.h> 33 34 namespace JSC { namespace Yarr { 35 36 struct PatternDisjunction; 37 38 struct CharacterRange { 39 UChar begin; 40 UChar end; 41 42 CharacterRange(UChar begin, UChar end) 43 : begin(begin) 44 , end(end) 45 { 46 } 47 }; 48 49 struct CharacterClassTable : RefCounted<CharacterClassTable> { 50 const char* m_table; 51 bool m_inverted; 52 static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted) 53 { 54 return adoptRef(new CharacterClassTable(table, inverted)); 55 } 56 57 private: 58 CharacterClassTable(const char* table, bool inverted) 59 : m_table(table) 60 , m_inverted(inverted) 61 { 62 } 63 }; 64 65 struct CharacterClass { 66 WTF_MAKE_FAST_ALLOCATED; 67 public: 68 // All CharacterClass instances have to have the full set of matches and ranges, 69 // they may have an optional table for faster lookups (which must match the 70 // specified matches and ranges) 71 CharacterClass(PassRefPtr<CharacterClassTable> table) 72 : m_table(table) 73 { 74 } 75 Vector<UChar> m_matches; 76 Vector<CharacterRange> m_ranges; 77 Vector<UChar> m_matchesUnicode; 78 Vector<CharacterRange> m_rangesUnicode; 79 RefPtr<CharacterClassTable> m_table; 80 }; 81 82 enum QuantifierType { 83 QuantifierFixedCount, 84 QuantifierGreedy, 85 QuantifierNonGreedy, 86 }; 87 88 struct PatternTerm { 89 enum Type { 90 TypeAssertionBOL, 91 TypeAssertionEOL, 92 TypeAssertionWordBoundary, 93 TypePatternCharacter, 94 TypeCharacterClass, 95 TypeBackReference, 96 TypeForwardReference, 97 TypeParenthesesSubpattern, 98 TypeParentheticalAssertion, 99 } type; 100 bool m_capture :1; 101 bool m_invert :1; 102 union { 103 UChar patternCharacter; 104 CharacterClass* characterClass; 105 unsigned backReferenceSubpatternId; 106 struct { 107 PatternDisjunction* disjunction; 108 unsigned subpatternId; 109 unsigned lastSubpatternId; 110 bool isCopy; 111 bool isTerminal; 112 } parentheses; 113 }; 114 QuantifierType quantityType; 115 unsigned quantityCount; 116 int inputPosition; 117 unsigned frameLocation; 118 119 PatternTerm(UChar ch) 120 : type(PatternTerm::TypePatternCharacter) 121 , m_capture(false) 122 , m_invert(false) 123 { 124 patternCharacter = ch; 125 quantityType = QuantifierFixedCount; 126 quantityCount = 1; 127 } 128 129 PatternTerm(CharacterClass* charClass, bool invert) 130 : type(PatternTerm::TypeCharacterClass) 131 , m_capture(false) 132 , m_invert(invert) 133 { 134 characterClass = charClass; 135 quantityType = QuantifierFixedCount; 136 quantityCount = 1; 137 } 138 139 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) 140 : type(type) 141 , m_capture(capture) 142 , m_invert(invert) 143 { 144 parentheses.disjunction = disjunction; 145 parentheses.subpatternId = subpatternId; 146 parentheses.isCopy = false; 147 parentheses.isTerminal = false; 148 quantityType = QuantifierFixedCount; 149 quantityCount = 1; 150 } 151 152 PatternTerm(Type type, bool invert = false) 153 : type(type) 154 , m_capture(false) 155 , m_invert(invert) 156 { 157 quantityType = QuantifierFixedCount; 158 quantityCount = 1; 159 } 160 161 PatternTerm(unsigned spatternId) 162 : type(TypeBackReference) 163 , m_capture(false) 164 , m_invert(false) 165 { 166 backReferenceSubpatternId = spatternId; 167 quantityType = QuantifierFixedCount; 168 quantityCount = 1; 169 } 170 171 static PatternTerm ForwardReference() 172 { 173 return PatternTerm(TypeForwardReference); 174 } 175 176 static PatternTerm BOL() 177 { 178 return PatternTerm(TypeAssertionBOL); 179 } 180 181 static PatternTerm EOL() 182 { 183 return PatternTerm(TypeAssertionEOL); 184 } 185 186 static PatternTerm WordBoundary(bool invert) 187 { 188 return PatternTerm(TypeAssertionWordBoundary, invert); 189 } 190 191 bool invert() 192 { 193 return m_invert; 194 } 195 196 bool capture() 197 { 198 return m_capture; 199 } 200 201 void quantify(unsigned count, QuantifierType type) 202 { 203 quantityCount = count; 204 quantityType = type; 205 } 206 }; 207 208 struct PatternAlternative { 209 WTF_MAKE_FAST_ALLOCATED; 210 public: 211 PatternAlternative(PatternDisjunction* disjunction) 212 : m_parent(disjunction) 213 , m_onceThrough(false) 214 , m_hasFixedSize(false) 215 , m_startsWithBOL(false) 216 , m_containsBOL(false) 217 { 218 } 219 220 PatternTerm& lastTerm() 221 { 222 ASSERT(m_terms.size()); 223 return m_terms[m_terms.size() - 1]; 224 } 225 226 void removeLastTerm() 227 { 228 ASSERT(m_terms.size()); 229 m_terms.shrink(m_terms.size() - 1); 230 } 231 232 void setOnceThrough() 233 { 234 m_onceThrough = true; 235 } 236 237 bool onceThrough() 238 { 239 return m_onceThrough; 240 } 241 242 Vector<PatternTerm> m_terms; 243 PatternDisjunction* m_parent; 244 unsigned m_minimumSize; 245 bool m_onceThrough : 1; 246 bool m_hasFixedSize : 1; 247 bool m_startsWithBOL : 1; 248 bool m_containsBOL : 1; 249 }; 250 251 struct PatternDisjunction { 252 WTF_MAKE_FAST_ALLOCATED; 253 public: 254 PatternDisjunction(PatternAlternative* parent = 0) 255 : m_parent(parent) 256 , m_hasFixedSize(false) 257 { 258 } 259 260 ~PatternDisjunction() 261 { 262 deleteAllValues(m_alternatives); 263 } 264 265 PatternAlternative* addNewAlternative() 266 { 267 PatternAlternative* alternative = new PatternAlternative(this); 268 m_alternatives.append(alternative); 269 return alternative; 270 } 271 272 Vector<PatternAlternative*> m_alternatives; 273 PatternAlternative* m_parent; 274 unsigned m_minimumSize; 275 unsigned m_callFrameSize; 276 bool m_hasFixedSize; 277 }; 278 279 // You probably don't want to be calling these functions directly 280 // (please to be calling newlineCharacterClass() et al on your 281 // friendly neighborhood YarrPattern instance to get nicely 282 // cached copies). 283 CharacterClass* newlineCreate(); 284 CharacterClass* digitsCreate(); 285 CharacterClass* spacesCreate(); 286 CharacterClass* wordcharCreate(); 287 CharacterClass* nondigitsCreate(); 288 CharacterClass* nonspacesCreate(); 289 CharacterClass* nonwordcharCreate(); 290 291 struct TermChain { 292 TermChain(PatternTerm term) 293 : term(term) 294 {} 295 296 PatternTerm term; 297 Vector<TermChain> hotTerms; 298 }; 299 300 struct BeginChar { 301 BeginChar() 302 : value(0) 303 , mask(0) 304 {} 305 306 BeginChar(unsigned value, unsigned mask) 307 : value(value) 308 , mask(mask) 309 {} 310 311 unsigned value; 312 unsigned mask; 313 }; 314 315 struct YarrPattern { 316 YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error); 317 318 ~YarrPattern() 319 { 320 deleteAllValues(m_disjunctions); 321 deleteAllValues(m_userCharacterClasses); 322 } 323 324 void reset() 325 { 326 m_numSubpatterns = 0; 327 m_maxBackReference = 0; 328 329 m_containsBackreferences = false; 330 m_containsBeginChars = false; 331 m_containsBOL = false; 332 333 newlineCached = 0; 334 digitsCached = 0; 335 spacesCached = 0; 336 wordcharCached = 0; 337 nondigitsCached = 0; 338 nonspacesCached = 0; 339 nonwordcharCached = 0; 340 341 deleteAllValues(m_disjunctions); 342 m_disjunctions.clear(); 343 deleteAllValues(m_userCharacterClasses); 344 m_userCharacterClasses.clear(); 345 m_beginChars.clear(); 346 } 347 348 bool containsIllegalBackReference() 349 { 350 return m_maxBackReference > m_numSubpatterns; 351 } 352 353 CharacterClass* newlineCharacterClass() 354 { 355 if (!newlineCached) 356 m_userCharacterClasses.append(newlineCached = newlineCreate()); 357 return newlineCached; 358 } 359 CharacterClass* digitsCharacterClass() 360 { 361 if (!digitsCached) 362 m_userCharacterClasses.append(digitsCached = digitsCreate()); 363 return digitsCached; 364 } 365 CharacterClass* spacesCharacterClass() 366 { 367 if (!spacesCached) 368 m_userCharacterClasses.append(spacesCached = spacesCreate()); 369 return spacesCached; 370 } 371 CharacterClass* wordcharCharacterClass() 372 { 373 if (!wordcharCached) 374 m_userCharacterClasses.append(wordcharCached = wordcharCreate()); 375 return wordcharCached; 376 } 377 CharacterClass* nondigitsCharacterClass() 378 { 379 if (!nondigitsCached) 380 m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); 381 return nondigitsCached; 382 } 383 CharacterClass* nonspacesCharacterClass() 384 { 385 if (!nonspacesCached) 386 m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); 387 return nonspacesCached; 388 } 389 CharacterClass* nonwordcharCharacterClass() 390 { 391 if (!nonwordcharCached) 392 m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); 393 return nonwordcharCached; 394 } 395 396 bool m_ignoreCase : 1; 397 bool m_multiline : 1; 398 bool m_containsBackreferences : 1; 399 bool m_containsBeginChars : 1; 400 bool m_containsBOL : 1; 401 unsigned m_numSubpatterns; 402 unsigned m_maxBackReference; 403 PatternDisjunction* m_body; 404 Vector<PatternDisjunction*, 4> m_disjunctions; 405 Vector<CharacterClass*> m_userCharacterClasses; 406 Vector<BeginChar> m_beginChars; 407 408 private: 409 const char* compile(const UString& patternString); 410 411 CharacterClass* newlineCached; 412 CharacterClass* digitsCached; 413 CharacterClass* spacesCached; 414 CharacterClass* wordcharCached; 415 CharacterClass* nondigitsCached; 416 CharacterClass* nonspacesCached; 417 CharacterClass* nonwordcharCached; 418 }; 419 420 } } // namespace JSC::Yarr 421 422 #endif // YarrPattern_h 423