1 /* 2 * Copyright (C) 2008, 2009 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "CharacterClassConstructor.h" 28 29 #if ENABLE(WREC) 30 31 #include "pcre_internal.h" 32 #include <wtf/ASCIICType.h> 33 34 using namespace WTF; 35 36 namespace JSC { namespace WREC { 37 38 void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch) 39 { 40 unsigned pos = 0; 41 unsigned range = matches.size(); 42 43 // binary chop, find position to insert char. 44 while (range) { 45 unsigned index = range >> 1; 46 47 int val = matches[pos+index] - ch; 48 if (!val) 49 return; 50 else if (val > 0) 51 range = index; 52 else { 53 pos += (index+1); 54 range -= (index+1); 55 } 56 } 57 58 if (pos == matches.size()) 59 matches.append(ch); 60 else 61 matches.insert(pos, ch); 62 } 63 64 void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) 65 { 66 unsigned end = ranges.size(); 67 68 // Simple linear scan - I doubt there are that many ranges anyway... 69 // feel free to fix this with something faster (eg binary chop). 70 for (unsigned i = 0; i < end; ++i) { 71 // does the new range fall before the current position in the array 72 if (hi < ranges[i].begin) { 73 // optional optimization: concatenate appending ranges? - may not be worthwhile. 74 if (hi == (ranges[i].begin - 1)) { 75 ranges[i].begin = lo; 76 return; 77 } 78 CharacterRange r = {lo, hi}; 79 ranges.insert(i, r); 80 return; 81 } 82 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining 83 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the 84 // end of the last range they concatenate, which is just as good. 85 if (lo <= (ranges[i].end + 1)) { 86 // found an intersect! we'll replace this entry in the array. 87 ranges[i].begin = std::min(ranges[i].begin, lo); 88 ranges[i].end = std::max(ranges[i].end, hi); 89 90 // now check if the new range can subsume any subsequent ranges. 91 unsigned next = i+1; 92 // each iteration of the loop we will either remove something from the list, or break the loop. 93 while (next < ranges.size()) { 94 if (ranges[next].begin <= (ranges[i].end + 1)) { 95 // the next entry now overlaps / concatenates this one. 96 ranges[i].end = std::max(ranges[i].end, ranges[next].end); 97 ranges.remove(next); 98 } else 99 break; 100 } 101 102 return; 103 } 104 } 105 106 // CharacterRange comes after all existing ranges. 107 CharacterRange r = {lo, hi}; 108 ranges.append(r); 109 } 110 111 void CharacterClassConstructor::put(UChar ch) 112 { 113 // Parsing a regular expression like [a-z], we start in an initial empty state: 114 // ((m_charBuffer == -1) && !m_isPendingDash) 115 // When buffer the 'a' sice it may be (and is in this case) part of a range: 116 // ((m_charBuffer != -1) && !m_isPendingDash) 117 // Having parsed the hyphen we then record that the dash is also pending: 118 // ((m_charBuffer != -1) && m_isPendingDash) 119 // The next change will always take us back to the initial state - either because 120 // a complete range has been parsed (such as [a-z]), or because a flush is forced, 121 // due to an early end in the regexp ([a-]), or a character class escape being added 122 // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted. 123 ASSERT(!((m_charBuffer == -1) && m_isPendingDash)); 124 125 if (m_charBuffer != -1) { 126 if (m_isPendingDash) { 127 // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range. 128 UChar lo = m_charBuffer; 129 UChar hi = ch; 130 // Reset back to the inital state. 131 m_charBuffer = -1; 132 m_isPendingDash = false; 133 134 // This is an error, detected lazily. Do not proceed. 135 if (lo > hi) { 136 m_isUpsideDown = true; 137 return; 138 } 139 140 if (lo <= 0x7f) { 141 char asciiLo = lo; 142 char asciiHi = std::min(hi, (UChar)0x7f); 143 addSortedRange(m_ranges, lo, asciiHi); 144 145 if (m_isCaseInsensitive) { 146 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) 147 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); 148 if ((asciiLo <= 'z') && (asciiHi >= 'a')) 149 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); 150 } 151 } 152 if (hi >= 0x80) { 153 UChar unicodeCurr = std::max(lo, (UChar)0x80); 154 addSortedRange(m_rangesUnicode, unicodeCurr, hi); 155 156 if (m_isCaseInsensitive) { 157 // we're going to scan along, updating the start of the range 158 while (unicodeCurr <= hi) { 159 // Spin forwards over any characters that don't have two cases. 160 for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) { 161 // if this was the last character in the range, we're done. 162 if (unicodeCurr == hi) 163 return; 164 } 165 // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case. 166 UChar rangeStart = unicodeCurr; 167 UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr); 168 169 // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case, 170 // and if it's other case value is one greater then the othercase value for the current last 171 // character included in the range, we can include next into the range. 172 while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) { 173 // increment unicodeCurr; it points to the end of the range. 174 // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value. 175 ++unicodeCurr; 176 ++otherCurr; 177 } 178 179 // otherChar is the last in the range of other case chars, calculate offset to get back to the start. 180 addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr); 181 182 // unicodeCurr has been added, move on to the next char. 183 ++unicodeCurr; 184 } 185 } 186 } 187 } else if (ch == '-') 188 // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range. 189 m_isPendingDash = true; 190 else { 191 // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'. 192 flush(); 193 m_charBuffer = ch; 194 } 195 } else 196 // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character 197 // (the hyphen not treated as a special character in this case, same handling for any char). 198 m_charBuffer = ch; 199 } 200 201 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range. 202 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate 203 // array (either ascii or unicode). 204 // If the pattern is case insensitive we add entries for both cases. 205 void CharacterClassConstructor::flush() 206 { 207 if (m_charBuffer != -1) { 208 if (m_charBuffer <= 0x7f) { 209 if (m_isCaseInsensitive && isASCIILower(m_charBuffer)) 210 addSorted(m_matches, toASCIIUpper(m_charBuffer)); 211 addSorted(m_matches, m_charBuffer); 212 if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer)) 213 addSorted(m_matches, toASCIILower(m_charBuffer)); 214 } else { 215 addSorted(m_matchesUnicode, m_charBuffer); 216 if (m_isCaseInsensitive) { 217 int other = jsc_pcre_ucp_othercase(m_charBuffer); 218 if (other != -1) 219 addSorted(m_matchesUnicode, other); 220 } 221 } 222 m_charBuffer = -1; 223 } 224 225 if (m_isPendingDash) { 226 addSorted(m_matches, '-'); 227 m_isPendingDash = false; 228 } 229 } 230 231 void CharacterClassConstructor::append(const CharacterClass& other) 232 { 233 // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]). 234 // Need to check the spec, really, but think this matches PCRE behaviour. 235 flush(); 236 237 if (other.numMatches) { 238 for (size_t i = 0; i < other.numMatches; ++i) 239 addSorted(m_matches, other.matches[i]); 240 } 241 if (other.numRanges) { 242 for (size_t i = 0; i < other.numRanges; ++i) 243 addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end); 244 } 245 if (other.numMatchesUnicode) { 246 for (size_t i = 0; i < other.numMatchesUnicode; ++i) 247 addSorted(m_matchesUnicode, other.matchesUnicode[i]); 248 } 249 if (other.numRangesUnicode) { 250 for (size_t i = 0; i < other.numRangesUnicode; ++i) 251 addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end); 252 } 253 } 254 255 } } // namespace JSC::WREC 256 257 #endif // ENABLE(WREC) 258