Home | History | Annotate | Download | only in wrec
      1 /*
      2  * Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "CharacterClassConstructor.h"
     28 
     29 #if ENABLE(WREC)
     30 
     31 #include "pcre_internal.h"
     32 #include <wtf/ASCIICType.h>
     33 
     34 using namespace WTF;
     35 
     36 namespace JSC { namespace WREC {
     37 
     38 void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
     39 {
     40     unsigned pos = 0;
     41     unsigned range = matches.size();
     42 
     43     // binary chop, find position to insert char.
     44     while (range) {
     45         unsigned index = range >> 1;
     46 
     47         int val = matches[pos+index] - ch;
     48         if (!val)
     49             return;
     50         else if (val > 0)
     51             range = index;
     52         else {
     53             pos += (index+1);
     54             range -= (index+1);
     55         }
     56     }
     57 
     58     if (pos == matches.size())
     59         matches.append(ch);
     60     else
     61         matches.insert(pos, ch);
     62 }
     63 
     64 void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
     65 {
     66     unsigned end = ranges.size();
     67 
     68     // Simple linear scan - I doubt there are that many ranges anyway...
     69     // feel free to fix this with something faster (eg binary chop).
     70     for (unsigned i = 0; i < end; ++i) {
     71         // does the new range fall before the current position in the array
     72         if (hi < ranges[i].begin) {
     73             // optional optimization: concatenate appending ranges? - may not be worthwhile.
     74             if (hi == (ranges[i].begin - 1)) {
     75                 ranges[i].begin = lo;
     76                 return;
     77             }
     78             CharacterRange r = {lo, hi};
     79             ranges.insert(i, r);
     80             return;
     81         }
     82         // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
     83         // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
     84         // end of the last range they concatenate, which is just as good.
     85         if (lo <= (ranges[i].end + 1)) {
     86             // found an intersect! we'll replace this entry in the array.
     87             ranges[i].begin = std::min(ranges[i].begin, lo);
     88             ranges[i].end = std::max(ranges[i].end, hi);
     89 
     90             // now check if the new range can subsume any subsequent ranges.
     91             unsigned next = i+1;
     92             // each iteration of the loop we will either remove something from the list, or break the loop.
     93             while (next < ranges.size()) {
     94                 if (ranges[next].begin <= (ranges[i].end + 1)) {
     95                     // the next entry now overlaps / concatenates this one.
     96                     ranges[i].end = std::max(ranges[i].end, ranges[next].end);
     97                     ranges.remove(next);
     98                 } else
     99                     break;
    100             }
    101 
    102             return;
    103         }
    104     }
    105 
    106     // CharacterRange comes after all existing ranges.
    107     CharacterRange r = {lo, hi};
    108     ranges.append(r);
    109 }
    110 
    111 void CharacterClassConstructor::put(UChar ch)
    112 {
    113     // Parsing a regular expression like [a-z], we start in an initial empty state:
    114     //     ((m_charBuffer == -1) && !m_isPendingDash)
    115     // When buffer the 'a' sice it may be (and is in this case) part of a range:
    116     //     ((m_charBuffer != -1) && !m_isPendingDash)
    117     // Having parsed the hyphen we then record that the dash is also pending:
    118     //     ((m_charBuffer != -1) && m_isPendingDash)
    119     // The next change will always take us back to the initial state - either because
    120     // a complete range has been parsed (such as [a-z]), or because a flush is forced,
    121     // due to an early end in the regexp ([a-]), or a character class escape being added
    122     // ([a-\s]).  The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
    123     ASSERT(!((m_charBuffer == -1) && m_isPendingDash));
    124 
    125     if (m_charBuffer != -1) {
    126         if (m_isPendingDash) {
    127             // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
    128             UChar lo = m_charBuffer;
    129             UChar hi = ch;
    130             // Reset back to the inital state.
    131             m_charBuffer = -1;
    132             m_isPendingDash = false;
    133 
    134             // This is an error, detected lazily.  Do not proceed.
    135             if (lo > hi) {
    136                 m_isUpsideDown = true;
    137                 return;
    138             }
    139 
    140             if (lo <= 0x7f) {
    141                 char asciiLo = lo;
    142                 char asciiHi = std::min(hi, (UChar)0x7f);
    143                 addSortedRange(m_ranges, lo, asciiHi);
    144 
    145                 if (m_isCaseInsensitive) {
    146                     if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
    147                         addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
    148                     if ((asciiLo <= 'z') && (asciiHi >= 'a'))
    149                         addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
    150                 }
    151             }
    152             if (hi >= 0x80) {
    153                 UChar unicodeCurr = std::max(lo, (UChar)0x80);
    154                 addSortedRange(m_rangesUnicode, unicodeCurr, hi);
    155 
    156                 if (m_isCaseInsensitive) {
    157                     // we're going to scan along, updating the start of the range
    158                     while (unicodeCurr <= hi) {
    159                         // Spin forwards over any characters that don't have two cases.
    160                         for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
    161                             // if this was the last character in the range, we're done.
    162                             if (unicodeCurr == hi)
    163                                 return;
    164                         }
    165                         // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
    166                         UChar rangeStart = unicodeCurr;
    167                         UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
    168 
    169                         // If unicodeCurr is not yet hi, check the next char in the range.  If it also has another case,
    170                         // and if it's other case value is one greater then the othercase value for the current last
    171                         // character included in the range, we can include next into the range.
    172                         while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
    173                             // increment unicodeCurr; it points to the end of the range.
    174                             // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
    175                             ++unicodeCurr;
    176                             ++otherCurr;
    177                         }
    178 
    179                         // otherChar is the last in the range of other case chars, calculate offset to get back to the start.
    180                         addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
    181 
    182                         // unicodeCurr has been added, move on to the next char.
    183                         ++unicodeCurr;
    184                     }
    185                 }
    186             }
    187         } else if (ch == '-')
    188             // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
    189             m_isPendingDash = true;
    190         else {
    191             // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
    192             flush();
    193             m_charBuffer = ch;
    194         }
    195     } else
    196         // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
    197         // (the hyphen not treated as a special character in this case, same handling for any char).
    198         m_charBuffer = ch;
    199 }
    200 
    201 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
    202 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
    203 // array (either ascii or unicode).
    204 // If the pattern is case insensitive we add entries for both cases.
    205 void CharacterClassConstructor::flush()
    206 {
    207     if (m_charBuffer != -1) {
    208         if (m_charBuffer <= 0x7f) {
    209             if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
    210                 addSorted(m_matches, toASCIIUpper(m_charBuffer));
    211             addSorted(m_matches, m_charBuffer);
    212             if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
    213                 addSorted(m_matches, toASCIILower(m_charBuffer));
    214         } else {
    215             addSorted(m_matchesUnicode, m_charBuffer);
    216             if (m_isCaseInsensitive) {
    217                 int other = jsc_pcre_ucp_othercase(m_charBuffer);
    218                 if (other != -1)
    219                     addSorted(m_matchesUnicode, other);
    220             }
    221         }
    222         m_charBuffer = -1;
    223     }
    224 
    225     if (m_isPendingDash) {
    226         addSorted(m_matches, '-');
    227         m_isPendingDash = false;
    228     }
    229 }
    230 
    231 void CharacterClassConstructor::append(const CharacterClass& other)
    232 {
    233     // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
    234     // Need to check the spec, really, but think this matches PCRE behaviour.
    235     flush();
    236 
    237     if (other.numMatches) {
    238         for (size_t i = 0; i < other.numMatches; ++i)
    239             addSorted(m_matches, other.matches[i]);
    240     }
    241     if (other.numRanges) {
    242         for (size_t i = 0; i < other.numRanges; ++i)
    243             addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
    244     }
    245     if (other.numMatchesUnicode) {
    246         for (size_t i = 0; i < other.numMatchesUnicode; ++i)
    247             addSorted(m_matchesUnicode, other.matchesUnicode[i]);
    248     }
    249     if (other.numRangesUnicode) {
    250         for (size_t i = 0; i < other.numRangesUnicode; ++i)
    251             addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
    252     }
    253 }
    254 
    255 } } // namespace JSC::WREC
    256 
    257 #endif // ENABLE(WREC)
    258