Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2011 Daniel Bates (dbates (at) intudata.com). All Rights Reserved.
      3  * Copyright (c) 2012 Google, inc.  All Rights Reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. Neither the name of Google Inc. nor the names of its
     14  *    contributors may be used to endorse or promote products derived from
     15  *    this software without specific prior written permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     18  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     24  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     25  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     27  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28  */
     29 
     30 #ifndef DecodeEscapeSequences_h
     31 #define DecodeEscapeSequences_h
     32 
     33 #include "wtf/ASCIICType.h"
     34 #include "wtf/Assertions.h"
     35 #include "wtf/text/StringBuilder.h"
     36 #include "wtf/text/TextEncoding.h"
     37 
     38 namespace blink {
     39 
     40 // See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
     41 struct Unicode16BitEscapeSequence {
     42     enum { sequenceSize = 6 }; // e.g. %u26C4
     43     static size_t findInString(const String& string, size_t startPosition) { return string.find("%u", startPosition); }
     44     static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition)
     45     {
     46         size_t runEnd = startPosition;
     47         while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
     48                && isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
     49                && isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
     50             runEnd += sequenceSize;
     51         }
     52         return runEnd;
     53     }
     54 
     55     template<typename CharType>
     56     static String decodeRun(const CharType* run, size_t runLength, const WTF::TextEncoding&)
     57     {
     58         // Each %u-escape sequence represents a UTF-16 code unit.
     59         // See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
     60         // For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
     61         // without any intervening characters, so decode the run without additional checks.
     62         size_t numberOfSequences = runLength / sequenceSize;
     63         StringBuilder builder;
     64         builder.reserveCapacity(numberOfSequences);
     65         while (numberOfSequences--) {
     66             UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
     67             builder.append(codeUnit);
     68             run += sequenceSize;
     69         }
     70         return builder.toString();
     71     }
     72 };
     73 
     74 struct URLEscapeSequence {
     75     enum { sequenceSize = 3 }; // e.g. %41
     76     static size_t findInString(const String& string, size_t startPosition) { return string.find('%', startPosition); }
     77     static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition)
     78     {
     79         // Make the simplifying assumption that supported encodings may have up to two unescaped characters
     80         // in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
     81         // decoder as part of the run. In other words, we end the run at the first value outside of the
     82         // 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
     83         // escape sequence.
     84         size_t runEnd = startPosition;
     85         int numberOfTrailingCharacters = 0;
     86         while (runEnd < endPosition) {
     87             if (string[runEnd] == '%') {
     88                 if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
     89                     runEnd += sequenceSize;
     90                     numberOfTrailingCharacters = 0;
     91                 } else
     92                     break;
     93             } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
     94                 runEnd += 1;
     95                 numberOfTrailingCharacters += 1;
     96             } else
     97                 break;
     98         }
     99         return runEnd;
    100     }
    101 
    102     template<typename CharType>
    103     static String decodeRun(const CharType* run, size_t runLength, const WTF::TextEncoding& encoding)
    104     {
    105         // For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
    106         // a valid escape sequence, but there may be characters between the sequences.
    107         Vector<char, 512> buffer;
    108         buffer.resize(runLength); // Unescaping hex sequences only makes the length smaller.
    109         char* p = buffer.data();
    110         const CharType* runEnd = run + runLength;
    111         while (run < runEnd) {
    112             if (run[0] == '%') {
    113                 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
    114                 run += sequenceSize;
    115             } else {
    116                 *p++ = run[0];
    117                 run += 1;
    118             }
    119         }
    120         ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
    121         return (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data());
    122     }
    123 };
    124 
    125 template<typename EscapeSequence>
    126 String decodeEscapeSequences(const String& string, const WTF::TextEncoding& encoding)
    127 {
    128     StringBuilder result;
    129     size_t length = string.length();
    130     size_t decodedPosition = 0;
    131     size_t searchPosition = 0;
    132     size_t encodedRunPosition;
    133     while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != kNotFound) {
    134         size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
    135         searchPosition = encodedRunEnd;
    136         if (encodedRunEnd == encodedRunPosition) {
    137             ++searchPosition;
    138             continue;
    139         }
    140 
    141         String decoded = string.is8Bit() ?
    142             EscapeSequence::decodeRun(string.characters8() + encodedRunPosition, encodedRunEnd - encodedRunPosition, encoding) :
    143             EscapeSequence::decodeRun(string.characters16() + encodedRunPosition, encodedRunEnd - encodedRunPosition, encoding);
    144 
    145         if (decoded.isEmpty())
    146             continue;
    147 
    148         result.append(string, decodedPosition, encodedRunPosition - decodedPosition);
    149         result.append(decoded);
    150         decodedPosition = encodedRunEnd;
    151     }
    152     result.append(string, decodedPosition, length - decodedPosition);
    153     return result.toString();
    154 }
    155 
    156 } // namespace blink
    157 
    158 #endif // DecodeEscapeSequences_h
    159