1 /* 2 * Copyright (C) 2011 Daniel Bates (dbates (at) intudata.com). All Rights Reserved. 3 * Copyright (c) 2012 Google, inc. All Rights Reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of Google Inc. nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 #ifndef DecodeEscapeSequences_h 31 #define DecodeEscapeSequences_h 32 33 #include "wtf/ASCIICType.h" 34 #include "wtf/Assertions.h" 35 #include "wtf/text/StringBuilder.h" 36 #include "wtf/text/TextEncoding.h" 37 38 namespace WebCore { 39 40 // See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>. 41 struct Unicode16BitEscapeSequence { 42 enum { sequenceSize = 6 }; // e.g. %u26C4 43 static size_t findInString(const String& string, size_t startPosition) { return string.find("%u", startPosition); } 44 static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition) 45 { 46 size_t runEnd = startPosition; 47 while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u' 48 && isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3]) 49 && isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) { 50 runEnd += sequenceSize; 51 } 52 return runEnd; 53 } 54 55 template<typename CharType> 56 static String decodeRun(const CharType* run, size_t runLength, const WTF::TextEncoding&) 57 { 58 // Each %u-escape sequence represents a UTF-16 code unit. 59 // See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>. 60 // For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences 61 // without any intervening characters, so decode the run without additional checks. 62 size_t numberOfSequences = runLength / sequenceSize; 63 StringBuilder builder; 64 builder.reserveCapacity(numberOfSequences); 65 while (numberOfSequences--) { 66 UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]); 67 builder.append(codeUnit); 68 run += sequenceSize; 69 } 70 return builder.toString(); 71 } 72 }; 73 74 struct URLEscapeSequence { 75 enum { sequenceSize = 3 }; // e.g. %41 76 static size_t findInString(const String& string, size_t startPosition) { return string.find('%', startPosition); } 77 static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition) 78 { 79 // Make the simplifying assumption that supported encodings may have up to two unescaped characters 80 // in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the 81 // decoder as part of the run. In other words, we end the run at the first value outside of the 82 // 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid 83 // escape sequence. 84 size_t runEnd = startPosition; 85 int numberOfTrailingCharacters = 0; 86 while (runEnd < endPosition) { 87 if (string[runEnd] == '%') { 88 if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) { 89 runEnd += sequenceSize; 90 numberOfTrailingCharacters = 0; 91 } else 92 break; 93 } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) { 94 runEnd += 1; 95 numberOfTrailingCharacters += 1; 96 } else 97 break; 98 } 99 return runEnd; 100 } 101 102 template<typename CharType> 103 static String decodeRun(const CharType* run, size_t runLength, const WTF::TextEncoding& encoding) 104 { 105 // For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces 106 // a valid escape sequence, but there may be characters between the sequences. 107 Vector<char, 512> buffer; 108 buffer.resize(runLength); // Unescaping hex sequences only makes the length smaller. 109 char* p = buffer.data(); 110 const CharType* runEnd = run + runLength; 111 while (run < runEnd) { 112 if (run[0] == '%') { 113 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]); 114 run += sequenceSize; 115 } else { 116 *p++ = run[0]; 117 run += 1; 118 } 119 } 120 ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun. 121 return (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data()); 122 } 123 }; 124 125 template<typename EscapeSequence> 126 String decodeEscapeSequences(const String& string, const WTF::TextEncoding& encoding) 127 { 128 StringBuilder result; 129 size_t length = string.length(); 130 size_t decodedPosition = 0; 131 size_t searchPosition = 0; 132 size_t encodedRunPosition; 133 while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != kNotFound) { 134 size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length); 135 searchPosition = encodedRunEnd; 136 if (encodedRunEnd == encodedRunPosition) { 137 ++searchPosition; 138 continue; 139 } 140 141 String decoded = string.is8Bit() ? 142 EscapeSequence::decodeRun(string.characters8() + encodedRunPosition, encodedRunEnd - encodedRunPosition, encoding) : 143 EscapeSequence::decodeRun(string.characters16() + encodedRunPosition, encodedRunEnd - encodedRunPosition, encoding); 144 145 if (decoded.isEmpty()) 146 continue; 147 148 result.append(string, decodedPosition, encodedRunPosition - decodedPosition); 149 result.append(decoded); 150 decodedPosition = encodedRunEnd; 151 } 152 result.append(string, decodedPosition, length - decodedPosition); 153 return result.toString(); 154 } 155 156 } // namespace WebCore 157 158 #endif // DecodeEscapeSequences_h 159