1 // Copyright 2007, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 // ICU integration functions. 31 32 #include <stdlib.h> 33 #include <string.h> 34 #include <unicode/ucnv.h> 35 #include <unicode/ucnv_cb.h> 36 #include <unicode/uidna.h> 37 38 #include "googleurl/src/url_canon_icu.h" 39 #include "googleurl/src/url_canon_internal.h" // for _itoa_s 40 41 #include "base/logging.h" 42 43 namespace url_canon { 44 45 namespace { 46 47 // Called when converting a character that can not be represented, this will 48 // append an escaped version of the numerical character reference for that code 49 // point. It is of the form "Ӓ" and we will escape the non-digits to 50 // "%26%231234%3B". Why? This is what Netscape did back in the olden days. 51 void appendURLEscapedChar(const void* context, 52 UConverterFromUnicodeArgs* from_args, 53 const UChar* code_units, 54 int32_t length, 55 UChar32 code_point, 56 UConverterCallbackReason reason, 57 UErrorCode* err) { 58 if (reason == UCNV_UNASSIGNED) { 59 *err = U_ZERO_ERROR; 60 61 const static int prefix_len = 6; 62 const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped 63 ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err); 64 65 DCHECK(code_point < 0x110000); 66 char number[8]; // Max Unicode code point is 7 digits. 67 _itoa_s(code_point, number, 10); 68 int number_len = static_cast<int>(strlen(number)); 69 ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err); 70 71 const static int postfix_len = 3; 72 const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped 73 ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err); 74 } 75 } 76 77 // A class for scoping the installation of the invalid character callback. 78 class AppendHandlerInstaller { 79 public: 80 // The owner of this object must ensure that the converter is alive for the 81 // duration of this object's lifetime. 82 AppendHandlerInstaller(UConverter* converter) : converter_(converter) { 83 UErrorCode err = U_ZERO_ERROR; 84 ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0, 85 &old_callback_, &old_context_, &err); 86 } 87 88 ~AppendHandlerInstaller() { 89 UErrorCode err = U_ZERO_ERROR; 90 ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err); 91 } 92 93 private: 94 UConverter* converter_; 95 96 UConverterFromUCallback old_callback_; 97 const void* old_context_; 98 }; 99 100 } // namespace 101 102 ICUCharsetConverter::ICUCharsetConverter(UConverter* converter) 103 : converter_(converter) { 104 } 105 106 void ICUCharsetConverter::ConvertFromUTF16(const char16* input, 107 int input_len, 108 CanonOutput* output) { 109 // Install our error handler. It will be called for character that can not 110 // be represented in the destination character set. 111 AppendHandlerInstaller handler(converter_); 112 113 int begin_offset = output->length(); 114 int dest_capacity = output->capacity() - begin_offset; 115 output->set_length(output->length()); 116 117 do { 118 UErrorCode err = U_ZERO_ERROR; 119 char* dest = &output->data()[begin_offset]; 120 int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity, 121 input, input_len, &err); 122 if (err != U_BUFFER_OVERFLOW_ERROR) { 123 output->set_length(begin_offset + required_capacity); 124 return; 125 } 126 127 // Output didn't fit, expand 128 dest_capacity = required_capacity; 129 output->Resize(begin_offset + dest_capacity); 130 } while (true); 131 } 132 133 // Converts the Unicode input representing a hostname to ASCII using IDN rules. 134 // The output must be ASCII, but is represented as wide characters. 135 // 136 // On success, the output will be filled with the ASCII host name and it will 137 // return true. Unlike most other canonicalization functions, this assumes that 138 // the output is empty. The beginning of the host will be at offset 0, and 139 // the length of the output will be set to the length of the new host name. 140 // 141 // On error, this will return false. The output in this case is undefined. 142 bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) { 143 DCHECK(output->length() == 0); // Output buffer is assumed empty. 144 while (true) { 145 // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate 146 // the spec (which do exist). This does not present any risk and is a 147 // little more future proof. 148 UErrorCode err = U_ZERO_ERROR; 149 int num_converted = uidna_IDNToASCII(src, src_len, output->data(), 150 output->capacity(), 151 UIDNA_ALLOW_UNASSIGNED, NULL, &err); 152 if (err == U_ZERO_ERROR) { 153 output->set_length(num_converted); 154 return true; 155 } 156 if (err != U_BUFFER_OVERFLOW_ERROR) 157 return false; // Unknown error, give up. 158 159 // Not enough room in our buffer, expand. 160 output->Resize(output->capacity() * 2); 161 } 162 } 163 164 bool ReadUTFChar(const char* str, int* begin, int length, 165 unsigned* code_point_out) { 166 int code_point; // Avoids warning when U8_NEXT writes -1 to it. 167 U8_NEXT(str, *begin, length, code_point); 168 *code_point_out = static_cast<unsigned>(code_point); 169 170 // The ICU macro above moves to the next char, we want to point to the last 171 // char consumed. 172 (*begin)--; 173 174 // Validate the decoded value. 175 if (U_IS_UNICODE_CHAR(code_point)) 176 return true; 177 *code_point_out = kUnicodeReplacementCharacter; 178 return false; 179 } 180 181 bool ReadUTFChar(const char16* str, int* begin, int length, 182 unsigned* code_point) { 183 if (U16_IS_SURROGATE(str[*begin])) { 184 if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || 185 !U16_IS_TRAIL(str[*begin + 1])) { 186 // Invalid surrogate pair. 187 *code_point = kUnicodeReplacementCharacter; 188 return false; 189 } else { 190 // Valid surrogate pair. 191 *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]); 192 (*begin)++; 193 } 194 } else { 195 // Not a surrogate, just one 16-bit word. 196 *code_point = str[*begin]; 197 } 198 199 if (U_IS_UNICODE_CHAR(*code_point)) 200 return true; 201 202 // Invalid code point. 203 *code_point = kUnicodeReplacementCharacter; 204 return false; 205 } 206 207 } // namespace url_canon 208