1 /* 2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All rights reserved. 3 * Copyright (C) 2005 Alexey Proskuryakov. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "platform/text/UnicodeUtilities.h" 29 30 #include "wtf/text/StringBuffer.h" 31 #include "wtf/unicode/CharacterNames.h" 32 #include <unicode/unorm.h> 33 34 using namespace WTF::Unicode; 35 36 namespace WebCore { 37 38 enum VoicedSoundMarkType { 39 NoVoicedSoundMark, 40 VoicedSoundMark, 41 SemiVoicedSoundMark 42 }; 43 44 template <typename CharType> 45 static inline CharType foldQuoteMarkOrSoftHyphen(CharType c) 46 { 47 switch (static_cast<UChar>(c)) { 48 case hebrewPunctuationGershayim: 49 case leftDoubleQuotationMark: 50 case rightDoubleQuotationMark: 51 return '"'; 52 case hebrewPunctuationGeresh: 53 case leftSingleQuotationMark: 54 case rightSingleQuotationMark: 55 return '\''; 56 case softHyphen: 57 // Replace soft hyphen with an ignorable character so that their presence or absence will 58 // not affect string comparison. 59 return 0; 60 default: 61 return c; 62 } 63 } 64 65 void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) 66 { 67 for (size_t i = 0; i < length; ++i) 68 data[i] = foldQuoteMarkOrSoftHyphen(data[i]); 69 } 70 71 void foldQuoteMarksAndSoftHyphens(String& s) 72 { 73 s.replace(hebrewPunctuationGeresh, '\''); 74 s.replace(hebrewPunctuationGershayim, '"'); 75 s.replace(leftDoubleQuotationMark, '"'); 76 s.replace(leftSingleQuotationMark, '\''); 77 s.replace(rightDoubleQuotationMark, '"'); 78 s.replace(rightSingleQuotationMark, '\''); 79 // Replace soft hyphen with an ignorable character so that their presence or absence will 80 // not affect string comparison. 81 s.replace(softHyphen, 0); 82 } 83 84 static bool isNonLatin1Separator(UChar32 character) 85 { 86 ASSERT_ARG(character, character >= 256); 87 88 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); 89 } 90 91 bool isSeparator(UChar32 character) 92 { 93 static const bool latin1SeparatorTable[256] = { 94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . / 97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ? 98 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @ 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _ 100 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ` 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~ 102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 105 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 107 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 110 }; 111 112 if (character < 256) 113 return latin1SeparatorTable[character]; 114 115 return isNonLatin1Separator(character); 116 } 117 118 // ICU's search ignores the distinction between small kana letters and ones 119 // that are not small, and also characters that differ only in the voicing 120 // marks when considering only primary collation strength differences. 121 // This is not helpful for end users, since these differences make words 122 // distinct, so for our purposes we need these to be considered. 123 // The Unicode folks do not think the collation algorithm should be 124 // changed. To work around this, we would like to tailor the ICU searcher, 125 // but we can't get that to work yet. So instead, we check for cases where 126 // these differences occur, and skip those matches. 127 128 // We refer to the above technique as the "kana workaround". The next few 129 // functions are helper functinos for the kana workaround. 130 131 bool isKanaLetter(UChar character) 132 { 133 // Hiragana letters. 134 if (character >= 0x3041 && character <= 0x3096) 135 return true; 136 137 // Katakana letters. 138 if (character >= 0x30A1 && character <= 0x30FA) 139 return true; 140 if (character >= 0x31F0 && character <= 0x31FF) 141 return true; 142 143 // Halfwidth katakana letters. 144 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) 145 return true; 146 147 return false; 148 } 149 150 bool isSmallKanaLetter(UChar character) 151 { 152 ASSERT(isKanaLetter(character)); 153 154 switch (character) { 155 case 0x3041: // HIRAGANA LETTER SMALL A 156 case 0x3043: // HIRAGANA LETTER SMALL I 157 case 0x3045: // HIRAGANA LETTER SMALL U 158 case 0x3047: // HIRAGANA LETTER SMALL E 159 case 0x3049: // HIRAGANA LETTER SMALL O 160 case 0x3063: // HIRAGANA LETTER SMALL TU 161 case 0x3083: // HIRAGANA LETTER SMALL YA 162 case 0x3085: // HIRAGANA LETTER SMALL YU 163 case 0x3087: // HIRAGANA LETTER SMALL YO 164 case 0x308E: // HIRAGANA LETTER SMALL WA 165 case 0x3095: // HIRAGANA LETTER SMALL KA 166 case 0x3096: // HIRAGANA LETTER SMALL KE 167 case 0x30A1: // KATAKANA LETTER SMALL A 168 case 0x30A3: // KATAKANA LETTER SMALL I 169 case 0x30A5: // KATAKANA LETTER SMALL U 170 case 0x30A7: // KATAKANA LETTER SMALL E 171 case 0x30A9: // KATAKANA LETTER SMALL O 172 case 0x30C3: // KATAKANA LETTER SMALL TU 173 case 0x30E3: // KATAKANA LETTER SMALL YA 174 case 0x30E5: // KATAKANA LETTER SMALL YU 175 case 0x30E7: // KATAKANA LETTER SMALL YO 176 case 0x30EE: // KATAKANA LETTER SMALL WA 177 case 0x30F5: // KATAKANA LETTER SMALL KA 178 case 0x30F6: // KATAKANA LETTER SMALL KE 179 case 0x31F0: // KATAKANA LETTER SMALL KU 180 case 0x31F1: // KATAKANA LETTER SMALL SI 181 case 0x31F2: // KATAKANA LETTER SMALL SU 182 case 0x31F3: // KATAKANA LETTER SMALL TO 183 case 0x31F4: // KATAKANA LETTER SMALL NU 184 case 0x31F5: // KATAKANA LETTER SMALL HA 185 case 0x31F6: // KATAKANA LETTER SMALL HI 186 case 0x31F7: // KATAKANA LETTER SMALL HU 187 case 0x31F8: // KATAKANA LETTER SMALL HE 188 case 0x31F9: // KATAKANA LETTER SMALL HO 189 case 0x31FA: // KATAKANA LETTER SMALL MU 190 case 0x31FB: // KATAKANA LETTER SMALL RA 191 case 0x31FC: // KATAKANA LETTER SMALL RI 192 case 0x31FD: // KATAKANA LETTER SMALL RU 193 case 0x31FE: // KATAKANA LETTER SMALL RE 194 case 0x31FF: // KATAKANA LETTER SMALL RO 195 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A 196 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I 197 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U 198 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E 199 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O 200 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA 201 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU 202 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO 203 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU 204 return true; 205 } 206 return false; 207 } 208 209 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) 210 { 211 ASSERT(isKanaLetter(character)); 212 213 switch (character) { 214 case 0x304C: // HIRAGANA LETTER GA 215 case 0x304E: // HIRAGANA LETTER GI 216 case 0x3050: // HIRAGANA LETTER GU 217 case 0x3052: // HIRAGANA LETTER GE 218 case 0x3054: // HIRAGANA LETTER GO 219 case 0x3056: // HIRAGANA LETTER ZA 220 case 0x3058: // HIRAGANA LETTER ZI 221 case 0x305A: // HIRAGANA LETTER ZU 222 case 0x305C: // HIRAGANA LETTER ZE 223 case 0x305E: // HIRAGANA LETTER ZO 224 case 0x3060: // HIRAGANA LETTER DA 225 case 0x3062: // HIRAGANA LETTER DI 226 case 0x3065: // HIRAGANA LETTER DU 227 case 0x3067: // HIRAGANA LETTER DE 228 case 0x3069: // HIRAGANA LETTER DO 229 case 0x3070: // HIRAGANA LETTER BA 230 case 0x3073: // HIRAGANA LETTER BI 231 case 0x3076: // HIRAGANA LETTER BU 232 case 0x3079: // HIRAGANA LETTER BE 233 case 0x307C: // HIRAGANA LETTER BO 234 case 0x3094: // HIRAGANA LETTER VU 235 case 0x30AC: // KATAKANA LETTER GA 236 case 0x30AE: // KATAKANA LETTER GI 237 case 0x30B0: // KATAKANA LETTER GU 238 case 0x30B2: // KATAKANA LETTER GE 239 case 0x30B4: // KATAKANA LETTER GO 240 case 0x30B6: // KATAKANA LETTER ZA 241 case 0x30B8: // KATAKANA LETTER ZI 242 case 0x30BA: // KATAKANA LETTER ZU 243 case 0x30BC: // KATAKANA LETTER ZE 244 case 0x30BE: // KATAKANA LETTER ZO 245 case 0x30C0: // KATAKANA LETTER DA 246 case 0x30C2: // KATAKANA LETTER DI 247 case 0x30C5: // KATAKANA LETTER DU 248 case 0x30C7: // KATAKANA LETTER DE 249 case 0x30C9: // KATAKANA LETTER DO 250 case 0x30D0: // KATAKANA LETTER BA 251 case 0x30D3: // KATAKANA LETTER BI 252 case 0x30D6: // KATAKANA LETTER BU 253 case 0x30D9: // KATAKANA LETTER BE 254 case 0x30DC: // KATAKANA LETTER BO 255 case 0x30F4: // KATAKANA LETTER VU 256 case 0x30F7: // KATAKANA LETTER VA 257 case 0x30F8: // KATAKANA LETTER VI 258 case 0x30F9: // KATAKANA LETTER VE 259 case 0x30FA: // KATAKANA LETTER VO 260 return VoicedSoundMark; 261 case 0x3071: // HIRAGANA LETTER PA 262 case 0x3074: // HIRAGANA LETTER PI 263 case 0x3077: // HIRAGANA LETTER PU 264 case 0x307A: // HIRAGANA LETTER PE 265 case 0x307D: // HIRAGANA LETTER PO 266 case 0x30D1: // KATAKANA LETTER PA 267 case 0x30D4: // KATAKANA LETTER PI 268 case 0x30D7: // KATAKANA LETTER PU 269 case 0x30DA: // KATAKANA LETTER PE 270 case 0x30DD: // KATAKANA LETTER PO 271 return SemiVoicedSoundMark; 272 } 273 return NoVoicedSoundMark; 274 } 275 276 static inline bool isCombiningVoicedSoundMark(UChar character) 277 { 278 switch (character) { 279 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK 280 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 281 return true; 282 } 283 return false; 284 } 285 286 bool containsKanaLetters(const String& pattern) 287 { 288 const unsigned length = pattern.length(); 289 for (unsigned i = 0; i < length; ++i) { 290 if (isKanaLetter(pattern[i])) 291 return true; 292 } 293 return false; 294 } 295 296 void normalizeCharactersIntoNFCForm(const UChar* characters, unsigned length, Vector<UChar>& buffer) 297 { 298 ASSERT(length); 299 300 buffer.resize(length); 301 302 UErrorCode status = U_ZERO_ERROR; 303 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status); 304 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR); 305 ASSERT(bufferSize); 306 307 buffer.resize(bufferSize); 308 309 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) 310 return; 311 312 status = U_ZERO_ERROR; 313 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status); 314 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); 315 } 316 317 // This function returns kNotFound if |first| and |second| contain different Kana letters. 318 // If |first| and |second| contain the same Kana letter 319 // then function returns offset in characters from |first|. 320 // Pointers to both strings increase simultaneously so so it is possible to use one offset value. 321 static inline size_t compareKanaLetterAndComposedVoicedSoundMarks(const UChar* first, const UChar* firstEnd, const UChar* second, const UChar* secondEnd) 322 { 323 const UChar* start = first; 324 // Check for differences in the kana letter character itself. 325 if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second)) 326 return kNotFound; 327 if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second)) 328 return kNotFound; 329 ++first; 330 ++second; 331 332 // Check for differences in combining voiced sound marks found after the letter. 333 while (true) { 334 const bool secondIsNotSoundMark = second == secondEnd || !isCombiningVoicedSoundMark(*second); 335 if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) { 336 return secondIsNotSoundMark ? first - start : kNotFound; 337 } 338 if (secondIsNotSoundMark) 339 return kNotFound; 340 if (*first != *second) 341 return kNotFound; 342 ++first; 343 ++second; 344 } 345 } 346 347 bool checkOnlyKanaLettersInStrings(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength) 348 { 349 const UChar* a = firstData; 350 const UChar* aEnd = firstData + firstLength; 351 352 const UChar* b = secondData; 353 const UChar* bEnd = secondData + secondLength; 354 while (true) { 355 // Skip runs of non-kana-letter characters. This is necessary so we can 356 // correctly handle strings where the |firstData| and |secondData| have different-length 357 // runs of characters that match, while still double checking the correctness 358 // of matches of kana letters with other kana letters. 359 while (a != aEnd && !isKanaLetter(*a)) 360 ++a; 361 while (b != bEnd && !isKanaLetter(*b)) 362 ++b; 363 364 // If we reached the end of either the target or the match, we should have 365 // reached the end of both; both should have the same number of kana letters. 366 if (a == aEnd || b == bEnd) { 367 return a == aEnd && b == bEnd; 368 } 369 370 // Check that single Kana letters in |a| and |b| are the same. 371 const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); 372 if (offset == kNotFound) 373 return false; 374 375 // Update values of |a| and |b| after comparing. 376 a += offset; 377 b += offset; 378 } 379 } 380 381 bool checkKanaStringsEqual(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength) 382 { 383 const UChar* a = firstData; 384 const UChar* aEnd = firstData + firstLength; 385 386 const UChar* b = secondData; 387 const UChar* bEnd = secondData + secondLength; 388 while (true) { 389 // Check for non-kana-letter characters. 390 while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) { 391 if (*a++ != *b++) 392 return false; 393 } 394 395 // If we reached the end of either the target or the match, we should have 396 // reached the end of both; both should have the same number of kana letters. 397 if (a == aEnd || b == bEnd) { 398 return a == aEnd && b == bEnd; 399 } 400 401 if (isKanaLetter(*a) != isKanaLetter(*b)) 402 return false; 403 404 // Check that single Kana letters in |a| and |b| are the same. 405 const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); 406 if (offset == kNotFound) 407 return false; 408 409 // Update values of |a| and |b| after comparing. 410 a += offset; 411 b += offset; 412 } 413 } 414 415 } 416