1 /* 2 * Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch> 3 * Copyright (C) 2008 Dominik Rttsches <dominik.roettsches (at) access-company.com> 4 * Copyright (C) 2010 Igalia S.L. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Library General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Library General Public License for more details. 15 * 16 * You should have received a copy of the GNU Library General Public License 17 * along with this library; see the file COPYING.LIB. If not, write to 18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 * Boston, MA 02110-1301, USA. 20 * 21 */ 22 23 #include "config.h" 24 #include "UnicodeGLib.h" 25 26 #include <wtf/Vector.h> 27 #include <wtf/unicode/UTF8.h> 28 29 #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) 30 31 namespace WTF { 32 namespace Unicode { 33 34 UChar32 foldCase(UChar32 ch) 35 { 36 GOwnPtr<GError> gerror; 37 38 GOwnPtr<char> utf8char; 39 utf8char.set(g_ucs4_to_utf8(reinterpret_cast<gunichar*>(&ch), 1, 0, 0, &gerror.outPtr())); 40 if (gerror) 41 return ch; 42 43 GOwnPtr<char> utf8caseFolded; 44 utf8caseFolded.set(g_utf8_casefold(utf8char.get(), -1)); 45 46 GOwnPtr<gunichar> ucs4Result; 47 ucs4Result.set(g_utf8_to_ucs4_fast(utf8caseFolded.get(), -1, 0)); 48 49 return *ucs4Result; 50 } 51 52 static int getUTF16LengthFromUTF8(const gchar* utf8String, int length) 53 { 54 int utf16Length = 0; 55 const gchar* inputString = utf8String; 56 57 while ((utf8String + length - inputString > 0) && *inputString) { 58 gunichar character = g_utf8_get_char(inputString); 59 60 utf16Length += UTF8_IS_SURROGATE(character) ? 2 : 1; 61 inputString = g_utf8_next_char(inputString); 62 } 63 64 return utf16Length; 65 } 66 67 typedef gchar* (*UTF8CaseFunction)(const gchar*, gssize length); 68 69 static int convertCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error, UTF8CaseFunction caseFunction) 70 { 71 *error = false; 72 73 // Allocate a buffer big enough to hold all the characters. 74 Vector<char> buffer(srcLength * 3); 75 char* utf8Target = buffer.data(); 76 const UChar* utf16Source = src; 77 ConversionResult conversionResult = convertUTF16ToUTF8(&utf16Source, utf16Source + srcLength, &utf8Target, utf8Target + buffer.size(), true); 78 if (conversionResult != conversionOK) { 79 *error = true; 80 return -1; 81 } 82 buffer.shrink(utf8Target - buffer.data()); 83 84 GOwnPtr<char> utf8Result(caseFunction(buffer.data(), buffer.size())); 85 long utf8ResultLength = strlen(utf8Result.get()); 86 87 // Calculate the destination buffer size. 88 int realLength = getUTF16LengthFromUTF8(utf8Result.get(), utf8ResultLength); 89 if (realLength > resultLength) { 90 *error = true; 91 return realLength; 92 } 93 94 // Convert the result to UTF-16. 95 UChar* utf16Target = result; 96 const char* utf8Source = utf8Result.get(); 97 conversionResult = convertUTF8ToUTF16(&utf8Source, utf8Source + utf8ResultLength, &utf16Target, utf16Target + resultLength, true); 98 long utf16ResultLength = utf16Target - result; 99 if (conversionResult != conversionOK) 100 *error = true; 101 102 return utf16ResultLength <= 0 ? -1 : utf16ResultLength; 103 } 104 int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) 105 { 106 return convertCase(result, resultLength, src, srcLength, error, g_utf8_casefold); 107 } 108 109 int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) 110 { 111 return convertCase(result, resultLength, src, srcLength, error, g_utf8_strdown); 112 } 113 114 int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error) 115 { 116 return convertCase(result, resultLength, src, srcLength, error, g_utf8_strup); 117 } 118 119 Direction direction(UChar32 c) 120 { 121 PangoBidiType type = pango_bidi_type_for_unichar(c); 122 switch (type) { 123 case PANGO_BIDI_TYPE_L: 124 return LeftToRight; 125 case PANGO_BIDI_TYPE_R: 126 return RightToLeft; 127 case PANGO_BIDI_TYPE_AL: 128 return RightToLeftArabic; 129 case PANGO_BIDI_TYPE_LRE: 130 return LeftToRightEmbedding; 131 case PANGO_BIDI_TYPE_RLE: 132 return RightToLeftEmbedding; 133 case PANGO_BIDI_TYPE_LRO: 134 return LeftToRightOverride; 135 case PANGO_BIDI_TYPE_RLO: 136 return RightToLeftOverride; 137 case PANGO_BIDI_TYPE_PDF: 138 return PopDirectionalFormat; 139 case PANGO_BIDI_TYPE_EN: 140 return EuropeanNumber; 141 case PANGO_BIDI_TYPE_AN: 142 return ArabicNumber; 143 case PANGO_BIDI_TYPE_ES: 144 return EuropeanNumberSeparator; 145 case PANGO_BIDI_TYPE_ET: 146 return EuropeanNumberTerminator; 147 case PANGO_BIDI_TYPE_CS: 148 return CommonNumberSeparator; 149 case PANGO_BIDI_TYPE_NSM: 150 return NonSpacingMark; 151 case PANGO_BIDI_TYPE_BN: 152 return BoundaryNeutral; 153 case PANGO_BIDI_TYPE_B: 154 return BlockSeparator; 155 case PANGO_BIDI_TYPE_S: 156 return SegmentSeparator; 157 case PANGO_BIDI_TYPE_WS: 158 return WhiteSpaceNeutral; 159 default: 160 return OtherNeutral; 161 } 162 } 163 164 int umemcasecmp(const UChar* a, const UChar* b, int len) 165 { 166 GOwnPtr<char> utf8a; 167 GOwnPtr<char> utf8b; 168 169 utf8a.set(g_utf16_to_utf8(a, len, 0, 0, 0)); 170 utf8b.set(g_utf16_to_utf8(b, len, 0, 0, 0)); 171 172 GOwnPtr<char> foldedA; 173 GOwnPtr<char> foldedB; 174 175 foldedA.set(g_utf8_casefold(utf8a.get(), -1)); 176 foldedB.set(g_utf8_casefold(utf8b.get(), -1)); 177 178 // FIXME: umemcasecmp needs to mimic u_memcasecmp of icu 179 // from the ICU docs: 180 // "Compare two strings case-insensitively using full case folding. 181 // his is equivalent to u_strcmp(u_strFoldCase(s1, n, options), u_strFoldCase(s2, n, options))." 182 // 183 // So it looks like we don't need the full g_utf8_collate here, 184 // but really a bitwise comparison of casefolded unicode chars (not utf-8 bytes). 185 // As there is no direct equivalent to this icu function in GLib, for now 186 // we'll use g_utf8_collate(): 187 188 return g_utf8_collate(foldedA.get(), foldedB.get()); 189 } 190 191 } 192 } 193