Home | History | Annotate | Download | only in glib
      1 /*
      2  *  Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch>
      3  *  Copyright (C) 2008 Dominik Rttsches <dominik.roettsches (at) access-company.com>
      4  *  Copyright (C) 2010 Igalia S.L.
      5  *
      6  *  This library is free software; you can redistribute it and/or
      7  *  modify it under the terms of the GNU Library General Public
      8  *  License as published by the Free Software Foundation; either
      9  *  version 2 of the License, or (at your option) any later version.
     10  *
     11  *  This library is distributed in the hope that it will be useful,
     12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14  *  Library General Public License for more details.
     15  *
     16  *  You should have received a copy of the GNU Library General Public License
     17  *  along with this library; see the file COPYING.LIB.  If not, write to
     18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     19  *  Boston, MA 02110-1301, USA.
     20  *
     21  */
     22 
     23 #include "config.h"
     24 #include "UnicodeGLib.h"
     25 
     26 #include <wtf/Vector.h>
     27 #include <wtf/unicode/UTF8.h>
     28 
     29 #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
     30 
     31 namespace WTF {
     32 namespace Unicode {
     33 
     34 UChar32 foldCase(UChar32 ch)
     35 {
     36     GOwnPtr<GError> gerror;
     37 
     38     GOwnPtr<char> utf8char;
     39     utf8char.set(g_ucs4_to_utf8(reinterpret_cast<gunichar*>(&ch), 1, 0, 0, &gerror.outPtr()));
     40     if (gerror)
     41         return ch;
     42 
     43     GOwnPtr<char> utf8caseFolded;
     44     utf8caseFolded.set(g_utf8_casefold(utf8char.get(), -1));
     45 
     46     GOwnPtr<gunichar> ucs4Result;
     47     ucs4Result.set(g_utf8_to_ucs4_fast(utf8caseFolded.get(), -1, 0));
     48 
     49     return *ucs4Result;
     50 }
     51 
     52 static int getUTF16LengthFromUTF8(const gchar* utf8String, int length)
     53 {
     54     int utf16Length = 0;
     55     const gchar* inputString = utf8String;
     56 
     57     while ((utf8String + length - inputString > 0) && *inputString) {
     58         gunichar character = g_utf8_get_char(inputString);
     59 
     60         utf16Length += UTF8_IS_SURROGATE(character) ? 2 : 1;
     61         inputString = g_utf8_next_char(inputString);
     62     }
     63 
     64     return utf16Length;
     65 }
     66 
     67 typedef gchar* (*UTF8CaseFunction)(const gchar*, gssize length);
     68 
     69 static int convertCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error, UTF8CaseFunction caseFunction)
     70 {
     71     *error = false;
     72 
     73     // Allocate a buffer big enough to hold all the characters.
     74     Vector<char> buffer(srcLength * 3);
     75     char* utf8Target = buffer.data();
     76     const UChar* utf16Source = src;
     77     ConversionResult conversionResult = convertUTF16ToUTF8(&utf16Source, utf16Source + srcLength, &utf8Target, utf8Target + buffer.size(), true);
     78     if (conversionResult != conversionOK) {
     79         *error = true;
     80         return -1;
     81     }
     82     buffer.shrink(utf8Target - buffer.data());
     83 
     84     GOwnPtr<char> utf8Result(caseFunction(buffer.data(), buffer.size()));
     85     long utf8ResultLength = strlen(utf8Result.get());
     86 
     87     // Calculate the destination buffer size.
     88     int realLength = getUTF16LengthFromUTF8(utf8Result.get(), utf8ResultLength);
     89     if (realLength > resultLength) {
     90         *error = true;
     91         return realLength;
     92     }
     93 
     94     // Convert the result to UTF-16.
     95     UChar* utf16Target = result;
     96     const char* utf8Source = utf8Result.get();
     97     conversionResult = convertUTF8ToUTF16(&utf8Source, utf8Source + utf8ResultLength, &utf16Target, utf16Target + resultLength, true);
     98     long utf16ResultLength = utf16Target - result;
     99     if (conversionResult != conversionOK)
    100         *error = true;
    101 
    102     return utf16ResultLength <= 0 ? -1 : utf16ResultLength;
    103 }
    104 int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
    105 {
    106     return convertCase(result, resultLength, src, srcLength, error, g_utf8_casefold);
    107 }
    108 
    109 int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
    110 {
    111     return convertCase(result, resultLength, src, srcLength, error, g_utf8_strdown);
    112 }
    113 
    114 int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
    115 {
    116     return convertCase(result, resultLength, src, srcLength, error, g_utf8_strup);
    117 }
    118 
    119 Direction direction(UChar32 c)
    120 {
    121     PangoBidiType type = pango_bidi_type_for_unichar(c);
    122     switch (type) {
    123     case PANGO_BIDI_TYPE_L:
    124         return LeftToRight;
    125     case PANGO_BIDI_TYPE_R:
    126         return RightToLeft;
    127     case PANGO_BIDI_TYPE_AL:
    128         return RightToLeftArabic;
    129     case PANGO_BIDI_TYPE_LRE:
    130         return LeftToRightEmbedding;
    131     case PANGO_BIDI_TYPE_RLE:
    132         return RightToLeftEmbedding;
    133     case PANGO_BIDI_TYPE_LRO:
    134         return LeftToRightOverride;
    135     case PANGO_BIDI_TYPE_RLO:
    136         return RightToLeftOverride;
    137     case PANGO_BIDI_TYPE_PDF:
    138         return PopDirectionalFormat;
    139     case PANGO_BIDI_TYPE_EN:
    140         return EuropeanNumber;
    141     case PANGO_BIDI_TYPE_AN:
    142         return ArabicNumber;
    143     case PANGO_BIDI_TYPE_ES:
    144         return EuropeanNumberSeparator;
    145     case PANGO_BIDI_TYPE_ET:
    146         return EuropeanNumberTerminator;
    147     case PANGO_BIDI_TYPE_CS:
    148         return CommonNumberSeparator;
    149     case PANGO_BIDI_TYPE_NSM:
    150         return NonSpacingMark;
    151     case PANGO_BIDI_TYPE_BN:
    152         return BoundaryNeutral;
    153     case PANGO_BIDI_TYPE_B:
    154         return BlockSeparator;
    155     case PANGO_BIDI_TYPE_S:
    156         return SegmentSeparator;
    157     case PANGO_BIDI_TYPE_WS:
    158         return WhiteSpaceNeutral;
    159     default:
    160         return OtherNeutral;
    161     }
    162 }
    163 
    164 int umemcasecmp(const UChar* a, const UChar* b, int len)
    165 {
    166     GOwnPtr<char> utf8a;
    167     GOwnPtr<char> utf8b;
    168 
    169     utf8a.set(g_utf16_to_utf8(a, len, 0, 0, 0));
    170     utf8b.set(g_utf16_to_utf8(b, len, 0, 0, 0));
    171 
    172     GOwnPtr<char> foldedA;
    173     GOwnPtr<char> foldedB;
    174 
    175     foldedA.set(g_utf8_casefold(utf8a.get(), -1));
    176     foldedB.set(g_utf8_casefold(utf8b.get(), -1));
    177 
    178     // FIXME: umemcasecmp needs to mimic u_memcasecmp of icu
    179     // from the ICU docs:
    180     // "Compare two strings case-insensitively using full case folding.
    181     // his is equivalent to u_strcmp(u_strFoldCase(s1, n, options), u_strFoldCase(s2, n, options))."
    182     //
    183     // So it looks like we don't need the full g_utf8_collate here,
    184     // but really a bitwise comparison of casefolded unicode chars (not utf-8 bytes).
    185     // As there is no direct equivalent to this icu function in GLib, for now
    186     // we'll use g_utf8_collate():
    187 
    188     return g_utf8_collate(foldedA.get(), foldedB.get());
    189 }
    190 
    191 }
    192 }
    193