Home | History | Annotate | Download | only in hb-ucdn
      1 /*
      2  * Copyright (C) 2012 Grigori Goronzy <greg (at) kinoho.net>
      3  *
      4  * Permission to use, copy, modify, and/or distribute this software for any
      5  * purpose with or without fee is hereby granted, provided that the above
      6  * copyright notice and this permission notice appear in all copies.
      7  *
      8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     15  */
     16 
     17 #include <stdio.h>
     18 #include <stdlib.h>
     19 #include <stdint.h>
     20 #include "ucdn.h"
     21 
     22 typedef struct {
     23     unsigned char category;
     24     unsigned char combining;
     25     unsigned char bidi_class;
     26     unsigned char mirrored;
     27     unsigned char east_asian_width;
     28     unsigned char normalization_check;
     29     unsigned char script;
     30 } UCDRecord;
     31 
     32 typedef struct {
     33     unsigned short from, to;
     34 } MirrorPair;
     35 
     36 typedef struct {
     37     int start;
     38     short count, index;
     39 } Reindex;
     40 
     41 #include "unicodedata_db.h"
     42 
     43 /* constants required for Hangul (de)composition */
     44 #define SBASE 0xAC00
     45 #define LBASE 0x1100
     46 #define VBASE 0x1161
     47 #define TBASE 0x11A7
     48 #define SCOUNT 11172
     49 #define LCOUNT 19
     50 #define VCOUNT 21
     51 #define TCOUNT 28
     52 #define NCOUNT (VCOUNT * TCOUNT)
     53 
     54 static const UCDRecord *get_ucd_record(uint32_t code)
     55 {
     56     int index, offset;
     57 
     58     if (code >= 0x110000)
     59         index = 0;
     60     else {
     61         index  = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
     62         offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
     63         index  = index1[index + offset] << SHIFT2;
     64         offset = code & ((1<<SHIFT2) - 1);
     65         index  = index2[index + offset];
     66     }
     67 
     68     return &ucd_records[index];
     69 }
     70 
     71 static const unsigned short *get_decomp_record(uint32_t code)
     72 {
     73     int index, offset;
     74 
     75     if (code >= 0x110000)
     76         index = 0;
     77     else {
     78         index  = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
     79             << DECOMP_SHIFT1;
     80         offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
     81         index  = decomp_index1[index + offset] << DECOMP_SHIFT2;
     82         offset = code & ((1<<DECOMP_SHIFT2) - 1);
     83         index  = decomp_index2[index + offset];
     84     }
     85 
     86     return &decomp_data[index];
     87 }
     88 
     89 static int get_comp_index(uint32_t code, const Reindex *idx)
     90 {
     91     int i;
     92 
     93     for (i = 0; idx[i].start; i++) {
     94         const Reindex *cur = &idx[i];
     95         if (code < cur->start)
     96             return -1;
     97         if (code <= cur->start + cur->count) {
     98             return cur->index + (code - cur->start);
     99         }
    100     }
    101 
    102     return -1;
    103 }
    104 
    105 static int compare_mp(const void *a, const void *b)
    106 {
    107     MirrorPair *mpa = (MirrorPair *)a;
    108     MirrorPair *mpb = (MirrorPair *)b;
    109     return mpa->from - mpb->from;
    110 }
    111 
    112 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
    113 {
    114     int si = code - SBASE;
    115 
    116     if (si < 0 || si >= SCOUNT)
    117         return 0;
    118 
    119     if (si % TCOUNT) {
    120         /* LV,T */
    121         *a = SBASE + (si / TCOUNT) * TCOUNT;
    122         *b = TBASE + (si % TCOUNT);
    123         return 3;
    124     } else {
    125         /* L,V */
    126         *a = LBASE + (si / NCOUNT);
    127         *b = VBASE + (si % NCOUNT) / TCOUNT;
    128         return 2;
    129     }
    130 }
    131 
    132 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
    133 {
    134     if (b < VBASE || b >= (TBASE + TCOUNT))
    135         return 0;
    136 
    137     if ((a < LBASE || a >= (LBASE + LCOUNT))
    138             && (a < SBASE || a >= (SBASE + SCOUNT)))
    139         return 0;
    140 
    141     if (a >= SBASE) {
    142         /* LV,T */
    143         *code = a + (b - TBASE);
    144         return 3;
    145     } else {
    146         /* L,V */
    147         int li = a - LBASE;
    148         int vi = b - VBASE;
    149         *code = SBASE + li * NCOUNT + vi * TCOUNT;
    150         return 2;
    151     }
    152 }
    153 
    154 static uint32_t decode_utf16(const unsigned short **code_ptr)
    155 {
    156     const unsigned short *code = *code_ptr;
    157 
    158     if ((code[0] & 0xd800) != 0xd800) {
    159         *code_ptr += 1;
    160         return (uint32_t)code[0];
    161     } else {
    162         *code_ptr += 2;
    163         return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
    164             (((uint32_t)code[0] - 0xd800) << 10);
    165     }
    166 }
    167 
    168 const char *ucdn_get_unicode_version(void)
    169 {
    170     return UNIDATA_VERSION;
    171 }
    172 
    173 int ucdn_get_combining_class(uint32_t code)
    174 {
    175     return get_ucd_record(code)->combining;
    176 }
    177 
    178 int ucdn_get_east_asian_width(uint32_t code)
    179 {
    180     return get_ucd_record(code)->east_asian_width;
    181 }
    182 
    183 int ucdn_get_general_category(uint32_t code)
    184 {
    185     return get_ucd_record(code)->category;
    186 }
    187 
    188 int ucdn_get_bidi_class(uint32_t code)
    189 {
    190     return get_ucd_record(code)->bidi_class;
    191 }
    192 
    193 int ucdn_get_mirrored(uint32_t code)
    194 {
    195     return get_ucd_record(code)->mirrored;
    196 }
    197 
    198 int ucdn_get_script(uint32_t code)
    199 {
    200     return get_ucd_record(code)->script;
    201 }
    202 
    203 uint32_t ucdn_mirror(uint32_t code)
    204 {
    205     MirrorPair mp = {0};
    206     MirrorPair *res;
    207 
    208     if (get_ucd_record(code)->mirrored == 0)
    209         return code;
    210 
    211     mp.from = code;
    212     res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair),
    213             compare_mp);
    214 
    215     if (res == NULL)
    216         return code;
    217     else
    218         return res->to;
    219 }
    220 
    221 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
    222 {
    223     const unsigned short *rec;
    224     int len;
    225 
    226     if (hangul_pair_decompose(code, a, b))
    227         return 1;
    228 
    229     rec = get_decomp_record(code);
    230     len = rec[0] >> 8;
    231 
    232     if ((rec[0] & 0xff) != 0 || len == 0)
    233         return 0;
    234 
    235     rec++;
    236     *a = decode_utf16(&rec);
    237     if (len > 1)
    238         *b = decode_utf16(&rec);
    239     else
    240         *b = 0;
    241 
    242     return 1;
    243 }
    244 
    245 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
    246 {
    247     int l, r, index, indexi, offset;
    248 
    249     if (hangul_pair_compose(code, a, b))
    250         return 1;
    251 
    252     l = get_comp_index(a, nfc_first);
    253     r = get_comp_index(b, nfc_last);
    254 
    255     if (l < 0 || r < 0)
    256         return 0;
    257 
    258     indexi = l * TOTAL_LAST + r;
    259     index  = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
    260     offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
    261     index  = comp_index1[index + offset] << COMP_SHIFT2;
    262     offset = indexi & ((1<<COMP_SHIFT2) - 1);
    263     *code  = comp_data[index + offset];
    264 
    265     return *code != 0;
    266 }
    267 
    268 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
    269 {
    270     int i, len;
    271     const unsigned short *rec = get_decomp_record(code);
    272     len = rec[0] >> 8;
    273 
    274     if (len == 0)
    275         return 0;
    276 
    277     rec++;
    278     for (i = 0; i < len; i++)
    279         decomposed[i] = decode_utf16(&rec);
    280 
    281     return len;
    282 }
    283