Home | History | Annotate | Download | only in hb-ucdn
      1 /*
      2  * Copyright (C) 2012 Grigori Goronzy <greg (at) kinoho.net>
      3  *
      4  * Permission to use, copy, modify, and/or distribute this software for any
      5  * purpose with or without fee is hereby granted, provided that the above
      6  * copyright notice and this permission notice appear in all copies.
      7  *
      8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     15  */
     16 
     17 #include <stdio.h>
     18 #include <stdlib.h>
     19 #include "ucdn.h"
     20 
     21 typedef struct {
     22     unsigned char category;
     23     unsigned char combining;
     24     unsigned char bidi_class;
     25     unsigned char mirrored;
     26     unsigned char east_asian_width;
     27     unsigned char normalization_check;
     28     unsigned char script;
     29 } UCDRecord;
     30 
     31 typedef struct {
     32     unsigned short from, to;
     33 } MirrorPair;
     34 
     35 typedef struct {
     36     unsigned int start;
     37     short count, index;
     38 } Reindex;
     39 
     40 #include "unicodedata_db.h"
     41 
     42 /* constants required for Hangul (de)composition */
     43 #define SBASE 0xAC00
     44 #define LBASE 0x1100
     45 #define VBASE 0x1161
     46 #define TBASE 0x11A7
     47 #define SCOUNT 11172
     48 #define LCOUNT 19
     49 #define VCOUNT 21
     50 #define TCOUNT 28
     51 #define NCOUNT (VCOUNT * TCOUNT)
     52 
     53 static const UCDRecord *get_ucd_record(uint32_t code)
     54 {
     55     int index, offset;
     56 
     57     if (code >= 0x110000)
     58         index = 0;
     59     else {
     60         index  = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
     61         offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
     62         index  = index1[index + offset] << SHIFT2;
     63         offset = code & ((1<<SHIFT2) - 1);
     64         index  = index2[index + offset];
     65     }
     66 
     67     return &ucd_records[index];
     68 }
     69 
     70 static const unsigned short *get_decomp_record(uint32_t code)
     71 {
     72     int index, offset;
     73 
     74     if (code >= 0x110000)
     75         index = 0;
     76     else {
     77         index  = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
     78             << DECOMP_SHIFT1;
     79         offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
     80         index  = decomp_index1[index + offset] << DECOMP_SHIFT2;
     81         offset = code & ((1<<DECOMP_SHIFT2) - 1);
     82         index  = decomp_index2[index + offset];
     83     }
     84 
     85     return &decomp_data[index];
     86 }
     87 
     88 static int get_comp_index(uint32_t code, const Reindex *idx)
     89 {
     90     int i;
     91 
     92     for (i = 0; idx[i].start; i++) {
     93         const Reindex *cur = &idx[i];
     94         if (code < cur->start)
     95             return -1;
     96         if (code <= cur->start + cur->count) {
     97             return cur->index + (code - cur->start);
     98         }
     99     }
    100 
    101     return -1;
    102 }
    103 
    104 static int compare_mp(const void *a, const void *b)
    105 {
    106     MirrorPair *mpa = (MirrorPair *)a;
    107     MirrorPair *mpb = (MirrorPair *)b;
    108     return mpa->from - mpb->from;
    109 }
    110 
    111 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
    112 {
    113     int si = code - SBASE;
    114 
    115     if (si < 0 || si >= SCOUNT)
    116         return 0;
    117 
    118     if (si % TCOUNT) {
    119         /* LV,T */
    120         *a = SBASE + (si / TCOUNT) * TCOUNT;
    121         *b = TBASE + (si % TCOUNT);
    122         return 3;
    123     } else {
    124         /* L,V */
    125         *a = LBASE + (si / NCOUNT);
    126         *b = VBASE + (si % NCOUNT) / TCOUNT;
    127         return 2;
    128     }
    129 }
    130 
    131 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
    132 {
    133     if (b < VBASE || b >= (TBASE + TCOUNT))
    134         return 0;
    135 
    136     if ((a < LBASE || a >= (LBASE + LCOUNT))
    137             && (a < SBASE || a >= (SBASE + SCOUNT)))
    138         return 0;
    139 
    140     if (a >= SBASE) {
    141         /* LV,T */
    142         *code = a + (b - TBASE);
    143         return 3;
    144     } else {
    145         /* L,V */
    146         int li = a - LBASE;
    147         int vi = b - VBASE;
    148         *code = SBASE + li * NCOUNT + vi * TCOUNT;
    149         return 2;
    150     }
    151 }
    152 
    153 static uint32_t decode_utf16(const unsigned short **code_ptr)
    154 {
    155     const unsigned short *code = *code_ptr;
    156 
    157     if ((code[0] & 0xd800) != 0xd800) {
    158         *code_ptr += 1;
    159         return (uint32_t)code[0];
    160     } else {
    161         *code_ptr += 2;
    162         return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
    163             (((uint32_t)code[0] - 0xd800) << 10);
    164     }
    165 }
    166 
    167 const char *ucdn_get_unicode_version(void)
    168 {
    169     return UNIDATA_VERSION;
    170 }
    171 
    172 int ucdn_get_combining_class(uint32_t code)
    173 {
    174     return get_ucd_record(code)->combining;
    175 }
    176 
    177 int ucdn_get_east_asian_width(uint32_t code)
    178 {
    179     return get_ucd_record(code)->east_asian_width;
    180 }
    181 
    182 int ucdn_get_general_category(uint32_t code)
    183 {
    184     return get_ucd_record(code)->category;
    185 }
    186 
    187 int ucdn_get_bidi_class(uint32_t code)
    188 {
    189     return get_ucd_record(code)->bidi_class;
    190 }
    191 
    192 int ucdn_get_mirrored(uint32_t code)
    193 {
    194     return get_ucd_record(code)->mirrored;
    195 }
    196 
    197 int ucdn_get_script(uint32_t code)
    198 {
    199     return get_ucd_record(code)->script;
    200 }
    201 
    202 uint32_t ucdn_mirror(uint32_t code)
    203 {
    204     MirrorPair mp = {0};
    205     MirrorPair *res;
    206 
    207     if (get_ucd_record(code)->mirrored == 0)
    208         return code;
    209 
    210     mp.from = code;
    211     res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair),
    212             compare_mp);
    213 
    214     if (res == NULL)
    215         return code;
    216     else
    217         return res->to;
    218 }
    219 
    220 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
    221 {
    222     const unsigned short *rec;
    223     int len;
    224 
    225     if (hangul_pair_decompose(code, a, b))
    226         return 1;
    227 
    228     rec = get_decomp_record(code);
    229     len = rec[0] >> 8;
    230 
    231     if ((rec[0] & 0xff) != 0 || len == 0)
    232         return 0;
    233 
    234     rec++;
    235     *a = decode_utf16(&rec);
    236     if (len > 1)
    237         *b = decode_utf16(&rec);
    238     else
    239         *b = 0;
    240 
    241     return 1;
    242 }
    243 
    244 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
    245 {
    246     int l, r, index, indexi, offset;
    247 
    248     if (hangul_pair_compose(code, a, b))
    249         return 1;
    250 
    251     l = get_comp_index(a, nfc_first);
    252     r = get_comp_index(b, nfc_last);
    253 
    254     if (l < 0 || r < 0)
    255         return 0;
    256 
    257     indexi = l * TOTAL_LAST + r;
    258     index  = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
    259     offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
    260     index  = comp_index1[index + offset] << COMP_SHIFT2;
    261     offset = indexi & ((1<<COMP_SHIFT2) - 1);
    262     *code  = comp_data[index + offset];
    263 
    264     return *code != 0;
    265 }
    266 
    267 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
    268 {
    269     int i, len;
    270     const unsigned short *rec = get_decomp_record(code);
    271     len = rec[0] >> 8;
    272 
    273     if (len == 0)
    274         return 0;
    275 
    276     rec++;
    277     for (i = 0; i < len; i++)
    278         decomposed[i] = decode_utf16(&rec);
    279 
    280     return len;
    281 }
    282