Home | History | Annotate | Download | only in hb-ucdn
      1 /*
      2  * Copyright (C) 2012 Grigori Goronzy <greg (at) kinoho.net>
      3  *
      4  * Permission to use, copy, modify, and/or distribute this software for any
      5  * purpose with or without fee is hereby granted, provided that the above
      6  * copyright notice and this permission notice appear in all copies.
      7  *
      8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     15  */
     16 
     17 #include <stdio.h>
     18 #include <stdlib.h>
     19 #include <stdint.h>
     20 #include "ucdn.h"
     21 
     22 typedef struct {
     23     unsigned char category;
     24     unsigned char combining;
     25     unsigned char bidi_class;
     26     unsigned char mirrored;
     27     unsigned char east_asian_width;
     28     unsigned char script;
     29     unsigned char linebreak_class;
     30 } UCDRecord;
     31 
     32 typedef struct {
     33     unsigned short from, to;
     34 } MirrorPair;
     35 
     36 typedef struct {
     37   unsigned short from, to;
     38   unsigned char type;
     39 } BracketPair;
     40 
     41 typedef struct {
     42     unsigned int start;
     43     short count, index;
     44 } Reindex;
     45 
     46 #include "unicodedata_db.h"
     47 
     48 /* constants required for Hangul (de)composition */
     49 #define SBASE 0xAC00
     50 #define LBASE 0x1100
     51 #define VBASE 0x1161
     52 #define TBASE 0x11A7
     53 #define SCOUNT 11172
     54 #define LCOUNT 19
     55 #define VCOUNT 21
     56 #define TCOUNT 28
     57 #define NCOUNT (VCOUNT * TCOUNT)
     58 
     59 static const UCDRecord *get_ucd_record(uint32_t code)
     60 {
     61     int index, offset;
     62 
     63     if (code >= 0x110000)
     64         index = 0;
     65     else {
     66         index  = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
     67         offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
     68         index  = index1[index + offset] << SHIFT2;
     69         offset = code & ((1<<SHIFT2) - 1);
     70         index  = index2[index + offset];
     71     }
     72 
     73     return &ucd_records[index];
     74 }
     75 
     76 static const unsigned short *get_decomp_record(uint32_t code)
     77 {
     78     int index, offset;
     79 
     80     if (code >= 0x110000)
     81         index = 0;
     82     else {
     83         index  = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
     84             << DECOMP_SHIFT1;
     85         offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
     86         index  = decomp_index1[index + offset] << DECOMP_SHIFT2;
     87         offset = code & ((1<<DECOMP_SHIFT2) - 1);
     88         index  = decomp_index2[index + offset];
     89     }
     90 
     91     return &decomp_data[index];
     92 }
     93 
     94 static int get_comp_index(uint32_t code, const Reindex *idx)
     95 {
     96     int i;
     97 
     98     for (i = 0; idx[i].start; i++) {
     99         const Reindex *cur = &idx[i];
    100         if (code < cur->start)
    101             return -1;
    102         if (code <= cur->start + cur->count) {
    103             return cur->index + (code - cur->start);
    104         }
    105     }
    106 
    107     return -1;
    108 }
    109 
    110 static int compare_mp(const void *a, const void *b)
    111 {
    112     MirrorPair *mpa = (MirrorPair *)a;
    113     MirrorPair *mpb = (MirrorPair *)b;
    114     return mpa->from - mpb->from;
    115 }
    116 
    117 static int compare_bp(const void *a, const void *b)
    118 {
    119     BracketPair *bpa = (BracketPair *)a;
    120     BracketPair *bpb = (BracketPair *)b;
    121     return bpa->from - bpb->from;
    122 }
    123 
    124 static BracketPair *search_bp(uint32_t code)
    125 {
    126     BracketPair bp = {0,0,2};
    127     BracketPair *res;
    128 
    129     bp.from = code;
    130     res = bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN, sizeof(BracketPair),
    131             compare_bp);
    132     return res;
    133 }
    134 
    135 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
    136 {
    137     int si = code - SBASE;
    138 
    139     if (si < 0 || si >= SCOUNT)
    140         return 0;
    141 
    142     if (si % TCOUNT) {
    143         /* LV,T */
    144         *a = SBASE + (si / TCOUNT) * TCOUNT;
    145         *b = TBASE + (si % TCOUNT);
    146         return 3;
    147     } else {
    148         /* L,V */
    149         *a = LBASE + (si / NCOUNT);
    150         *b = VBASE + (si % NCOUNT) / TCOUNT;
    151         return 2;
    152     }
    153 }
    154 
    155 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
    156 {
    157     if (b < VBASE || b >= (TBASE + TCOUNT))
    158         return 0;
    159 
    160     if ((a < LBASE || a >= (LBASE + LCOUNT))
    161             && (a < SBASE || a >= (SBASE + SCOUNT)))
    162         return 0;
    163 
    164     if (a >= SBASE) {
    165         /* LV,T */
    166         *code = a + (b - TBASE);
    167         return 3;
    168     } else {
    169         /* L,V */
    170         int li = a - LBASE;
    171         int vi = b - VBASE;
    172         *code = SBASE + li * NCOUNT + vi * TCOUNT;
    173         return 2;
    174     }
    175 }
    176 
    177 static uint32_t decode_utf16(const unsigned short **code_ptr)
    178 {
    179     const unsigned short *code = *code_ptr;
    180 
    181     if ((code[0] & 0xd800) != 0xd800) {
    182         *code_ptr += 1;
    183         return (uint32_t)code[0];
    184     } else {
    185         *code_ptr += 2;
    186         return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
    187             (((uint32_t)code[0] - 0xd800) << 10);
    188     }
    189 }
    190 
    191 const char *ucdn_get_unicode_version(void)
    192 {
    193     return UNIDATA_VERSION;
    194 }
    195 
    196 int ucdn_get_combining_class(uint32_t code)
    197 {
    198     return get_ucd_record(code)->combining;
    199 }
    200 
    201 int ucdn_get_east_asian_width(uint32_t code)
    202 {
    203     return get_ucd_record(code)->east_asian_width;
    204 }
    205 
    206 int ucdn_get_general_category(uint32_t code)
    207 {
    208     return get_ucd_record(code)->category;
    209 }
    210 
    211 int ucdn_get_bidi_class(uint32_t code)
    212 {
    213     return get_ucd_record(code)->bidi_class;
    214 }
    215 
    216 int ucdn_get_mirrored(uint32_t code)
    217 {
    218     return get_ucd_record(code)->mirrored;
    219 }
    220 
    221 int ucdn_get_script(uint32_t code)
    222 {
    223     return get_ucd_record(code)->script;
    224 }
    225 
    226 int ucdn_get_linebreak_class(uint32_t code)
    227 {
    228     return get_ucd_record(code)->linebreak_class;
    229 }
    230 
    231 int ucdn_get_resolved_linebreak_class(uint32_t code)
    232 {
    233     const UCDRecord *record = get_ucd_record(code);
    234 
    235     switch (record->linebreak_class)
    236     {
    237     case UCDN_LINEBREAK_CLASS_AI:
    238     case UCDN_LINEBREAK_CLASS_SG:
    239     case UCDN_LINEBREAK_CLASS_XX:
    240         return UCDN_LINEBREAK_CLASS_AL;
    241 
    242     case UCDN_LINEBREAK_CLASS_SA:
    243         if (record->category == UCDN_GENERAL_CATEGORY_MC ||
    244                 record->category == UCDN_GENERAL_CATEGORY_MN)
    245             return UCDN_LINEBREAK_CLASS_CM;
    246         return UCDN_LINEBREAK_CLASS_AL;
    247 
    248     case UCDN_LINEBREAK_CLASS_CJ:
    249         return UCDN_LINEBREAK_CLASS_NS;
    250 
    251     case UCDN_LINEBREAK_CLASS_CB:
    252         return UCDN_LINEBREAK_CLASS_B2;
    253 
    254     case UCDN_LINEBREAK_CLASS_NL:
    255         return UCDN_LINEBREAK_CLASS_BK;
    256 
    257     default:
    258         return record->linebreak_class;
    259     }
    260 }
    261 
    262 uint32_t ucdn_mirror(uint32_t code)
    263 {
    264     MirrorPair mp = {0};
    265     MirrorPair *res;
    266 
    267     if (get_ucd_record(code)->mirrored == 0)
    268         return code;
    269 
    270     mp.from = code;
    271     res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair),
    272             compare_mp);
    273 
    274     if (res == NULL)
    275         return code;
    276     else
    277         return res->to;
    278 }
    279 
    280 uint32_t ucdn_paired_bracket(uint32_t code)
    281 {
    282     BracketPair *res = search_bp(code);
    283     if (res == NULL)
    284         return code;
    285     else
    286         return res->to;
    287 }
    288 
    289 int ucdn_paired_bracket_type(uint32_t code)
    290 {
    291     BracketPair *res = search_bp(code);
    292     if (res == NULL)
    293         return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE;
    294     else
    295         return res->type;
    296 }
    297 
    298 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
    299 {
    300     const unsigned short *rec;
    301     int len;
    302 
    303     if (hangul_pair_decompose(code, a, b))
    304         return 1;
    305 
    306     rec = get_decomp_record(code);
    307     len = rec[0] >> 8;
    308 
    309     if ((rec[0] & 0xff) != 0 || len == 0)
    310         return 0;
    311 
    312     rec++;
    313     *a = decode_utf16(&rec);
    314     if (len > 1)
    315         *b = decode_utf16(&rec);
    316     else
    317         *b = 0;
    318 
    319     return 1;
    320 }
    321 
    322 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
    323 {
    324     int l, r, index, indexi, offset;
    325 
    326     if (hangul_pair_compose(code, a, b))
    327         return 1;
    328 
    329     l = get_comp_index(a, nfc_first);
    330     r = get_comp_index(b, nfc_last);
    331 
    332     if (l < 0 || r < 0)
    333         return 0;
    334 
    335     indexi = l * TOTAL_LAST + r;
    336     index  = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
    337     offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
    338     index  = comp_index1[index + offset] << COMP_SHIFT2;
    339     offset = indexi & ((1<<COMP_SHIFT2) - 1);
    340     *code  = comp_data[index + offset];
    341 
    342     return *code != 0;
    343 }
    344 
    345 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
    346 {
    347     int i, len;
    348     const unsigned short *rec = get_decomp_record(code);
    349     len = rec[0] >> 8;
    350 
    351     if (len == 0)
    352         return 0;
    353 
    354     rec++;
    355     for (i = 0; i < len; i++)
    356         decomposed[i] = decode_utf16(&rec);
    357 
    358     return len;
    359 }
    360