1 /* 2 * Copyright (C) 2012 Grigori Goronzy <greg (at) kinoho.net> 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 #include <stdio.h> 18 #include <stdlib.h> 19 #include "ucdn.h" 20 21 typedef struct { 22 unsigned char category; 23 unsigned char combining; 24 unsigned char bidi_class; 25 unsigned char mirrored; 26 unsigned char east_asian_width; 27 unsigned char normalization_check; 28 unsigned char script; 29 } UCDRecord; 30 31 typedef struct { 32 unsigned short from, to; 33 } MirrorPair; 34 35 typedef struct { 36 unsigned int start; 37 short count, index; 38 } Reindex; 39 40 #include "unicodedata_db.h" 41 42 /* constants required for Hangul (de)composition */ 43 #define SBASE 0xAC00 44 #define LBASE 0x1100 45 #define VBASE 0x1161 46 #define TBASE 0x11A7 47 #define SCOUNT 11172 48 #define LCOUNT 19 49 #define VCOUNT 21 50 #define TCOUNT 28 51 #define NCOUNT (VCOUNT * TCOUNT) 52 53 static const UCDRecord *get_ucd_record(uint32_t code) 54 { 55 int index, offset; 56 57 if (code >= 0x110000) 58 index = 0; 59 else { 60 index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; 61 offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); 62 index = index1[index + offset] << SHIFT2; 63 offset = code & ((1<<SHIFT2) - 1); 64 index = index2[index + offset]; 65 } 66 67 return &ucd_records[index]; 68 } 69 70 static const unsigned short *get_decomp_record(uint32_t code) 71 { 72 int index, offset; 73 74 if (code >= 0x110000) 75 index = 0; 76 else { 77 index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] 78 << DECOMP_SHIFT1; 79 offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); 80 index = decomp_index1[index + offset] << DECOMP_SHIFT2; 81 offset = code & ((1<<DECOMP_SHIFT2) - 1); 82 index = decomp_index2[index + offset]; 83 } 84 85 return &decomp_data[index]; 86 } 87 88 static int get_comp_index(uint32_t code, const Reindex *idx) 89 { 90 int i; 91 92 for (i = 0; idx[i].start; i++) { 93 const Reindex *cur = &idx[i]; 94 if (code < cur->start) 95 return -1; 96 if (code <= cur->start + cur->count) { 97 return cur->index + (code - cur->start); 98 } 99 } 100 101 return -1; 102 } 103 104 static int compare_mp(const void *a, const void *b) 105 { 106 MirrorPair *mpa = (MirrorPair *)a; 107 MirrorPair *mpb = (MirrorPair *)b; 108 return mpa->from - mpb->from; 109 } 110 111 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) 112 { 113 int si = code - SBASE; 114 115 if (si < 0 || si >= SCOUNT) 116 return 0; 117 118 if (si % TCOUNT) { 119 /* LV,T */ 120 *a = SBASE + (si / TCOUNT) * TCOUNT; 121 *b = TBASE + (si % TCOUNT); 122 return 3; 123 } else { 124 /* L,V */ 125 *a = LBASE + (si / NCOUNT); 126 *b = VBASE + (si % NCOUNT) / TCOUNT; 127 return 2; 128 } 129 } 130 131 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) 132 { 133 if (b < VBASE || b >= (TBASE + TCOUNT)) 134 return 0; 135 136 if ((a < LBASE || a >= (LBASE + LCOUNT)) 137 && (a < SBASE || a >= (SBASE + SCOUNT))) 138 return 0; 139 140 if (a >= SBASE) { 141 /* LV,T */ 142 *code = a + (b - TBASE); 143 return 3; 144 } else { 145 /* L,V */ 146 int li = a - LBASE; 147 int vi = b - VBASE; 148 *code = SBASE + li * NCOUNT + vi * TCOUNT; 149 return 2; 150 } 151 } 152 153 static uint32_t decode_utf16(const unsigned short **code_ptr) 154 { 155 const unsigned short *code = *code_ptr; 156 157 if ((code[0] & 0xd800) != 0xd800) { 158 *code_ptr += 1; 159 return (uint32_t)code[0]; 160 } else { 161 *code_ptr += 2; 162 return 0x10000 + ((uint32_t)code[1] - 0xdc00) + 163 (((uint32_t)code[0] - 0xd800) << 10); 164 } 165 } 166 167 const char *ucdn_get_unicode_version(void) 168 { 169 return UNIDATA_VERSION; 170 } 171 172 int ucdn_get_combining_class(uint32_t code) 173 { 174 return get_ucd_record(code)->combining; 175 } 176 177 int ucdn_get_east_asian_width(uint32_t code) 178 { 179 return get_ucd_record(code)->east_asian_width; 180 } 181 182 int ucdn_get_general_category(uint32_t code) 183 { 184 return get_ucd_record(code)->category; 185 } 186 187 int ucdn_get_bidi_class(uint32_t code) 188 { 189 return get_ucd_record(code)->bidi_class; 190 } 191 192 int ucdn_get_mirrored(uint32_t code) 193 { 194 return get_ucd_record(code)->mirrored; 195 } 196 197 int ucdn_get_script(uint32_t code) 198 { 199 return get_ucd_record(code)->script; 200 } 201 202 uint32_t ucdn_mirror(uint32_t code) 203 { 204 MirrorPair mp = {0}; 205 MirrorPair *res; 206 207 if (get_ucd_record(code)->mirrored == 0) 208 return code; 209 210 mp.from = code; 211 res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair), 212 compare_mp); 213 214 if (res == NULL) 215 return code; 216 else 217 return res->to; 218 } 219 220 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) 221 { 222 const unsigned short *rec; 223 int len; 224 225 if (hangul_pair_decompose(code, a, b)) 226 return 1; 227 228 rec = get_decomp_record(code); 229 len = rec[0] >> 8; 230 231 if ((rec[0] & 0xff) != 0 || len == 0) 232 return 0; 233 234 rec++; 235 *a = decode_utf16(&rec); 236 if (len > 1) 237 *b = decode_utf16(&rec); 238 else 239 *b = 0; 240 241 return 1; 242 } 243 244 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) 245 { 246 int l, r, index, indexi, offset; 247 248 if (hangul_pair_compose(code, a, b)) 249 return 1; 250 251 l = get_comp_index(a, nfc_first); 252 r = get_comp_index(b, nfc_last); 253 254 if (l < 0 || r < 0) 255 return 0; 256 257 indexi = l * TOTAL_LAST + r; 258 index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; 259 offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); 260 index = comp_index1[index + offset] << COMP_SHIFT2; 261 offset = indexi & ((1<<COMP_SHIFT2) - 1); 262 *code = comp_data[index + offset]; 263 264 return *code != 0; 265 } 266 267 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) 268 { 269 int i, len; 270 const unsigned short *rec = get_decomp_record(code); 271 len = rec[0] >> 8; 272 273 if (len == 0) 274 return 0; 275 276 rec++; 277 for (i = 0; i < len; i++) 278 decomposed[i] = decode_utf16(&rec); 279 280 return len; 281 } 282