1 /* $NetBSD: chartype.c,v 1.10 2011/08/16 16:25:15 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the NetBSD 18 * Foundation, Inc. and its contributors. 19 * 4. Neither the name of The NetBSD Foundation nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 25 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 26 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 27 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 * POSSIBILITY OF SUCH DAMAGE. 34 */ 35 36 /* 37 * chartype.c: character classification and meta information 38 */ 39 #include "config.h" 40 #if !defined(lint) && !defined(SCCSID) 41 __RCSID("$NetBSD: chartype.c,v 1.10 2011/08/16 16:25:15 christos Exp $"); 42 #endif /* not lint && not SCCSID */ 43 #include "el.h" 44 #include <stdlib.h> 45 46 #define CT_BUFSIZ ((size_t)1024) 47 48 #ifdef WIDECHAR 49 protected void 50 ct_conv_buff_resize(ct_buffer_t *conv, size_t mincsize, size_t minwsize) 51 { 52 void *p; 53 if (mincsize > conv->csize) { 54 conv->csize = mincsize; 55 p = el_realloc(conv->cbuff, conv->csize * sizeof(*conv->cbuff)); 56 if (p == NULL) { 57 conv->csize = 0; 58 el_free(conv->cbuff); 59 conv->cbuff = NULL; 60 } else 61 conv->cbuff = p; 62 } 63 64 if (minwsize > conv->wsize) { 65 conv->wsize = minwsize; 66 p = el_realloc(conv->wbuff, conv->wsize * sizeof(*conv->wbuff)); 67 if (p == NULL) { 68 conv->wsize = 0; 69 el_free(conv->wbuff); 70 conv->wbuff = NULL; 71 } else 72 conv->wbuff = p; 73 } 74 } 75 76 77 public char * 78 ct_encode_string(const Char *s, ct_buffer_t *conv) 79 { 80 char *dst; 81 ssize_t used = 0; 82 83 if (!s) 84 return NULL; 85 if (!conv->cbuff) 86 ct_conv_buff_resize(conv, CT_BUFSIZ, (size_t)0); 87 if (!conv->cbuff) 88 return NULL; 89 90 dst = conv->cbuff; 91 while (*s) { 92 used = (ssize_t)(conv->csize - (size_t)(dst - conv->cbuff)); 93 if (used < 5) { 94 used = dst - conv->cbuff; 95 ct_conv_buff_resize(conv, conv->csize + CT_BUFSIZ, 96 (size_t)0); 97 if (!conv->cbuff) 98 return NULL; 99 dst = conv->cbuff + used; 100 } 101 used = ct_encode_char(dst, (size_t)5, *s); 102 if (used == -1) /* failed to encode, need more buffer space */ 103 abort(); 104 ++s; 105 dst += used; 106 } 107 *dst = '\0'; 108 return conv->cbuff; 109 } 110 111 public Char * 112 ct_decode_string(const char *s, ct_buffer_t *conv) 113 { 114 size_t len = 0; 115 116 if (!s) 117 return NULL; 118 if (!conv->wbuff) 119 ct_conv_buff_resize(conv, (size_t)0, CT_BUFSIZ); 120 if (!conv->wbuff) 121 return NULL; 122 123 len = ct_mbstowcs(NULL, s, (size_t)0); 124 if (len == (size_t)-1) 125 return NULL; 126 if (len > conv->wsize) 127 ct_conv_buff_resize(conv, (size_t)0, len + 1); 128 if (!conv->wbuff) 129 return NULL; 130 ct_mbstowcs(conv->wbuff, s, conv->wsize); 131 return conv->wbuff; 132 } 133 134 135 protected Char ** 136 ct_decode_argv(int argc, const char *argv[], ct_buffer_t *conv) 137 { 138 size_t bufspace; 139 int i; 140 Char *p; 141 Char **wargv; 142 ssize_t bytes; 143 144 /* Make sure we have enough space in the conversion buffer to store all 145 * the argv strings. */ 146 for (i = 0, bufspace = 0; i < argc; ++i) 147 bufspace += argv[i] ? strlen(argv[i]) + 1 : 0; 148 ct_conv_buff_resize(conv, (size_t)0, bufspace); 149 if (!conv->wsize) 150 return NULL; 151 152 wargv = el_malloc((size_t)argc * sizeof(*wargv)); 153 154 for (i = 0, p = conv->wbuff; i < argc; ++i) { 155 if (!argv[i]) { /* don't pass null pointers to mbstowcs */ 156 wargv[i] = NULL; 157 continue; 158 } else { 159 wargv[i] = p; 160 bytes = (ssize_t)mbstowcs(p, argv[i], bufspace); 161 } 162 if (bytes == -1) { 163 el_free(wargv); 164 return NULL; 165 } else 166 bytes++; /* include '\0' in the count */ 167 bufspace -= (size_t)bytes; 168 p += bytes; 169 } 170 171 return wargv; 172 } 173 174 175 protected size_t 176 ct_enc_width(Char c) 177 { 178 /* UTF-8 encoding specific values */ 179 if (c < 0x80) 180 return 1; 181 else if (c < 0x0800) 182 return 2; 183 else if (c < 0x10000) 184 return 3; 185 else if (c < 0x110000) 186 return 4; 187 else 188 return 0; /* not a valid codepoint */ 189 } 190 191 protected ssize_t 192 ct_encode_char(char *dst, size_t len, Char c) 193 { 194 ssize_t l = 0; 195 if (len < ct_enc_width(c)) 196 return -1; 197 l = ct_wctomb(dst, c); 198 199 if (l < 0) { 200 ct_wctomb_reset; 201 l = 0; 202 } 203 return l; 204 } 205 #endif 206 207 protected const Char * 208 ct_visual_string(const Char *s) 209 { 210 static Char *buff = NULL; 211 static size_t buffsize = 0; 212 void *p; 213 Char *dst; 214 ssize_t used = 0; 215 216 if (!s) 217 return NULL; 218 if (!buff) { 219 buffsize = CT_BUFSIZ; 220 buff = el_malloc(buffsize * sizeof(*buff)); 221 } 222 dst = buff; 223 while (*s) { 224 used = ct_visual_char(dst, buffsize - (size_t)(dst - buff), *s); 225 if (used == -1) { /* failed to encode, need more buffer space */ 226 used = dst - buff; 227 buffsize += CT_BUFSIZ; 228 p = el_realloc(buff, buffsize * sizeof(*buff)); 229 if (p == NULL) 230 goto out; 231 buff = p; 232 dst = buff + used; 233 /* don't increment s here - we want to retry it! */ 234 } 235 else 236 ++s; 237 dst += used; 238 } 239 if (dst >= (buff + buffsize)) { /* sigh */ 240 buffsize += 1; 241 p = el_realloc(buff, buffsize * sizeof(*buff)); 242 if (p == NULL) 243 goto out; 244 buff = p; 245 dst = buff + buffsize - 1; 246 } 247 *dst = 0; 248 return buff; 249 out: 250 el_free(buff); 251 buffsize = 0; 252 return NULL; 253 } 254 255 256 257 protected int 258 ct_visual_width(Char c) 259 { 260 int t = ct_chr_class(c); 261 switch (t) { 262 case CHTYPE_ASCIICTL: 263 return 2; /* ^@ ^? etc. */ 264 case CHTYPE_TAB: 265 return 1; /* Hmm, this really need to be handled outside! */ 266 case CHTYPE_NL: 267 return 0; /* Should this be 1 instead? */ 268 #ifdef WIDECHAR 269 case CHTYPE_PRINT: 270 return wcwidth(c); 271 case CHTYPE_NONPRINT: 272 if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */ 273 return 8; /* \U+12345 */ 274 else 275 return 7; /* \U+1234 */ 276 #else 277 case CHTYPE_PRINT: 278 return 1; 279 case CHTYPE_NONPRINT: 280 return 4; /* \123 */ 281 #endif 282 default: 283 return 0; /* should not happen */ 284 } 285 } 286 287 288 protected ssize_t 289 ct_visual_char(Char *dst, size_t len, Char c) 290 { 291 int t = ct_chr_class(c); 292 switch (t) { 293 case CHTYPE_TAB: 294 case CHTYPE_NL: 295 case CHTYPE_ASCIICTL: 296 if (len < 2) 297 return -1; /* insufficient space */ 298 *dst++ = '^'; 299 if (c == '\177') 300 *dst = '?'; /* DEL -> ^? */ 301 else 302 *dst = c | 0100; /* uncontrolify it */ 303 return 2; 304 case CHTYPE_PRINT: 305 if (len < 1) 306 return -1; /* insufficient space */ 307 *dst = c; 308 return 1; 309 case CHTYPE_NONPRINT: 310 /* we only use single-width glyphs for display, 311 * so this is right */ 312 if ((ssize_t)len < ct_visual_width(c)) 313 return -1; /* insufficient space */ 314 #ifdef WIDECHAR 315 *dst++ = '\\'; 316 *dst++ = 'U'; 317 *dst++ = '+'; 318 #define tohexdigit(v) "0123456789ABCDEF"[v] 319 if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */ 320 *dst++ = tohexdigit(((unsigned int) c >> 16) & 0xf); 321 *dst++ = tohexdigit(((unsigned int) c >> 12) & 0xf); 322 *dst++ = tohexdigit(((unsigned int) c >> 8) & 0xf); 323 *dst++ = tohexdigit(((unsigned int) c >> 4) & 0xf); 324 *dst = tohexdigit(((unsigned int) c ) & 0xf); 325 return c > 0xffff ? 8 : 7; 326 #else 327 *dst++ = '\\'; 328 #define tooctaldigit(v) ((v) + '0') 329 *dst++ = tooctaldigit(((unsigned int) c >> 6) & 0x7); 330 *dst++ = tooctaldigit(((unsigned int) c >> 3) & 0x7); 331 *dst++ = tooctaldigit(((unsigned int) c ) & 0x7); 332 #endif 333 /*FALLTHROUGH*/ 334 /* these two should be handled outside this function */ 335 default: /* we should never hit the default */ 336 return 0; 337 } 338 } 339 340 341 342 343 protected int 344 ct_chr_class(Char c) 345 { 346 if (c == '\t') 347 return CHTYPE_TAB; 348 else if (c == '\n') 349 return CHTYPE_NL; 350 else if (IsASCII(c) && Iscntrl(c)) 351 return CHTYPE_ASCIICTL; 352 else if (Isprint(c)) 353 return CHTYPE_PRINT; 354 else 355 return CHTYPE_NONPRINT; 356 } 357