1 /* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */ 2 3 /*- 4 * Copyright (c) 2002-2004 Tim J. Robbins 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <errno.h> 30 #include <sys/param.h> 31 #include <string.h> 32 #include <wchar.h> 33 #include <uchar.h> 34 35 #include "private/bionic_mbstate.h" 36 37 // 38 // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 39 // 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where 40 // mbstate_t was only 4 bytes. 41 // 42 // The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32 43 // mbstate_t already has enough space (out of the 4 available bytes we only 44 // need 3 since we should never need to store the entire sequence in the 45 // intermediary state). 46 // 47 // The C standard leaves the conversion state undefined after a bad conversion. 48 // To avoid unexpected failures due to the possible use of the internal private 49 // state we always reset the conversion state when encountering illegal 50 // sequences. 51 // 52 // We also implement the POSIX interface directly rather than being accessed via 53 // function pointers. 54 // 55 56 int mbsinit(const mbstate_t* ps) { 57 return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0)); 58 } 59 60 size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) { 61 static mbstate_t __private_state; 62 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 63 64 // Our wchar_t is UTF-32. 65 return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state); 66 } 67 68 size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) { 69 static mbstate_t __private_state; 70 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 71 size_t i, o, r; 72 73 // The fast paths in the loops below are not safe if an ASCII 74 // character appears as anything but the first byte of a 75 // multibyte sequence. Check now to avoid doing it in the loops. 76 if (nmc > 0 && mbstate_bytes_so_far(state) > 0 && static_cast<uint8_t>((*src)[0]) < 0x80) { 77 return mbstate_reset_and_return_illegal(EILSEQ, state); 78 } 79 80 // Measure only? 81 if (dst == NULL) { 82 for (i = o = 0; i < nmc; i += r, o++) { 83 if (static_cast<uint8_t>((*src)[i]) < 0x80) { 84 // Fast path for plain ASCII characters. 85 if ((*src)[i] == '\0') { 86 return mbstate_reset_and_return(o, state); 87 } 88 r = 1; 89 } else { 90 r = mbrtowc(NULL, *src + i, nmc - i, state); 91 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 92 return mbstate_reset_and_return_illegal(EILSEQ, state); 93 } 94 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { 95 return mbstate_reset_and_return_illegal(EILSEQ, state); 96 } 97 if (r == 0) { 98 return mbstate_reset_and_return(o, state); 99 } 100 } 101 } 102 return mbstate_reset_and_return(o, state); 103 } 104 105 // Actually convert, updating `dst` and `src`. 106 for (i = o = 0; i < nmc && o < len; i += r, o++) { 107 if (static_cast<uint8_t>((*src)[i]) < 0x80) { 108 // Fast path for plain ASCII characters. 109 dst[o] = (*src)[i]; 110 r = 1; 111 if ((*src)[i] == '\0') { 112 *src = nullptr; 113 return mbstate_reset_and_return(o, state); 114 } 115 } else { 116 r = mbrtowc(dst + o, *src + i, nmc - i, state); 117 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 118 *src += i; 119 return mbstate_reset_and_return_illegal(EILSEQ, state); 120 } 121 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { 122 *src += nmc; 123 return mbstate_reset_and_return_illegal(EILSEQ, state); 124 } 125 if (r == 0) { 126 *src = NULL; 127 return mbstate_reset_and_return(o, state); 128 } 129 } 130 } 131 *src += i; 132 return mbstate_reset_and_return(o, state); 133 } 134 135 size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) { 136 return mbsnrtowcs(dst, src, SIZE_MAX, len, ps); 137 } 138 139 size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) { 140 static mbstate_t __private_state; 141 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 142 143 // Our wchar_t is UTF-32. 144 return c32rtomb(s, static_cast<char32_t>(wc), state); 145 } 146 147 size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) { 148 static mbstate_t __private_state; 149 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 150 151 if (!mbsinit(state)) { 152 return mbstate_reset_and_return_illegal(EILSEQ, state); 153 } 154 155 char buf[MB_LEN_MAX]; 156 size_t i, o, r; 157 if (dst == NULL) { 158 for (i = o = 0; i < nwc; i++, o += r) { 159 wchar_t wc = (*src)[i]; 160 if (static_cast<uint32_t>(wc) < 0x80) { 161 // Fast path for plain ASCII characters. 162 if (wc == 0) { 163 return o; 164 } 165 r = 1; 166 } else { 167 r = wcrtomb(buf, wc, state); 168 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 169 return r; 170 } 171 } 172 } 173 return o; 174 } 175 176 for (i = o = 0; i < nwc && o < len; i++, o += r) { 177 wchar_t wc = (*src)[i]; 178 if (static_cast<uint32_t>(wc) < 0x80) { 179 // Fast path for plain ASCII characters. 180 dst[o] = wc; 181 if (wc == 0) { 182 *src = NULL; 183 return o; 184 } 185 r = 1; 186 } else if (len - o >= sizeof(buf)) { 187 // Enough space to translate in-place. 188 r = wcrtomb(dst + o, wc, state); 189 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 190 *src += i; 191 return r; 192 } 193 } else { 194 // May not be enough space; use temp buffer. 195 r = wcrtomb(buf, wc, state); 196 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 197 *src += i; 198 return r; 199 } 200 if (r > len - o) { 201 break; 202 } 203 memcpy(dst + o, buf, r); 204 } 205 } 206 *src += i; 207 return o; 208 } 209 210 size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) { 211 return wcsnrtombs(dst, src, SIZE_MAX, len, ps); 212 } 213