Home | History | Annotate | Download | only in bionic
      1 /*	$OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
      2 
      3 /*-
      4  * Copyright (c) 2002-2004 Tim J. Robbins
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <errno.h>
     30 #include <sys/param.h>
     31 #include <string.h>
     32 #include <wchar.h>
     33 #include <uchar.h>
     34 
     35 #include "private/bionic_mbstate.h"
     36 
     37 //
     38 // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
     39 // 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
     40 // mbstate_t was only 4 bytes.
     41 //
     42 // The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
     43 // mbstate_t already has enough space (out of the 4 available bytes we only
     44 // need 3 since we should never need to store the entire sequence in the
     45 // intermediary state).
     46 //
     47 // The C standard leaves the conversion state undefined after a bad conversion.
     48 // To avoid unexpected failures due to the possible use of the internal private
     49 // state we always reset the conversion state when encountering illegal
     50 // sequences.
     51 //
     52 // We also implement the POSIX interface directly rather than being accessed via
     53 // function pointers.
     54 //
     55 
     56 int mbsinit(const mbstate_t* ps) {
     57   return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
     58 }
     59 
     60 size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
     61   static mbstate_t __private_state;
     62   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
     63 
     64   // Our wchar_t is UTF-32
     65   return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
     66 }
     67 
     68 size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
     69   static mbstate_t __private_state;
     70   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
     71   size_t i, o, r;
     72 
     73   if (dst == NULL) {
     74     /*
     75      * The fast path in the loop below is not safe if an ASCII
     76      * character appears as anything but the first byte of a
     77      * multibyte sequence. Check now to avoid doing it in the loop.
     78      */
     79     if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
     80         && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
     81       return reset_and_return_illegal(EILSEQ, state);
     82     }
     83     for (i = o = 0; i < nmc; i += r, o++) {
     84       if (static_cast<uint8_t>((*src)[i]) < 0x80) {
     85         // Fast path for plain ASCII characters.
     86         if ((*src)[i] == '\0') {
     87           *src = nullptr;
     88           return reset_and_return(o, state);
     89         }
     90         r = 1;
     91       } else {
     92         r = mbrtowc(NULL, *src + i, nmc - i, state);
     93         if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
     94           return reset_and_return_illegal(EILSEQ, state);
     95         }
     96         if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
     97           return reset_and_return_illegal(EILSEQ, state);
     98         }
     99         if (r == 0) {
    100           *src = nullptr;
    101           return reset_and_return(o, state);
    102         }
    103       }
    104     }
    105     return reset_and_return(o, state);
    106   }
    107 
    108   /*
    109    * The fast path in the loop below is not safe if an ASCII
    110    * character appears as anything but the first byte of a
    111    * multibyte sequence. Check now to avoid doing it in the loop.
    112    */
    113   if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
    114       && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
    115     return reset_and_return_illegal(EILSEQ, state);
    116   }
    117   for (i = o = 0; i < nmc && o < len; i += r, o++) {
    118     if (static_cast<uint8_t>((*src)[i]) < 0x80) {
    119       // Fast path for plain ASCII characters.
    120       dst[o] = (*src)[i];
    121       r = 1;
    122       if ((*src)[i] == '\0') {
    123         *src = nullptr;
    124         return reset_and_return(o, state);
    125       }
    126     } else {
    127       r = mbrtowc(dst + o, *src + i, nmc - i, state);
    128       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
    129         *src += i;
    130         return reset_and_return_illegal(EILSEQ, state);
    131       }
    132       if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
    133         *src += nmc;
    134         return reset_and_return(EILSEQ, state);
    135       }
    136       if (r == 0) {
    137         *src = NULL;
    138         return reset_and_return(o, state);
    139       }
    140     }
    141   }
    142   *src += i;
    143   return reset_and_return(o, state);
    144 }
    145 
    146 size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
    147   return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
    148 }
    149 
    150 size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
    151   static mbstate_t __private_state;
    152   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
    153 
    154   // Our wchar_t is UTF-32
    155   return c32rtomb(s, static_cast<char32_t>(wc), state);
    156 }
    157 
    158 size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
    159   static mbstate_t __private_state;
    160   mbstate_t* state = (ps == NULL) ? &__private_state : ps;
    161 
    162   if (!mbsinit(state)) {
    163     return reset_and_return_illegal(EILSEQ, state);
    164   }
    165 
    166   char buf[MB_LEN_MAX];
    167   size_t i, o, r;
    168   if (dst == NULL) {
    169     for (i = o = 0; i < nwc; i++, o += r) {
    170       wchar_t wc = (*src)[i];
    171       if (static_cast<uint32_t>(wc) < 0x80) {
    172         // Fast path for plain ASCII characters.
    173         if (wc == 0) {
    174           return o;
    175         }
    176         r = 1;
    177       } else {
    178         r = wcrtomb(buf, wc, state);
    179         if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
    180           return r;
    181         }
    182       }
    183     }
    184     return o;
    185   }
    186 
    187   for (i = o = 0; i < nwc && o < len; i++, o += r) {
    188     wchar_t wc = (*src)[i];
    189     if (static_cast<uint32_t>(wc) < 0x80) {
    190       // Fast path for plain ASCII characters.
    191       dst[o] = wc;
    192       if (wc == 0) {
    193         *src = NULL;
    194         return o;
    195       }
    196       r = 1;
    197     } else if (len - o >= sizeof(buf)) {
    198       // Enough space to translate in-place.
    199       r = wcrtomb(dst + o, wc, state);
    200       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
    201         *src += i;
    202         return r;
    203       }
    204     } else {
    205       // May not be enough space; use temp buffer.
    206       r = wcrtomb(buf, wc, state);
    207       if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
    208         *src += i;
    209         return r;
    210       }
    211       if (r > len - o) {
    212         break;
    213       }
    214       memcpy(dst + o, buf, r);
    215     }
    216   }
    217   *src += i;
    218   return o;
    219 }
    220 
    221 size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
    222   return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
    223 }
    224 
    225 int wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t) {
    226   return wcscoll(ws1, ws2);
    227 }
    228 
    229 size_t wcsxfrm_l(wchar_t *dest, const wchar_t *src, size_t n, locale_t) {
    230   return wcsxfrm(dest, src, n);
    231 }
    232 
    233 long long wcstoll_l(const wchar_t *nptr, wchar_t **endptr, int base,
    234                     locale_t) {
    235   return wcstoll(nptr, endptr, base);
    236 }
    237 
    238 unsigned long long wcstoull_l(const wchar_t *nptr, wchar_t **endptr,
    239                               int base, locale_t) {
    240   return wcstoull(nptr, endptr, base);
    241 }
    242 
    243 long double wcstold_l(const wchar_t *nptr, wchar_t **endptr, locale_t) {
    244   return wcstold(nptr, endptr);
    245 }
    246