Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright  2012 Intel Corporation
      3  * Copyright  2014 Ran Benita <ran234 (at) gmail.com>
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9  * and/or sell copies of the Software, and to permit persons to whom the
     10  * Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     22  * DEALINGS IN THE SOFTWARE.
     23  *
     24  * Author: Rob Bradford <rob (at) linux.intel.com>
     25  */
     26 
     27 #include <stddef.h>
     28 #include <stdbool.h>
     29 #include <inttypes.h>
     30 
     31 #include "utf8.h"
     32 
     33 int
     34 utf32_to_utf8(uint32_t unichar, char *buffer)
     35 {
     36     int count, shift, length;
     37     uint8_t head;
     38 
     39     if (unichar <= 0x007f) {
     40         buffer[0] = unichar;
     41         buffer[1] = '\0';
     42         return 2;
     43     }
     44     else if (unichar <= 0x07FF) {
     45         length = 2;
     46         head = 0xc0;
     47     }
     48     else if (unichar <= 0xffff) {
     49         length = 3;
     50         head = 0xe0;
     51     }
     52     else if (unichar <= 0x1fffff) {
     53         length = 4;
     54         head = 0xf0;
     55     }
     56     else if (unichar <= 0x3ffffff) {
     57         length = 5;
     58         head = 0xf8;
     59     }
     60     else {
     61         length = 6;
     62         head = 0xfc;
     63     }
     64 
     65     for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
     66         buffer[count] = 0x80 | ((unichar >> shift) & 0x3f);
     67 
     68     buffer[0] = head | ((unichar >> shift) & 0x3f);
     69     buffer[length] = '\0';
     70 
     71     return length + 1;
     72 }
     73 
     74 bool
     75 is_valid_utf8(const char *ss, size_t len)
     76 {
     77     size_t i = 0;
     78     size_t tail_bytes = 0;
     79     const uint8_t *s = (const uint8_t *) ss;
     80 
     81     /* This beauty is from:
     82      *  The Unicode Standard Version 6.2 - Core Specification, Table 3.7
     83      *  http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G7404
     84      * We can optimize if needed. */
     85     while (i < len)
     86     {
     87         if (s[i] <= 0x7F) {
     88             tail_bytes = 0;
     89         }
     90         else if (s[i] >= 0xC2 && s[i] <= 0xDF) {
     91             tail_bytes = 1;
     92         }
     93         else if (s[i] == 0xE0) {
     94             i++;
     95             if (i >= len || !(s[i] >= 0xA0 && s[i] <= 0xBF))
     96                 return false;
     97             tail_bytes = 1;
     98         }
     99         else if (s[i] >= 0xE1 && s[i] <= 0xEC) {
    100             tail_bytes = 2;
    101         }
    102         else if (s[i] == 0xED) {
    103             i++;
    104             if (i >= len || !(s[i] >= 0x80 && s[i] <= 0x9F))
    105                 return false;
    106             tail_bytes = 1;
    107         }
    108         else if (s[i] >= 0xEE && s[i] <= 0xEF) {
    109             tail_bytes = 2;
    110         }
    111         else if (s[i] == 0xF0) {
    112             i++;
    113             if (i >= len || !(s[i] >= 0x90 && s[i] <= 0xBF))
    114                 return false;
    115             tail_bytes = 2;
    116         }
    117         else if (s[i] >= 0xF1 && s[i] <= 0xF3) {
    118             tail_bytes = 3;
    119         }
    120         else if (s[i] == 0xF4) {
    121             i++;
    122             if (i >= len || !(s[i] >= 0x80 && s[i] <= 0x8F))
    123                 return false;
    124             tail_bytes = 2;
    125         }
    126         else {
    127             return false;
    128         }
    129 
    130         i++;
    131 
    132         while (i < len && tail_bytes > 0 && s[i] >= 0x80 && s[i] <= 0xBF) {
    133             i++;
    134             tail_bytes--;
    135         }
    136 
    137         if (tail_bytes != 0)
    138             return false;
    139     }
    140 
    141     return true;
    142 }
    143