Home | History | Annotate | Download | only in enc
      1 /**********************************************************************
      2   utf16_le.c -  Oniguruma (regular expression library)
      3 **********************************************************************/
      4 /*-
      5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
      6  * All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 #include "regenc.h"
     31 
     32 static const int EncLen_UTF16[] = {
     33   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     34   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     35   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     36   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     37   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     38   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     39   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     40   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     41   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     42   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     43   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     44   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     45   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     46   2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
     47   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     48   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
     49 };
     50 
     51 static int
     52 utf16le_code_to_mbclen(OnigCodePoint code)
     53 {
     54   return (code > 0xffff ? 4 : 2);
     55 }
     56 
     57 static int
     58 utf16le_mbc_enc_len(const UChar* p)
     59 {
     60   return EncLen_UTF16[*(p+1)];
     61 }
     62 
     63 static int
     64 utf16le_is_mbc_newline(const UChar* p, const UChar* end)
     65 {
     66   if (p + 1 < end) {
     67     if (*p == 0x0a && *(p+1) == 0x00)
     68       return 1;
     69 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
     70     if ((
     71 #ifndef USE_CRNL_AS_LINE_TERMINATOR
     72 	 *p == 0x0d ||
     73 #endif
     74 	 *p == 0x85) && *(p+1) == 0x00)
     75       return 1;
     76     if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
     77       return 1;
     78 #endif
     79   }
     80   return 0;
     81 }
     82 
     83 static OnigCodePoint
     84 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
     85 {
     86   OnigCodePoint code;
     87   UChar c0 = *p;
     88   UChar c1 = *(p+1);
     89 
     90   if (UTF16_IS_SURROGATE_FIRST(c1)) {
     91     code = ((((c1 - 0xd8) << 2) + ((c0  & 0xc0) >> 6) + 1) << 16)
     92          + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8)
     93          + p[2];
     94   }
     95   else {
     96     code = c1 * 256 + p[0];
     97   }
     98   return code;
     99 }
    100 
    101 static int
    102 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf)
    103 {
    104   UChar* p = buf;
    105 
    106   if (code > 0xffff) {
    107     unsigned int plane, high;
    108 
    109     plane = (code >> 16) - 1;
    110     high = (code & 0xff00) >> 8;
    111 
    112     *p++ = (UChar)(((plane & 0x03) << 6) + (high >> 2));
    113     *p++ = (UChar)((plane >> 2) + 0xd8);
    114     *p++ = (UChar )(code & 0xff);
    115     *p   = (high & 0x03) + 0xdc;
    116     return 4;
    117   }
    118   else {
    119     *p++ = (UChar )(code & 0xff);
    120     *p++ = (UChar )((code & 0xff00) >> 8);
    121     return 2;
    122   }
    123 }
    124 
    125 static int
    126 utf16le_mbc_case_fold(OnigCaseFoldType flag,
    127 		      const UChar** pp, const UChar* end, UChar* fold)
    128 {
    129   const UChar* p = *pp;
    130 
    131   if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
    132 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
    133     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
    134       if (*p == 0x49) {
    135 	*fold++ = 0x31;
    136 	*fold   = 0x01;
    137 	(*pp) += 2;
    138 	return 2;
    139       }
    140     }
    141 #endif
    142 
    143     *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
    144     *fold   = 0;
    145     *pp += 2;
    146     return 2;
    147   }
    148   else
    149     return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_LE, flag, pp, end,
    150 					 fold);
    151 }
    152 
    153 #if 0
    154 static int
    155 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
    156 			 const UChar* end)
    157 {
    158   const UChar* p = *pp;
    159 
    160   (*pp) += EncLen_UTF16[*(p+1)];
    161 
    162   if (*(p+1) == 0) {
    163     int c, v;
    164 
    165     if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
    166       return TRUE;
    167     }
    168 
    169     c = *p;
    170     v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
    171                        (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
    172     if ((v | BIT_CTYPE_LOWER) != 0) {
    173       /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
    174       if (c >= 0xaa && c <= 0xba)
    175 	return FALSE;
    176       else
    177 	return TRUE;
    178     }
    179     return (v != 0 ? TRUE : FALSE);
    180   }
    181 
    182   return FALSE;
    183 }
    184 #endif
    185 
    186 static UChar*
    187 utf16le_left_adjust_char_head(const UChar* start, const UChar* s)
    188 {
    189   if (s <= start) return (UChar* )s;
    190 
    191   if ((s - start) % 2 == 1) {
    192     s--;
    193   }
    194 
    195   if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
    196     s -= 2;
    197 
    198   return (UChar* )s;
    199 }
    200 
    201 static int
    202 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,
    203     const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
    204 {
    205   return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_LE,
    206 						    flag, p, end, items);
    207 }
    208 
    209 OnigEncodingType OnigEncodingUTF16_LE = {
    210   utf16le_mbc_enc_len,
    211   "UTF-16LE",   /* name */
    212   4,            /* max byte length */
    213   2,            /* min byte length */
    214   utf16le_is_mbc_newline,
    215   utf16le_mbc_to_code,
    216   utf16le_code_to_mbclen,
    217   utf16le_code_to_mbc,
    218   utf16le_mbc_case_fold,
    219   onigenc_unicode_apply_all_case_fold,
    220   utf16le_get_case_fold_codes_by_str,
    221   onigenc_unicode_property_name_to_ctype,
    222   onigenc_unicode_is_code_ctype,
    223   onigenc_utf16_32_get_ctype_code_range,
    224   utf16le_left_adjust_char_head,
    225   onigenc_always_false_is_allowed_reverse_match
    226 };
    227