1 /********************************************************************** 2 utf16_le.c - Oniguruma (regular expression library) 3 **********************************************************************/ 4 /*- 5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include "regenc.h" 31 32 static const int EncLen_UTF16[] = { 33 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 34 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 35 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 36 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 37 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 38 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 39 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 40 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 41 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 42 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 43 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 49 }; 50 51 static int 52 utf16le_code_to_mbclen(OnigCodePoint code) 53 { 54 return (code > 0xffff ? 4 : 2); 55 } 56 57 static int 58 utf16le_mbc_enc_len(const UChar* p) 59 { 60 return EncLen_UTF16[*(p+1)]; 61 } 62 63 static int 64 utf16le_is_mbc_newline(const UChar* p, const UChar* end) 65 { 66 if (p + 1 < end) { 67 if (*p == 0x0a && *(p+1) == 0x00) 68 return 1; 69 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS 70 if (( 71 #ifndef USE_CRNL_AS_LINE_TERMINATOR 72 *p == 0x0d || 73 #endif 74 *p == 0x85) && *(p+1) == 0x00) 75 return 1; 76 if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28)) 77 return 1; 78 #endif 79 } 80 return 0; 81 } 82 83 static OnigCodePoint 84 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) 85 { 86 OnigCodePoint code; 87 UChar c0 = *p; 88 UChar c1 = *(p+1); 89 90 if (UTF16_IS_SURROGATE_FIRST(c1)) { 91 code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16) 92 + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8) 93 + p[2]; 94 } 95 else { 96 code = c1 * 256 + p[0]; 97 } 98 return code; 99 } 100 101 static int 102 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf) 103 { 104 UChar* p = buf; 105 106 if (code > 0xffff) { 107 unsigned int plane, high; 108 109 plane = (code >> 16) - 1; 110 high = (code & 0xff00) >> 8; 111 112 *p++ = (UChar)(((plane & 0x03) << 6) + (high >> 2)); 113 *p++ = (UChar)((plane >> 2) + 0xd8); 114 *p++ = (UChar )(code & 0xff); 115 *p = (high & 0x03) + 0xdc; 116 return 4; 117 } 118 else { 119 *p++ = (UChar )(code & 0xff); 120 *p++ = (UChar )((code & 0xff00) >> 8); 121 return 2; 122 } 123 } 124 125 static int 126 utf16le_mbc_case_fold(OnigCaseFoldType flag, 127 const UChar** pp, const UChar* end, UChar* fold) 128 { 129 const UChar* p = *pp; 130 131 if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) { 132 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 133 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 134 if (*p == 0x49) { 135 *fold++ = 0x31; 136 *fold = 0x01; 137 (*pp) += 2; 138 return 2; 139 } 140 } 141 #endif 142 143 *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 144 *fold = 0; 145 *pp += 2; 146 return 2; 147 } 148 else 149 return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_LE, flag, pp, end, 150 fold); 151 } 152 153 #if 0 154 static int 155 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, 156 const UChar* end) 157 { 158 const UChar* p = *pp; 159 160 (*pp) += EncLen_UTF16[*(p+1)]; 161 162 if (*(p+1) == 0) { 163 int c, v; 164 165 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 166 return TRUE; 167 } 168 169 c = *p; 170 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, 171 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 172 if ((v | BIT_CTYPE_LOWER) != 0) { 173 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 174 if (c >= 0xaa && c <= 0xba) 175 return FALSE; 176 else 177 return TRUE; 178 } 179 return (v != 0 ? TRUE : FALSE); 180 } 181 182 return FALSE; 183 } 184 #endif 185 186 static UChar* 187 utf16le_left_adjust_char_head(const UChar* start, const UChar* s) 188 { 189 if (s <= start) return (UChar* )s; 190 191 if ((s - start) % 2 == 1) { 192 s--; 193 } 194 195 if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) 196 s -= 2; 197 198 return (UChar* )s; 199 } 200 201 static int 202 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag, 203 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) 204 { 205 return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_LE, 206 flag, p, end, items); 207 } 208 209 OnigEncodingType OnigEncodingUTF16_LE = { 210 utf16le_mbc_enc_len, 211 "UTF-16LE", /* name */ 212 4, /* max byte length */ 213 2, /* min byte length */ 214 utf16le_is_mbc_newline, 215 utf16le_mbc_to_code, 216 utf16le_code_to_mbclen, 217 utf16le_code_to_mbc, 218 utf16le_mbc_case_fold, 219 onigenc_unicode_apply_all_case_fold, 220 utf16le_get_case_fold_codes_by_str, 221 onigenc_unicode_property_name_to_ctype, 222 onigenc_unicode_is_code_ctype, 223 onigenc_utf16_32_get_ctype_code_range, 224 utf16le_left_adjust_char_head, 225 onigenc_always_false_is_allowed_reverse_match 226 }; 227