Home | History | Annotate | Download | only in icu
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utf_impl.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 1999sep13
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This file provides implementation functions for macros in the utfXX.h
     19 *   that would otherwise be too long as macros.
     20 */
     21 
     22 #include "base/third_party/icu/icu_utf.h"
     23 
     24 namespace base_icu {
     25 
     26 // source/common/utf_impl.cpp
     27 
     28 static const UChar32
     29 utf8_errorValue[6]={
     30     // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
     31     // but without relying on the obsolete unicode/utf_old.h.
     32     0x15, 0x9f, 0xffff,
     33     0x10ffff
     34 };
     35 
     36 static UChar32
     37 errorValue(int32_t count, int8_t strict) {
     38     if(strict>=0) {
     39         return utf8_errorValue[count];
     40     } else if(strict==-3) {
     41         return 0xfffd;
     42     } else {
     43         return CBU_SENTINEL;
     44     }
     45 }
     46 
     47 /*
     48  * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
     49  * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
     50  *
     51  * U8_NEXT() supports NUL-terminated strings indicated via length<0.
     52  *
     53  * The "strict" parameter controls the error behavior:
     54  * <0  "Safe" behavior of U8_NEXT():
     55  *     -1: All illegal byte sequences yield U_SENTINEL=-1.
     56  *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
     57  *         Some implementations use this for roundtripping of
     58  *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
     59  *         contain unpaired surrogates.
     60  *     -3: All illegal byte sequences yield U+FFFD.
     61  *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
     62  *     All illegal byte sequences yield a positive code point such that this
     63  *     result code point would be encoded with the same number of bytes as
     64  *     the illegal sequence.
     65  * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
     66  *     Same as the obsolete "safe" behavior, but non-characters are also treated
     67  *     like illegal sequences.
     68  *
     69  * Note that a UBool is the same as an int8_t.
     70  */
     71 UChar32
     72 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
     73     // *pi is one after byte c.
     74     int32_t i=*pi;
     75     // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
     76     if(i==length || c>0xf4) {
     77         // end of string, or not a lead byte
     78     } else if(c>=0xf0) {
     79         // Test for 4-byte sequences first because
     80         // U8_NEXT() handles shorter valid sequences inline.
     81         uint8_t t1=s[i], t2, t3;
     82         c&=7;
     83         if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
     84                 ++i!=length && (t2=s[i]-0x80)<=0x3f &&
     85                 ++i!=length && (t3=s[i]-0x80)<=0x3f) {
     86             ++i;
     87             c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
     88             // strict: forbid non-characters like U+fffe
     89             if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
     90                 *pi=i;
     91                 return c;
     92             }
     93         }
     94     } else if(c>=0xe0) {
     95         c&=0xf;
     96         if(strict!=-2) {
     97             uint8_t t1=s[i], t2;
     98             if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
     99                     ++i!=length && (t2=s[i]-0x80)<=0x3f) {
    100                 ++i;
    101                 c=(c<<12)|((t1&0x3f)<<6)|t2;
    102                 // strict: forbid non-characters like U+fffe
    103                 if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
    104                     *pi=i;
    105                     return c;
    106                 }
    107             }
    108         } else {
    109             // strict=-2 -> lenient: allow surrogates
    110             uint8_t t1=s[i]-0x80, t2;
    111             if(t1<=0x3f && (c>0 || t1>=0x20) &&
    112                     ++i!=length && (t2=s[i]-0x80)<=0x3f) {
    113                 *pi=i+1;
    114                 return (c<<12)|(t1<<6)|t2;
    115             }
    116         }
    117     } else if(c>=0xc2) {
    118         uint8_t t1=s[i]-0x80;
    119         if(t1<=0x3f) {
    120             *pi=i+1;
    121             return ((c-0xc0)<<6)|t1;
    122         }
    123     }  // else 0x80<=c<0xc2 is not a lead byte
    124 
    125     /* error handling */
    126     c=errorValue(i-*pi, strict);
    127     *pi=i;
    128     return c;
    129 }
    130 
    131 }  // namespace base_icu
    132