Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2005-2014, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_CONVERSION
     13 
     14 #include "csrutf8.h"
     15 #include "csmatch.h"
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 CharsetRecog_UTF8::~CharsetRecog_UTF8()
     20 {
     21     // nothing to do
     22 }
     23 
     24 const char *CharsetRecog_UTF8::getName() const
     25 {
     26     return "UTF-8";
     27 }
     28 
     29 UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
     30     bool hasBOM = FALSE;
     31     int32_t numValid = 0;
     32     int32_t numInvalid = 0;
     33     const uint8_t *inputBytes = input->fRawInput;
     34     int32_t i;
     35     int32_t trailBytes = 0;
     36     int32_t confidence;
     37 
     38     if (input->fRawLength >= 3 &&
     39         inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
     40             hasBOM = TRUE;
     41     }
     42 
     43     // Scan for multi-byte sequences
     44     for (i=0; i < input->fRawLength; i += 1) {
     45         int32_t b = inputBytes[i];
     46 
     47         if ((b & 0x80) == 0) {
     48             continue;   // ASCII
     49         }
     50 
     51         // Hi bit on char found.  Figure out how long the sequence should be
     52         if ((b & 0x0E0) == 0x0C0) {
     53             trailBytes = 1;
     54         } else if ((b & 0x0F0) == 0x0E0) {
     55             trailBytes = 2;
     56         } else if ((b & 0x0F8) == 0xF0) {
     57             trailBytes = 3;
     58         } else {
     59             numInvalid += 1;
     60             continue;
     61         }
     62 
     63         // Verify that we've got the right number of trail bytes in the sequence
     64         for (;;) {
     65             i += 1;
     66 
     67             if (i >= input->fRawLength) {
     68                 break;
     69             }
     70 
     71             b = inputBytes[i];
     72 
     73             if ((b & 0xC0) != 0x080) {
     74                 numInvalid += 1;
     75                 break;
     76             }
     77 
     78             if (--trailBytes == 0) {
     79                 numValid += 1;
     80                 break;
     81             }
     82         }
     83 
     84     }
     85 
     86     // Cook up some sort of confidence score, based on presence of a BOM
     87     //    and the existence of valid and/or invalid multi-byte sequences.
     88     confidence = 0;
     89     if (hasBOM && numInvalid == 0) {
     90         confidence = 100;
     91     } else if (hasBOM && numValid > numInvalid*10) {
     92         confidence = 80;
     93     } else if (numValid > 3 && numInvalid == 0) {
     94         confidence = 100;
     95     } else if (numValid > 0 && numInvalid == 0) {
     96         confidence = 80;
     97     } else if (numValid == 0 && numInvalid == 0) {
     98         // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
     99         //              accepts ASCII with confidence = 10.
    100         confidence = 15;
    101     } else if (numValid > numInvalid*10) {
    102         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
    103         confidence = 25;
    104     }
    105 
    106     results->set(input, this, confidence);
    107     return (confidence > 0);
    108 }
    109 
    110 U_NAMESPACE_END
    111 #endif
    112