Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2014, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "csrutf8.h"
     13 #include "csmatch.h"
     14 
     15 U_NAMESPACE_BEGIN
     16 
     17 CharsetRecog_UTF8::~CharsetRecog_UTF8()
     18 {
     19     // nothing to do
     20 }
     21 
     22 const char *CharsetRecog_UTF8::getName() const
     23 {
     24     return "UTF-8";
     25 }
     26 
     27 UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
     28     bool hasBOM = FALSE;
     29     int32_t numValid = 0;
     30     int32_t numInvalid = 0;
     31     const uint8_t *inputBytes = input->fRawInput;
     32     int32_t i;
     33     int32_t trailBytes = 0;
     34     int32_t confidence;
     35 
     36     if (input->fRawLength >= 3 &&
     37         inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
     38             hasBOM = TRUE;
     39     }
     40 
     41     // Scan for multi-byte sequences
     42     for (i=0; i < input->fRawLength; i += 1) {
     43         int32_t b = inputBytes[i];
     44 
     45         if ((b & 0x80) == 0) {
     46             continue;   // ASCII
     47         }
     48 
     49         // Hi bit on char found.  Figure out how long the sequence should be
     50         if ((b & 0x0E0) == 0x0C0) {
     51             trailBytes = 1;
     52         } else if ((b & 0x0F0) == 0x0E0) {
     53             trailBytes = 2;
     54         } else if ((b & 0x0F8) == 0xF0) {
     55             trailBytes = 3;
     56         } else {
     57             numInvalid += 1;
     58             continue;
     59         }
     60 
     61         // Verify that we've got the right number of trail bytes in the sequence
     62         for (;;) {
     63             i += 1;
     64 
     65             if (i >= input->fRawLength) {
     66                 break;
     67             }
     68 
     69             b = inputBytes[i];
     70 
     71             if ((b & 0xC0) != 0x080) {
     72                 numInvalid += 1;
     73                 break;
     74             }
     75 
     76             if (--trailBytes == 0) {
     77                 numValid += 1;
     78                 break;
     79             }
     80         }
     81 
     82     }
     83 
     84     // Cook up some sort of confidence score, based on presence of a BOM
     85     //    and the existence of valid and/or invalid multi-byte sequences.
     86     confidence = 0;
     87     if (hasBOM && numInvalid == 0) {
     88         confidence = 100;
     89     } else if (hasBOM && numValid > numInvalid*10) {
     90         confidence = 80;
     91     } else if (numValid > 3 && numInvalid == 0) {
     92         confidence = 100;
     93     } else if (numValid > 0 && numInvalid == 0) {
     94         confidence = 80;
     95     } else if (numValid == 0 && numInvalid == 0) {
     96         // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
     97         //              accepts ASCII with confidence = 10.
     98         confidence = 15;
     99     } else if (numValid > numInvalid*10) {
    100         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
    101         confidence = 25;
    102     }
    103 
    104     results->set(input, this, confidence);
    105     return (confidence > 0);
    106 }
    107 
    108 U_NAMESPACE_END
    109 #endif
    110