Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2012, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "csrutf8.h"
     13 #include "csmatch.h"
     14 
     15 U_NAMESPACE_BEGIN
     16 
     17 CharsetRecog_UTF8::~CharsetRecog_UTF8()
     18 {
     19     // nothing to do
     20 }
     21 
     22 const char *CharsetRecog_UTF8::getName() const
     23 {
     24     return "UTF-8";
     25 }
     26 
     27 UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
     28     bool hasBOM = FALSE;
     29     int32_t numValid = 0;
     30     int32_t numInvalid = 0;
     31     const uint8_t *inputBytes = input->fRawInput;
     32     int32_t i;
     33     int32_t trailBytes = 0;
     34     int32_t confidence;
     35 
     36     if (input->fRawLength >= 3 &&
     37         inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
     38             hasBOM = TRUE;
     39     }
     40 
     41     // Scan for multi-byte sequences
     42     for (i=0; i < input->fRawLength; i += 1) {
     43         int32_t b = inputBytes[i];
     44 
     45         if ((b & 0x80) == 0) {
     46             continue;   // ASCII
     47         }
     48 
     49         // Hi bit on char found.  Figure out how long the sequence should be
     50         if ((b & 0x0E0) == 0x0C0) {
     51             trailBytes = 1;
     52         } else if ((b & 0x0F0) == 0x0E0) {
     53             trailBytes = 2;
     54         } else if ((b & 0x0F8) == 0xF0) {
     55             trailBytes = 3;
     56         } else {
     57             numInvalid += 1;
     58 
     59             if (numInvalid > 5) {
     60                 break;
     61             }
     62 
     63             trailBytes = 0;
     64         }
     65 
     66         // Verify that we've got the right number of trail bytes in the sequence
     67         for (;;) {
     68             i += 1;
     69 
     70             if (i >= input->fRawLength) {
     71                 break;
     72             }
     73 
     74             b = inputBytes[i];
     75 
     76             if ((b & 0xC0) != 0x080) {
     77                 numInvalid += 1;
     78                 break;
     79             }
     80 
     81             if (--trailBytes == 0) {
     82                 numValid += 1;
     83                 break;
     84             }
     85         }
     86 
     87     }
     88 
     89     // Cook up some sort of confidence score, based on presense of a BOM
     90     //    and the existence of valid and/or invalid multi-byte sequences.
     91     confidence = 0;
     92     if (hasBOM && numInvalid == 0) {
     93         confidence = 100;
     94     } else if (hasBOM && numValid > numInvalid*10) {
     95         confidence = 80;
     96     } else if (numValid > 3 && numInvalid == 0) {
     97         confidence = 100;
     98     } else if (numValid > 0 && numInvalid == 0) {
     99         confidence = 80;
    100     } else if (numValid == 0 && numInvalid == 0) {
    101         // Plain ASCII.
    102         confidence = 10;
    103     } else if (numValid > numInvalid*10) {
    104         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
    105         confidence = 25;
    106     }
    107 
    108     results->set(input, this, confidence);
    109     return (confidence > 0);
    110 }
    111 
    112 U_NAMESPACE_END
    113 #endif
    114