Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2008, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "csrutf8.h"
     13 
     14 U_NAMESPACE_BEGIN
     15 
     16 CharsetRecog_UTF8::~CharsetRecog_UTF8()
     17 {
     18     // nothing to do
     19 }
     20 
     21 const char *CharsetRecog_UTF8::getName() const
     22 {
     23     return "UTF-8";
     24 }
     25 
     26 int32_t CharsetRecog_UTF8::match(InputText* det) {
     27     bool hasBOM = FALSE;
     28     int32_t numValid = 0;
     29     int32_t numInvalid = 0;
     30     const uint8_t *input = det->fRawInput;
     31     int32_t i;
     32     int32_t trailBytes = 0;
     33     int32_t confidence;
     34 
     35     if (det->fRawLength >= 3 &&
     36         input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
     37             hasBOM = TRUE;
     38     }
     39 
     40     // Scan for multi-byte sequences
     41     for (i=0; i < det->fRawLength; i += 1) {
     42         int32_t b = input[i];
     43 
     44         if ((b & 0x80) == 0) {
     45             continue;   // ASCII
     46         }
     47 
     48         // Hi bit on char found.  Figure out how long the sequence should be
     49         if ((b & 0x0E0) == 0x0C0) {
     50             trailBytes = 1;
     51         } else if ((b & 0x0F0) == 0x0E0) {
     52             trailBytes = 2;
     53         } else if ((b & 0x0F8) == 0xF0) {
     54             trailBytes = 3;
     55         } else {
     56             numInvalid += 1;
     57 
     58             if (numInvalid > 5) {
     59                 break;
     60             }
     61 
     62             trailBytes = 0;
     63         }
     64 
     65         // Verify that we've got the right number of trail bytes in the sequence
     66         for (;;) {
     67             i += 1;
     68 
     69             if (i >= det->fRawLength) {
     70                 break;
     71             }
     72 
     73             b = input[i];
     74 
     75             if ((b & 0xC0) != 0x080) {
     76                 numInvalid += 1;
     77                 break;
     78             }
     79 
     80             if (--trailBytes == 0) {
     81                 numValid += 1;
     82                 break;
     83             }
     84         }
     85 
     86     }
     87 
     88     // Cook up some sort of confidence score, based on presense of a BOM
     89     //    and the existence of valid and/or invalid multi-byte sequences.
     90     confidence = 0;
     91     if (hasBOM && numInvalid == 0) {
     92         confidence = 100;
     93     } else if (hasBOM && numValid > numInvalid*10) {
     94         confidence = 80;
     95     } else if (numValid > 3 && numInvalid == 0) {
     96         confidence = 100;
     97     } else if (numValid > 0 && numInvalid == 0) {
     98         confidence = 80;
     99     } else if (numValid == 0 && numInvalid == 0) {
    100         // Plain ASCII.
    101         confidence = 10;
    102     } else if (numValid > numInvalid*10) {
    103         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
    104         confidence = 25;
    105     }
    106 
    107     return confidence;
    108 }
    109 
    110 U_NAMESPACE_END
    111 #endif
    112