Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /**
      5 *******************************************************************************
      6 * Copyright (C) 2005 - 2014, International Business Machines Corporation and  *
      7 * others. All Rights Reserved.                                                *
      8 *******************************************************************************
      9 */
     10 package android.icu.text;
     11 
     12 /**
     13  * Charset recognizer for UTF-8
     14  */
     15 class CharsetRecog_UTF8 extends CharsetRecognizer {
     16 
     17     @Override
     18     String getName() {
     19         return "UTF-8";
     20     }
     21 
     22     /* (non-Javadoc)
     23      * @see android.icu.text.CharsetRecognizer#match(android.icu.text.CharsetDetector)
     24      */
     25     @Override
     26     CharsetMatch match(CharsetDetector det) {
     27         boolean     hasBOM = false;
     28         int         numValid = 0;
     29         int         numInvalid = 0;
     30         byte        input[] = det.fRawInput;
     31         int         i;
     32         int         trailBytes = 0;
     33         int         confidence;
     34 
     35         if (det.fRawLength >= 3 &&
     36                 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
     37             hasBOM = true;
     38         }
     39 
     40         // Scan for multi-byte sequences
     41         for (i=0; i<det.fRawLength; i++) {
     42             int b = input[i];
     43             if ((b & 0x80) == 0) {
     44                 continue;   // ASCII
     45             }
     46 
     47             // Hi bit on char found.  Figure out how long the sequence should be
     48             if ((b & 0x0e0) == 0x0c0) {
     49                 trailBytes = 1;
     50             } else if ((b & 0x0f0) == 0x0e0) {
     51                 trailBytes = 2;
     52             } else if ((b & 0x0f8) == 0xf0) {
     53                 trailBytes = 3;
     54             } else {
     55                 numInvalid++;
     56                 continue;
     57             }
     58 
     59             // Verify that we've got the right number of trail bytes in the sequence
     60             for (;;) {
     61                 i++;
     62                 if (i>=det.fRawLength) {
     63                     break;
     64                 }
     65                 b = input[i];
     66                 if ((b & 0xc0) != 0x080) {
     67                     numInvalid++;
     68                     break;
     69                 }
     70                 if (--trailBytes == 0) {
     71                     numValid++;
     72                     break;
     73                 }
     74             }
     75         }
     76 
     77         // Cook up some sort of confidence score, based on presense of a BOM
     78         //    and the existence of valid and/or invalid multi-byte sequences.
     79         confidence = 0;
     80         if (hasBOM && numInvalid==0) {
     81             confidence = 100;
     82         } else if (hasBOM && numValid > numInvalid*10) {
     83             confidence = 80;
     84         } else if (numValid > 3 && numInvalid == 0) {
     85             confidence = 100;
     86         } else if (numValid > 0 && numInvalid == 0) {
     87             confidence = 80;
     88         } else if (numValid == 0 && numInvalid == 0) {
     89             // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
     90             //              accepts ASCII with confidence = 10.
     91             // TODO: add plain ASCII as an explicitly detected type.
     92             confidence = 15;
     93         } else if (numValid > numInvalid*10) {
     94             // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
     95             confidence = 25;
     96         }
     97         return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
     98     }
     99 
    100 }
    101