1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /** 5 ******************************************************************************* 6 * Copyright (C) 2005 - 2014, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package android.icu.text; 11 12 /** 13 * Charset recognizer for UTF-8 14 */ 15 class CharsetRecog_UTF8 extends CharsetRecognizer { 16 17 @Override 18 String getName() { 19 return "UTF-8"; 20 } 21 22 /* (non-Javadoc) 23 * @see android.icu.text.CharsetRecognizer#match(android.icu.text.CharsetDetector) 24 */ 25 @Override 26 CharsetMatch match(CharsetDetector det) { 27 boolean hasBOM = false; 28 int numValid = 0; 29 int numInvalid = 0; 30 byte input[] = det.fRawInput; 31 int i; 32 int trailBytes = 0; 33 int confidence; 34 35 if (det.fRawLength >= 3 && 36 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) { 37 hasBOM = true; 38 } 39 40 // Scan for multi-byte sequences 41 for (i=0; i<det.fRawLength; i++) { 42 int b = input[i]; 43 if ((b & 0x80) == 0) { 44 continue; // ASCII 45 } 46 47 // Hi bit on char found. Figure out how long the sequence should be 48 if ((b & 0x0e0) == 0x0c0) { 49 trailBytes = 1; 50 } else if ((b & 0x0f0) == 0x0e0) { 51 trailBytes = 2; 52 } else if ((b & 0x0f8) == 0xf0) { 53 trailBytes = 3; 54 } else { 55 numInvalid++; 56 continue; 57 } 58 59 // Verify that we've got the right number of trail bytes in the sequence 60 for (;;) { 61 i++; 62 if (i>=det.fRawLength) { 63 break; 64 } 65 b = input[i]; 66 if ((b & 0xc0) != 0x080) { 67 numInvalid++; 68 break; 69 } 70 if (--trailBytes == 0) { 71 numValid++; 72 break; 73 } 74 } 75 } 76 77 // Cook up some sort of confidence score, based on presense of a BOM 78 // and the existence of valid and/or invalid multi-byte sequences. 79 confidence = 0; 80 if (hasBOM && numInvalid==0) { 81 confidence = 100; 82 } else if (hasBOM && numValid > numInvalid*10) { 83 confidence = 80; 84 } else if (numValid > 3 && numInvalid == 0) { 85 confidence = 100; 86 } else if (numValid > 0 && numInvalid == 0) { 87 confidence = 80; 88 } else if (numValid == 0 && numInvalid == 0) { 89 // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which 90 // accepts ASCII with confidence = 10. 91 // TODO: add plain ASCII as an explicitly detected type. 92 confidence = 15; 93 } else if (numValid > numInvalid*10) { 94 // Probably corruput utf-8 data. Valid sequences aren't likely by chance. 95 confidence = 25; 96 } 97 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 98 } 99 100 } 101