Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 1996-2013, International Business Machines Corporation and    *
      7  * others. All Rights Reserved.                                                *
      8  *******************************************************************************
      9  *
     10  */
     11 
     12 package android.icu.text;
     13 
     14 /**
     15  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
     16  * BOM will be used if it is present.
     17  */
     18 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
     19 
     20     /* (non-Javadoc)
     21      * @see android.icu.text.CharsetRecognizer#getName()
     22      */
     23     @Override
     24     abstract String getName();
     25 
     26     /* (non-Javadoc)
     27      * @see android.icu.text.CharsetRecognizer#match(android.icu.text.CharsetDetector)
     28      */
     29     @Override
     30     abstract CharsetMatch match(CharsetDetector det);
     31 
     32     static int codeUnit16FromBytes(byte hi, byte lo) {
     33         return ((hi & 0xff) << 8) | (lo & 0xff);
     34     }
     35 
     36     // UTF-16 confidence calculation. Very simple minded, but better than nothing.
     37     //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
     38     //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
     39     //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
     40     //   NULs should be rare in actual text.
     41     static int adjustConfidence(int codeUnit, int confidence) {
     42         if (codeUnit == 0) {
     43             confidence -= 10;
     44         } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
     45             confidence += 10;
     46         }
     47         if (confidence < 0) {
     48             confidence = 0;
     49         } else if (confidence > 100) {
     50             confidence = 100;
     51         }
     52         return confidence;
     53     }
     54 
     55     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
     56     {
     57         @Override
     58         String getName()
     59         {
     60             return "UTF-16BE";
     61         }
     62 
     63         @Override
     64         CharsetMatch match(CharsetDetector det)
     65         {
     66             byte[] input = det.fRawInput;
     67             int confidence = 10;
     68 
     69             int bytesToCheck = Math.min(input.length, 30);
     70             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
     71                 int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
     72                 if (charIndex == 0 && codeUnit == 0xFEFF) {
     73                     confidence = 100;
     74                     break;
     75                 }
     76                 confidence = adjustConfidence(codeUnit, confidence);
     77                 if (confidence == 0 || confidence == 100) {
     78                     break;
     79                 }
     80             }
     81             if (bytesToCheck < 4 && confidence < 100) {
     82                 confidence = 0;
     83             }
     84             if (confidence > 0) {
     85                 return new CharsetMatch(det, this, confidence);
     86             }
     87             return null;
     88         }
     89     }
     90 
     91     static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
     92     {
     93         @Override
     94         String getName()
     95         {
     96             return "UTF-16LE";
     97         }
     98 
     99         @Override
    100         CharsetMatch match(CharsetDetector det)
    101         {
    102             byte[] input = det.fRawInput;
    103             int confidence = 10;
    104 
    105             int bytesToCheck = Math.min(input.length, 30);
    106             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
    107                 int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
    108                 if (charIndex == 0 && codeUnit == 0xFEFF) {
    109                     confidence = 100;
    110                     break;
    111                 }
    112                 confidence = adjustConfidence(codeUnit, confidence);
    113                 if (confidence == 0 || confidence == 100) {
    114                     break;
    115                 }
    116             }
    117             if (bytesToCheck < 4 && confidence < 100) {
    118                 confidence = 0;
    119             }
    120             if (confidence > 0) {
    121                 return new CharsetMatch(det, this, confidence);
    122             }
    123             return null;
    124         }
    125     }
    126 
    127     static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
    128     {
    129         abstract int getChar(byte[] input, int index);
    130 
    131         @Override
    132         abstract String getName();
    133 
    134         @Override
    135         CharsetMatch match(CharsetDetector det)
    136         {
    137             byte[] input   = det.fRawInput;
    138             int limit      = (det.fRawLength / 4) * 4;
    139             int numValid   = 0;
    140             int numInvalid = 0;
    141             boolean hasBOM = false;
    142             int confidence = 0;
    143 
    144             if (limit==0) {
    145                 return null;
    146             }
    147             if (getChar(input, 0) == 0x0000FEFF) {
    148                 hasBOM = true;
    149             }
    150 
    151             for(int i = 0; i < limit; i += 4) {
    152                 int ch = getChar(input, i);
    153 
    154                 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
    155                     numInvalid += 1;
    156                 } else {
    157                     numValid += 1;
    158                 }
    159             }
    160 
    161 
    162             // Cook up some sort of confidence score, based on presence of a BOM
    163             //    and the existence of valid and/or invalid multi-byte sequences.
    164             if (hasBOM && numInvalid==0) {
    165                 confidence = 100;
    166             } else if (hasBOM && numValid > numInvalid*10) {
    167                 confidence = 80;
    168             } else if (numValid > 3 && numInvalid == 0) {
    169                 confidence = 100;
    170             } else if (numValid > 0 && numInvalid == 0) {
    171                 confidence = 80;
    172             } else if (numValid > numInvalid*10) {
    173                 // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
    174                 confidence = 25;
    175             }
    176 
    177             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    178         }
    179     }
    180 
    181     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
    182     {
    183         @Override
    184         int getChar(byte[] input, int index)
    185         {
    186             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
    187                    (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
    188         }
    189 
    190         @Override
    191         String getName()
    192         {
    193             return "UTF-32BE";
    194         }
    195     }
    196 
    197 
    198     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
    199     {
    200         @Override
    201         int getChar(byte[] input, int index)
    202         {
    203             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
    204                    (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
    205         }
    206 
    207         @Override
    208         String getName()
    209         {
    210             return "UTF-32LE";
    211         }
    212     }
    213 }
    214