Home | History | Annotate | Download | only in libmedia
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 //#define LOG_NDEBUG 0
     18 #define LOG_TAG "CharacterEncodingDector"
     19 #include <utils/Log.h>
     20 
     21 #include <media/CharacterEncodingDetector.h>
     22 #include "CharacterEncodingDetectorTables.h"
     23 
     24 #include <utils/Vector.h>
     25 #include <media/StringArray.h>
     26 
     27 #include <unicode/ucnv.h>
     28 #include <unicode/ucsdet.h>
     29 #include <unicode/ustring.h>
     30 
     31 namespace android {
     32 
     33 CharacterEncodingDetector::CharacterEncodingDetector() {
     34 
     35     UErrorCode status = U_ZERO_ERROR;
     36     mUtf8Conv = ucnv_open("UTF-8", &status);
     37     if (U_FAILURE(status)) {
     38         ALOGE("could not create UConverter for UTF-8");
     39         mUtf8Conv = NULL;
     40     }
     41 }
     42 
     43 CharacterEncodingDetector::~CharacterEncodingDetector() {
     44     ucnv_close(mUtf8Conv);
     45 }
     46 
     47 void CharacterEncodingDetector::addTag(const char *name, const char *value) {
     48     mNames.push_back(name);
     49     mValues.push_back(value);
     50 }
     51 
     52 size_t CharacterEncodingDetector::size() {
     53     return mNames.size();
     54 }
     55 
     56 status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) {
     57     if (index >= mNames.size()) {
     58         return BAD_VALUE;
     59     }
     60 
     61     *name = mNames.getEntry(index);
     62     *value = mValues.getEntry(index);
     63     return OK;
     64 }
     65 
     66 static bool isPrintableAscii(const char *value, size_t len) {
     67     for (size_t i = 0; i < len; i++) {
     68         if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) {
     69             return false;
     70         }
     71     }
     72     return true;
     73 }
     74 
     75 void CharacterEncodingDetector::detectAndConvert() {
     76 
     77     int size = mNames.size();
     78     ALOGV("%d tags before conversion", size);
     79     for (int i = 0; i < size; i++) {
     80         ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
     81     }
     82 
     83     if (size && mUtf8Conv) {
     84 
     85         UErrorCode status = U_ZERO_ERROR;
     86         UCharsetDetector *csd = ucsdet_open(&status);
     87         const UCharsetMatch *ucm;
     88         bool goodmatch = true;
     89         int highest = 0;
     90 
     91         // try combined detection of artist/album/title etc.
     92         char buf[1024];
     93         buf[0] = 0;
     94         bool allprintable = true;
     95         for (int i = 0; i < size; i++) {
     96             const char *name = mNames.getEntry(i);
     97             const char *value = mValues.getEntry(i);
     98             if (!isPrintableAscii(value, strlen(value)) && (
     99                         !strcmp(name, "artist") ||
    100                         !strcmp(name, "albumartist") ||
    101                         !strcmp(name, "composer") ||
    102                         !strcmp(name, "genre") ||
    103                         !strcmp(name, "album") ||
    104                         !strcmp(name, "title"))) {
    105                 strlcat(buf, value, sizeof(buf));
    106                 // separate tags by space so ICU's ngram detector can do its job
    107                 strlcat(buf, " ", sizeof(buf));
    108                 allprintable = false;
    109             }
    110         }
    111 
    112         const char *combinedenc = "UTF-8";
    113         if (allprintable) {
    114             // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
    115             // no need to even call it
    116             ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf));
    117         } else {
    118             ucsdet_setText(csd, buf, strlen(buf), &status);
    119             int32_t matches;
    120             const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
    121             const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
    122                     ucma, matches, &goodmatch, &highest);
    123 
    124             ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
    125             if (!goodmatch && (highest < 15 || strlen(buf) < 20)) {
    126                 ALOGV("not a good match, trying with more data");
    127                 // This string might be too short for ICU to do anything useful with.
    128                 // (real world example: "Bjrk" in ISO-8859-1 might be detected as GB18030, because
    129                 //  the ISO detector reports a confidence of 0, while the GB18030 detector reports
    130                 //  a confidence of 10 with no invalid characters)
    131                 // Append artist, album and title if they were previously omitted because they
    132                 // were printable ascii.
    133                 bool added = false;
    134                 for (int i = 0; i < size; i++) {
    135                     const char *name = mNames.getEntry(i);
    136                     const char *value = mValues.getEntry(i);
    137                     if (isPrintableAscii(value, strlen(value)) && (
    138                                 !strcmp(name, "artist") ||
    139                                 !strcmp(name, "album") ||
    140                                 !strcmp(name, "title"))) {
    141                         strlcat(buf, value, sizeof(buf));
    142                         strlcat(buf, " ", sizeof(buf));
    143                         added = true;
    144                     }
    145                 }
    146                 if (added) {
    147                     ucsdet_setText(csd, buf, strlen(buf), &status);
    148                     ucma = ucsdet_detectAll(csd, &matches, &status);
    149                     bestCombinedMatch = getPreferred(buf, strlen(buf),
    150                             ucma, matches, &goodmatch, &highest);
    151                     if (!goodmatch && highest <= 15) {
    152                         ALOGV("still not a good match after adding printable tags");
    153                         bestCombinedMatch = NULL;
    154                     }
    155                 } else {
    156                     ALOGV("no printable tags to add");
    157                 }
    158             }
    159 
    160             if (bestCombinedMatch != NULL) {
    161                 combinedenc = ucsdet_getName(bestCombinedMatch, &status);
    162             } else {
    163                 combinedenc = "ISO-8859-1";
    164             }
    165         }
    166 
    167         for (int i = 0; i < size; i++) {
    168             const char *name = mNames.getEntry(i);
    169             uint8_t* src = (uint8_t *)mValues.getEntry(i);
    170             int len = strlen((char *)src);
    171 
    172             ALOGV("@@@ checking %s", name);
    173             const char *s = mValues.getEntry(i);
    174             int32_t inputLength = strlen(s);
    175             const char *enc;
    176 
    177             if (!allprintable && (!strcmp(name, "artist") ||
    178                     !strcmp(name, "albumartist") ||
    179                     !strcmp(name, "composer") ||
    180                     !strcmp(name, "genre") ||
    181                     !strcmp(name, "album") ||
    182                     !strcmp(name, "title"))) {
    183                 if (!goodmatch && highest < 0) {
    184                     // Give it one more chance if there is no good match.
    185                     ALOGV("Trying to detect %s separately", name);
    186                     int32_t matches;
    187                     bool goodmatchSingle = true;
    188                     int highestSingle = 0;
    189                     ucsdet_setText(csd, s, inputLength, &status);
    190                     const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
    191                     const UCharsetMatch* bestSingleMatch = getPreferred(s, inputLength,
    192                             ucma, matches, &goodmatchSingle, &highestSingle);
    193                     if (goodmatchSingle || highestSingle > highest)
    194                         enc = ucsdet_getName(bestSingleMatch, &status);
    195                     else
    196                         enc = combinedenc;
    197                 } else {
    198                     // use encoding determined from the combination of artist/album/title etc.
    199                     enc = combinedenc;
    200                 }
    201             } else {
    202                 if (isPrintableAscii(s, inputLength)) {
    203                     enc = "UTF-8";
    204                     ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
    205                 } else {
    206                     ucsdet_setText(csd, s, inputLength, &status);
    207                     ucm = ucsdet_detect(csd, &status);
    208                     if (!ucm) {
    209                         mValues.setEntry(i, "???");
    210                         continue;
    211                     }
    212                     enc = ucsdet_getName(ucm, &status);
    213                     ALOGV("@@@@ recognized charset: %s for %s confidence %d",
    214                             enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
    215                 }
    216             }
    217 
    218             if (strcmp(enc,"UTF-8") != 0) {
    219                 // only convert if the source encoding isn't already UTF-8
    220                 ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
    221                 status = U_ZERO_ERROR;
    222                 UConverter *conv = ucnv_open(enc, &status);
    223                 if (U_FAILURE(status)) {
    224                     ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
    225                             enc, status);
    226                     status = U_ZERO_ERROR;
    227                     conv = ucnv_open("ISO-8859-1", &status);
    228                     if (U_FAILURE(status)) {
    229                         ALOGW("could not create UConverter for ISO-8859-1 either");
    230                         continue;
    231                     }
    232                 }
    233 
    234                 // convert from native encoding to UTF-8
    235                 const char* source = mValues.getEntry(i);
    236                 int targetLength = len * 3 + 1;
    237                 char* buffer = new char[targetLength];
    238                 // don't normally check for NULL, but in this case targetLength may be large
    239                 if (!buffer)
    240                     break;
    241                 char* target = buffer;
    242 
    243                 ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
    244                         &source, source + strlen(source),
    245                         NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
    246 
    247                 if (U_FAILURE(status)) {
    248                     ALOGE("ucnv_convertEx failed: %d", status);
    249                     mValues.setEntry(i, "???");
    250                 } else {
    251                     // zero terminate
    252                     *target = 0;
    253                     // strip trailing spaces
    254                     while (--target > buffer && *target == ' ') {
    255                         *target = 0;
    256                     }
    257                     // skip leading spaces
    258                     char *start = buffer;
    259                     while (*start == ' ') {
    260                         start++;
    261                     }
    262                     mValues.setEntry(i, start);
    263                 }
    264 
    265                 delete[] buffer;
    266 
    267                 ucnv_close(conv);
    268             }
    269         }
    270 
    271         for (int i = size - 1; i >= 0; --i) {
    272             if (strlen(mValues.getEntry(i)) == 0) {
    273                 ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
    274                 mNames.erase(i);
    275                 mValues.erase(i);
    276             }
    277         }
    278 
    279         ucsdet_close(csd);
    280     }
    281 }
    282 
    283 /*
    284  * When ICU detects multiple encoding matches, apply additional heuristics to determine
    285  * which one is the best match, since ICU can't always be trusted to make the right choice.
    286  *
    287  * What this method does is:
    288  * - decode the input using each of the matches found
    289  * - recalculate the starting confidence level for multibyte encodings using a different
    290  *   algorithm and larger frequent character lists than ICU
    291  * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
    292  * - pick the highest match
    293  * - signal to the caller whether this match is considered good: confidence > 15, and confidence
    294  *   delta with the next runner up > 15
    295  */
    296 const UCharsetMatch *CharacterEncodingDetector::getPreferred(
    297         const char *input, size_t len,
    298         const UCharsetMatch** ucma, size_t nummatches,
    299         bool *goodmatch, int *highestmatch) {
    300 
    301     *goodmatch = false;
    302     Vector<const UCharsetMatch*> matches;
    303     UErrorCode status = U_ZERO_ERROR;
    304 
    305     ALOGV("%zu matches", nummatches);
    306     for (size_t i = 0; i < nummatches; i++) {
    307         const char *encname = ucsdet_getName(ucma[i], &status);
    308         int confidence = ucsdet_getConfidence(ucma[i], &status);
    309         ALOGV("%zu: %s %d", i, encname, confidence);
    310         matches.push_back(ucma[i]);
    311     }
    312 
    313     size_t num = matches.size();
    314     if (num == 0) {
    315         return NULL;
    316     }
    317     if (num == 1) {
    318         int confidence = ucsdet_getConfidence(matches[0], &status);
    319         if (confidence > 15) {
    320             *goodmatch = true;
    321         }
    322         return matches[0];
    323     }
    324 
    325     ALOGV("considering %zu matches", num);
    326 
    327     // keep track of how many "special" characters result when converting the input using each
    328     // encoding
    329     Vector<int> newconfidence;
    330     for (size_t i = 0; i < num; i++) {
    331         const uint16_t *freqdata = NULL;
    332         float freqcoverage = 0;
    333         status = U_ZERO_ERROR;
    334         const char *encname = ucsdet_getName(matches[i], &status);
    335         int confidence = ucsdet_getConfidence(matches[i], &status);
    336         if (!strcmp("GB18030", encname)) {
    337             freqdata = frequent_zhCN;
    338             freqcoverage = frequent_zhCN_coverage;
    339         } else if (!strcmp("Big5", encname)) {
    340             freqdata = frequent_zhTW;
    341             freqcoverage = frequent_zhTW_coverage;
    342         } else if (!strcmp("EUC-KR", encname)) {
    343             freqdata = frequent_ko;
    344             freqcoverage = frequent_ko_coverage;
    345         } else if (!strcmp("EUC-JP", encname)) {
    346             freqdata = frequent_ja;
    347             freqcoverage = frequent_ja_coverage;
    348         } else if (!strcmp("Shift_JIS", encname)) {
    349             freqdata = frequent_ja;
    350             freqcoverage = frequent_ja_coverage;
    351         }
    352 
    353         ALOGV("%zu: %s %d", i, encname, confidence);
    354         status = U_ZERO_ERROR;
    355         UConverter *conv = ucnv_open(encname, &status);
    356         int demerit = 0;
    357         if (U_FAILURE(status)) {
    358             ALOGV("failed to open %s: %d", encname, status);
    359             confidence = 0;
    360             demerit += 1000;
    361         }
    362         const char *source = input;
    363         const char *sourceLimit = input + len;
    364         status = U_ZERO_ERROR;
    365         int frequentchars = 0;
    366         int totalchars = 0;
    367         while (true) {
    368             // demerit the current encoding for each "special" character found after conversion.
    369             // The amount of demerit is somewhat arbitrarily chosen.
    370             int inchar;
    371             if (source != sourceLimit) {
    372                 inchar = (source[0] << 8) + source[1];
    373             }
    374             UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
    375             if (!U_SUCCESS(status)) {
    376                 break;
    377             }
    378             if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
    379                 ALOGV("control character %x", c);
    380                 demerit += 100;
    381             } else if ((c == 0xa0)                      // no-break space
    382                     || (c >= 0xa2 && c <= 0xbe)         // symbols, superscripts
    383                     || (c == 0xd7) || (c == 0xf7)       // multiplication and division signs
    384                     || (c >= 0x2000 && c <= 0x209f)) {  // punctuation, superscripts
    385                 ALOGV("unlikely character %x", c);
    386                 demerit += 10;
    387             } else if (c >= 0xe000 && c <= 0xf8ff) {
    388                 ALOGV("private use character %x", c);
    389                 demerit += 30;
    390             } else if (c >= 0x2190 && c <= 0x2bff) {
    391                 // this range comprises various symbol ranges that are unlikely to appear in
    392                 // music file metadata.
    393                 ALOGV("symbol %x", c);
    394                 demerit += 10;
    395             } else if (c == 0xfffd) {
    396                 ALOGV("replacement character");
    397                 demerit += 50;
    398             } else if (c >= 0xfff0 && c <= 0xfffc) {
    399                 ALOGV("unicode special %x", c);
    400                 demerit += 50;
    401             } else if (freqdata != NULL) {
    402                 totalchars++;
    403                 if (isFrequent(freqdata, c)) {
    404                     frequentchars++;
    405                 }
    406             }
    407         }
    408         if (freqdata != NULL && totalchars != 0) {
    409             int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
    410             ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
    411                     totalchars, frequentchars);
    412             if (myconfidence > 100) myconfidence = 100;
    413             if (myconfidence < 0) myconfidence = 0;
    414             confidence = myconfidence;
    415         }
    416         ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
    417         newconfidence.push_back(confidence - demerit);
    418         ucnv_close(conv);
    419         if (i == 0 && (confidence - demerit) == 100) {
    420             // no need to check any further, we'll end up using this match anyway
    421             break;
    422         }
    423     }
    424 
    425     // find match with highest confidence after adjusting for unlikely characters
    426     int highest = newconfidence[0];
    427     size_t highestidx = 0;
    428     int runnerup = -10000;
    429     int runnerupidx = -10000;
    430     num = newconfidence.size();
    431     for (size_t i = 1; i < num; i++) {
    432         if (newconfidence[i] > highest) {
    433             runnerup = highest;
    434             runnerupidx = highestidx;
    435             highest = newconfidence[i];
    436             highestidx = i;
    437         } else if (newconfidence[i] > runnerup){
    438             runnerup = newconfidence[i];
    439             runnerupidx = i;
    440         }
    441     }
    442     status = U_ZERO_ERROR;
    443     ALOGV("selecting: '%s' w/ %d confidence",
    444             ucsdet_getName(matches[highestidx], &status), highest);
    445     if (runnerupidx < 0) {
    446         ALOGV("no runner up");
    447         if (highest > 15) {
    448             *goodmatch = true;
    449         }
    450     } else {
    451         ALOGV("runner up: '%s' w/ %d confidence",
    452                 ucsdet_getName(matches[runnerupidx], &status), runnerup);
    453         if (runnerup < 0) {
    454             runnerup = 0;
    455         }
    456         if ((highest - runnerup) > 15) {
    457             *goodmatch = true;
    458         }
    459     }
    460     *highestmatch = highest;
    461     return matches[highestidx];
    462 }
    463 
    464 
    465 bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
    466 
    467     int start = 0;
    468     int end = 511; // All the tables have 512 entries
    469     int mid = (start+end)/2;
    470 
    471     while(start <= end) {
    472         if(c == values[mid]) {
    473             return true;
    474         } else if (c > values[mid]) {
    475             start = mid + 1;
    476         } else {
    477             end = mid - 1;
    478         }
    479 
    480         mid = (start + end) / 2;
    481     }
    482 
    483     return false;
    484 }
    485 
    486 
    487 }  // namespace android
    488