Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "encodings/compact_lang_det/compact_lang_det.h"
      6 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
      7 #include "encodings/compact_lang_det/win/cld_basictypes.h"
      8 
      9 // String is "code_version - data_scrape_date"
     10 static const char* kDetectLanguageVersion = "V1.6 - 20081121";
     11 
     12 // Large-table version for all ~160 languages (all Tiers)
     13 
     14 // Scan interchange-valid UTF-8 bytes and detect most likely language
     15 Language CompactLangDet::DetectLanguage(
     16                           const DetectionTables* tables,
     17                           const char* buffer,
     18                           int buffer_length,
     19                           bool is_plain_text,
     20                           bool* is_reliable) {
     21   bool allow_extended_lang = false;
     22   Language language3[3];
     23   int percent3[3];
     24   double normalized_score3[3];
     25   int text_bytes;
     26   int flags = 0;
     27   Language plus_one = UNKNOWN_LANGUAGE;
     28   const char* tld_hint = "";
     29   int encoding_hint = UNKNOWN_ENCODING;
     30   Language language_hint = UNKNOWN_LANGUAGE;
     31 
     32   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
     33                           tables,
     34                           buffer,
     35                           buffer_length,
     36                           is_plain_text,
     37                           tld_hint,               // "id" boosts Indonesian
     38                           encoding_hint,          // SJS boosts Japanese
     39                           language_hint,          // ITALIAN boosts it
     40                           allow_extended_lang,
     41                           flags,
     42                           plus_one,
     43                           language3,
     44                           percent3,
     45                           normalized_score3,
     46                           &text_bytes,
     47                           is_reliable);
     48   // Default to English.
     49   if (lang == UNKNOWN_LANGUAGE) {
     50     lang = ENGLISH;
     51   }
     52   return lang;
     53 }
     54 
     55 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
     56 Language CompactLangDet::DetectLanguageSummary(
     57                           const DetectionTables* tables,
     58                           const char* buffer,
     59                           int buffer_length,
     60                           bool is_plain_text,
     61                           Language* language3,
     62                           int* percent3,
     63                           int* text_bytes,
     64                           bool* is_reliable) {
     65   double normalized_score3[3];
     66   bool allow_extended_lang = false;
     67   int flags = 0;
     68   Language plus_one = UNKNOWN_LANGUAGE;
     69   const char* tld_hint = "";
     70   int encoding_hint = UNKNOWN_ENCODING;
     71   Language language_hint = UNKNOWN_LANGUAGE;
     72 
     73   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
     74                           tables,
     75                           buffer,
     76                           buffer_length,
     77                           is_plain_text,
     78                           tld_hint,               // "id" boosts Indonesian
     79                           encoding_hint,          // SJS boosts Japanese
     80                           language_hint,          // ITALIAN boosts it
     81                           allow_extended_lang,
     82                           flags,
     83                           plus_one,
     84                           language3,
     85                           percent3,
     86                           normalized_score3,
     87                           text_bytes,
     88                           is_reliable);
     89   // Default to English
     90   if (lang == UNKNOWN_LANGUAGE) {
     91     lang = ENGLISH;
     92   }
     93   return lang;
     94 }
     95 
     96 // Same as above, with hints supplied
     97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
     98 Language CompactLangDet::DetectLanguageSummary(
     99                           const DetectionTables* tables,
    100                           const char* buffer,
    101                           int buffer_length,
    102                           bool is_plain_text,
    103                           const char* tld_hint,       // "id" boosts Indonesian
    104                           int encoding_hint,          // SJS boosts Japanese
    105                           Language language_hint,     // ITALIAN boosts it
    106                           Language* language3,
    107                           int* percent3,
    108                           int* text_bytes,
    109                           bool* is_reliable) {
    110   double normalized_score3[3];
    111   bool allow_extended_lang = false;
    112   int flags = 0;
    113   Language plus_one = UNKNOWN_LANGUAGE;
    114 
    115   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
    116                           tables,
    117                           buffer,
    118                           buffer_length,
    119                           is_plain_text,
    120                           tld_hint,               // "id" boosts Indonesian
    121                           encoding_hint,          // SJS boosts Japanese
    122                           language_hint,          // ITALIAN boosts it
    123                           allow_extended_lang,
    124                           flags,
    125                           plus_one,
    126                           language3,
    127                           percent3,
    128                           normalized_score3,
    129                           text_bytes,
    130                           is_reliable);
    131   // Default to English
    132   if (lang == UNKNOWN_LANGUAGE) {
    133     lang = ENGLISH;
    134   }
    135   return lang;
    136 }
    137 
    138 
    139 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
    140 // languages.
    141 // Extended languages are additional Google interface languages and Unicode
    142 // single-language scripts, from ext_lang_enc.h
    143 Language CompactLangDet::ExtDetectLanguageSummary(
    144                           const DetectionTables* tables,
    145                           const char* buffer,
    146                           int buffer_length,
    147                           bool is_plain_text,
    148                           Language* language3,
    149                           int* percent3,
    150                           int* text_bytes,
    151                           bool* is_reliable) {
    152   double normalized_score3[3];
    153   bool allow_extended_lang = true;
    154   int flags = 0;
    155   Language plus_one = UNKNOWN_LANGUAGE;
    156   const char* tld_hint = "";
    157   int encoding_hint = UNKNOWN_ENCODING;
    158   Language language_hint = UNKNOWN_LANGUAGE;
    159 
    160   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
    161                           tables,
    162                           buffer,
    163                           buffer_length,
    164                           is_plain_text,
    165                           tld_hint,               // "id" boosts Indonesian
    166                           encoding_hint,          // SJS boosts Japanese
    167                           language_hint,          // ITALIAN boosts it
    168                           allow_extended_lang,
    169                           flags,
    170                           plus_one,
    171                           language3,
    172                           percent3,
    173                           normalized_score3,
    174                           text_bytes,
    175                           is_reliable);
    176   // Do not default to English
    177   return lang;
    178 }
    179 
    180 // Same as above, with hints supplied
    181 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
    182 // languages.
    183 // Extended languages are additional Google interface languages and Unicode
    184 // single-language scripts, from ext_lang_enc.h
    185 Language CompactLangDet::ExtDetectLanguageSummary(
    186                           const DetectionTables* tables,
    187                           const char* buffer,
    188                           int buffer_length,
    189                           bool is_plain_text,
    190                           const char* tld_hint,       // "id" boosts Indonesian
    191                           int encoding_hint,          // SJS boosts Japanese
    192                           Language language_hint,     // ITALIAN boosts it
    193                           Language* language3,
    194                           int* percent3,
    195                           int* text_bytes,
    196                           bool* is_reliable) {
    197   double normalized_score3[3];
    198   bool allow_extended_lang = true;
    199   int flags = 0;
    200   Language plus_one = UNKNOWN_LANGUAGE;
    201 
    202   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
    203                           tables,
    204                           buffer,
    205                           buffer_length,
    206                           is_plain_text,
    207                           tld_hint,               // "id" boosts Indonesian
    208                           encoding_hint,          // SJS boosts Japanese
    209                           language_hint,          // ITALIAN boosts it
    210                           allow_extended_lang,
    211                           flags,
    212                           plus_one,
    213                           language3,
    214                           percent3,
    215                           normalized_score3,
    216                           text_bytes,
    217                           is_reliable);
    218   // Do not default to English
    219   return lang;
    220 }
    221 
    222 // Same as above, and also returns internal language scores as a ratio to
    223 // normal score for real text in that language. Scores close to 1.0 indicate
    224 // normal text, while scores far away from 1.0 indicate badly-skewed text or
    225 // gibberish
    226 //
    227 Language CompactLangDet::ExtDetectLanguageSummary(
    228                         const DetectionTables* tables,
    229                         const char* buffer,
    230                         int buffer_length,
    231                         bool is_plain_text,
    232                         const char* tld_hint,       // "id" boosts Indonesian
    233                         int encoding_hint,          // SJS boosts Japanese
    234                         Language language_hint,     // ITALIAN boosts it
    235                         Language* language3,
    236                         int* percent3,
    237                         double* normalized_score3,
    238                         int* text_bytes,
    239                         bool* is_reliable) {
    240   bool allow_extended_lang = true;
    241   int flags = 0;
    242   Language plus_one = UNKNOWN_LANGUAGE;
    243 
    244   Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
    245                           tables,
    246                           buffer,
    247                           buffer_length,
    248                           is_plain_text,
    249                           tld_hint,               // "id" boosts Indonesian
    250                           encoding_hint,          // SJS boosts Japanese
    251                           language_hint,          // ITALIAN boosts it
    252                           allow_extended_lang,
    253                           flags,
    254                           plus_one,
    255                           language3,
    256                           percent3,
    257                           normalized_score3,
    258                           text_bytes,
    259                           is_reliable);
    260   // Do not default to English
    261   return lang;
    262   }
    263 
    264 
    265 
    266 // Return version text string
    267 // String is "code_version - data_scrape_date"
    268 const char* CompactLangDet::DetectLanguageVersion() {
    269   return kDetectLanguageVersion;
    270 }
    271 
    272