1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "encodings/compact_lang_det/compact_lang_det.h" 6 #include "encodings/compact_lang_det/compact_lang_det_impl.h" 7 #include "encodings/compact_lang_det/win/cld_basictypes.h" 8 9 // String is "code_version - data_scrape_date" 10 static const char* kDetectLanguageVersion = "V1.6 - 20081121"; 11 12 // Large-table version for all ~160 languages (all Tiers) 13 14 // Scan interchange-valid UTF-8 bytes and detect most likely language 15 Language CompactLangDet::DetectLanguage( 16 const DetectionTables* tables, 17 const char* buffer, 18 int buffer_length, 19 bool is_plain_text, 20 bool* is_reliable) { 21 bool allow_extended_lang = false; 22 Language language3[3]; 23 int percent3[3]; 24 double normalized_score3[3]; 25 int text_bytes; 26 int flags = 0; 27 Language plus_one = UNKNOWN_LANGUAGE; 28 const char* tld_hint = ""; 29 int encoding_hint = UNKNOWN_ENCODING; 30 Language language_hint = UNKNOWN_LANGUAGE; 31 32 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( 33 tables, 34 buffer, 35 buffer_length, 36 is_plain_text, 37 tld_hint, // "id" boosts Indonesian 38 encoding_hint, // SJS boosts Japanese 39 language_hint, // ITALIAN boosts it 40 allow_extended_lang, 41 flags, 42 plus_one, 43 language3, 44 percent3, 45 normalized_score3, 46 &text_bytes, 47 is_reliable); 48 // Default to English. 49 if (lang == UNKNOWN_LANGUAGE) { 50 lang = ENGLISH; 51 } 52 return lang; 53 } 54 55 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 56 Language CompactLangDet::DetectLanguageSummary( 57 const DetectionTables* tables, 58 const char* buffer, 59 int buffer_length, 60 bool is_plain_text, 61 Language* language3, 62 int* percent3, 63 int* text_bytes, 64 bool* is_reliable) { 65 double normalized_score3[3]; 66 bool allow_extended_lang = false; 67 int flags = 0; 68 Language plus_one = UNKNOWN_LANGUAGE; 69 const char* tld_hint = ""; 70 int encoding_hint = UNKNOWN_ENCODING; 71 Language language_hint = UNKNOWN_LANGUAGE; 72 73 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( 74 tables, 75 buffer, 76 buffer_length, 77 is_plain_text, 78 tld_hint, // "id" boosts Indonesian 79 encoding_hint, // SJS boosts Japanese 80 language_hint, // ITALIAN boosts it 81 allow_extended_lang, 82 flags, 83 plus_one, 84 language3, 85 percent3, 86 normalized_score3, 87 text_bytes, 88 is_reliable); 89 // Default to English 90 if (lang == UNKNOWN_LANGUAGE) { 91 lang = ENGLISH; 92 } 93 return lang; 94 } 95 96 // Same as above, with hints supplied 97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 98 Language CompactLangDet::DetectLanguageSummary( 99 const DetectionTables* tables, 100 const char* buffer, 101 int buffer_length, 102 bool is_plain_text, 103 const char* tld_hint, // "id" boosts Indonesian 104 int encoding_hint, // SJS boosts Japanese 105 Language language_hint, // ITALIAN boosts it 106 Language* language3, 107 int* percent3, 108 int* text_bytes, 109 bool* is_reliable) { 110 double normalized_score3[3]; 111 bool allow_extended_lang = false; 112 int flags = 0; 113 Language plus_one = UNKNOWN_LANGUAGE; 114 115 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( 116 tables, 117 buffer, 118 buffer_length, 119 is_plain_text, 120 tld_hint, // "id" boosts Indonesian 121 encoding_hint, // SJS boosts Japanese 122 language_hint, // ITALIAN boosts it 123 allow_extended_lang, 124 flags, 125 plus_one, 126 language3, 127 percent3, 128 normalized_score3, 129 text_bytes, 130 is_reliable); 131 // Default to English 132 if (lang == UNKNOWN_LANGUAGE) { 133 lang = ENGLISH; 134 } 135 return lang; 136 } 137 138 139 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 140 // languages. 141 // Extended languages are additional Google interface languages and Unicode 142 // single-language scripts, from ext_lang_enc.h 143 Language CompactLangDet::ExtDetectLanguageSummary( 144 const DetectionTables* tables, 145 const char* buffer, 146 int buffer_length, 147 bool is_plain_text, 148 Language* language3, 149 int* percent3, 150 int* text_bytes, 151 bool* is_reliable) { 152 double normalized_score3[3]; 153 bool allow_extended_lang = true; 154 int flags = 0; 155 Language plus_one = UNKNOWN_LANGUAGE; 156 const char* tld_hint = ""; 157 int encoding_hint = UNKNOWN_ENCODING; 158 Language language_hint = UNKNOWN_LANGUAGE; 159 160 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( 161 tables, 162 buffer, 163 buffer_length, 164 is_plain_text, 165 tld_hint, // "id" boosts Indonesian 166 encoding_hint, // SJS boosts Japanese 167 language_hint, // ITALIAN boosts it 168 allow_extended_lang, 169 flags, 170 plus_one, 171 language3, 172 percent3, 173 normalized_score3, 174 text_bytes, 175 is_reliable); 176 // Do not default to English 177 return lang; 178 } 179 180 // Same as above, with hints supplied 181 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 182 // languages. 183 // Extended languages are additional Google interface languages and Unicode 184 // single-language scripts, from ext_lang_enc.h 185 Language CompactLangDet::ExtDetectLanguageSummary( 186 const DetectionTables* tables, 187 const char* buffer, 188 int buffer_length, 189 bool is_plain_text, 190 const char* tld_hint, // "id" boosts Indonesian 191 int encoding_hint, // SJS boosts Japanese 192 Language language_hint, // ITALIAN boosts it 193 Language* language3, 194 int* percent3, 195 int* text_bytes, 196 bool* is_reliable) { 197 double normalized_score3[3]; 198 bool allow_extended_lang = true; 199 int flags = 0; 200 Language plus_one = UNKNOWN_LANGUAGE; 201 202 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( 203 tables, 204 buffer, 205 buffer_length, 206 is_plain_text, 207 tld_hint, // "id" boosts Indonesian 208 encoding_hint, // SJS boosts Japanese 209 language_hint, // ITALIAN boosts it 210 allow_extended_lang, 211 flags, 212 plus_one, 213 language3, 214 percent3, 215 normalized_score3, 216 text_bytes, 217 is_reliable); 218 // Do not default to English 219 return lang; 220 } 221 222 // Same as above, and also returns internal language scores as a ratio to 223 // normal score for real text in that language. Scores close to 1.0 indicate 224 // normal text, while scores far away from 1.0 indicate badly-skewed text or 225 // gibberish 226 // 227 Language CompactLangDet::ExtDetectLanguageSummary( 228 const DetectionTables* tables, 229 const char* buffer, 230 int buffer_length, 231 bool is_plain_text, 232 const char* tld_hint, // "id" boosts Indonesian 233 int encoding_hint, // SJS boosts Japanese 234 Language language_hint, // ITALIAN boosts it 235 Language* language3, 236 int* percent3, 237 double* normalized_score3, 238 int* text_bytes, 239 bool* is_reliable) { 240 bool allow_extended_lang = true; 241 int flags = 0; 242 Language plus_one = UNKNOWN_LANGUAGE; 243 244 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25( 245 tables, 246 buffer, 247 buffer_length, 248 is_plain_text, 249 tld_hint, // "id" boosts Indonesian 250 encoding_hint, // SJS boosts Japanese 251 language_hint, // ITALIAN boosts it 252 allow_extended_lang, 253 flags, 254 plus_one, 255 language3, 256 percent3, 257 normalized_score3, 258 text_bytes, 259 is_reliable); 260 // Do not default to English 261 return lang; 262 } 263 264 265 266 // Return version text string 267 // String is "code_version - data_scrape_date" 268 const char* CompactLangDet::DetectLanguageVersion() { 269 return kDetectLanguageVersion; 270 } 271 272