1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Baybayin (ancient script of the Philippines) is detected as TAGALOG. 6 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. 7 // HAITIAN_CREOLE is detected as such. 8 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) 9 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. 10 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as MOLDAVIAN. 11 // SERBO_CROATIAN, BOSNIAN, CROATIAN, SERBIAN, MONTENEGRIN in the Latin script 12 // are all detected as CROATIAN; in the Cyrillic script as SERBIAN. 13 // Zhuang is detected in the Latin script only. 14 // 15 // The Google interface languages X_PIG_LATIN and X_KLINGON are detected in the 16 // extended calls ExtDetectLanguageSummary(). BorkBorkBork, ElmerFudd, and 17 // Hacker are not detected (too little training data). 18 // 19 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure 20 // is high enough. This happens with non-text input such as the bytes of a 21 // JPEG, and also with some text in languages outside the Google Language 22 // enum, such as Ilonggo. 23 // 24 // The following languages are detected in multiple scripts: 25 // AZERBAIJANI (Latin, Cyrillic*, Arabic*) 26 // BURMESE (Latin, Myanmar) 27 // HAUSA (Latin, Arabic) 28 // KASHMIRI (Arabic, Devanagari) 29 // KAZAKH (Latin, Cyrillic, Arabic) 30 // KURDISH (Latin*, Arabic) 31 // KYRGYZ (Cyrillic, Arabic) 32 // LIMBU (Devanagari, Limbu) 33 // MONGOLIAN (Cyrillic, Mongolian) 34 // SANSKRIT (Latin, Devanagari) 35 // SINDHI (Arabic, Devanagari) 36 // TAGALOG (Latin, Tagalog) 37 // TAJIK (Cyrillic, Arabic*) 38 // TATAR (Latin, Cyrillic, Arabic) 39 // TURKMEN (Latin, Cyrillic, Arabic) 40 // UIGHUR (Latin, Cyrillic, Arabic) 41 // UZBEK (Latin, Cyrillic, Arabic) 42 // 43 // * Due to a shortage of training text, AZERBAIJANI is not currently detected 44 // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in 45 // Arabic script. 46 // 47 48 #ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_ 49 #define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_ 50 51 #include "languages/public/languages.h" 52 #include "encodings/compact_lang_det/win/cld_utf8statetable.h" 53 54 namespace cld { 55 struct CLDTableSummary; 56 } // namespace cld 57 58 namespace CompactLangDet { 59 // Scan interchange-valid UTF-8 bytes and detect most likely language, 60 // or set of languages. 61 // 62 // Design goals: 63 // Skip over big stretches of HTML tags 64 // Able to return ranges of different languages 65 // Relatively small tables and relatively fast processing 66 // Thread safe 67 // 68 // For HTML documents, tags are skipped, along with <script> ... </script> 69 // and <style> ... </style> sequences, and entities are expanded. 70 // 71 // We distinguish between bytes of the raw input buffer and bytes of non-tag 72 // text letters. Since tags can be over 50% of the bytes of an HTML Page, 73 // and are nearly all seven-bit ASCII English, we prefer to distinguish 74 // language mixture fractions based on just the non-tag text. 75 // 76 // Inputs: text and text_length 77 // Code skips HTML tags and expands HTML entities, unless 78 // is_plain_text is true 79 // Outputs: 80 // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE 81 // percent3 is an array of the text percentages 0..100 of the top 3 languages 82 // text_bytes is the amount of non-tag/letters-only text found 83 // is_reliable set true if the returned Language is some amount more 84 // probable then the second-best Language. Calculation is a complex function 85 // of the length of the text and the different-script runs of text. 86 // Return value: the most likely Language for the majority of the input text 87 // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text 88 // defaults to ENGLISH. 89 // 90 // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for 91 // backwards compatibility with LLD. 92 // 93 // The third version may return UNKNOWN_LANGUAGE, and also returns extended 94 // language codes from ext_lang_enc.h 95 // 96 // Subsetting: For fast detection over large documents, these routines will 97 // scan non-tag text of the initial part of a document, then will 98 // skip 4-16 bytes and subsample text in the rest of the document, up to a 99 // fixed limit (currently 160KB of non-tag letters). 100 // 101 102 struct DetectionTables { 103 const cld::CLDTableSummary* quadgram_obj; 104 const UTF8PropObj* unigram_obj; 105 }; 106 107 // Scan interchange-valid UTF-8 bytes and detect most likely language 108 Language DetectLanguage(const DetectionTables* tables, 109 const char* buffer, 110 int buffer_length, 111 bool is_plain_text, 112 bool* is_reliable); 113 114 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 115 // language3[0] is also the return value 116 Language DetectLanguageSummary( 117 const DetectionTables* tables, 118 const char* buffer, 119 int buffer_length, 120 bool is_plain_text, 121 Language* language3, 122 int* percent3, 123 int* text_bytes, 124 bool* is_reliable); 125 126 // Same as above, with hints supplied 127 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. 128 // language3[0] is also the return value 129 Language DetectLanguageSummary( 130 const DetectionTables* tables, 131 const char* buffer, 132 int buffer_length, 133 bool is_plain_text, 134 const char* tld_hint, // "id" boosts Indonesian 135 int encoding_hint, // SJS boosts Japanese 136 Language language_hint, // ITALIAN boosts it 137 Language* language3, 138 int* percent3, 139 int* text_bytes, 140 bool* is_reliable); 141 142 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 143 // languages. 144 // 145 // Extended languages are additional Google interface languages and Unicode 146 // single-language scripts, from ext_lang_enc.h. They are experimental and 147 // this call may be removed. 148 // 149 // language3[0] is also the return value 150 Language ExtDetectLanguageSummary( 151 const DetectionTables* tables, 152 const char* buffer, 153 int buffer_length, 154 bool is_plain_text, 155 Language* language3, 156 int* percent3, 157 int* text_bytes, 158 bool* is_reliable); 159 160 // Same as above, with hints supplied 161 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended 162 // languages. 163 // 164 // Extended languages are additional Google interface languages and Unicode 165 // single-language scripts, from ext_lang_enc.h. They are experimental and 166 // this call may be removed. 167 // 168 // language3[0] is also the return value 169 Language ExtDetectLanguageSummary( 170 const DetectionTables* tables, 171 const char* buffer, 172 int buffer_length, 173 bool is_plain_text, 174 const char* tld_hint, // "id" boosts Indonesian 175 int encoding_hint, // SJS boosts Japanese 176 Language language_hint, // ITALIAN boosts it 177 Language* language3, 178 int* percent3, 179 int* text_bytes, 180 bool* is_reliable); 181 182 // Same as above, and also returns internal language scores as a ratio to 183 // normal score for real text in that language. Scores close to 1.0 indicate 184 // normal text, while scores far away from 1.0 indicate badly-skewed text or 185 // gibberish 186 // 187 Language ExtDetectLanguageSummary( 188 const DetectionTables* tables, 189 const char* buffer, 190 int buffer_length, 191 bool is_plain_text, 192 const char* tld_hint, // "id" boosts Indonesian 193 int encoding_hint, // SJS boosts Japanese 194 Language language_hint, // ITALIAN boosts it 195 Language* language3, 196 int* percent3, 197 double* normalized_score3, 198 int* text_bytes, 199 bool* is_reliable); 200 201 // Return version text string 202 // String is "code_version - data_scrape_date" 203 const char* DetectLanguageVersion(); 204 }; // End namespace CompactLangDet 205 206 #endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_ 207