1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "encodings/compact_lang_det/win/cld_unicodetext.h" 6 7 #include <string> 8 #include <vector> // to compile bar/common/component.h 9 10 #include "encodings/compact_lang_det/compact_lang_det.h" 11 #include "encodings/compact_lang_det/string_byte_sink.h" 12 #include "base/string_util.h" 13 #include "unicode/normlzr.h" 14 #include "unicode/unistr.h" 15 #include "unicode/ustring.h" 16 17 std::string NormalizeText(const UChar* text) { 18 // To avoid a copy, use the read-only aliasing ctor. 19 icu::UnicodeString source(1, text, -1); 20 icu::UnicodeString normalized; 21 UErrorCode status = U_ZERO_ERROR; 22 icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status); 23 if (U_FAILURE(status)) 24 return std::string(); 25 normalized.toLower(); 26 std::string utf8; 27 // Internally, toUTF8 uses a 1kB stack buffer (which is not large enough 28 // for most web pages) and does pre-flighting followed by malloc for larger 29 // strings. We have to switch to obtaining the buffer with the maximum size 30 // (UTF-16 length * 3) without pre-flighting if necessary. 31 StringByteSink sink(&utf8); 32 normalized.toUTF8(sink); 33 return utf8; 34 } 35 36 37 // Detects a language of the UTF-16 encoded zero-terminated text. 38 // Returns: Language enum. 39 Language DetectLanguageOfUnicodeText( 40 const CompactLangDet::DetectionTables* detection_tables, 41 const UChar* text, bool is_plain_text, 42 bool* is_reliable, int* num_languages, 43 int* error_code, int* text_bytes) { 44 if (!text || !num_languages) 45 return NUM_LANGUAGES; 46 // Normalize text to NFC, lowercase and convert to UTF-8. 47 std::string utf8_encoded = NormalizeText(text); 48 if (utf8_encoded.empty()) 49 return NUM_LANGUAGES; 50 51 // Engage core CLD library language detection. 52 Language language3[3] = { 53 UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE 54 }; 55 int percent3[3] = { 0, 0, 0 }; 56 int text_bytes_tmp = 0; 57 // We ignore return value here due to the problem described in bug 1800161. 58 // For example, translate.google.com was detected as Indonesian. It happened 59 // due to the heuristic in CLD, which ignores English as a top language 60 // in the presence of another reliably detected language. 61 // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function. 62 // language3 array is always set according to the detection results and 63 // is not affected by this heuristic. 64 CompactLangDet::DetectLanguageSummary(detection_tables, 65 utf8_encoded.c_str(), 66 utf8_encoded.length(), 67 is_plain_text, language3, percent3, 68 &text_bytes_tmp, is_reliable); 69 70 // Calcualte a number of languages detected in more than 20% of the text. 71 const int kMinTextPercentToCountLanguage = 20; 72 *num_languages = 0; 73 if (text_bytes) 74 *text_bytes = text_bytes_tmp; 75 COMPILE_ASSERT(arraysize(language3) == arraysize(percent3), 76 language3_and_percent3_should_be_of_the_same_size); 77 for (int i = 0; i < arraysize(language3); ++i) { 78 if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) && 79 percent3[i] >= kMinTextPercentToCountLanguage) { 80 ++*num_languages; 81 } 82 } 83 84 return language3[0]; 85 } 86