1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "lang_id/custom-tokenizer.h" 18 19 #include <ctype.h> 20 21 #include <string> 22 23 #include "util/strings/utf8.h" 24 25 namespace libtextclassifier { 26 namespace nlp_core { 27 namespace lang_id { 28 29 namespace { 30 inline bool IsTokenSeparator(int num_bytes, const char *curr) { 31 if (num_bytes != 1) { 32 return false; 33 } 34 return !isalpha(*curr); 35 } 36 } // namespace 37 38 const char *GetSafeEndOfString(const char *data, size_t size) { 39 const char *const hard_end = data + size; 40 const char *curr = data; 41 while (curr < hard_end) { 42 int num_bytes = GetNumBytesForUTF8Char(curr); 43 if (num_bytes == 0) { 44 break; 45 } 46 const char *new_curr = curr + num_bytes; 47 if (new_curr > hard_end) { 48 return curr; 49 } 50 curr = new_curr; 51 } 52 return curr; 53 } 54 55 void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) { 56 const char *const start = text.data(); 57 const char *curr = start; 58 const char *end = GetSafeEndOfString(start, text.size()); 59 60 // Corner case: empty safe part of the text. 61 if (curr >= end) { 62 return; 63 } 64 65 // Number of bytes for UTF8 character starting at *curr. Note: the loop below 66 // is guaranteed to terminate because in each iteration, we move curr by at 67 // least num_bytes, and num_bytes is guaranteed to be > 0. 68 int num_bytes = GetNumBytesForNonZeroUTF8Char(curr); 69 while (curr < end) { 70 // Jump over consecutive token separators. 71 while (IsTokenSeparator(num_bytes, curr)) { 72 curr += num_bytes; 73 if (curr >= end) { 74 return; 75 } 76 num_bytes = GetNumBytesForNonZeroUTF8Char(curr); 77 } 78 79 // If control reaches this point, we are at beginning of a non-empty token. 80 std::string *word = sentence->add_word(); 81 82 // Add special token-start character. 83 word->push_back('^'); 84 85 // Add UTF8 characters to word, until we hit the end of the safe text or a 86 // token separator. 87 while (true) { 88 word->append(curr, num_bytes); 89 curr += num_bytes; 90 if (curr >= end) { 91 break; 92 } 93 num_bytes = GetNumBytesForNonZeroUTF8Char(curr); 94 if (IsTokenSeparator(num_bytes, curr)) { 95 curr += num_bytes; 96 num_bytes = GetNumBytesForNonZeroUTF8Char(curr); 97 break; 98 } 99 } 100 word->push_back('$'); 101 102 // Note: we intentionally do not token.set_start()/end(), as those fields 103 // are not used by the langid model. 104 } 105 } 106 107 } // namespace lang_id 108 } // namespace nlp_core 109 } // namespace libtextclassifier 110