Home | History | Annotate | Download | only in lang_id
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "lang_id/custom-tokenizer.h"
     18 
     19 #include <ctype.h>
     20 
     21 #include <string>
     22 
     23 #include "util/strings/utf8.h"
     24 
     25 namespace libtextclassifier {
     26 namespace nlp_core {
     27 namespace lang_id {
     28 
     29 namespace {
     30 inline bool IsTokenSeparator(int num_bytes, const char *curr) {
     31   if (num_bytes != 1) {
     32     return false;
     33   }
     34   return !isalpha(*curr);
     35 }
     36 }  // namespace
     37 
     38 const char *GetSafeEndOfString(const char *data, size_t size) {
     39   const char *const hard_end = data + size;
     40   const char *curr = data;
     41   while (curr < hard_end) {
     42     int num_bytes = GetNumBytesForUTF8Char(curr);
     43     if (num_bytes == 0) {
     44       break;
     45     }
     46     const char *new_curr = curr + num_bytes;
     47     if (new_curr > hard_end) {
     48       return curr;
     49     }
     50     curr = new_curr;
     51   }
     52   return curr;
     53 }
     54 
     55 void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) {
     56   const char *const start = text.data();
     57   const char *curr = start;
     58   const char *end = GetSafeEndOfString(start, text.size());
     59 
     60   // Corner case: empty safe part of the text.
     61   if (curr >= end) {
     62     return;
     63   }
     64 
     65   // Number of bytes for UTF8 character starting at *curr.  Note: the loop below
     66   // is guaranteed to terminate because in each iteration, we move curr by at
     67   // least num_bytes, and num_bytes is guaranteed to be > 0.
     68   int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
     69   while (curr < end) {
     70     // Jump over consecutive token separators.
     71     while (IsTokenSeparator(num_bytes, curr)) {
     72       curr += num_bytes;
     73       if (curr >= end) {
     74         return;
     75       }
     76       num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
     77     }
     78 
     79     // If control reaches this point, we are at beginning of a non-empty token.
     80     std::string *word = sentence->add_word();
     81 
     82     // Add special token-start character.
     83     word->push_back('^');
     84 
     85     // Add UTF8 characters to word, until we hit the end of the safe text or a
     86     // token separator.
     87     while (true) {
     88       word->append(curr, num_bytes);
     89       curr += num_bytes;
     90       if (curr >= end) {
     91         break;
     92       }
     93       num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
     94       if (IsTokenSeparator(num_bytes, curr)) {
     95         curr += num_bytes;
     96         num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
     97         break;
     98       }
     99     }
    100     word->push_back('$');
    101 
    102     // Note: we intentionally do not token.set_start()/end(), as those fields
    103     // are not used by the langid model.
    104   }
    105 }
    106 
    107 }  // namespace lang_id
    108 }  // namespace nlp_core
    109 }  // namespace libtextclassifier
    110