1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef TC3_STD_STRING_IMPORT 18 #define TC3_STD_STRING_IMPORT 19 #include <string> 20 21 namespace libtextclassifier3 { 22 using string = std::string; 23 template <class CharT, class Traits = std::char_traits<CharT>, 24 class Allocator = std::allocator<CharT> > 25 using basic_string = std::basic_string<CharT, Traits, Allocator>; 26 } // namespace libtextclassifier3 27 #endif 28 #ifndef NLP_SAFT_COMPONENTS_COMMON_MOBILE_UTF8_H_ 29 #define NLP_SAFT_COMPONENTS_COMMON_MOBILE_UTF8_H_ 30 31 #include <stddef.h> 32 33 #include <string> 34 35 namespace libtextclassifier3 { 36 namespace mobile { 37 namespace utils { 38 39 // Returns the length (number of bytes) of the UTF8 code point starting at src, 40 // by reading only the byte from address src. 41 // 42 // The result is a number from the set {1, 2, 3, 4}. 43 static inline int OneCharLen(const char *src) { 44 // On most platforms, char is unsigned by default, but iOS is an exception. 45 // The cast below makes sure we always interpret *src as an unsigned char. 46 return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4" 47 [(*(reinterpret_cast<const unsigned char *>(src)) & 0xFF) >> 4]; 48 } 49 50 // Returns a pointer "end" inside [data, data + size) such that the prefix from 51 // [data, end) is the largest one that does not contain '\0' and offers the 52 // following guarantee: if one starts with 53 // 54 // curr = text.data() 55 // 56 // and keeps executing 57 // 58 // curr += OneCharLen(curr) 59 // 60 // one would eventually reach curr == end (the pointer returned by this 61 // function) without accessing data outside the string. This guards against 62 // scenarios like a broken UTF8 string which has only e.g., the first 2 bytes 63 // from a 3-byte UTF8 sequence. 64 // 65 // Preconditions: data != nullptr. 66 const char *GetSafeEndOfUtf8String(const char *data, size_t size); 67 68 static inline const char *GetSafeEndOfUtf8String(const string &text) { 69 return GetSafeEndOfUtf8String(text.data(), text.size()); 70 } 71 72 } // namespace utils 73 } // namespace mobile 74 } // namespace nlp_saft 75 76 #endif // NLP_SAFT_COMPONENTS_COMMON_MOBILE_UTF8_H_ 77