1 // Copyright 2007-2010 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_UNICODE_INL_H_ 6 #define V8_UNICODE_INL_H_ 7 8 #include "src/unicode.h" 9 #include "src/base/logging.h" 10 #include "src/utils.h" 11 12 namespace unibrow { 13 14 template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { 15 CacheEntry entry = entries_[code_point & kMask]; 16 if (entry.code_point() == code_point) return entry.value(); 17 return CalculateValue(code_point); 18 } 19 20 template <class T, int s> bool Predicate<T, s>::CalculateValue( 21 uchar code_point) { 22 bool result = T::Is(code_point); 23 entries_[code_point & kMask] = CacheEntry(code_point, result); 24 return result; 25 } 26 27 template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, 28 uchar* result) { 29 CacheEntry entry = entries_[c & kMask]; 30 if (entry.code_point_ == c) { 31 if (entry.offset_ == 0) { 32 return 0; 33 } else { 34 result[0] = c + entry.offset_; 35 return 1; 36 } 37 } else { 38 return CalculateValue(c, n, result); 39 } 40 } 41 42 template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, 43 uchar* result) { 44 bool allow_caching = true; 45 int length = T::Convert(c, n, result, &allow_caching); 46 if (allow_caching) { 47 if (length == 1) { 48 entries_[c & kMask] = CacheEntry(c, result[0] - c); 49 return 1; 50 } else { 51 entries_[c & kMask] = CacheEntry(c, 0); 52 return 0; 53 } 54 } else { 55 return length; 56 } 57 } 58 59 60 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { 61 static const int kMask = ~(1 << 6); 62 if (c <= kMaxOneByteChar) { 63 str[0] = c; 64 return 1; 65 } 66 str[0] = 0xC0 | (c >> 6); 67 str[1] = 0x80 | (c & kMask); 68 return 2; 69 } 70 71 // Encode encodes the UTF-16 code units c and previous into the given str 72 // buffer, and combines surrogate code units into single code points. If 73 // replace_invalid is set to true, orphan surrogate code units will be replaced 74 // with kBadChar. 75 unsigned Utf8::Encode(char* str, 76 uchar c, 77 int previous, 78 bool replace_invalid) { 79 static const int kMask = ~(1 << 6); 80 if (c <= kMaxOneByteChar) { 81 str[0] = c; 82 return 1; 83 } else if (c <= kMaxTwoByteChar) { 84 str[0] = 0xC0 | (c >> 6); 85 str[1] = 0x80 | (c & kMask); 86 return 2; 87 } else if (c <= kMaxThreeByteChar) { 88 if (Utf16::IsSurrogatePair(previous, c)) { 89 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; 90 return Encode(str - kUnmatchedSize, 91 Utf16::CombineSurrogatePair(previous, c), 92 Utf16::kNoPreviousCharacter, 93 replace_invalid) - kUnmatchedSize; 94 } else if (replace_invalid && 95 (Utf16::IsLeadSurrogate(c) || 96 Utf16::IsTrailSurrogate(c))) { 97 c = kBadChar; 98 } 99 str[0] = 0xE0 | (c >> 12); 100 str[1] = 0x80 | ((c >> 6) & kMask); 101 str[2] = 0x80 | (c & kMask); 102 return 3; 103 } else { 104 str[0] = 0xF0 | (c >> 18); 105 str[1] = 0x80 | ((c >> 12) & kMask); 106 str[2] = 0x80 | ((c >> 6) & kMask); 107 str[3] = 0x80 | (c & kMask); 108 return 4; 109 } 110 } 111 112 113 uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) { 114 if (length <= 0) return kBadChar; 115 byte first = bytes[0]; 116 // Characters between 0000 and 0007F are encoded as a single character 117 if (first <= kMaxOneByteChar) { 118 *cursor += 1; 119 return first; 120 } 121 return CalculateValue(bytes, length, cursor); 122 } 123 124 unsigned Utf8::Length(uchar c, int previous) { 125 if (c <= kMaxOneByteChar) { 126 return 1; 127 } else if (c <= kMaxTwoByteChar) { 128 return 2; 129 } else if (c <= kMaxThreeByteChar) { 130 if (Utf16::IsTrailSurrogate(c) && 131 Utf16::IsLeadSurrogate(previous)) { 132 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 133 } 134 return 3; 135 } else { 136 return 4; 137 } 138 } 139 140 bool Utf8::IsValidCharacter(uchar c) { 141 return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) || 142 (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu && 143 c != kBadChar); 144 } 145 146 } // namespace unibrow 147 148 #endif // V8_UNICODE_INL_H_ 149