1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_BYTE_ARRAY_UTILS_H 18 #define LATINIME_BYTE_ARRAY_UTILS_H 19 20 #include <cstdint> 21 22 #include "defines.h" 23 24 namespace latinime { 25 26 /** 27 * Utility methods for reading byte arrays. 28 */ 29 class ByteArrayUtils { 30 public: 31 /** 32 * Integer writing 33 * 34 * Each method write a corresponding size integer in a big endian manner. 35 */ 36 static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, 37 const uint32_t data, const int size, int *const pos) { 38 // size must be in 1 to 4. 39 ASSERT(size >= 1 && size <= 4); 40 switch (size) { 41 case 1: 42 ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); 43 return; 44 case 2: 45 ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); 46 return; 47 case 3: 48 ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); 49 return; 50 case 4: 51 ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); 52 return; 53 default: 54 break; 55 } 56 } 57 58 /** 59 * Integer reading 60 * 61 * Each method read a corresponding size integer in a big endian manner. 62 */ 63 static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { 64 return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) 65 ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; 66 } 67 68 static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { 69 return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; 70 } 71 72 static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { 73 return (buffer[pos] << 8) ^ buffer[pos + 1]; 74 } 75 76 static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { 77 return buffer[pos]; 78 } 79 80 static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( 81 const uint8_t *const buffer, int *const pos) { 82 const uint32_t value = readUint32(buffer, *pos); 83 *pos += 4; 84 return value; 85 } 86 87 static AK_FORCE_INLINE int readSint24AndAdvancePosition( 88 const uint8_t *const buffer, int *const pos) { 89 const uint8_t value = readUint8(buffer, *pos); 90 if (value < 0x80) { 91 return readUint24AndAdvancePosition(buffer, pos); 92 } else { 93 (*pos)++; 94 return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); 95 } 96 } 97 98 static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( 99 const uint8_t *const buffer, int *const pos) { 100 const uint32_t value = readUint24(buffer, *pos); 101 *pos += 3; 102 return value; 103 } 104 105 static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( 106 const uint8_t *const buffer, int *const pos) { 107 const uint16_t value = readUint16(buffer, *pos); 108 *pos += 2; 109 return value; 110 } 111 112 static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( 113 const uint8_t *const buffer, int *const pos) { 114 return buffer[(*pos)++]; 115 } 116 117 static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, 118 const int size, const int pos) { 119 // size must be in 1 to 4. 120 ASSERT(size >= 1 && size <= 4); 121 switch (size) { 122 case 1: 123 return ByteArrayUtils::readUint8(buffer, pos); 124 case 2: 125 return ByteArrayUtils::readUint16(buffer, pos); 126 case 3: 127 return ByteArrayUtils::readUint24(buffer, pos); 128 case 4: 129 return ByteArrayUtils::readUint32(buffer, pos); 130 default: 131 return 0; 132 } 133 } 134 135 /** 136 * Code Point Reading 137 * 138 * 1 byte = bbbbbbbb match 139 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 140 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 141 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 142 * 00011111 would be outside unicode. 143 * else: iso-latin-1 code 144 * This allows for the whole unicode range to be encoded, including chars outside of 145 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 146 * characters which should never happen anyway (and still work, but take 3 bytes). 147 */ 148 static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { 149 int p = pos; 150 return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); 151 } 152 153 static AK_FORCE_INLINE int readCodePointAndAdvancePosition( 154 const uint8_t *const buffer, const int *const codePointTable, int *const pos) { 155 /* 156 * codePointTable is an array to convert the most frequent characters in this dictionary to 157 * 1 byte code points. It is only made of the original code points of the most frequent 158 * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. 159 * The original code points are restored by picking the code points at the indices of the 160 * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. 161 */ 162 const uint8_t firstByte = readUint8(buffer, *pos); 163 if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { 164 if (firstByte == CHARACTER_ARRAY_TERMINATOR) { 165 *pos += 1; 166 return NOT_A_CODE_POINT; 167 } else { 168 return readUint24AndAdvancePosition(buffer, pos); 169 } 170 } else { 171 *pos += 1; 172 if (codePointTable) { 173 return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; 174 } 175 return firstByte; 176 } 177 } 178 179 /** 180 * String (array of code points) Reading 181 * 182 * Reads code points until the terminator is found. 183 */ 184 // Returns the length of the string. 185 static int readStringAndAdvancePosition(const uint8_t *const buffer, 186 const int maxLength, const int *const codePointTable, int *const outBuffer, 187 int *const pos) { 188 int length = 0; 189 int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); 190 while (NOT_A_CODE_POINT != codePoint && length < maxLength) { 191 outBuffer[length++] = codePoint; 192 codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); 193 } 194 return length; 195 } 196 197 // Advances the position and returns the length of the string. 198 static int advancePositionToBehindString( 199 const uint8_t *const buffer, const int maxLength, int *const pos) { 200 int length = 0; 201 int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); 202 while (NOT_A_CODE_POINT != codePoint && length < maxLength) { 203 codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); 204 length++; 205 } 206 return length; 207 } 208 209 /** 210 * String (array of code points) Writing 211 */ 212 static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, 213 const int *const codePoints, const int codePointCount, const bool writesTerminator, 214 int *const pos) { 215 for (int i = 0; i < codePointCount; ++i) { 216 const int codePoint = codePoints[i]; 217 if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { 218 break; 219 } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE 220 || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { 221 // three bytes character. 222 writeUint24AndAdvancePosition(buffer, codePoint, pos); 223 } else { 224 // one byte character. 225 writeUint8AndAdvancePosition(buffer, codePoint, pos); 226 } 227 } 228 if (writesTerminator) { 229 writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); 230 } 231 } 232 233 static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, 234 const int codePointCount, const bool writesTerminator) { 235 int byteCount = 0; 236 for (int i = 0; i < codePointCount; ++i) { 237 const int codePoint = codePoints[i]; 238 if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { 239 break; 240 } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE 241 || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { 242 // three bytes character. 243 byteCount += 3; 244 } else { 245 // one byte character. 246 byteCount += 1; 247 } 248 } 249 if (writesTerminator) { 250 // The terminator is one byte. 251 byteCount += 1; 252 } 253 return byteCount; 254 } 255 256 private: 257 DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); 258 259 static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; 260 static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; 261 static const uint8_t CHARACTER_ARRAY_TERMINATOR; 262 263 static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, 264 const uint32_t data, int *const pos) { 265 buffer[(*pos)++] = (data >> 24) & 0xFF; 266 buffer[(*pos)++] = (data >> 16) & 0xFF; 267 buffer[(*pos)++] = (data >> 8) & 0xFF; 268 buffer[(*pos)++] = data & 0xFF; 269 } 270 271 static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, 272 const uint32_t data, int *const pos) { 273 buffer[(*pos)++] = (data >> 16) & 0xFF; 274 buffer[(*pos)++] = (data >> 8) & 0xFF; 275 buffer[(*pos)++] = data & 0xFF; 276 } 277 278 static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, 279 const uint16_t data, int *const pos) { 280 buffer[(*pos)++] = (data >> 8) & 0xFF; 281 buffer[(*pos)++] = data & 0xFF; 282 } 283 284 static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, 285 const uint8_t data, int *const pos) { 286 buffer[(*pos)++] = data & 0xFF; 287 } 288 }; 289 } // namespace latinime 290 #endif /* LATINIME_BYTE_ARRAY_UTILS_H */ 291