Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2013, The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_BYTE_ARRAY_UTILS_H
     18 #define LATINIME_BYTE_ARRAY_UTILS_H
     19 
     20 #include <cstdint>
     21 
     22 #include "defines.h"
     23 
     24 namespace latinime {
     25 
     26 /**
     27  * Utility methods for reading byte arrays.
     28  */
     29 class ByteArrayUtils {
     30  public:
     31     /**
     32      * Integer writing
     33      *
     34      * Each method write a corresponding size integer in a big endian manner.
     35      */
     36     static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer,
     37             const uint32_t data, const int size, int *const pos) {
     38         // size must be in 1 to 4.
     39         ASSERT(size >= 1 && size <= 4);
     40         switch (size) {
     41             case 1:
     42                 ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos);
     43                 return;
     44             case 2:
     45                 ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos);
     46                 return;
     47             case 3:
     48                 ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos);
     49                 return;
     50             case 4:
     51                 ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos);
     52                 return;
     53             default:
     54                 break;
     55         }
     56     }
     57 
     58     /**
     59      * Integer reading
     60      *
     61      * Each method read a corresponding size integer in a big endian manner.
     62      */
     63     static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) {
     64         return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16)
     65                 ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3];
     66     }
     67 
     68     static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) {
     69         return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2];
     70     }
     71 
     72     static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) {
     73         return (buffer[pos] << 8) ^ buffer[pos + 1];
     74     }
     75 
     76     static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) {
     77         return buffer[pos];
     78     }
     79 
     80     static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition(
     81             const uint8_t *const buffer, int *const pos) {
     82         const uint32_t value = readUint32(buffer, *pos);
     83         *pos += 4;
     84         return value;
     85     }
     86 
     87     static AK_FORCE_INLINE int readSint24AndAdvancePosition(
     88             const uint8_t *const buffer, int *const pos) {
     89         const uint8_t value = readUint8(buffer, *pos);
     90         if (value < 0x80) {
     91             return readUint24AndAdvancePosition(buffer, pos);
     92         } else {
     93             (*pos)++;
     94             return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos));
     95         }
     96     }
     97 
     98     static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition(
     99             const uint8_t *const buffer, int *const pos) {
    100         const uint32_t value = readUint24(buffer, *pos);
    101         *pos += 3;
    102         return value;
    103     }
    104 
    105     static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition(
    106             const uint8_t *const buffer, int *const pos) {
    107         const uint16_t value = readUint16(buffer, *pos);
    108         *pos += 2;
    109         return value;
    110     }
    111 
    112     static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition(
    113             const uint8_t *const buffer, int *const pos) {
    114         return buffer[(*pos)++];
    115     }
    116 
    117     static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer,
    118             const int size, const int pos) {
    119         // size must be in 1 to 4.
    120         ASSERT(size >= 1 && size <= 4);
    121         switch (size) {
    122             case 1:
    123                 return ByteArrayUtils::readUint8(buffer, pos);
    124             case 2:
    125                 return ByteArrayUtils::readUint16(buffer, pos);
    126             case 3:
    127                 return ByteArrayUtils::readUint24(buffer, pos);
    128             case 4:
    129                 return ByteArrayUtils::readUint32(buffer, pos);
    130             default:
    131                 return 0;
    132         }
    133     }
    134 
    135     /**
    136      * Code Point Reading
    137      *
    138      * 1 byte = bbbbbbbb match
    139      * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
    140      * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
    141      *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
    142      *       00011111 would be outside unicode.
    143      * else: iso-latin-1 code
    144      * This allows for the whole unicode range to be encoded, including chars outside of
    145      * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
    146      * characters which should never happen anyway (and still work, but take 3 bytes).
    147      */
    148     static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
    149         int p = pos;
    150         return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
    151     }
    152 
    153     static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
    154             const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
    155         /*
    156          * codePointTable is an array to convert the most frequent characters in this dictionary to
    157          * 1 byte code points. It is only made of the original code points of the most frequent
    158          * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
    159          * The original code points are restored by picking the code points at the indices of the
    160          * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
    161          */
    162         const uint8_t firstByte = readUint8(buffer, *pos);
    163         if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
    164             if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
    165                 *pos += 1;
    166                 return NOT_A_CODE_POINT;
    167             } else {
    168                 return readUint24AndAdvancePosition(buffer, pos);
    169             }
    170         } else {
    171             *pos += 1;
    172             if (codePointTable) {
    173                 return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
    174             }
    175             return firstByte;
    176         }
    177     }
    178 
    179     /**
    180      * String (array of code points) Reading
    181      *
    182      * Reads code points until the terminator is found.
    183      */
    184     // Returns the length of the string.
    185     static int readStringAndAdvancePosition(const uint8_t *const buffer,
    186             const int maxLength, const int *const codePointTable, int *const outBuffer,
    187             int *const pos) {
    188         int length = 0;
    189         int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
    190         while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
    191             outBuffer[length++] = codePoint;
    192             codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
    193         }
    194         return length;
    195     }
    196 
    197     // Advances the position and returns the length of the string.
    198     static int advancePositionToBehindString(
    199             const uint8_t *const buffer, const int maxLength, int *const pos) {
    200         int length = 0;
    201         int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
    202         while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
    203             codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
    204             length++;
    205         }
    206         return length;
    207     }
    208 
    209     /**
    210      * String (array of code points) Writing
    211      */
    212     static void writeCodePointsAndAdvancePosition(uint8_t *const buffer,
    213             const int *const codePoints, const int codePointCount, const bool writesTerminator,
    214             int *const pos) {
    215         for (int i = 0; i < codePointCount; ++i) {
    216             const int codePoint = codePoints[i];
    217             if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
    218                 break;
    219             } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
    220                     || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
    221                 // three bytes character.
    222                 writeUint24AndAdvancePosition(buffer, codePoint, pos);
    223             } else {
    224                 // one byte character.
    225                 writeUint8AndAdvancePosition(buffer, codePoint, pos);
    226             }
    227         }
    228         if (writesTerminator) {
    229             writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos);
    230         }
    231     }
    232 
    233     static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints,
    234             const int codePointCount, const bool writesTerminator) {
    235         int byteCount = 0;
    236         for (int i = 0; i < codePointCount; ++i) {
    237             const int codePoint = codePoints[i];
    238             if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
    239                 break;
    240             } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
    241                     || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
    242                 // three bytes character.
    243                 byteCount += 3;
    244             } else {
    245                 // one byte character.
    246                 byteCount += 1;
    247             }
    248         }
    249         if (writesTerminator) {
    250             // The terminator is one byte.
    251             byteCount += 1;
    252         }
    253         return byteCount;
    254     }
    255 
    256  private:
    257     DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
    258 
    259     static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE;
    260     static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE;
    261     static const uint8_t CHARACTER_ARRAY_TERMINATOR;
    262 
    263     static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
    264             const uint32_t data, int *const pos) {
    265         buffer[(*pos)++] = (data >> 24) & 0xFF;
    266         buffer[(*pos)++] = (data >> 16) & 0xFF;
    267         buffer[(*pos)++] = (data >> 8) & 0xFF;
    268         buffer[(*pos)++] = data & 0xFF;
    269     }
    270 
    271     static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer,
    272             const uint32_t data, int *const pos) {
    273         buffer[(*pos)++] = (data >> 16) & 0xFF;
    274         buffer[(*pos)++] = (data >> 8) & 0xFF;
    275         buffer[(*pos)++] = data & 0xFF;
    276     }
    277 
    278     static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer,
    279             const uint16_t data, int *const pos) {
    280         buffer[(*pos)++] = (data >> 8) & 0xFF;
    281         buffer[(*pos)++] = data & 0xFF;
    282     }
    283 
    284     static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer,
    285             const uint8_t data, int *const pos) {
    286         buffer[(*pos)++] = data & 0xFF;
    287     }
    288 };
    289 } // namespace latinime
    290 #endif /* LATINIME_BYTE_ARRAY_UTILS_H */
    291