/* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LATINIME_BYTE_ARRAY_UTILS_H #define LATINIME_BYTE_ARRAY_UTILS_H #include #include "defines.h" namespace latinime { /** * Utility methods for reading byte arrays. */ class ByteArrayUtils { public: /** * Integer writing * * Each method write a corresponding size integer in a big endian manner. */ static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, const uint32_t data, const int size, int *const pos) { // size must be in 1 to 4. ASSERT(size >= 1 && size <= 4); switch (size) { case 1: ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); return; case 2: ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); return; case 3: ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); return; case 4: ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); return; default: break; } } /** * Integer reading * * Each method read a corresponding size integer in a big endian manner. */ static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; } static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; } static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { return (buffer[pos] << 8) ^ buffer[pos + 1]; } static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { return buffer[pos]; } static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint32_t value = readUint32(buffer, *pos); *pos += 4; return value; } static AK_FORCE_INLINE int readSint24AndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint8_t value = readUint8(buffer, *pos); if (value < 0x80) { return readUint24AndAdvancePosition(buffer, pos); } else { (*pos)++; return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); } } static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint32_t value = readUint24(buffer, *pos); *pos += 3; return value; } static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint16_t value = readUint16(buffer, *pos); *pos += 2; return value; } static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( const uint8_t *const buffer, int *const pos) { return buffer[(*pos)++]; } static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, const int size, const int pos) { // size must be in 1 to 4. ASSERT(size >= 1 && size <= 4); switch (size) { case 1: return ByteArrayUtils::readUint8(buffer, pos); case 2: return ByteArrayUtils::readUint16(buffer, pos); case 3: return ByteArrayUtils::readUint24(buffer, pos); case 4: return ByteArrayUtils::readUint32(buffer, pos); default: return 0; } } /** * Code Point Reading * * 1 byte = bbbbbbbb match * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with * 00011111 would be outside unicode. * else: iso-latin-1 code * This allows for the whole unicode range to be encoded, including chars outside of * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control * characters which should never happen anyway (and still work, but take 3 bytes). */ static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { int p = pos; return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); } static AK_FORCE_INLINE int readCodePointAndAdvancePosition( const uint8_t *const buffer, const int *const codePointTable, int *const pos) { /* * codePointTable is an array to convert the most frequent characters in this dictionary to * 1 byte code points. It is only made of the original code points of the most frequent * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. * The original code points are restored by picking the code points at the indices of the * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. */ const uint8_t firstByte = readUint8(buffer, *pos); if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { if (firstByte == CHARACTER_ARRAY_TERMINATOR) { *pos += 1; return NOT_A_CODE_POINT; } else { return readUint24AndAdvancePosition(buffer, pos); } } else { *pos += 1; if (codePointTable) { return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; } return firstByte; } } /** * String (array of code points) Reading * * Reads code points until the terminator is found. */ // Returns the length of the string. static int readStringAndAdvancePosition(const uint8_t *const buffer, const int maxLength, const int *const codePointTable, int *const outBuffer, int *const pos) { int length = 0; int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); while (NOT_A_CODE_POINT != codePoint && length < maxLength) { outBuffer[length++] = codePoint; codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); } return length; } // Advances the position and returns the length of the string. static int advancePositionToBehindString( const uint8_t *const buffer, const int maxLength, int *const pos) { int length = 0; int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); while (NOT_A_CODE_POINT != codePoint && length < maxLength) { codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); length++; } return length; } /** * String (array of code points) Writing */ static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, const int *const codePoints, const int codePointCount, const bool writesTerminator, int *const pos) { for (int i = 0; i < codePointCount; ++i) { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. writeUint24AndAdvancePosition(buffer, codePoint, pos); } else { // one byte character. writeUint8AndAdvancePosition(buffer, codePoint, pos); } } if (writesTerminator) { writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); } } static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, const int codePointCount, const bool writesTerminator) { int byteCount = 0; for (int i = 0; i < codePointCount; ++i) { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. byteCount += 3; } else { // one byte character. byteCount += 1; } } if (writesTerminator) { // The terminator is one byte. byteCount += 1; } return byteCount; } private: DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; static const uint8_t CHARACTER_ARRAY_TERMINATOR; static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, const uint32_t data, int *const pos) { buffer[(*pos)++] = (data >> 24) & 0xFF; buffer[(*pos)++] = (data >> 16) & 0xFF; buffer[(*pos)++] = (data >> 8) & 0xFF; buffer[(*pos)++] = data & 0xFF; } static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, const uint32_t data, int *const pos) { buffer[(*pos)++] = (data >> 16) & 0xFF; buffer[(*pos)++] = (data >> 8) & 0xFF; buffer[(*pos)++] = data & 0xFF; } static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, const uint16_t data, int *const pos) { buffer[(*pos)++] = (data >> 8) & 0xFF; buffer[(*pos)++] = data & 0xFF; } static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, const uint8_t data, int *const pos) { buffer[(*pos)++] = data & 0xFF; } }; } // namespace latinime #endif /* LATINIME_BYTE_ARRAY_UTILS_H */