120 lines
4.7 KiB
C++
120 lines
4.7 KiB
C++
/*
|
|
* Copyright (C) 2014 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "utils/utf8_utils.h"
|
|
|
|
#include "utils/char_utils.h"
|
|
|
|
namespace latinime {
|
|
namespace dicttoolkit {
|
|
|
|
const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
|
|
const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
|
|
const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
|
|
const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
|
|
const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
|
|
|
|
const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
|
|
const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
|
|
const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;
|
|
|
|
/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
|
|
std::vector<int> codePoints;
|
|
int remainingByteCountForCurrentCodePoint = 0;
|
|
int currentCodePointSequenceSize = 0;
|
|
int codePoint = 0;
|
|
for (const char c : utf8Str) {
|
|
if (remainingByteCountForCurrentCodePoint == 0) {
|
|
currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
|
|
if (currentCodePointSequenceSize <= 0) {
|
|
AKLOGE("%x is an invalid utf8 first byte value.", c);
|
|
return std::vector<int>();
|
|
}
|
|
remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
|
|
codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
|
|
} else {
|
|
codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
|
|
codePoint += maskTrailingByte(c);
|
|
}
|
|
remainingByteCountForCurrentCodePoint--;
|
|
if (remainingByteCountForCurrentCodePoint == 0) {
|
|
if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
|
|
AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
|
|
currentCodePointSequenceSize, codePoint);
|
|
return std::vector<int>();
|
|
}
|
|
codePoints.push_back(codePoint);
|
|
}
|
|
}
|
|
return codePoints;
|
|
}
|
|
|
|
/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
|
|
for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
|
|
if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
|
|
return i;
|
|
}
|
|
}
|
|
// Not a valid utf8 char first byte.
|
|
return -1;
|
|
}
|
|
|
|
/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
|
|
const int sequenceSize) {
|
|
return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
|
|
}
|
|
|
|
/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
|
|
return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
|
|
}
|
|
|
|
/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
|
|
std::string utf8String;
|
|
for (const int codePoint : codePoints) {
|
|
const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
|
|
if (sequenceSize <= 0) {
|
|
AKLOGE("Cannot encode code point (%d).", codePoint);
|
|
return std::string();
|
|
}
|
|
const int trailingByteCount = sequenceSize - 1;
|
|
// Output first byte.
|
|
const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
|
|
utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize]));
|
|
// Output second and later bytes.
|
|
for (int i = 1; i < sequenceSize; ++i) {
|
|
const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
|
|
const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
|
|
utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER));
|
|
}
|
|
}
|
|
return utf8String;
|
|
}
|
|
|
|
/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
|
|
if (codePoint < 0) {
|
|
return -1;
|
|
}
|
|
for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
|
|
if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
} // namespace dicttoolkit
|
|
} // namespace latinime
|