From f0c303dd02a5df8ad544b3971e7738cb34a1d6be Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Mon, 10 Nov 2014 12:10:36 +0900 Subject: [PATCH] Utf8Utils for dicttoolkit. Bug: 10059681 Change-Id: Ie484ba8096823792f0ac663524d1c02d1be070e9 --- native/dicttoolkit/NativeFileList.mk | 7 +- native/dicttoolkit/src/utils/utf8_utils.cpp | 119 ++++++++++++++++++ native/dicttoolkit/src/utils/utf8_utils.h | 56 +++++++++ .../tests/utils/utf8_utils_test.cpp | 85 +++++++++++++ 4 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 native/dicttoolkit/src/utils/utf8_utils.cpp create mode 100644 native/dicttoolkit/src/utils/utf8_utils.h create mode 100644 native/dicttoolkit/tests/utils/utf8_utils_test.cpp diff --git a/native/dicttoolkit/NativeFileList.mk b/native/dicttoolkit/NativeFileList.mk index b39a24890..1c004f73a 100644 --- a/native/dicttoolkit/NativeFileList.mk +++ b/native/dicttoolkit/NativeFileList.mk @@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \ makedict_executor.cpp) \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict.cpp) \ - utils/command_utils.cpp + $(addprefix utils/, \ + command_utils.cpp \ + utf8_utils.cpp) LATIN_IME_DICT_TOOLKIT_TEST_FILES := \ dict_toolkit_defines_test.cpp \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict_test.cpp) \ $(addprefix utils/, \ - command_utils_test.cpp) + command_utils_test.cpp \ + utf8_utils_test.cpp) diff --git a/native/dicttoolkit/src/utils/utf8_utils.cpp b/native/dicttoolkit/src/utils/utf8_utils.cpp new file mode 100644 index 000000000..0f349f512 --- /dev/null +++ b/native/dicttoolkit/src/utils/utf8_utils.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/utf8_utils.h" + +#include "utils/char_utils.h" + +namespace latinime { +namespace dicttoolkit { + +const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4; +const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8}; +const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0}; +const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03}; +const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; + +const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F; +const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80; +const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6; + +/* static */ std::vector Utf8Utils::getCodePoints(const std::string &utf8Str) { + std::vector codePoints; + int remainingByteCountForCurrentCodePoint = 0; + int currentCodePointSequenceSize = 0; + int codePoint = 0; + for (const char c : utf8Str) { + if (remainingByteCountForCurrentCodePoint == 0) { + currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c); + if (currentCodePointSequenceSize <= 0) { + AKLOGE("%x is an invalid utf8 first byte value.", c); + return std::vector(); + } + remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize; + codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint); + } else { + codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; + codePoint += maskTrailingByte(c); + } + remainingByteCountForCurrentCodePoint--; + if (remainingByteCountForCurrentCodePoint == 0) { + if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) { + AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.", + currentCodePointSequenceSize, codePoint); + return std::vector(); + } + codePoints.push_back(codePoint); + } + } + return codePoints; +} + +/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) { + for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { + if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) { + return i; + } + } + // Not a valid utf8 char first byte. + return -1; +} + +/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte, + const int sequenceSize) { + return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize]; +} + +/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) { + return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK; +} + +/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) { + std::string utf8String; + for (const int codePoint : codePoints) { + const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint); + if (sequenceSize <= 0) { + AKLOGE("Cannot encode code point (%d).", codePoint); + return std::string(); + } + const int trailingByteCount = sequenceSize - 1; + // Output first byte. + const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE); + utf8String.push_back(static_cast(value | FIRST_BYTE_MARKERS[sequenceSize])); + // Output second and later bytes. + for (int i = 1; i < sequenceSize; ++i) { + const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; + const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK; + utf8String.push_back(static_cast(value | TRAILING_BYTE_MARKER)); + } + } + return utf8String; +} + +/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) { + if (codePoint < 0) { + return -1; + } + for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { + if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) { + return i; + } + } + return -1; +} + +} // namespace dicttoolkit +} // namespace latinime diff --git a/native/dicttoolkit/src/utils/utf8_utils.h b/native/dicttoolkit/src/utils/utf8_utils.h new file mode 100644 index 000000000..35818e56c --- /dev/null +++ b/native/dicttoolkit/src/utils/utf8_utils.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H +#define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H + +#include +#include +#include + +#include "dict_toolkit_defines.h" +#include "utils/int_array_view.h" + +namespace latinime { +namespace dicttoolkit { + +class Utf8Utils { +public: + static std::vector getCodePoints(const std::string &utf8Str); + static std::string getUtf8String(const CodePointArrayView codePoints); + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils); + + // Values indexed by sequence size. + static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; + static const uint8_t FIRST_BYTE_MARKER_MASKS[]; + static const uint8_t FIRST_BYTE_MARKERS[]; + static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[]; + static const int MAX_ENCODED_CODE_POINT_VALUES[]; + + static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK; + static const uint8_t TRAILING_BYTE_MARKER; + static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; + + static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte); + static int maskFirstByte(const uint8_t firstByte, const int encodeSize); + static int maskTrailingByte(const uint8_t secondOrLaterByte); + static int getSequenceSizeToEncodeCodePoint(const int codePoint); +}; +} // namespace dicttoolkit +} // namespace latinime +#endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H diff --git a/native/dicttoolkit/tests/utils/utf8_utils_test.cpp b/native/dicttoolkit/tests/utils/utf8_utils_test.cpp new file mode 100644 index 000000000..9c59a8b05 --- /dev/null +++ b/native/dicttoolkit/tests/utils/utf8_utils_test.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/utf8_utils.h" + +#include + +#include + +#include "utils/int_array_view.h" + +namespace latinime { +namespace dicttoolkit { +namespace { + +TEST(Utf8UtilsTests, TestGetCodePoints) { + { + const std::vector codePoints = Utf8Utils::getCodePoints(""); + EXPECT_EQ(0u, codePoints.size()); + } + { + const std::vector codePoints = Utf8Utils::getCodePoints("test"); + EXPECT_EQ(4u, codePoints.size()); + EXPECT_EQ('t', codePoints[0]); + EXPECT_EQ('e', codePoints[1]); + EXPECT_EQ('s', codePoints[2]); + EXPECT_EQ('t', codePoints[3]); + } + { + const std::vector codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410"); + EXPECT_EQ(4u, codePoints.size()); + EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A + EXPECT_EQ('a', codePoints[1]); + EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A + EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA + } + { + const std::vector codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752"); + EXPECT_EQ(3u, codePoints.size()); + EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE + EXPECT_EQ('?', codePoints[1]); + EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT + } + + // Redundant UTF-8 sequences must be rejected. + EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty()); + EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty()); + EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty()); +} + +TEST(Utf8UtilsTests, TestGetUtf8String) { + { + const std::vector codePoints = {'t', 'e', 's', 't'}; + EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); + } + { + const std::vector codePoints = { + 0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, + 0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, + 0x0430 /* CYRILLIC SMALL LETTER A */, + 0x3042 /* HIRAGANA LETTER A */, + 0x1F36A /* COOKIE */, + 0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */ + }; + EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752", + Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); + } +} + +} // namespace +} // namespace dicttoolkit +} // namespace latinime