Utf8Utils for dicttoolkit.

Bug: 10059681 Change-Id: Ie484ba8096823792f0ac663524d1c02d1be070e9
2014-11-10 12:10:36 +09:00 · 2014-11-10 12:10:36 +09:00 · f0c303dd02
commit f0c303dd02
parent cd10540973
4 changed files with 265 additions and 2 deletions
--- a/native/dicttoolkit/NativeFileList.mk
+++ b/native/dicttoolkit/NativeFileList.mk
@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \
        makedict_executor.cpp) \
    $(addprefix offdevice_intermediate_dict/, \
        offdevice_intermediate_dict.cpp) \
-    utils/command_utils.cpp
+    $(addprefix utils/, \
+        command_utils.cpp \
+        utf8_utils.cpp)

 LATIN_IME_DICT_TOOLKIT_TEST_FILES := \
    dict_toolkit_defines_test.cpp \
    $(addprefix offdevice_intermediate_dict/, \
        offdevice_intermediate_dict_test.cpp) \
    $(addprefix utils/, \
-        command_utils_test.cpp)
+        command_utils_test.cpp \
+        utf8_utils_test.cpp)
--- a/native/dicttoolkit/src/utils/utf8_utils.cpp
+++ b/native/dicttoolkit/src/utils/utf8_utils.cpp
@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/utf8_utils.h"
+
+#include "utils/char_utils.h"
+
+namespace latinime {
+namespace dicttoolkit {
+
+const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
+const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
+const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
+const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
+const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
+
+const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
+const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
+const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;
+
+/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
+    std::vector<int> codePoints;
+    int remainingByteCountForCurrentCodePoint = 0;
+    int currentCodePointSequenceSize = 0;
+    int codePoint = 0;
+    for (const char c : utf8Str) {
+        if (remainingByteCountForCurrentCodePoint == 0) {
+            currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
+            if (currentCodePointSequenceSize <= 0) {
+                AKLOGE("%x is an invalid utf8 first byte value.", c);
+                return std::vector<int>();
+            }
+            remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
+            codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
+        } else {
+            codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
+            codePoint += maskTrailingByte(c);
+        }
+        remainingByteCountForCurrentCodePoint--;
+        if (remainingByteCountForCurrentCodePoint == 0) {
+            if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
+                AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
+                        currentCodePointSequenceSize,  codePoint);
+                return std::vector<int>();
+            }
+            codePoints.push_back(codePoint);
+        }
+    }
+    return codePoints;
+}
+
+/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
+    for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
+        if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
+            return i;
+        }
+    }
+    // Not a valid utf8 char first byte.
+    return -1;
+}
+
+/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
+        const int sequenceSize) {
+    return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
+}
+
+/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
+    return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
+}
+
+/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
+    std::string utf8String;
+    for (const int codePoint : codePoints) {
+        const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
+        if (sequenceSize <= 0) {
+            AKLOGE("Cannot encode code point (%d).", codePoint);
+            return std::string();
+        }
+        const int trailingByteCount = sequenceSize - 1;
+        // Output first byte.
+        const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
+        utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize]));
+        // Output second and later bytes.
+        for (int i = 1; i < sequenceSize; ++i) {
+            const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
+            const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
+            utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER));
+        }
+    }
+    return utf8String;
+}
+
+/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
+    if (codePoint < 0) {
+        return -1;
+    }
+    for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
+        if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+} // namespace dicttoolkit
+} // namespace latinime
--- a/native/dicttoolkit/src/utils/utf8_utils.h
+++ b/native/dicttoolkit/src/utils/utf8_utils.h
@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
+#define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "dict_toolkit_defines.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+namespace dicttoolkit {
+
+class Utf8Utils {
+public:
+    static std::vector<int> getCodePoints(const std::string &utf8Str);
+    static std::string getUtf8String(const CodePointArrayView codePoints);
+
+private:
+    DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils);
+
+    // Values indexed by sequence size.
+    static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT;
+    static const uint8_t FIRST_BYTE_MARKER_MASKS[];
+    static const uint8_t FIRST_BYTE_MARKERS[];
+    static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[];
+    static const int MAX_ENCODED_CODE_POINT_VALUES[];
+
+    static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK;
+    static const uint8_t TRAILING_BYTE_MARKER;
+    static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
+
+    static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte);
+    static int maskFirstByte(const uint8_t firstByte, const int encodeSize);
+    static int maskTrailingByte(const uint8_t secondOrLaterByte);
+    static int getSequenceSizeToEncodeCodePoint(const int codePoint);
+};
+} // namespace dicttoolkit
+} // namespace latinime
+#endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
--- a/native/dicttoolkit/tests/utils/utf8_utils_test.cpp
+++ b/native/dicttoolkit/tests/utils/utf8_utils_test.cpp
@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/utf8_utils.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "utils/int_array_view.h"
+
+namespace latinime {
+namespace dicttoolkit {
+namespace {
+
+TEST(Utf8UtilsTests, TestGetCodePoints) {
+    {
+        const std::vector<int> codePoints = Utf8Utils::getCodePoints("");
+        EXPECT_EQ(0u, codePoints.size());
+    }
+    {
+        const std::vector<int> codePoints = Utf8Utils::getCodePoints("test");
+        EXPECT_EQ(4u, codePoints.size());
+        EXPECT_EQ('t', codePoints[0]);
+        EXPECT_EQ('e', codePoints[1]);
+        EXPECT_EQ('s', codePoints[2]);
+        EXPECT_EQ('t', codePoints[3]);
+    }
+    {
+        const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410");
+        EXPECT_EQ(4u, codePoints.size());
+        EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A
+        EXPECT_EQ('a', codePoints[1]);
+        EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A
+        EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA
+    }
+    {
+        const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752");
+        EXPECT_EQ(3u, codePoints.size());
+        EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE
+        EXPECT_EQ('?', codePoints[1]);
+        EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT
+    }
+
+    // Redundant UTF-8 sequences must be rejected.
+    EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty());
+    EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty());
+    EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty());
+}
+
+TEST(Utf8UtilsTests, TestGetUtf8String) {
+    {
+        const std::vector<int> codePoints = {'t', 'e', 's', 't'};
+        EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
+    }
+    {
+        const std::vector<int> codePoints = {
+                0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */,
+                0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
+                0x0430 /* CYRILLIC SMALL LETTER A */,
+                0x3042 /* HIRAGANA LETTER A */,
+                0x1F36A /* COOKIE */,
+                0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */
+        };
+        EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752",
+                Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
+    }
+}
+
+} // namespace
+} // namespace dicttoolkit
+} // namespace latinime