am 95de7ad1: Merge "Add CharUtilsTest."
* commit '95de7ad1a8838d2a28050db8ef952e954ab9c2cc': Add CharUtilsTest.main
commit
75611ee033
|
@ -134,5 +134,6 @@ LATIN_IME_CORE_TEST_FILES := \
|
||||||
suggest/policyimpl/dictionary/utils/trie_map_test.cpp \
|
suggest/policyimpl/dictionary/utils/trie_map_test.cpp \
|
||||||
suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \
|
suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \
|
||||||
utils/autocorrection_threshold_utils_test.cpp \
|
utils/autocorrection_threshold_utils_test.cpp \
|
||||||
|
utils/char_utils_test.cpp \
|
||||||
utils/int_array_view_test.cpp \
|
utils/int_array_view_test.cpp \
|
||||||
utils/time_keeper_test.cpp
|
utils/time_keeper_test.cpp
|
||||||
|
|
|
@ -1057,11 +1057,11 @@ static int compare_pair_capital(const void *a, const void *b) {
|
||||||
- static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital);
|
- static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) {
|
/* static */ int CharUtils::latin_tolower(const int c) {
|
||||||
struct LatinCapitalSmallPair *p =
|
struct LatinCapitalSmallPair *p =
|
||||||
static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP,
|
static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP,
|
||||||
NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital));
|
NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital));
|
||||||
return p ? p->small : c;
|
return p ? static_cast<int>(p->small) : c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -27,20 +27,14 @@ namespace latinime {
|
||||||
|
|
||||||
class CharUtils {
|
class CharUtils {
|
||||||
public:
|
public:
|
||||||
|
static const std::vector<int> EMPTY_STRING;
|
||||||
|
|
||||||
static AK_FORCE_INLINE bool isAsciiUpper(int c) {
|
static AK_FORCE_INLINE bool isAsciiUpper(int c) {
|
||||||
// Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
|
// Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
|
||||||
// be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
|
// be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
|
||||||
return (c >= 'A' && c <= 'Z');
|
return (c >= 'A' && c <= 'Z');
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE int toAsciiLower(int c) {
|
|
||||||
return c - 'A' + 'a';
|
|
||||||
}
|
|
||||||
|
|
||||||
static AK_FORCE_INLINE bool isAscii(int c) {
|
|
||||||
return isascii(c) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static AK_FORCE_INLINE int toLowerCase(const int c) {
|
static AK_FORCE_INLINE int toLowerCase(const int c) {
|
||||||
if (isAsciiUpper(c)) {
|
if (isAsciiUpper(c)) {
|
||||||
return toAsciiLower(c);
|
return toAsciiLower(c);
|
||||||
|
@ -48,7 +42,7 @@ class CharUtils {
|
||||||
if (isAscii(c)) {
|
if (isAscii(c)) {
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
return static_cast<int>(latin_tolower(static_cast<unsigned short>(c)));
|
return latin_tolower(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
|
static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
|
||||||
|
@ -59,7 +53,6 @@ class CharUtils {
|
||||||
// TODO: Do not hardcode here
|
// TODO: Do not hardcode here
|
||||||
return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
|
return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
|
static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
|
||||||
int size = 0;
|
int size = 0;
|
||||||
for (; size < arraySize; ++size) {
|
for (; size < arraySize; ++size) {
|
||||||
|
@ -91,9 +84,6 @@ class CharUtils {
|
||||||
return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
|
return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned short latin_tolower(const unsigned short c);
|
|
||||||
static const std::vector<int> EMPTY_STRING;
|
|
||||||
|
|
||||||
// Returns updated code point count. Returns 0 when the code points cannot be marked as a
|
// Returns updated code point count. Returns 0 when the code points cannot be marked as a
|
||||||
// Beginning-of-Sentence.
|
// Beginning-of-Sentence.
|
||||||
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
|
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
|
||||||
|
@ -125,6 +115,16 @@ class CharUtils {
|
||||||
*/
|
*/
|
||||||
static const int BASE_CHARS_SIZE = 0x0500;
|
static const int BASE_CHARS_SIZE = 0x0500;
|
||||||
static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
|
static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool isAscii(int c) {
|
||||||
|
return isascii(c) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int toAsciiLower(int c) {
|
||||||
|
return c - 'A' + 'a';
|
||||||
|
}
|
||||||
|
|
||||||
|
static int latin_tolower(const int c);
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_CHAR_UTILS_H
|
#endif // LATINIME_CHAR_UTILS_H
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2014 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utils/char_utils.h"
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
TEST(CharUtilsTest, TestIsAsciiUpper) {
|
||||||
|
EXPECT_TRUE(CharUtils::isAsciiUpper('A'));
|
||||||
|
EXPECT_TRUE(CharUtils::isAsciiUpper('Z'));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper('a'));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper('z'));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper('@'));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(' '));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(0x0410 /* CYRILLIC CAPITAL LETTER A */));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(0x0430 /* CYRILLIC SMALL LETTER A */));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(0x3042 /* HIRAGANA LETTER A */));
|
||||||
|
EXPECT_FALSE(CharUtils::isAsciiUpper(0x1F36A /* COOKIE */));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(CharUtilsTest, TestToLowerCase) {
|
||||||
|
EXPECT_EQ('a', CharUtils::toLowerCase('A'));
|
||||||
|
EXPECT_EQ('z', CharUtils::toLowerCase('Z'));
|
||||||
|
EXPECT_EQ('a', CharUtils::toLowerCase('a'));
|
||||||
|
EXPECT_EQ('z', CharUtils::toLowerCase('z'));
|
||||||
|
EXPECT_EQ('@', CharUtils::toLowerCase('@'));
|
||||||
|
EXPECT_EQ(' ', CharUtils::toLowerCase(' '));
|
||||||
|
EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */,
|
||||||
|
CharUtils::toLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */,
|
||||||
|
CharUtils::toLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
|
||||||
|
CharUtils::toLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */));
|
||||||
|
EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */,
|
||||||
|
CharUtils::toLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */));
|
||||||
|
EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */,
|
||||||
|
CharUtils::toLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */));
|
||||||
|
EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */,
|
||||||
|
CharUtils::toLowerCase(0x3042 /* HIRAGANA LETTER A */));
|
||||||
|
EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toLowerCase(0x1F36A /* COOKIE */));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(CharUtilsTest, TestToBaseLowerCase) {
|
||||||
|
EXPECT_EQ('a', CharUtils::toBaseLowerCase('A'));
|
||||||
|
EXPECT_EQ('z', CharUtils::toBaseLowerCase('Z'));
|
||||||
|
EXPECT_EQ('a', CharUtils::toBaseLowerCase('a'));
|
||||||
|
EXPECT_EQ('z', CharUtils::toBaseLowerCase('z'));
|
||||||
|
EXPECT_EQ('@', CharUtils::toBaseLowerCase('@'));
|
||||||
|
EXPECT_EQ(' ', CharUtils::toBaseLowerCase(' '));
|
||||||
|
EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
|
||||||
|
CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */));
|
||||||
|
EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */,
|
||||||
|
CharUtils::toBaseLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */));
|
||||||
|
EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */,
|
||||||
|
CharUtils::toBaseLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */));
|
||||||
|
EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */,
|
||||||
|
CharUtils::toBaseLowerCase(0x3042 /* HIRAGANA LETTER A */));
|
||||||
|
EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseLowerCase(0x1F36A /* COOKIE */));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(CharUtilsTest, TestToBaseCodePoint) {
|
||||||
|
EXPECT_EQ('A', CharUtils::toBaseCodePoint('A'));
|
||||||
|
EXPECT_EQ('Z', CharUtils::toBaseCodePoint('Z'));
|
||||||
|
EXPECT_EQ('a', CharUtils::toBaseCodePoint('a'));
|
||||||
|
EXPECT_EQ('z', CharUtils::toBaseCodePoint('z'));
|
||||||
|
EXPECT_EQ('@', CharUtils::toBaseCodePoint('@'));
|
||||||
|
EXPECT_EQ(' ', CharUtils::toBaseCodePoint(' '));
|
||||||
|
EXPECT_EQ('A', CharUtils::toBaseCodePoint(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_EQ('a', CharUtils::toBaseCodePoint(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */));
|
||||||
|
EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
|
||||||
|
CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */));
|
||||||
|
EXPECT_EQ(0x0410 /* CYRILLIC CAPITAL LETTER A */,
|
||||||
|
CharUtils::toBaseCodePoint(0x0410 /* CYRILLIC CAPITAL LETTER A */));
|
||||||
|
EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */,
|
||||||
|
CharUtils::toBaseCodePoint(0x0430 /* CYRILLIC SMALL LETTER A */));
|
||||||
|
EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */,
|
||||||
|
CharUtils::toBaseCodePoint(0x3042 /* HIRAGANA LETTER A */));
|
||||||
|
EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseCodePoint(0x1F36A /* COOKIE */));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(CharUtilsTest, TestIsIntentionalOmissionCodePoint) {
|
||||||
|
EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('\''));
|
||||||
|
EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('-'));
|
||||||
|
EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('a'));
|
||||||
|
EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('?'));
|
||||||
|
EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('/'));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(CharUtilsTest, TestIsInUnicodeSpace) {
|
||||||
|
EXPECT_FALSE(CharUtils::isInUnicodeSpace(NOT_A_CODE_POINT));
|
||||||
|
EXPECT_FALSE(CharUtils::isInUnicodeSpace(CODE_POINT_BEGINNING_OF_SENTENCE));
|
||||||
|
EXPECT_TRUE(CharUtils::isInUnicodeSpace('a'));
|
||||||
|
EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x0410 /* CYRILLIC CAPITAL LETTER A */));
|
||||||
|
EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x3042 /* HIRAGANA LETTER A */));
|
||||||
|
EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x1F36A /* COOKIE */));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
} // namespace latinime
|
Loading…
Reference in New Issue