From 6e3cb27cffa525d555b289111678f6fa0495447e Mon Sep 17 00:00:00 2001 From: "Tadashi G. Takaoka" Date: Fri, 11 Nov 2011 14:26:13 +0900 Subject: [PATCH] Reorganize char_utils.h and basechars.h * make BASE_CHARS[] const * add several inline menthods for ASCII character handling Change-Id: I49664f219af88faf0aef43ac350cfc216570b185 --- native/Android.mk | 1 + native/src/{basechars.h => basechars.cpp} | 12 ++++--- native/src/char_utils.h | 39 +++++++++++++++++++++++ native/src/correction.cpp | 27 ++++++---------- native/src/dictionary.h | 15 --------- native/src/proximity_info.cpp | 6 ++-- native/src/unigram_dictionary.cpp | 8 ++--- 7 files changed, 64 insertions(+), 44 deletions(-) rename native/src/{basechars.h => basechars.cpp} (98%) diff --git a/native/Android.mk b/native/Android.mk index f07be6abe..d2537f055 100644 --- a/native/Android.mk +++ b/native/Android.mk @@ -12,6 +12,7 @@ LOCAL_SRC_FILES := \ jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \ jni/com_android_inputmethod_latin_BinaryDictionary.cpp \ jni/jni_common.cpp \ + src/basechars.cpp \ src/bigram_dictionary.cpp \ src/char_utils.cpp \ src/correction.cpp \ diff --git a/native/src/basechars.h b/native/src/basechars.cpp similarity index 98% rename from native/src/basechars.h rename to native/src/basechars.cpp index 3843e11c5..31f1e18a8 100644 --- a/native/src/basechars.h +++ b/native/src/basechars.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 The Android Open Source Project + * Copyright (C) 2011 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,9 @@ * limitations under the License. */ -#ifndef LATINIME_BASECHARS_H -#define LATINIME_BASECHARS_H +#include "char_utils.h" + +namespace latinime { /** * Table mapping most combined Latin, Greek, and Cyrillic characters @@ -23,7 +24,7 @@ * if c is not a combined character, or the base character if it * is combined. */ -static unsigned short BASE_CHARS[] = { +const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, @@ -189,4 +190,5 @@ static unsigned short BASE_CHARS[] = { // generated with: // cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' -#endif // LATINIME_BASECHARS_H + +} // namespace latinime diff --git a/native/src/char_utils.h b/native/src/char_utils.h index a69a35e7a..607dc5195 100644 --- a/native/src/char_utils.h +++ b/native/src/char_utils.h @@ -19,8 +19,47 @@ namespace latinime { +inline static int isAsciiUpper(unsigned short c) { + return c >= 'A' && c <= 'Z'; +} + +inline static unsigned short toAsciiLower(unsigned short c) { + return c - 'A' + 'a'; +} + +inline static int isAscii(unsigned short c) { + return c <= 127; +} + unsigned short latin_tolower(unsigned short c); +/** + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + */ + +static const int BASE_CHARS_SIZE = 0x0500; +extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; + +inline static unsigned short toBaseChar(unsigned short c) { + if (c < BASE_CHARS_SIZE) { + return BASE_CHARS[c]; + } + return c; +} + +inline static unsigned short toBaseLowerCase(unsigned short c) { + c = toBaseChar(c); + if (isAsciiUpper(c)) { + return toAsciiLower(c); + } else if (isAscii(c)) { + return c; + } + return latin_tolower(c); +} + } // namespace latinime #endif // LATINIME_CHAR_UTILS_H diff --git a/native/src/correction.cpp b/native/src/correction.cpp index f6b7eb6ad..8b6d3b23b 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -21,6 +21,7 @@ #define LOG_TAG "LatinIME: correction.cpp" +#include "char_utils.h" #include "correction.h" #include "dictionary.h" #include "proximity_info.h" @@ -48,13 +49,13 @@ inline static int editDistance( for (int i = 0; i < li - 1; ++i) { for (int j = 0; j < lo - 1; ++j) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i]); - const uint32_t co = Dictionary::toBaseLowerCase(output[j]); + const uint32_t ci = toBaseLowerCase(input[i]); + const uint32_t co = toBaseLowerCase(output[j]); const uint16_t cost = (ci == co) ? 0 : 1; dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); - if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1]) - && co == Dictionary::toBaseLowerCase(input[i - 1])) { + if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1]) + && co == toBaseLowerCase(input[i - 1])) { dp[(i + 1) * lo + (j + 1)] = min( dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); } @@ -89,15 +90,13 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne const int *const prevprev = outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0; current[0] = outputLength; - const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]); - const uint32_t prevCO = - outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0; + const uint32_t co = toBaseLowerCase(output[outputLength - 1]); + const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; for (int i = 1; i <= inputLength; ++i) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]); + const uint32_t ci = toBaseLowerCase(input[i - 1]); const uint16_t cost = (ci == co) ? 0 : 1; current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); - if (i >= 2 && prevprev && ci == prevCO - && co == Dictionary::toBaseLowerCase(input[i - 2])) { + if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) { current[i] = min(current[i], prevprev[i - 2] + 1); } } @@ -607,13 +606,7 @@ inline static int getQuoteCount(const unsigned short* word, const int length) { } inline static bool isUpperCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (isupper(c)) { - return true; - } - return false; + return isAsciiUpper(toBaseChar(c)); } ////////////////////// diff --git a/native/src/dictionary.h b/native/src/dictionary.h index d5de0083a..f891e7457 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -17,7 +17,6 @@ #ifndef LATINIME_DICTIONARY_H #define LATINIME_DICTIONARY_H -#include "basechars.h" #include "bigram_dictionary.h" #include "char_utils.h" #include "defines.h" @@ -63,7 +62,6 @@ public: static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion, const int pos, unsigned short *c, int *childrenPosition, bool *terminal, int *freq); - static inline unsigned short toBaseLowerCase(unsigned short c); private: bool hasBigram(); @@ -156,19 +154,6 @@ inline int Dictionary::setDictionaryValues(const unsigned char *dict, return position; } - -inline unsigned short Dictionary::toBaseLowerCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (c >='A' && c <= 'Z') { - c |= 32; - } else if (c > 127) { - c = latin_tolower(c); - } - return c; -} - } // namespace latinime #endif // LATINIME_DICTIONARY_H diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index d6355883d..6857caf00 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -167,7 +167,7 @@ int ProximityInfo::getKeyIndex(const int c) const { // We do not have the coordinate data return NOT_A_INDEX; } - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); + const unsigned short baseLowerC = toBaseLowerCase(c); if (baseLowerC > MAX_CHAR_CODE) { return NOT_A_INDEX; } @@ -232,7 +232,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde const unsigned short c, const bool checkProximityChars, int *proximityIndex) const { const int *currentChars = getProximityCharsAt(index); const int firstChar = currentChars[0]; - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); + const unsigned short baseLowerC = toBaseLowerCase(c); // The first char in the array is what user typed. If it matches right away, // that means the user typed that same char for this pos. @@ -245,7 +245,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde // If the non-accented, lowercased version of that first character matches c, // then we have a non-accented version of the accented character the user // typed. Treat it as a close char. - if (Dictionary::toBaseLowerCase(firstChar) == baseLowerC) + if (toBaseLowerCase(firstChar) == baseLowerC) return NEAR_PROXIMITY_CHAR; // Not an exact nor an accent-alike match: search the list of close keys diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 7ff2f4a2f..647bfde04 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -464,8 +464,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - int32_t baseChar = Dictionary::toBaseLowerCase(character); - const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]); + int32_t baseChar = toBaseLowerCase(character); + const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; @@ -477,8 +477,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, if (hasMultipleChars) { character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); while (NOT_A_CHARACTER != character) { - baseChar = Dictionary::toBaseLowerCase(character); - if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + baseChar = toBaseLowerCase(character); + if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false;