Reorganize char_utils.h and basechars.h
* make BASE_CHARS[] const * add several inline menthods for ASCII character handling Change-Id: I49664f219af88faf0aef43ac350cfc216570b185
This commit is contained in:
parent
85170a9c17
commit
6e3cb27cff
7 changed files with 64 additions and 44 deletions
|
@ -12,6 +12,7 @@ LOCAL_SRC_FILES := \
|
||||||
jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \
|
jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \
|
||||||
jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
|
jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
|
||||||
jni/jni_common.cpp \
|
jni/jni_common.cpp \
|
||||||
|
src/basechars.cpp \
|
||||||
src/bigram_dictionary.cpp \
|
src/bigram_dictionary.cpp \
|
||||||
src/char_utils.cpp \
|
src/char_utils.cpp \
|
||||||
src/correction.cpp \
|
src/correction.cpp \
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2009 The Android Open Source Project
|
* Copyright (C) 2011 The Android Open Source Project
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
|
@ -14,8 +14,9 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef LATINIME_BASECHARS_H
|
#include "char_utils.h"
|
||||||
#define LATINIME_BASECHARS_H
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Table mapping most combined Latin, Greek, and Cyrillic characters
|
* Table mapping most combined Latin, Greek, and Cyrillic characters
|
||||||
|
@ -23,7 +24,7 @@
|
||||||
* if c is not a combined character, or the base character if it
|
* if c is not a combined character, or the base character if it
|
||||||
* is combined.
|
* is combined.
|
||||||
*/
|
*/
|
||||||
static unsigned short BASE_CHARS[] = {
|
const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = {
|
||||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
|
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
|
||||||
|
@ -189,4 +190,5 @@ static unsigned short BASE_CHARS[] = {
|
||||||
|
|
||||||
// generated with:
|
// generated with:
|
||||||
// cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }'
|
// cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }'
|
||||||
#endif // LATINIME_BASECHARS_H
|
|
||||||
|
} // namespace latinime
|
|
@ -19,8 +19,47 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
inline static int isAsciiUpper(unsigned short c) {
|
||||||
|
return c >= 'A' && c <= 'Z';
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static unsigned short toAsciiLower(unsigned short c) {
|
||||||
|
return c - 'A' + 'a';
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int isAscii(unsigned short c) {
|
||||||
|
return c <= 127;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned short latin_tolower(unsigned short c);
|
unsigned short latin_tolower(unsigned short c);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Table mapping most combined Latin, Greek, and Cyrillic characters
|
||||||
|
* to their base characters. If c is in range, BASE_CHARS[c] == c
|
||||||
|
* if c is not a combined character, or the base character if it
|
||||||
|
* is combined.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static const int BASE_CHARS_SIZE = 0x0500;
|
||||||
|
extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
|
||||||
|
|
||||||
|
inline static unsigned short toBaseChar(unsigned short c) {
|
||||||
|
if (c < BASE_CHARS_SIZE) {
|
||||||
|
return BASE_CHARS[c];
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static unsigned short toBaseLowerCase(unsigned short c) {
|
||||||
|
c = toBaseChar(c);
|
||||||
|
if (isAsciiUpper(c)) {
|
||||||
|
return toAsciiLower(c);
|
||||||
|
} else if (isAscii(c)) {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
return latin_tolower(c);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
||||||
#endif // LATINIME_CHAR_UTILS_H
|
#endif // LATINIME_CHAR_UTILS_H
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
|
|
||||||
#define LOG_TAG "LatinIME: correction.cpp"
|
#define LOG_TAG "LatinIME: correction.cpp"
|
||||||
|
|
||||||
|
#include "char_utils.h"
|
||||||
#include "correction.h"
|
#include "correction.h"
|
||||||
#include "dictionary.h"
|
#include "dictionary.h"
|
||||||
#include "proximity_info.h"
|
#include "proximity_info.h"
|
||||||
|
@ -48,13 +49,13 @@ inline static int editDistance(
|
||||||
|
|
||||||
for (int i = 0; i < li - 1; ++i) {
|
for (int i = 0; i < li - 1; ++i) {
|
||||||
for (int j = 0; j < lo - 1; ++j) {
|
for (int j = 0; j < lo - 1; ++j) {
|
||||||
const uint32_t ci = Dictionary::toBaseLowerCase(input[i]);
|
const uint32_t ci = toBaseLowerCase(input[i]);
|
||||||
const uint32_t co = Dictionary::toBaseLowerCase(output[j]);
|
const uint32_t co = toBaseLowerCase(output[j]);
|
||||||
const uint16_t cost = (ci == co) ? 0 : 1;
|
const uint16_t cost = (ci == co) ? 0 : 1;
|
||||||
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
|
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
|
||||||
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
|
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
|
||||||
if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1])
|
if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
|
||||||
&& co == Dictionary::toBaseLowerCase(input[i - 1])) {
|
&& co == toBaseLowerCase(input[i - 1])) {
|
||||||
dp[(i + 1) * lo + (j + 1)] = min(
|
dp[(i + 1) * lo + (j + 1)] = min(
|
||||||
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
|
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
|
||||||
}
|
}
|
||||||
|
@ -89,15 +90,13 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne
|
||||||
const int *const prevprev =
|
const int *const prevprev =
|
||||||
outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0;
|
outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0;
|
||||||
current[0] = outputLength;
|
current[0] = outputLength;
|
||||||
const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]);
|
const uint32_t co = toBaseLowerCase(output[outputLength - 1]);
|
||||||
const uint32_t prevCO =
|
const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0;
|
||||||
outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0;
|
|
||||||
for (int i = 1; i <= inputLength; ++i) {
|
for (int i = 1; i <= inputLength; ++i) {
|
||||||
const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]);
|
const uint32_t ci = toBaseLowerCase(input[i - 1]);
|
||||||
const uint16_t cost = (ci == co) ? 0 : 1;
|
const uint16_t cost = (ci == co) ? 0 : 1;
|
||||||
current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
|
current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
|
||||||
if (i >= 2 && prevprev && ci == prevCO
|
if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) {
|
||||||
&& co == Dictionary::toBaseLowerCase(input[i - 2])) {
|
|
||||||
current[i] = min(current[i], prevprev[i - 2] + 1);
|
current[i] = min(current[i], prevprev[i - 2] + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -607,13 +606,7 @@ inline static int getQuoteCount(const unsigned short* word, const int length) {
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static bool isUpperCase(unsigned short c) {
|
inline static bool isUpperCase(unsigned short c) {
|
||||||
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
|
return isAsciiUpper(toBaseChar(c));
|
||||||
c = BASE_CHARS[c];
|
|
||||||
}
|
|
||||||
if (isupper(c)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
#ifndef LATINIME_DICTIONARY_H
|
#ifndef LATINIME_DICTIONARY_H
|
||||||
#define LATINIME_DICTIONARY_H
|
#define LATINIME_DICTIONARY_H
|
||||||
|
|
||||||
#include "basechars.h"
|
|
||||||
#include "bigram_dictionary.h"
|
#include "bigram_dictionary.h"
|
||||||
#include "char_utils.h"
|
#include "char_utils.h"
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
@ -63,7 +62,6 @@ public:
|
||||||
static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion,
|
static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion,
|
||||||
const int pos, unsigned short *c, int *childrenPosition,
|
const int pos, unsigned short *c, int *childrenPosition,
|
||||||
bool *terminal, int *freq);
|
bool *terminal, int *freq);
|
||||||
static inline unsigned short toBaseLowerCase(unsigned short c);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool hasBigram();
|
bool hasBigram();
|
||||||
|
@ -156,19 +154,6 @@ inline int Dictionary::setDictionaryValues(const unsigned char *dict,
|
||||||
return position;
|
return position;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline unsigned short Dictionary::toBaseLowerCase(unsigned short c) {
|
|
||||||
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
|
|
||||||
c = BASE_CHARS[c];
|
|
||||||
}
|
|
||||||
if (c >='A' && c <= 'Z') {
|
|
||||||
c |= 32;
|
|
||||||
} else if (c > 127) {
|
|
||||||
c = latin_tolower(c);
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
||||||
#endif // LATINIME_DICTIONARY_H
|
#endif // LATINIME_DICTIONARY_H
|
||||||
|
|
|
@ -167,7 +167,7 @@ int ProximityInfo::getKeyIndex(const int c) const {
|
||||||
// We do not have the coordinate data
|
// We do not have the coordinate data
|
||||||
return NOT_A_INDEX;
|
return NOT_A_INDEX;
|
||||||
}
|
}
|
||||||
const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
|
const unsigned short baseLowerC = toBaseLowerCase(c);
|
||||||
if (baseLowerC > MAX_CHAR_CODE) {
|
if (baseLowerC > MAX_CHAR_CODE) {
|
||||||
return NOT_A_INDEX;
|
return NOT_A_INDEX;
|
||||||
}
|
}
|
||||||
|
@ -232,7 +232,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde
|
||||||
const unsigned short c, const bool checkProximityChars, int *proximityIndex) const {
|
const unsigned short c, const bool checkProximityChars, int *proximityIndex) const {
|
||||||
const int *currentChars = getProximityCharsAt(index);
|
const int *currentChars = getProximityCharsAt(index);
|
||||||
const int firstChar = currentChars[0];
|
const int firstChar = currentChars[0];
|
||||||
const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
|
const unsigned short baseLowerC = toBaseLowerCase(c);
|
||||||
|
|
||||||
// The first char in the array is what user typed. If it matches right away,
|
// The first char in the array is what user typed. If it matches right away,
|
||||||
// that means the user typed that same char for this pos.
|
// that means the user typed that same char for this pos.
|
||||||
|
@ -245,7 +245,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde
|
||||||
// If the non-accented, lowercased version of that first character matches c,
|
// If the non-accented, lowercased version of that first character matches c,
|
||||||
// then we have a non-accented version of the accented character the user
|
// then we have a non-accented version of the accented character the user
|
||||||
// typed. Treat it as a close char.
|
// typed. Treat it as a close char.
|
||||||
if (Dictionary::toBaseLowerCase(firstChar) == baseLowerC)
|
if (toBaseLowerCase(firstChar) == baseLowerC)
|
||||||
return NEAR_PROXIMITY_CHAR;
|
return NEAR_PROXIMITY_CHAR;
|
||||||
|
|
||||||
// Not an exact nor an accent-alike match: search the list of close keys
|
// Not an exact nor an accent-alike match: search the list of close keys
|
||||||
|
|
|
@ -464,8 +464,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
|
||||||
const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
int pos = startPos;
|
int pos = startPos;
|
||||||
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
int32_t baseChar = Dictionary::toBaseLowerCase(character);
|
int32_t baseChar = toBaseLowerCase(character);
|
||||||
const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]);
|
const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
|
||||||
|
|
||||||
if (baseChar != wChar) {
|
if (baseChar != wChar) {
|
||||||
*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
|
*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
|
||||||
|
@ -477,8 +477,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
|
||||||
if (hasMultipleChars) {
|
if (hasMultipleChars) {
|
||||||
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
while (NOT_A_CHARACTER != character) {
|
while (NOT_A_CHARACTER != character) {
|
||||||
baseChar = Dictionary::toBaseLowerCase(character);
|
baseChar = toBaseLowerCase(character);
|
||||||
if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
|
if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
|
||||||
*outPos = BinaryFormat::skipOtherCharacters(root, pos);
|
*outPos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
*outInputIndex = startInputIndex;
|
*outInputIndex = startInputIndex;
|
||||||
return false;
|
return false;
|
||||||
|
|
Loading…
Reference in a new issue