Merge "Suggest words with missing space"
This commit is contained in:
commit
71890a78b0
5 changed files with 164 additions and 84 deletions
|
@ -30,8 +30,8 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
|
||||||
: DICT(dict), MAX_WORD_LENGTH(maxWordLength),
|
: DICT(dict), MAX_WORD_LENGTH(maxWordLength),
|
||||||
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
||||||
HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
|
HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
|
||||||
LOGI("BigramDictionary - constructor");
|
if (DEBUG_DICT) LOGI("BigramDictionary - constructor");
|
||||||
LOGI("Has Bigram : %d \n", hasBigram);
|
if (DEBUG_DICT) LOGI("Has Bigram : %d \n", hasBigram);
|
||||||
}
|
}
|
||||||
|
|
||||||
BigramDictionary::~BigramDictionary() {
|
BigramDictionary::~BigramDictionary() {
|
||||||
|
@ -54,7 +54,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
|
||||||
}
|
}
|
||||||
insertAt++;
|
insertAt++;
|
||||||
}
|
}
|
||||||
LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
|
if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
|
||||||
if (insertAt < mMaxBigrams) {
|
if (insertAt < mMaxBigrams) {
|
||||||
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
|
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
|
||||||
(char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
|
(char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
|
||||||
|
@ -107,7 +107,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
|
||||||
if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
|
if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
|
||||||
int pos = mParentDictionary->isValidWordRec(
|
int pos = mParentDictionary->isValidWordRec(
|
||||||
DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
|
DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
|
||||||
LOGI("Pos -> %d\n", pos);
|
if (DEBUG_DICT) LOGI("Pos -> %d\n", pos);
|
||||||
if (pos < 0) {
|
if (pos < 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -151,7 +151,7 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc
|
||||||
}
|
}
|
||||||
pos = followDownBranchAddress; // pos start at count
|
pos = followDownBranchAddress; // pos start at count
|
||||||
int count = DICT[pos] & 0xFF;
|
int count = DICT[pos] & 0xFF;
|
||||||
LOGI("count - %d\n",count);
|
if (DEBUG_DICT) LOGI("count - %d\n",count);
|
||||||
pos++;
|
pos++;
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
// pos at data
|
// pos at data
|
||||||
|
@ -225,7 +225,7 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc
|
||||||
}
|
}
|
||||||
depth++;
|
depth++;
|
||||||
if (followDownBranchAddress == 0) {
|
if (followDownBranchAddress == 0) {
|
||||||
LOGI("ERROR!!! Cannot find bigram!!");
|
if (DEBUG_DICT) LOGI("ERROR!!! Cannot find bigram!!");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,10 @@
|
||||||
#define DEBUG_SHOW_FOUND_WORD false
|
#define DEBUG_SHOW_FOUND_WORD false
|
||||||
#endif // FLAG_DBG
|
#endif // FLAG_DBG
|
||||||
|
|
||||||
|
#ifndef U_SHORT_MAX
|
||||||
|
#define U_SHORT_MAX 1 << 16
|
||||||
|
#endif
|
||||||
|
|
||||||
// 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
|
// 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
|
||||||
#define ADDRESS_MASK 0x3FFFFF
|
#define ADDRESS_MASK 0x3FFFFF
|
||||||
|
|
||||||
|
@ -49,9 +53,9 @@
|
||||||
#define DICTIONARY_HEADER_SIZE 2
|
#define DICTIONARY_HEADER_SIZE 2
|
||||||
#define NOT_VALID_WORD -99
|
#define NOT_VALID_WORD -99
|
||||||
|
|
||||||
#define SUGGEST_MISSING_CHARACTERS true
|
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
|
||||||
|
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
|
||||||
#define SUGGEST_EXCESSIVE_CHARACTERS true
|
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
|
||||||
|
|
||||||
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
||||||
// This is only used for the size of array. Not to be used in c functions.
|
// This is only used for the size of array. Not to be used in c functions.
|
||||||
|
@ -59,6 +63,8 @@
|
||||||
|
|
||||||
#define MAX_DEPTH_MULTIPLIER 3
|
#define MAX_DEPTH_MULTIPLIER 3
|
||||||
|
|
||||||
|
#define MIN_SUGGEST_DEPTH 2
|
||||||
|
|
||||||
#define min(a,b) ((a)<(b)?(a):(b))
|
#define min(a,b) ((a)<(b)?(a):(b))
|
||||||
|
|
||||||
#endif // LATINIME_DEFINES_H
|
#endif // LATINIME_DEFINES_H
|
||||||
|
|
|
@ -27,20 +27,21 @@ Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultip
|
||||||
int maxWordLength, int maxWords, int maxAlternatives)
|
int maxWordLength, int maxWords, int maxAlternatives)
|
||||||
: DICT((unsigned char*) dict),
|
: DICT((unsigned char*) dict),
|
||||||
// Checks whether it has the latest dictionary or the old dictionary
|
// Checks whether it has the latest dictionary or the old dictionary
|
||||||
IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN)
|
IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) {
|
||||||
{
|
if (DEBUG_DICT) {
|
||||||
if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) {
|
if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) {
|
||||||
LOGI("Max word length (%d) is greater than %d", maxWordLength, MAX_WORD_LENGTH_INTERNAL);
|
LOGI("Max word length (%d) is greater than %d",
|
||||||
}
|
maxWordLength, MAX_WORD_LENGTH_INTERNAL);
|
||||||
LOGI("IN NATIVE SUGGEST Version: %d \n", (DICT[0] & 0xFF));
|
LOGI("IN NATIVE SUGGEST Version: %d \n", (DICT[0] & 0xFF));
|
||||||
|
}
|
||||||
|
}
|
||||||
mUnigramDictionary = new UnigramDictionary(DICT, typedLetterMultiplier, fullWordMultiplier,
|
mUnigramDictionary = new UnigramDictionary(DICT, typedLetterMultiplier, fullWordMultiplier,
|
||||||
maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
|
maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
|
||||||
mBigramDictionary = new BigramDictionary(DICT, maxWordLength, maxAlternatives,
|
mBigramDictionary = new BigramDictionary(DICT, maxWordLength, maxAlternatives,
|
||||||
IS_LATEST_DICT_VERSION, hasBigram(), this);
|
IS_LATEST_DICT_VERSION, hasBigram(), this);
|
||||||
}
|
}
|
||||||
|
|
||||||
Dictionary::~Dictionary()
|
Dictionary::~Dictionary() {
|
||||||
{
|
|
||||||
delete mUnigramDictionary;
|
delete mUnigramDictionary;
|
||||||
delete mBigramDictionary;
|
delete mBigramDictionary;
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,11 +30,12 @@
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
|
UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
|
||||||
int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives,
|
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||||
const bool isLatestDictVersion)
|
const bool isLatestDictVersion)
|
||||||
: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
|
: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
|
||||||
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
||||||
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {
|
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
|
||||||
|
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
|
||||||
LOGI("UnigramDictionary - constructor");
|
LOGI("UnigramDictionary - constructor");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,35 +44,37 @@ UnigramDictionary::~UnigramDictionary() {}
|
||||||
int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,
|
int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,
|
||||||
int *frequencies, int *nextLetters, int nextLettersSize)
|
int *frequencies, int *nextLetters, int nextLettersSize)
|
||||||
{
|
{
|
||||||
|
|
||||||
initSuggestions(codes, codesSize, outWords, frequencies);
|
initSuggestions(codes, codesSize, outWords, frequencies);
|
||||||
|
getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);
|
||||||
|
|
||||||
int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, -1, nextLetters,
|
// Suggestion with missing character
|
||||||
nextLettersSize);
|
if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
|
||||||
|
|
||||||
// If there aren't sufficient suggestions, search for words by allowing wild cards at
|
|
||||||
// the different character positions. This feature is not ready for prime-time as we need
|
|
||||||
// to figure out the best ranking for such words compared to proximity corrections and
|
|
||||||
// completions.
|
|
||||||
if (SUGGEST_MISSING_CHARACTERS) {
|
|
||||||
for (int i = 0; i < codesSize; ++i) {
|
for (int i = 0; i < codesSize; ++i) {
|
||||||
if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
|
if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
|
||||||
const int tempCount = getSuggestionCandidates(codesSize, i, -1, NULL, 0);
|
getSuggestionCandidates(codesSize, i, -1, NULL, 0);
|
||||||
if (tempCount > suggestedWordsCount) {
|
|
||||||
suggestedWordsCount = tempCount;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Suggest excessive characters
|
// Suggestion with excessive character
|
||||||
if (SUGGEST_EXCESSIVE_CHARACTERS) {
|
if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {
|
||||||
for (int i = 0; i < codesSize; ++i) {
|
for (int i = 0; i < codesSize; ++i) {
|
||||||
if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
|
if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
|
||||||
const int tempCount = getSuggestionCandidates(codesSize, -1, i, NULL, 0);
|
getSuggestionCandidates(codesSize, -1, i, NULL, 0);
|
||||||
if (tempCount > suggestedWordsCount) {
|
|
||||||
suggestedWordsCount = tempCount;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Suggestions with missing space
|
||||||
|
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER) {
|
||||||
|
for (int i = 1; i < codesSize; ++i) {
|
||||||
|
if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);
|
||||||
|
getMissingSpaceWords(mInputLength, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the word count
|
||||||
|
int suggestedWordsCount = 0;
|
||||||
|
while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
|
||||||
|
suggestedWordsCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
|
@ -84,6 +87,7 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
|
||||||
}
|
}
|
||||||
LOGI("\n");
|
LOGI("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
return suggestedWordsCount;
|
return suggestedWordsCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,23 +101,6 @@ void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned shor
|
||||||
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
|
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos, int excessivePos,
|
|
||||||
int *nextLetters, int nextLettersSize) {
|
|
||||||
if (DEBUG_DICT) LOGI("getSuggestionCandidates");
|
|
||||||
int initialPos = 0;
|
|
||||||
if (IS_LATEST_DICT_VERSION) {
|
|
||||||
initialPos = DICTIONARY_HEADER_SIZE;
|
|
||||||
}
|
|
||||||
getWords(initialPos, inputLength, skipPos, excessivePos, nextLetters, nextLettersSize);
|
|
||||||
|
|
||||||
// Get the word count
|
|
||||||
int suggestedWordsCount = 0;
|
|
||||||
while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
|
|
||||||
suggestedWordsCount++;
|
|
||||||
}
|
|
||||||
return suggestedWordsCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
void UnigramDictionary::registerNextLetter(
|
void UnigramDictionary::registerNextLetter(
|
||||||
unsigned short c, int *nextLetters, int nextLettersSize) {
|
unsigned short c, int *nextLetters, int nextLettersSize) {
|
||||||
if (c < nextLettersSize) {
|
if (c < nextLettersSize) {
|
||||||
|
@ -121,12 +108,13 @@ void UnigramDictionary::registerNextLetter(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: We need to optimize addWord by using STL or something
|
||||||
bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
|
bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
|
||||||
word[length] = 0;
|
word[length] = 0;
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
|
||||||
char s[length + 1];
|
char s[length + 1];
|
||||||
for (int i = 0; i <= length; i++) s[i] = word[i];
|
for (int i = 0; i <= length; i++) s[i] = word[i];
|
||||||
if (DEBUG_SHOW_FOUND_WORD) LOGI("Found word = %s, freq = %d : \n", s, frequency);
|
LOGI("Found word = %s, freq = %d", s, frequency);
|
||||||
}
|
}
|
||||||
if (length > MAX_WORD_LENGTH) {
|
if (length > MAX_WORD_LENGTH) {
|
||||||
if (DEBUG_DICT) LOGI("Exceeded max word length.");
|
if (DEBUG_DICT) LOGI("Exceeded max word length.");
|
||||||
|
@ -146,7 +134,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
char s[length + 1];
|
char s[length + 1];
|
||||||
for (int i = 0; i <= length; i++) s[i] = word[i];
|
for (int i = 0; i <= length; i++) s[i] = word[i];
|
||||||
LOGI("Added word = %s, freq = %d : \n", s, frequency);
|
LOGI("Added word = %s, freq = %d", s, frequency);
|
||||||
}
|
}
|
||||||
memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
|
memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
|
||||||
(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
|
(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
|
||||||
|
@ -160,7 +148,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
|
||||||
*dest++ = *word++;
|
*dest++ = *word++;
|
||||||
}
|
}
|
||||||
*dest = 0; // NULL terminate
|
*dest = 0; // NULL terminate
|
||||||
if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
|
if (DEBUG_DICT) LOGI("Added word at %d", insertAt);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -187,27 +175,19 @@ bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {
|
||||||
if ((unsigned int) *inputCodes != (unsigned int) *word) {
|
if ((unsigned int) *inputCodes != (unsigned int) *word) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
inputCodes += MAX_ALTERNATIVES;
|
inputCodes += MAX_PROXIMITY_CHARS;
|
||||||
word++;
|
word++;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char QUOTE = '\'';
|
static const char QUOTE = '\'';
|
||||||
|
static const char SPACE = ' ';
|
||||||
|
|
||||||
// Keep this for comparing spec to new getWords
|
void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,
|
||||||
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
|
|
||||||
const int excessivePos, int *nextLetters, const int nextLettersSize) {
|
const int excessivePos, int *nextLetters, const int nextLettersSize) {
|
||||||
int initialPosition = initialPos;
|
if (DEBUG_DICT) LOGI("getSuggestionCandidates");
|
||||||
const int count = Dictionary::getCount(DICT, &initialPosition);
|
int rootPosition = ROOT_POS;
|
||||||
getWordsRec(count, initialPosition, 0,
|
|
||||||
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
|
|
||||||
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
void UnigramDictionary::getWords(const int rootPos, const int inputLength, const int skipPos,
|
|
||||||
const int excessivePos, int *nextLetters, const int nextLettersSize) {
|
|
||||||
int rootPosition = rootPos;
|
|
||||||
const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
|
const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
|
||||||
// Get the number of child of root, then increment the position
|
// Get the number of child of root, then increment the position
|
||||||
int childCount = Dictionary::getCount(DICT, &rootPosition);
|
int childCount = Dictionary::getCount(DICT, &rootPosition);
|
||||||
|
@ -220,6 +200,7 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
|
||||||
mStackDiffs[0] = 0;
|
mStackDiffs[0] = 0;
|
||||||
mStackSiblingPos[0] = rootPosition;
|
mStackSiblingPos[0] = rootPosition;
|
||||||
|
|
||||||
|
// Depth first search
|
||||||
while (depth >= 0) {
|
while (depth >= 0) {
|
||||||
if (mStackChildCount[depth] > 0) {
|
if (mStackChildCount[depth] > 0) {
|
||||||
--mStackChildCount[depth];
|
--mStackChildCount[depth];
|
||||||
|
@ -235,7 +216,7 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
|
||||||
MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
|
MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
|
||||||
nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
|
nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
|
||||||
&snr, &inputIndex, &diffs, &siblingPos);
|
&snr, &inputIndex, &diffs, &siblingPos);
|
||||||
// Next sibling pos
|
// Update next sibling pos
|
||||||
mStackSiblingPos[depth] = siblingPos;
|
mStackSiblingPos[depth] = siblingPos;
|
||||||
if (needsToTraverseChildrenNodes) {
|
if (needsToTraverseChildrenNodes) {
|
||||||
// Goes to child node
|
// Goes to child node
|
||||||
|
@ -254,7 +235,48 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// snr : frequency?
|
bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
|
||||||
|
if (missingSpacePos <= 0 || missingSpacePos >= inputLength) return false;
|
||||||
|
const int firstFreq = getWordFreq(0, missingSpacePos);
|
||||||
|
const int secondFreq = getWordFreq(missingSpacePos, inputLength - missingSpacePos);
|
||||||
|
if (DEBUG_DICT) LOGI("First freq: %d, Second freq: %d", firstFreq, secondFreq);
|
||||||
|
|
||||||
|
if (firstFreq <= 0 || secondFreq <= 0) return false;
|
||||||
|
int pairFreq = (firstFreq + secondFreq) / 2;
|
||||||
|
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
|
||||||
|
const int newWordLength = inputLength + 1;
|
||||||
|
// Allocating variable length array on stack
|
||||||
|
unsigned short word[newWordLength];
|
||||||
|
int j = 0;
|
||||||
|
for (int i = 0; i < missingSpacePos; ++i) {
|
||||||
|
// Down-casting
|
||||||
|
if (DEBUG_DICT) {
|
||||||
|
assert((*(mInputCodes + i * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
|
||||||
|
}
|
||||||
|
word[i] = (unsigned short) *(mInputCodes + i * MAX_PROXIMITY_CHARS);
|
||||||
|
}
|
||||||
|
word[missingSpacePos] = SPACE;
|
||||||
|
for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {
|
||||||
|
// Down-casting
|
||||||
|
if (DEBUG_DICT) {
|
||||||
|
assert((*(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
|
||||||
|
}
|
||||||
|
word[i] = (unsigned short) *(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS);
|
||||||
|
}
|
||||||
|
addWord(word, newWordLength, pairFreq);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep this for comparing spec to new getWords
|
||||||
|
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
|
||||||
|
const int excessivePos, int *nextLetters, const int nextLettersSize) {
|
||||||
|
int initialPosition = initialPos;
|
||||||
|
const int count = Dictionary::getCount(DICT, &initialPosition);
|
||||||
|
getWordsRec(count, initialPosition, 0,
|
||||||
|
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
|
||||||
|
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
|
||||||
|
}
|
||||||
|
|
||||||
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
|
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
|
||||||
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
|
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
|
||||||
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
|
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
|
||||||
|
@ -287,7 +309,7 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
|
||||||
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
|
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
|
||||||
unsigned short *word, const int inputLength, const int depth, const int snr,
|
unsigned short *word, const int inputLength, const int depth, const int snr,
|
||||||
int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
|
int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
|
||||||
addWord(word, depth + 1, freq * snr);
|
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);
|
||||||
if (depth >= inputLength && skipPos < 0) {
|
if (depth >= inputLength && skipPos < 0) {
|
||||||
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
|
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
|
||||||
}
|
}
|
||||||
|
@ -301,13 +323,13 @@ inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
|
||||||
// Proximity collection will promote a word of the same length as
|
// Proximity collection will promote a word of the same length as
|
||||||
// what user typed.
|
// what user typed.
|
||||||
if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
|
if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
|
||||||
addWord(word, depth + 1, finalFreq);
|
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
|
inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
|
||||||
const int inputIndex, const int skipPos, const int depth) {
|
const int inputIndex, const int skipPos, const int depth) {
|
||||||
const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_ALTERNATIVES))[0];
|
const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];
|
||||||
// Skip the ' or other letter and continue deeper
|
// Skip the ' or other letter and continue deeper
|
||||||
return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
|
return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
|
||||||
}
|
}
|
||||||
|
@ -361,7 +383,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
||||||
*newDiffs = diffs;
|
*newDiffs = diffs;
|
||||||
*newInputIndex = inputIndex;
|
*newInputIndex = inputIndex;
|
||||||
} else {
|
} else {
|
||||||
int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
|
int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
|
||||||
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
|
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
|
||||||
if (matchedProximityCharId < 0) return false;
|
if (matchedProximityCharId < 0) return false;
|
||||||
mWord[depth] = c;
|
mWord[depth] = c;
|
||||||
|
@ -396,4 +418,48 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
||||||
return needsToTraverseChildrenNodes;
|
return needsToTraverseChildrenNodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline int UnigramDictionary::getWordFreq(const int startInputIndex, const int inputLength) {
|
||||||
|
int pos = ROOT_POS;
|
||||||
|
int count = Dictionary::getCount(DICT, &pos);
|
||||||
|
int freq = 0;
|
||||||
|
bool terminal = false;
|
||||||
|
|
||||||
|
for (int i = 0; i < inputLength; ++i) {
|
||||||
|
bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(pos, count,
|
||||||
|
startInputIndex + i, &pos, &count, &terminal, &freq);
|
||||||
|
if (!needsToTraverseChildrenNodes && (i < inputLength - 1)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (terminal) {
|
||||||
|
return freq;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
|
||||||
|
const int count, const int inputIndex, int *newChildPosition, int *newCount,
|
||||||
|
bool *newTerminal, int *newFreq) {
|
||||||
|
const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
|
||||||
|
int pos = firstChildPos;
|
||||||
|
unsigned short c;
|
||||||
|
for (int i = 0; i < count; ++i) {
|
||||||
|
pos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
|
||||||
|
newChildPosition, newTerminal, newFreq);
|
||||||
|
const unsigned int inputC = currentChars[0];
|
||||||
|
const unsigned short lowerC = toLowerCase(c);
|
||||||
|
const bool matched = (inputC == lowerC || inputC == c);
|
||||||
|
const bool hasChild = *newChildPosition != 0;
|
||||||
|
if (matched) {
|
||||||
|
if (hasChild) {
|
||||||
|
*newCount = Dictionary::getCount(DICT, newChildPosition);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -24,15 +24,15 @@ namespace latinime {
|
||||||
class UnigramDictionary {
|
class UnigramDictionary {
|
||||||
public:
|
public:
|
||||||
UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
|
UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
|
||||||
int maxWordLength, int maxWords, int maxAlternatives, const bool isLatestDictVersion);
|
int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
|
||||||
int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
|
int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
|
||||||
int *nextLetters, int nextLettersSize);
|
int *nextLetters, int nextLettersSize);
|
||||||
~UnigramDictionary();
|
~UnigramDictionary();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
|
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
|
||||||
int getSuggestionCandidates(int inputLength, int skipPos, int excessivePos, int *nextLetters,
|
void getSuggestionCandidates(const int inputLength, const int skipPos, const int excessivePos,
|
||||||
int nextLettersSize);
|
int *nextLetters, const int nextLettersSize);
|
||||||
void getVersionNumber();
|
void getVersionNumber();
|
||||||
bool checkIfDictVersionIsLatest();
|
bool checkIfDictVersionIsLatest();
|
||||||
int getAddress(int *pos);
|
int getAddress(int *pos);
|
||||||
|
@ -44,8 +44,7 @@ private:
|
||||||
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
|
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
|
||||||
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
|
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
|
||||||
const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
|
const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
|
||||||
void getWords(const int rootPos, const int inputLength, const int skipPos,
|
bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
|
||||||
const int excessivePos, int *nextLetters, const int nextLettersSize);
|
|
||||||
// Keep getWordsOld for comparing performance between getWords and getWordsOld
|
// Keep getWordsOld for comparing performance between getWords and getWordsOld
|
||||||
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
|
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
|
||||||
const int excessivePos, int *nextLetters, const int nextLettersSize);
|
const int excessivePos, int *nextLetters, const int nextLettersSize);
|
||||||
|
@ -58,17 +57,25 @@ private:
|
||||||
bool needsToSkipCurrentNode(const unsigned short c,
|
bool needsToSkipCurrentNode(const unsigned short c,
|
||||||
const int inputIndex, const int skipPos, const int depth);
|
const int inputIndex, const int skipPos, const int depth);
|
||||||
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
|
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
|
||||||
|
// Process a node by considering proximity, missing and excessive character
|
||||||
bool processCurrentNode(const int pos, const int depth,
|
bool processCurrentNode(const int pos, const int depth,
|
||||||
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
|
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
|
||||||
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
|
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
|
||||||
const int nextLettersSize, int *newCount, int *newChildPosition,
|
const int nextLettersSize, int *newCount, int *newChildPosition,
|
||||||
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
|
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
|
||||||
int *nextSiblingPosition);
|
int *nextSiblingPosition);
|
||||||
|
int getWordFreq(const int startInputIndex, const int inputLength);
|
||||||
|
// Process a node by considering missing space
|
||||||
|
bool processCurrentNodeForExactMatch(const int firstChildPos, const int count,
|
||||||
|
const int inputIndex, int *newChildPosition, int *newCount, bool *newTerminal,
|
||||||
|
int *newFreq);
|
||||||
|
|
||||||
const unsigned char *DICT;
|
const unsigned char *DICT;
|
||||||
const int MAX_WORDS;
|
const int MAX_WORDS;
|
||||||
const int MAX_WORD_LENGTH;
|
const int MAX_WORD_LENGTH;
|
||||||
const int MAX_ALTERNATIVES;
|
const int MAX_PROXIMITY_CHARS;
|
||||||
const bool IS_LATEST_DICT_VERSION;
|
const bool IS_LATEST_DICT_VERSION;
|
||||||
|
const int ROOT_POS;
|
||||||
const int TYPED_LETTER_MULTIPLIER;
|
const int TYPED_LETTER_MULTIPLIER;
|
||||||
const int FULL_WORD_MULTIPLIER;
|
const int FULL_WORD_MULTIPLIER;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue