Merge "Fix parameters of native functions and refactor Dictionary"

This commit is contained in:
satok 2010-12-01 18:30:33 -08:00 committed by Android (Google) Code Review
commit 711abe6756
9 changed files with 854 additions and 681 deletions

View file

@ -49,11 +49,11 @@ public class BinaryDictionary extends Dictionary {
private int mDicTypeId;
private int mNativeDict;
private int mDictLength;
private int[] mInputCodes = new int[MAX_WORD_LENGTH * MAX_ALTERNATIVES];
private char[] mOutputChars = new char[MAX_WORD_LENGTH * MAX_WORDS];
private char[] mOutputChars_bigrams = new char[MAX_WORD_LENGTH * MAX_BIGRAMS];
private int[] mFrequencies = new int[MAX_WORDS];
private int[] mFrequencies_bigrams = new int[MAX_BIGRAMS];
private final int[] mInputCodes = new int[MAX_WORD_LENGTH * MAX_ALTERNATIVES];
private final char[] mOutputChars = new char[MAX_WORD_LENGTH * MAX_WORDS];
private final char[] mOutputChars_bigrams = new char[MAX_WORD_LENGTH * MAX_BIGRAMS];
private final int[] mFrequencies = new int[MAX_WORDS];
private final int[] mFrequencies_bigrams = new int[MAX_BIGRAMS];
// Keep a reference to the native dict direct buffer in Java to avoid
// unexpected deallocation of the direct buffer.
private ByteBuffer mNativeDictDirectBuffer;
@ -94,18 +94,19 @@ public class BinaryDictionary extends Dictionary {
}
mDictLength = byteBuffer.capacity();
mNativeDict = openNative(mNativeDictDirectBuffer,
TYPED_LETTER_MULTIPLIER, FULL_WORD_FREQ_MULTIPLIER);
TYPED_LETTER_MULTIPLIER, FULL_WORD_FREQ_MULTIPLIER,
MAX_WORD_LENGTH, MAX_WORDS, MAX_ALTERNATIVES);
}
mDicTypeId = dicTypeId;
}
private native int openNative(ByteBuffer bb, int typedLetterMultiplier,
int fullWordMultiplier);
int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives);
private native void closeNative(int dict);
private native boolean isValidWordNative(int nativeData, char[] word, int wordLength);
private native int getSuggestionsNative(int dict, int[] inputCodes, int codesSize,
char[] outputChars, int[] frequencies, int maxWordLength, int maxWords,
int maxAlternatives, int skipPos, int[] nextLettersFrequencies, int nextLettersSize);
char[] outputChars, int[] frequencies,
int[] nextLettersFrequencies, int nextLettersSize);
private native int getBigramsNative(int dict, char[] prevWord, int prevWordLength,
int[] inputCodes, int inputCodesLength, char[] outputChars, int[] frequencies,
int maxWordLength, int maxBigrams, int maxAlternatives);
@ -131,7 +132,8 @@ public class BinaryDictionary extends Dictionary {
Log.e(TAG, "Read " + got + " bytes, expected " + total);
} else {
mNativeDict = openNative(mNativeDictDirectBuffer,
TYPED_LETTER_MULTIPLIER, FULL_WORD_FREQ_MULTIPLIER);
TYPED_LETTER_MULTIPLIER, FULL_WORD_FREQ_MULTIPLIER,
MAX_WORD_LENGTH, MAX_WORDS, MAX_ALTERNATIVES);
mDictLength = total;
}
} catch (IOException e) {
@ -188,7 +190,7 @@ public class BinaryDictionary extends Dictionary {
final int codesSize = codes.size();
// Won't deal with really long words.
if (codesSize > MAX_WORD_LENGTH - 1) return;
Arrays.fill(mInputCodes, -1);
for (int i = 0; i < codesSize; i++) {
int[] alternatives = codes.getCodesAt(i);
@ -199,8 +201,7 @@ public class BinaryDictionary extends Dictionary {
Arrays.fill(mFrequencies, 0);
int count = getSuggestionsNative(mNativeDict, mInputCodes, codesSize, mOutputChars,
mFrequencies, MAX_WORD_LENGTH, MAX_WORDS, MAX_ALTERNATIVES, -1,
nextLettersFrequencies,
mFrequencies, nextLettersFrequencies,
nextLettersFrequencies != null ? nextLettersFrequencies.length : 0);
for (int j = 0; j < count; j++) {

View file

@ -4,9 +4,11 @@ include $(CLEAR_VARS)
LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
LOCAL_SRC_FILES := \
jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
src/dictionary.cpp \
src/char_utils.cpp
jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
src/bigram_dictionary.cpp \
src/char_utils.cpp \
src/dictionary.cpp \
src/unigram_dictionary.cpp
#FLAG_DBG := true

View file

@ -42,21 +42,23 @@ static void throwException(JNIEnv *env, const char* ex, const char* fmt, int dat
static jint latinime_BinaryDictionary_open
(JNIEnv *env, jobject object, jobject dictDirectBuffer,
jint typedLetterMultiplier, jint fullWordMultiplier)
jint typedLetterMultiplier, jint fullWordMultiplier, jint maxWordLength, jint maxWords,
jint maxAlternatives)
{
void *dict = env->GetDirectBufferAddress(dictDirectBuffer);
if (dict == NULL) {
fprintf(stderr, "DICT: Dictionary buffer is null\n");
return 0;
}
Dictionary *dictionary = new Dictionary(dict, typedLetterMultiplier, fullWordMultiplier);
Dictionary *dictionary = new Dictionary(dict, typedLetterMultiplier, fullWordMultiplier,
maxWordLength, maxWords, maxAlternatives);
return (jint) dictionary;
}
static int latinime_BinaryDictionary_getSuggestions(
JNIEnv *env, jobject object, jint dict, jintArray inputArray, jint arraySize,
jcharArray outputArray, jintArray frequencyArray, jint maxWordLength, jint maxWords,
jint maxAlternatives, jint skipPos, jintArray nextLettersArray, jint nextLettersSize)
jcharArray outputArray, jintArray frequencyArray,
jintArray nextLettersArray, jint nextLettersSize)
{
Dictionary *dictionary = (Dictionary*) dict;
if (dictionary == NULL) return 0;
@ -68,8 +70,7 @@ static int latinime_BinaryDictionary_getSuggestions(
: NULL;
int count = dictionary->getSuggestions(inputCodes, arraySize, (unsigned short*) outputChars,
frequencies, maxWordLength, maxWords, maxAlternatives, skipPos, nextLetters,
nextLettersSize);
frequencies, nextLetters, nextLettersSize);
env->ReleaseIntArrayElements(frequencyArray, frequencies, 0);
env->ReleaseIntArrayElements(inputArray, inputCodes, JNI_ABORT);
@ -123,17 +124,16 @@ static jboolean latinime_BinaryDictionary_isValidWord
static void latinime_BinaryDictionary_close
(JNIEnv *env, jobject object, jint dict)
{
Dictionary *dictionary = (Dictionary*) dict;
delete (Dictionary*) dict;
}
// ----------------------------------------------------------------------------
static JNINativeMethod gMethods[] = {
{"openNative", "(Ljava/nio/ByteBuffer;II)I",
{"openNative", "(Ljava/nio/ByteBuffer;IIIII)I",
(void*)latinime_BinaryDictionary_open},
{"closeNative", "(I)V", (void*)latinime_BinaryDictionary_close},
{"getSuggestionsNative", "(I[II[C[IIIII[II)I", (void*)latinime_BinaryDictionary_getSuggestions},
{"getSuggestionsNative", "(I[II[C[I[II)I", (void*)latinime_BinaryDictionary_getSuggestions},
{"isValidWordNative", "(I[CI)Z", (void*)latinime_BinaryDictionary_isValidWord},
{"getBigramsNative", "(I[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams}
};

View file

@ -0,0 +1,31 @@
/*
**
** Copyright 2010, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/
#include "bigram_dictionary.h"
namespace latinime {
BigramDictionary::BigramDictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxAlternatives, Dictionary *parentDictionary)
{
}
BigramDictionary::~BigramDictionary()
{
}
// TODO: Move functions related to bigram to here
} // namespace latinime

View file

@ -0,0 +1,32 @@
/*
* Copyright (C) 2010 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_BIGRAM_DICTIONARY_H
#define LATINIME_BIGRAM_DICTIONARY_H
namespace latinime {
class Dictionary;
class BigramDictionary {
public:
BigramDictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength,
int maxWords, int maxAlternatives, Dictionary *parentDictionary);
~BigramDictionary();
private:
};
// ----------------------------------------------------------------------------
}; // namespace latinime
#endif // LATINIME_BIGRAM_DICTIONARY_H

View file

@ -16,618 +16,23 @@
*/
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#ifdef FLAG_DBG
#define LOG_TAG "LatinIME: dictionary.cpp"
#include <cutils/log.h>
#define DEBUG_DICT 1
#else // FLAG_DBG
#define LOGI
#define DEBUG_DICT 0
#endif // FLAG_DBG
#include "dictionary.h"
#include "basechars.h"
#include "char_utils.h"
#define DICTIONARY_VERSION_MIN 200
#define DICTIONARY_HEADER_SIZE 2
#define NOT_VALID_WORD -99
#define SUGGEST_MISSING_CHARACTERS true
#define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5
namespace latinime {
Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxAlternatives)
{
LOGI("Dictionary - constructor");
mDict = (unsigned char*) dict;
mTypedLetterMultiplier = typedLetterMultiplier;
mFullWordMultiplier = fullWordMultiplier;
getVersionNumber();
mUnigramDictionary = new UnigramDictionary(dict, typedLetterMultiplier, fullWordMultiplier,
maxWordLength, maxWords, maxAlternatives, this);
mBigramDictionary = new BigramDictionary(dict, typedLetterMultiplier, fullWordMultiplier,
maxWordLength, maxWords, maxAlternatives, this);
}
Dictionary::~Dictionary()
{
delete mUnigramDictionary;
delete mBigramDictionary;
}
int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
int maxWordLength, int maxWords, int maxAlternatives, int skipPos,
int *nextLetters, int nextLettersSize)
{
initSuggestions(codes, codesSize, outWords, frequencies, maxWordLength, maxWords,
maxAlternatives);
int suggestedWordsCount = getSuggestionCandidates(codesSize, maxWords, skipPos, nextLetters,
nextLettersSize);
// If there aren't sufficient suggestions, search for words by allowing wild cards at
// the different character positions. This feature is not ready for prime-time as we need
// to figure out the best ranking for such words compared to proximity corrections and
// completions.
if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
for (int i = 0; i < codesSize; ++i) {
int tempCount = getSuggestionCandidates(codesSize, maxWords, i, NULL, 0);
if (tempCount > suggestedWordsCount) {
suggestedWordsCount = tempCount;
break;
}
}
}
if (DEBUG_DICT) {
LOGI("Returning %d words", suggestedWordsCount);
LOGI("Next letters: ");
for (int k = 0; k < nextLettersSize; k++) {
if (nextLetters[k] > 0) {
LOGI("%c = %d,", k, nextLetters[k]);
}
}
LOGI("\n");
}
return suggestedWordsCount;
}
void Dictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
int *frequencies, int maxWordLength, int maxWords, int maxAlternatives) {
mFrequencies = frequencies;
mOutputChars = outWords;
mInputCodes = codes;
mInputLength = codesSize;
mMaxAlternatives = maxAlternatives;
mMaxWordLength = maxWordLength;
mMaxWords = maxWords;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
}
int Dictionary::getSuggestionCandidates(int inputLength, int maxWords, int skipPos,
int *nextLetters, int nextLettersSize) {
if (checkIfDictVersionIsLatest()) {
getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
nextLetters, nextLettersSize);
} else {
getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
}
// Get the word count
int suggestedWordsCount = 0;
while (suggestedWordsCount < maxWords && mFrequencies[suggestedWordsCount] > 0) {
suggestedWordsCount++;
}
return suggestedWordsCount;
}
void Dictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
if (c < nextLettersSize) {
nextLetters[c]++;
}
}
void
Dictionary::getVersionNumber()
{
mVersion = (mDict[0] & 0xFF);
mBigram = (mDict[1] & 0xFF);
LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
}
// Checks whether it has the latest dictionary or the old dictionary
bool
Dictionary::checkIfDictVersionIsLatest()
{
return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
}
unsigned short
Dictionary::getChar(int *pos)
{
unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
// If the code is 255, then actual 16 bit code follows (in big endian)
if (ch == 0xFF) {
ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
(*pos) += 2;
}
return ch;
}
int
Dictionary::getAddress(int *pos)
{
int address = 0;
if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
*pos += 1;
} else {
address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
address += (mDict[*pos + 1] & 0xFF) << 8;
address += (mDict[*pos + 2] & 0xFF);
*pos += 3;
}
return address;
}
int
Dictionary::getFreq(int *pos)
{
int freq = mDict[(*pos)++] & 0xFF;
if (checkIfDictVersionIsLatest()) {
// skipping bigram
int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0) {
(*pos) += 3;
nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
}
} else {
(*pos)++;
}
}
return freq;
}
int
Dictionary::wideStrLen(unsigned short *str)
{
if (!str) return 0;
unsigned short *end = str;
while (*end)
end++;
return end - str;
}
bool
Dictionary::addWord(unsigned short *word, int length, int frequency)
{
word[length] = 0;
if (DEBUG_DICT) {
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
LOGI("Found word = %s, freq = %d : \n", s, frequency);
}
// Find the right insertion point
int insertAt = 0;
while (insertAt < mMaxWords) {
if (frequency > mFrequencies[insertAt]
|| (mFrequencies[insertAt] == frequency
&& length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
break;
}
insertAt++;
}
if (insertAt < mMaxWords) {
memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
(mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
mFrequencies[insertAt] = frequency;
memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
(char*) mOutputChars + (insertAt ) * mMaxWordLength * sizeof(short),
(mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
unsigned short *dest = mOutputChars + (insertAt ) * mMaxWordLength;
while (length--) {
*dest++ = *word++;
}
*dest = 0; // NULL terminate
if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
return true;
}
return false;
}
bool
Dictionary::addWordBigram(unsigned short *word, int length, int frequency)
{
word[length] = 0;
if (DEBUG_DICT) {
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
}
// Find the right insertion point
int insertAt = 0;
while (insertAt < mMaxBigrams) {
if (frequency > mBigramFreq[insertAt]
|| (mBigramFreq[insertAt] == frequency
&& length < wideStrLen(mBigramChars + insertAt * mMaxWordLength))) {
break;
}
insertAt++;
}
LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
if (insertAt < mMaxBigrams) {
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
(char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
(mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
mBigramFreq[insertAt] = frequency;
memmove((char*) mBigramChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
(char*) mBigramChars + (insertAt ) * mMaxWordLength * sizeof(short),
(mMaxBigrams - insertAt - 1) * sizeof(short) * mMaxWordLength);
unsigned short *dest = mBigramChars + (insertAt ) * mMaxWordLength;
while (length--) {
*dest++ = *word++;
}
*dest = 0; // NULL terminate
if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
return true;
}
return false;
}
unsigned short
Dictionary::toLowerCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
c = BASE_CHARS[c];
}
if (c >='A' && c <= 'Z') {
c |= 32;
} else if (c > 127) {
c = latin_tolower(c);
}
return c;
}
bool
Dictionary::sameAsTyped(unsigned short *word, int length)
{
if (length != mInputLength) {
return false;
}
int *inputCodes = mInputCodes;
while (length--) {
if ((unsigned int) *inputCodes != (unsigned int) *word) {
return false;
}
inputCodes += mMaxAlternatives;
word++;
}
return true;
}
static char QUOTE = '\'';
void
Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
int diffs, int skipPos, int *nextLetters, int nextLettersSize)
{
// Optimization: Prune out words that are too long compared to how much was typed.
if (depth > maxDepth) {
return;
}
if (diffs > mMaxEditDistance) {
return;
}
int count = getCount(&pos);
int *currentChars = NULL;
if (mInputLength <= inputIndex) {
completion = true;
} else {
currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
}
for (int i = 0; i < count; i++) {
// -- at char
unsigned short c = getChar(&pos);
// -- at flag/add
unsigned short lowerC = toLowerCase(c);
bool terminal = getTerminal(&pos);
int childrenAddress = getAddress(&pos);
// -- after address or flag
int freq = 1;
if (terminal) freq = getFreq(&pos);
// -- after add or freq
// If we are only doing completions, no need to look at the typed characters.
if (completion) {
mWord[depth] = c;
if (terminal) {
addWord(mWord, depth + 1, freq * snr);
if (depth >= mInputLength && skipPos < 0) {
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
}
}
if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
diffs, skipPos, nextLetters, nextLettersSize);
}
} else if ((c == QUOTE && currentChars[0] != QUOTE) || skipPos == depth) {
// Skip the ' or other letter and continue deeper
mWord[depth] = c;
if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
skipPos, nextLetters, nextLettersSize);
}
} else {
int j = 0;
while (currentChars[j] > 0) {
if (currentChars[j] == lowerC || currentChars[j] == c) {
int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
mWord[depth] = c;
if (mInputLength == inputIndex + 1) {
if (terminal) {
if (//INCLUDE_TYPED_WORD_IF_VALID ||
!sameAsTyped(mWord, depth + 1)) {
int finalFreq = freq * snr * addedWeight;
if (skipPos < 0) finalFreq *= mFullWordMultiplier;
addWord(mWord, depth + 1, finalFreq);
}
}
if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1,
maxDepth, true, snr * addedWeight, inputIndex + 1,
diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
}
} else if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1, maxDepth,
false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
skipPos, nextLetters, nextLettersSize);
}
}
j++;
if (skipPos >= 0) break;
}
}
}
}
int
Dictionary::getBigramAddress(int *pos, bool advance)
{
int address = 0;
address += (mDict[*pos] & 0x3F) << 16;
address += (mDict[*pos + 1] & 0xFF) << 8;
address += (mDict[*pos + 2] & 0xFF);
if (advance) {
*pos += 3;
}
return address;
}
int
Dictionary::getBigramFreq(int *pos)
{
int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
return freq;
}
int
Dictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
int maxAlternatives)
{
mBigramFreq = bigramFreq;
mBigramChars = bigramChars;
mInputCodes = codes;
mInputLength = codesSize;
mMaxWordLength = maxWordLength;
mMaxBigrams = maxBigrams;
mMaxAlternatives = maxAlternatives;
if (mBigram == 1 && checkIfDictVersionIsLatest()) {
int pos = isValidWordRec(DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
LOGI("Pos -> %d\n", pos);
if (pos < 0) {
return 0;
}
int bigramCount = 0;
int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0 && bigramCount < maxBigrams) {
int bigramAddress = getBigramAddress(&pos, true);
int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
// search for all bigrams and store them
searchForTerminalNode(bigramAddress, frequency);
nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
bigramCount++;
}
}
return bigramCount;
}
return 0;
}
void
Dictionary::searchForTerminalNode(int addressLookingFor, int frequency)
{
// track word with such address and store it in an array
unsigned short word[mMaxWordLength];
int pos;
int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
bool found = false;
char followingChar = ' ';
int depth = -1;
while(!found) {
bool followDownAddressSearchStop = false;
bool firstAddress = true;
bool haveToSearchAll = true;
if (depth >= 0) {
word[depth] = (unsigned short) followingChar;
}
pos = followDownBranchAddress; // pos start at count
int count = mDict[pos] & 0xFF;
LOGI("count - %d\n",count);
pos++;
for (int i = 0; i < count; i++) {
// pos at data
pos++;
// pos now at flag
if (!getFirstBitOfByte(&pos)) { // non-terminal
if (!followDownAddressSearchStop) {
int addr = getBigramAddress(&pos, false);
if (addr > addressLookingFor) {
followDownAddressSearchStop = true;
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
} else if (!haveToSearchAll) {
break;
}
} else {
followDownBranchAddress = addr;
followingChar = (char)(0xFF & mDict[pos-1]);
if (firstAddress) {
firstAddress = false;
haveToSearchAll = false;
}
}
}
pos += 3;
} else if (getFirstBitOfByte(&pos)) { // terminal
if (addressLookingFor == (pos-1)) { // found !!
depth++;
word[depth] = (0xFF & mDict[pos-1]);
found = true;
break;
}
if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
if (!followDownAddressSearchStop) {
int addr = getBigramAddress(&pos, false);
if (addr > addressLookingFor) {
followDownAddressSearchStop = true;
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
} else if (!haveToSearchAll) {
break;
}
} else {
followDownBranchAddress = addr;
followingChar = (char)(0xFF & mDict[pos-1]);
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
}
}
}
pos += 4;
} else { // freq only (2 byte)
pos += 2;
}
// skipping bigram
int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0) {
pos += 3;
nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
}
} else {
pos++;
}
}
}
depth++;
if (followDownBranchAddress == 0) {
LOGI("ERROR!!! Cannot find bigram!!");
break;
}
}
if (checkFirstCharacter(word)) {
addWordBigram(word, depth, frequency);
}
}
bool
Dictionary::checkFirstCharacter(unsigned short *word)
{
// Checks whether this word starts with same character or neighboring characters of
// what user typed.
int *inputCodes = mInputCodes;
int maxAlt = mMaxAlternatives;
while (maxAlt > 0) {
if ((unsigned int) *inputCodes == (unsigned int) *word) {
return true;
}
inputCodes++;
maxAlt--;
}
return false;
}
bool
Dictionary::isValidWord(unsigned short *word, int length)
{
if (checkIfDictVersionIsLatest()) {
return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
} else {
return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
}
}
int
Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
// returns address of bigram data of that word
// return -99 if not found
int count = getCount(&pos);
unsigned short currentChar = (unsigned short) word[offset];
for (int j = 0; j < count; j++) {
unsigned short c = getChar(&pos);
int terminal = getTerminal(&pos);
int childPos = getAddress(&pos);
if (c == currentChar) {
if (offset == length - 1) {
if (terminal) {
return (pos+1);
}
} else {
if (childPos != 0) {
int t = isValidWordRec(childPos, word, offset + 1, length);
if (t > 0) {
return t;
}
}
}
}
if (terminal) {
getFreq(&pos);
}
// There could be two instances of each alphabet - upper and lower case. So continue
// looking ...
}
return NOT_VALID_WORD;
}
} // namespace latinime

View file

@ -17,6 +17,9 @@
#ifndef LATINIME_DICTIONARY_H
#define LATINIME_DICTIONARY_H
#include "bigram_dictionary.h"
#include "unigram_dictionary.h"
namespace latinime {
// 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
@ -35,68 +38,32 @@ namespace latinime {
class Dictionary {
public:
Dictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier);
Dictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength,
int maxWords, int maxAlternatives);
int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
int maxWordLength, int maxWords, int maxAlternatives, int skipPos,
int *nextLetters, int nextLettersSize);
int *nextLetters, int nextLettersSize) {
return mUnigramDictionary->getSuggestions(codes, codesSize, outWords, frequencies,
nextLetters, nextLettersSize);
}
// TODO: Call mBigramDictionary instead of mUnigramDictionary
int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
int maxAlternatives);
bool isValidWord(unsigned short *word, int length);
int maxAlternatives) {
return mUnigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies,
maxWordLength, maxBigrams, maxAlternatives);
}
bool isValidWord(unsigned short *word, int length) {
return mUnigramDictionary->isValidWord(word, length);
}
void setAsset(void *asset) { mAsset = asset; }
void *getAsset() { return mAsset; }
~Dictionary();
private:
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
int maxWordLength, int maxWords, int maxAlternatives);
int getSuggestionCandidates(int inputLength, int maxWords, int skipPos, int *nextLetters,
int nextLettersSize);
void getVersionNumber();
bool checkIfDictVersionIsLatest();
int getAddress(int *pos);
int getBigramAddress(int *pos, bool advance);
int getFreq(int *pos);
int getBigramFreq(int *pos);
void searchForTerminalNode(int address, int frequency);
bool getFirstBitOfByte(int *pos) { return (mDict[*pos] & 0x80) > 0; }
bool getSecondBitOfByte(int *pos) { return (mDict[*pos] & 0x40) > 0; }
bool getTerminal(int *pos) { return (mDict[*pos] & FLAG_TERMINAL_MASK) > 0; }
int getCount(int *pos) { return mDict[(*pos)++] & 0xFF; }
unsigned short getChar(int *pos);
int wideStrLen(unsigned short *str);
bool sameAsTyped(unsigned short *word, int length);
bool checkFirstCharacter(unsigned short *word);
bool addWord(unsigned short *word, int length, int frequency);
bool addWordBigram(unsigned short *word, int length, int frequency);
unsigned short toLowerCase(unsigned short c);
void getWordsRec(int pos, int depth, int maxDepth, bool completion, int frequency,
int inputIndex, int diffs, int skipPos, int *nextLetters, int nextLettersSize);
int isValidWordRec(int pos, unsigned short *word, int offset, int length);
void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
unsigned char *mDict;
void *mAsset;
int *mFrequencies;
int *mBigramFreq;
int mMaxWords;
int mMaxBigrams;
int mMaxWordLength;
unsigned short *mOutputChars;
unsigned short *mBigramChars;
int *mInputCodes;
int mInputLength;
int mMaxAlternatives;
unsigned short mWord[128];
int mMaxEditDistance;
int mFullWordMultiplier;
int mTypedLetterMultiplier;
int mVersion;
int mBigram;
BigramDictionary *mBigramDictionary;
UnigramDictionary *mUnigramDictionary;
};
// ----------------------------------------------------------------------------

View file

@ -0,0 +1,631 @@
/*
**
** Copyright 2010, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#ifdef FLAG_DBG
#define LOG_TAG "LatinIME: dictionary.cpp"
#include <cutils/log.h>
#define DEBUG_DICT 1
#else // FLAG_DBG
#define LOGI
#define DEBUG_DICT 0
#endif // FLAG_DBG
#include "unigram_dictionary.h"
#include "basechars.h"
#include "char_utils.h"
#define DICTIONARY_VERSION_MIN 200
#define DICTIONARY_HEADER_SIZE 2
#define NOT_VALID_WORD -99
#define SUGGEST_MISSING_CHARACTERS true
#define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5
namespace latinime {
UnigramDictionary::UnigramDictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxAlternatives, Dictionary *parentDictionary)
: MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords), MAX_ALTERNATIVES(maxAlternatives)
{
LOGI("UnigramDictionary - constructor");
mDict = (unsigned char*) dict;
mTypedLetterMultiplier = typedLetterMultiplier;
mFullWordMultiplier = fullWordMultiplier;
mParentDictionary = parentDictionary;
getVersionNumber();
}
UnigramDictionary::~UnigramDictionary()
{
}
int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
int *nextLetters, int nextLettersSize)
{
initSuggestions(codes, codesSize, outWords, frequencies);
int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
nextLettersSize);
// If there aren't sufficient suggestions, search for words by allowing wild cards at
// the different character positions. This feature is not ready for prime-time as we need
// to figure out the best ranking for such words compared to proximity corrections and
// completions.
if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
for (int i = 0; i < codesSize; ++i) {
int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
if (tempCount > suggestedWordsCount) {
suggestedWordsCount = tempCount;
break;
}
}
}
if (DEBUG_DICT) {
LOGI("Returning %d words", suggestedWordsCount);
LOGI("Next letters: ");
for (int k = 0; k < nextLettersSize; k++) {
if (nextLetters[k] > 0) {
LOGI("%c = %d,", k, nextLetters[k]);
}
}
LOGI("\n");
}
return suggestedWordsCount;
}
void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
int *frequencies) {
mFrequencies = frequencies;
mOutputChars = outWords;
mInputCodes = codes;
mInputLength = codesSize;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
}
int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
int *nextLetters, int nextLettersSize) {
if (checkIfDictVersionIsLatest()) {
getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
nextLetters, nextLettersSize);
} else {
getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
}
// Get the word count
int suggestedWordsCount = 0;
while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
suggestedWordsCount++;
}
return suggestedWordsCount;
}
void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
if (c < nextLettersSize) {
nextLetters[c]++;
}
}
// TODO: Should be const static variable calculate in the constructor
void
UnigramDictionary::getVersionNumber()
{
mVersion = (mDict[0] & 0xFF);
mBigram = (mDict[1] & 0xFF);
LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
}
// TODO: Should be const static variable calculate in the constructor
// Checks whether it has the latest dictionary or the old dictionary
bool
UnigramDictionary::checkIfDictVersionIsLatest()
{
return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
}
unsigned short
UnigramDictionary::getChar(int *pos)
{
unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
// If the code is 255, then actual 16 bit code follows (in big endian)
if (ch == 0xFF) {
ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
(*pos) += 2;
}
return ch;
}
int
UnigramDictionary::getAddress(int *pos)
{
int address = 0;
if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
*pos += 1;
} else {
address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
address += (mDict[*pos + 1] & 0xFF) << 8;
address += (mDict[*pos + 2] & 0xFF);
*pos += 3;
}
return address;
}
int
UnigramDictionary::getFreq(int *pos)
{
int freq = mDict[(*pos)++] & 0xFF;
if (checkIfDictVersionIsLatest()) {
// skipping bigram
int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0) {
(*pos) += 3;
nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
}
} else {
(*pos)++;
}
}
return freq;
}
int
UnigramDictionary::wideStrLen(unsigned short *str)
{
if (!str) return 0;
unsigned short *end = str;
while (*end)
end++;
return end - str;
}
bool
UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
{
word[length] = 0;
if (DEBUG_DICT) {
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
LOGI("Found word = %s, freq = %d : \n", s, frequency);
}
// Find the right insertion point
int insertAt = 0;
while (insertAt < MAX_WORDS) {
if (frequency > mFrequencies[insertAt]
|| (mFrequencies[insertAt] == frequency
&& length < wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
break;
}
insertAt++;
}
if (insertAt < MAX_WORDS) {
memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
mFrequencies[insertAt] = frequency;
memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
(char*) mOutputChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
unsigned short *dest = mOutputChars + (insertAt ) * MAX_WORD_LENGTH;
while (length--) {
*dest++ = *word++;
}
*dest = 0; // NULL terminate
if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
return true;
}
return false;
}
bool
UnigramDictionary::addWordBigram(unsigned short *word, int length, int frequency)
{
word[length] = 0;
if (DEBUG_DICT) {
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
}
// Find the right insertion point
int insertAt = 0;
while (insertAt < mMaxBigrams) {
if (frequency > mBigramFreq[insertAt]
|| (mBigramFreq[insertAt] == frequency
&& length < wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
break;
}
insertAt++;
}
LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
if (insertAt < mMaxBigrams) {
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
(char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
(mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
mBigramFreq[insertAt] = frequency;
memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
(char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
(mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH;
while (length--) {
*dest++ = *word++;
}
*dest = 0; // NULL terminate
if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
return true;
}
return false;
}
unsigned short
UnigramDictionary::toLowerCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
c = BASE_CHARS[c];
}
if (c >='A' && c <= 'Z') {
c |= 32;
} else if (c > 127) {
c = latin_tolower(c);
}
return c;
}
bool
UnigramDictionary::sameAsTyped(unsigned short *word, int length)
{
if (length != mInputLength) {
return false;
}
int *inputCodes = mInputCodes;
while (length--) {
if ((unsigned int) *inputCodes != (unsigned int) *word) {
return false;
}
inputCodes += MAX_ALTERNATIVES;
word++;
}
return true;
}
static char QUOTE = '\'';
void
UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
int diffs, int skipPos, int *nextLetters, int nextLettersSize)
{
// Optimization: Prune out words that are too long compared to how much was typed.
if (depth > maxDepth) {
return;
}
if (diffs > mMaxEditDistance) {
return;
}
int count = getCount(&pos);
int *currentChars = NULL;
if (mInputLength <= inputIndex) {
completion = true;
} else {
currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
}
for (int i = 0; i < count; i++) {
// -- at char
unsigned short c = getChar(&pos);
// -- at flag/add
unsigned short lowerC = toLowerCase(c);
bool terminal = getTerminal(&pos);
int childrenAddress = getAddress(&pos);
// -- after address or flag
int freq = 1;
if (terminal) freq = getFreq(&pos);
// -- after add or freq
// If we are only doing completions, no need to look at the typed characters.
if (completion) {
mWord[depth] = c;
if (terminal) {
addWord(mWord, depth + 1, freq * snr);
if (depth >= mInputLength && skipPos < 0) {
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
}
}
if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
diffs, skipPos, nextLetters, nextLettersSize);
}
} else if ((c == QUOTE && currentChars[0] != QUOTE) || skipPos == depth) {
// Skip the ' or other letter and continue deeper
mWord[depth] = c;
if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
skipPos, nextLetters, nextLettersSize);
}
} else {
int j = 0;
while (currentChars[j] > 0) {
if (currentChars[j] == lowerC || currentChars[j] == c) {
int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
mWord[depth] = c;
if (mInputLength == inputIndex + 1) {
if (terminal) {
if (//INCLUDE_TYPED_WORD_IF_VALID ||
!sameAsTyped(mWord, depth + 1)) {
int finalFreq = freq * snr * addedWeight;
if (skipPos < 0) finalFreq *= mFullWordMultiplier;
addWord(mWord, depth + 1, finalFreq);
}
}
if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1,
maxDepth, true, snr * addedWeight, inputIndex + 1,
diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
}
} else if (childrenAddress != 0) {
getWordsRec(childrenAddress, depth + 1, maxDepth,
false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
skipPos, nextLetters, nextLettersSize);
}
}
j++;
if (skipPos >= 0) break;
}
}
}
}
int
UnigramDictionary::getBigramAddress(int *pos, bool advance)
{
int address = 0;
address += (mDict[*pos] & 0x3F) << 16;
address += (mDict[*pos + 1] & 0xFF) << 8;
address += (mDict[*pos + 2] & 0xFF);
if (advance) {
*pos += 3;
}
return address;
}
int
UnigramDictionary::getBigramFreq(int *pos)
{
int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
return freq;
}
int
UnigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
int maxAlternatives)
{
mBigramFreq = bigramFreq;
mBigramChars = bigramChars;
mInputCodes = codes;
mInputLength = codesSize;
mMaxBigrams = maxBigrams;
if (mBigram == 1 && checkIfDictVersionIsLatest()) {
int pos = isValidWordRec(
DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
LOGI("Pos -> %d\n", pos);
if (pos < 0) {
return 0;
}
int bigramCount = 0;
int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0 && bigramCount < maxBigrams) {
int bigramAddress = getBigramAddress(&pos, true);
int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
// search for all bigrams and store them
searchForTerminalNode(bigramAddress, frequency);
nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
bigramCount++;
}
}
return bigramCount;
}
return 0;
}
void
UnigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency)
{
// track word with such address and store it in an array
unsigned short word[MAX_WORD_LENGTH];
int pos;
int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
bool found = false;
char followingChar = ' ';
int depth = -1;
while(!found) {
bool followDownAddressSearchStop = false;
bool firstAddress = true;
bool haveToSearchAll = true;
if (depth >= 0) {
word[depth] = (unsigned short) followingChar;
}
pos = followDownBranchAddress; // pos start at count
int count = mDict[pos] & 0xFF;
LOGI("count - %d\n",count);
pos++;
for (int i = 0; i < count; i++) {
// pos at data
pos++;
// pos now at flag
if (!getFirstBitOfByte(&pos)) { // non-terminal
if (!followDownAddressSearchStop) {
int addr = getBigramAddress(&pos, false);
if (addr > addressLookingFor) {
followDownAddressSearchStop = true;
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
} else if (!haveToSearchAll) {
break;
}
} else {
followDownBranchAddress = addr;
followingChar = (char)(0xFF & mDict[pos-1]);
if (firstAddress) {
firstAddress = false;
haveToSearchAll = false;
}
}
}
pos += 3;
} else if (getFirstBitOfByte(&pos)) { // terminal
if (addressLookingFor == (pos-1)) { // found !!
depth++;
word[depth] = (0xFF & mDict[pos-1]);
found = true;
break;
}
if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
if (!followDownAddressSearchStop) {
int addr = getBigramAddress(&pos, false);
if (addr > addressLookingFor) {
followDownAddressSearchStop = true;
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
} else if (!haveToSearchAll) {
break;
}
} else {
followDownBranchAddress = addr;
followingChar = (char)(0xFF & mDict[pos-1]);
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
}
}
}
pos += 4;
} else { // freq only (2 byte)
pos += 2;
}
// skipping bigram
int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0) {
pos += 3;
nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
}
} else {
pos++;
}
}
}
depth++;
if (followDownBranchAddress == 0) {
LOGI("ERROR!!! Cannot find bigram!!");
break;
}
}
if (checkFirstCharacter(word)) {
addWordBigram(word, depth, frequency);
}
}
bool
UnigramDictionary::checkFirstCharacter(unsigned short *word)
{
// Checks whether this word starts with same character or neighboring characters of
// what user typed.
int *inputCodes = mInputCodes;
int maxAlt = MAX_ALTERNATIVES;
while (maxAlt > 0) {
if ((unsigned int) *inputCodes == (unsigned int) *word) {
return true;
}
inputCodes++;
maxAlt--;
}
return false;
}
// TODO: Move to parent dictionary
bool
UnigramDictionary::isValidWord(unsigned short *word, int length)
{
if (checkIfDictVersionIsLatest()) {
return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
} else {
return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
}
}
int
UnigramDictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
// returns address of bigram data of that word
// return -99 if not found
int count = getCount(&pos);
unsigned short currentChar = (unsigned short) word[offset];
for (int j = 0; j < count; j++) {
unsigned short c = getChar(&pos);
int terminal = getTerminal(&pos);
int childPos = getAddress(&pos);
if (c == currentChar) {
if (offset == length - 1) {
if (terminal) {
return (pos+1);
}
} else {
if (childPos != 0) {
int t = isValidWordRec(childPos, word, offset + 1, length);
if (t > 0) {
return t;
}
}
}
}
if (terminal) {
getFreq(&pos);
}
// There could be two instances of each alphabet - upper and lower case. So continue
// looking ...
}
return NOT_VALID_WORD;
}
} // namespace latinime

View file

@ -0,0 +1,104 @@
/*
* Copyright (C) 2010 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_UNIGRAM_DICTIONARY_H
#define LATINIME_UNIGRAM_DICTIONARY_H
namespace latinime {
// 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
#define ADDRESS_MASK 0x3FFFFF
// The bit that decides if an address follows in the next 22 bits
#define FLAG_ADDRESS_MASK 0x40
// The bit that decides if this is a terminal node for a word. The node could still have children,
// if the word has other endings.
#define FLAG_TERMINAL_MASK 0x80
#define FLAG_BIGRAM_READ 0x80
#define FLAG_BIGRAM_CHILDEXIST 0x40
#define FLAG_BIGRAM_CONTINUED 0x80
#define FLAG_BIGRAM_FREQ 0x7F
class Dictionary;
class UnigramDictionary {
public:
UnigramDictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength,
int maxWords, int maxAlternatives, Dictionary *parentDictionary);
int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
int *nextLetters, int nextLettersSize);
int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
int maxAlternatives);
bool isValidWord(unsigned short *word, int length);
~UnigramDictionary();
private:
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
int getSuggestionCandidates(int inputLength, int skipPos, int *nextLetters, int nextLettersSize);
void getVersionNumber();
bool checkIfDictVersionIsLatest();
int getAddress(int *pos);
int getBigramAddress(int *pos, bool advance);
int getFreq(int *pos);
int getBigramFreq(int *pos);
void searchForTerminalNode(int address, int frequency);
bool getFirstBitOfByte(int *pos) { return (mDict[*pos] & 0x80) > 0; }
bool getSecondBitOfByte(int *pos) { return (mDict[*pos] & 0x40) > 0; }
bool getTerminal(int *pos) { return (mDict[*pos] & FLAG_TERMINAL_MASK) > 0; }
int getCount(int *pos) { return mDict[(*pos)++] & 0xFF; }
unsigned short getChar(int *pos);
int wideStrLen(unsigned short *str);
bool sameAsTyped(unsigned short *word, int length);
bool checkFirstCharacter(unsigned short *word);
bool addWord(unsigned short *word, int length, int frequency);
bool addWordBigram(unsigned short *word, int length, int frequency);
unsigned short toLowerCase(unsigned short c);
void getWordsRec(int pos, int depth, int maxDepth, bool completion, int frequency,
int inputIndex, int diffs, int skipPos, int *nextLetters, int nextLettersSize);
void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
int isValidWordRec(int pos, unsigned short *word, int offset, int length);
unsigned char *mDict;
Dictionary *mParentDictionary;
const int MAX_WORDS;
const int MAX_WORD_LENGTH;
const int MAX_ALTERNATIVES;
int *mFrequencies;
int *mBigramFreq;
int mMaxBigrams;
unsigned short *mOutputChars;
unsigned short *mBigramChars;
int *mInputCodes;
int mInputLength;
unsigned short mWord[128];
int mMaxEditDistance;
int mFullWordMultiplier;
int mTypedLetterMultiplier;
int mVersion;
int mBigram;
};
// ----------------------------------------------------------------------------
}; // namespace latinime
#endif // LATINIME_UNIGRAM_DICTIONARY_H