Refactor parameters by naming convention
Change-Id: I8bda8075b33f656ecbec08320afcd864b620fe77
This commit is contained in:
parent
a50673330e
commit
e0e6737373
12 changed files with 127 additions and 120 deletions
|
@ -91,7 +91,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
|
|
||||||
private static native long openNative(String sourceDir, long dictOffset, long dictSize);
|
private static native long openNative(String sourceDir, long dictOffset, long dictSize);
|
||||||
private static native void closeNative(long dict);
|
private static native void closeNative(long dict);
|
||||||
private static native int getFrequencyNative(long dict, int[] word);
|
private static native int getProbabilityNative(long dict, int[] word);
|
||||||
private static native boolean isValidBigramNative(long dict, int[] word1, int[] word2);
|
private static native boolean isValidBigramNative(long dict, int[] word1, int[] word2);
|
||||||
private static native int getSuggestionsNative(long dict, long proximityInfo,
|
private static native int getSuggestionsNative(long dict, long proximityInfo,
|
||||||
long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
|
long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
|
||||||
|
@ -186,7 +186,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
public int getFrequency(final String word) {
|
public int getFrequency(final String word) {
|
||||||
if (word == null) return -1;
|
if (word == null) return -1;
|
||||||
int[] codePoints = StringUtils.toCodePointArray(word);
|
int[] codePoints = StringUtils.toCodePointArray(word);
|
||||||
return getFrequencyNative(mNativeDict, codePoints);
|
return getProbabilityNative(mNativeDict, codePoints);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni
|
// TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni
|
||||||
|
|
|
@ -203,14 +203,14 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz, j
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
static jint latinime_BinaryDictionary_getFrequency(JNIEnv *env, jclass clazz, jlong dict,
|
static jint latinime_BinaryDictionary_getProbability(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
jintArray wordArray) {
|
jintArray wordArray) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
if (!dictionary) return 0;
|
if (!dictionary) return 0;
|
||||||
const jsize codePointLength = env->GetArrayLength(wordArray);
|
const jsize codePointLength = env->GetArrayLength(wordArray);
|
||||||
int codePoints[codePointLength];
|
int codePoints[codePointLength];
|
||||||
env->GetIntArrayRegion(wordArray, 0, codePointLength, codePoints);
|
env->GetIntArrayRegion(wordArray, 0, codePointLength, codePoints);
|
||||||
return dictionary->getFrequency(codePoints, codePointLength);
|
return dictionary->getProbability(codePoints, codePointLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
static jboolean latinime_BinaryDictionary_isValidBigram(JNIEnv *env, jclass clazz, jlong dict,
|
static jboolean latinime_BinaryDictionary_isValidBigram(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
|
@ -285,8 +285,8 @@ static JNINativeMethod sMethods[] = {
|
||||||
{"closeNative", "(J)V", reinterpret_cast<void *>(latinime_BinaryDictionary_close)},
|
{"closeNative", "(J)V", reinterpret_cast<void *>(latinime_BinaryDictionary_close)},
|
||||||
{"getSuggestionsNative", "(JJJ[I[I[I[I[IIIZ[IZ[I[I[I[I)I",
|
{"getSuggestionsNative", "(JJJ[I[I[I[I[IIIZ[IZ[I[I[I[I)I",
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_getSuggestions)},
|
reinterpret_cast<void *>(latinime_BinaryDictionary_getSuggestions)},
|
||||||
{"getFrequencyNative", "(J[I)I",
|
{"getProbabilityNative", "(J[I)I",
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_getFrequency)},
|
reinterpret_cast<void *>(latinime_BinaryDictionary_getProbability)},
|
||||||
{"isValidBigramNative", "(J[I[I)Z",
|
{"isValidBigramNative", "(J[I[I)Z",
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_isValidBigram)},
|
reinterpret_cast<void *>(latinime_BinaryDictionary_isValidBigram)},
|
||||||
{"calcNormalizedScoreNative", "([I[II)F",
|
{"calcNormalizedScoreNative", "([I[II)F",
|
||||||
|
|
|
@ -36,21 +36,21 @@ BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT
|
||||||
BigramDictionary::~BigramDictionary() {
|
BigramDictionary::~BigramDictionary() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq,
|
void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability,
|
||||||
int *bigramCodePoints, int *outputTypes) const {
|
int *bigramCodePoints, int *outputTypes) const {
|
||||||
word[length] = 0;
|
word[length] = 0;
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
#ifdef FLAG_DBG
|
#ifdef FLAG_DBG
|
||||||
char s[length + 1];
|
char s[length + 1];
|
||||||
for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
|
for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
|
||||||
AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
|
AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the right insertion point
|
// Find the right insertion point
|
||||||
int insertAt = 0;
|
int insertAt = 0;
|
||||||
while (insertAt < MAX_RESULTS) {
|
while (insertAt < MAX_RESULTS) {
|
||||||
if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency
|
if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability
|
||||||
&& length < getCodePointCount(MAX_WORD_LENGTH,
|
&& length < getCodePointCount(MAX_WORD_LENGTH,
|
||||||
bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
|
bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
|
||||||
break;
|
break;
|
||||||
|
@ -63,10 +63,10 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *
|
||||||
if (insertAt >= MAX_RESULTS) {
|
if (insertAt >= MAX_RESULTS) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
memmove(bigramFreq + (insertAt + 1),
|
memmove(bigramProbability + (insertAt + 1),
|
||||||
bigramFreq + insertAt,
|
bigramProbability + insertAt,
|
||||||
(MAX_RESULTS - insertAt - 1) * sizeof(bigramFreq[0]));
|
(MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0]));
|
||||||
bigramFreq[insertAt] = frequency;
|
bigramProbability[insertAt] = probability;
|
||||||
outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
|
outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
|
||||||
memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
|
memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
|
||||||
bigramCodePoints + insertAt * MAX_WORD_LENGTH,
|
bigramCodePoints + insertAt * MAX_WORD_LENGTH,
|
||||||
|
@ -87,7 +87,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *
|
||||||
* inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions.
|
* inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions.
|
||||||
* inputSize: the size of the codes array.
|
* inputSize: the size of the codes array.
|
||||||
* bigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
|
* bigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
|
||||||
* bigramFreq: an array to output frequencies.
|
* bigramProbability: an array to output frequencies.
|
||||||
* outputTypes: an array to output types.
|
* outputTypes: an array to output types.
|
||||||
* This method returns the number of bigrams this word has, for backward compatibility.
|
* This method returns the number of bigrams this word has, for backward compatibility.
|
||||||
* Note: this is not the number of bigrams output in the array, which is the number of
|
* Note: this is not the number of bigrams output in the array, which is the number of
|
||||||
|
@ -98,7 +98,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *
|
||||||
* reduce their scope to the ones that match the first letter.
|
* reduce their scope to the ones that match the first letter.
|
||||||
*/
|
*/
|
||||||
int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
|
int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
|
||||||
int inputSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const {
|
int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
|
||||||
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
||||||
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
||||||
|
|
||||||
|
@ -118,23 +118,24 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
|
||||||
do {
|
do {
|
||||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
int bigramBuffer[MAX_WORD_LENGTH];
|
int bigramBuffer[MAX_WORD_LENGTH];
|
||||||
int unigramFreq = 0;
|
int unigramProbability = 0;
|
||||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||||
&pos);
|
&pos);
|
||||||
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
|
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
|
||||||
bigramBuffer, &unigramFreq);
|
bigramBuffer, &unigramProbability);
|
||||||
|
|
||||||
// inputSize == 0 means we are trying to find bigram predictions.
|
// inputSize == 0 means we are trying to find bigram predictions.
|
||||||
if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
|
if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
|
||||||
const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
const int bigramProbabilityTemp =
|
||||||
// Due to space constraints, the frequency for bigrams is approximate - the lower the
|
BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
||||||
// unigram frequency, the worse the precision. The theoritical maximum error in
|
// Due to space constraints, the probability for bigrams is approximate - the lower the
|
||||||
// resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
|
// unigram probability, the worse the precision. The theoritical maximum error in
|
||||||
|
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
||||||
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
|
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
|
||||||
// here, but it can't get too bad.
|
// here, but it can't get too bad.
|
||||||
const int frequency =
|
const int probability = BinaryFormat::computeProbabilityForBigram(
|
||||||
BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp);
|
unigramProbability, bigramProbabilityTemp);
|
||||||
addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints,
|
addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints,
|
||||||
outputTypes);
|
outputTypes);
|
||||||
++bigramCount;
|
++bigramCount;
|
||||||
}
|
}
|
||||||
|
@ -159,13 +160,13 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in
|
||||||
} else {
|
} else {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
pos = BinaryFormat::skipChildrenPosition(flags, pos);
|
pos = BinaryFormat::skipChildrenPosition(flags, pos);
|
||||||
pos = BinaryFormat::skipShortcuts(root, flags, pos);
|
pos = BinaryFormat::skipShortcuts(root, flags, pos);
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevWord,
|
void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
|
||||||
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
|
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
|
||||||
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
|
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = DICT_ROOT;
|
||||||
|
@ -181,10 +182,10 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevW
|
||||||
uint8_t bigramFlags;
|
uint8_t bigramFlags;
|
||||||
do {
|
do {
|
||||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
||||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||||
&pos);
|
&pos);
|
||||||
(*map)[bigramPos] = frequency;
|
(*map)[bigramPos] = probability;
|
||||||
setInFilter(filter, bigramPos);
|
setInFilter(filter, bigramPos);
|
||||||
} while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
} while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,14 +29,14 @@ class BigramDictionary {
|
||||||
BigramDictionary(const uint8_t *const streamStart);
|
BigramDictionary(const uint8_t *const streamStart);
|
||||||
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
||||||
int *frequencies, int *outputTypes) const;
|
int *frequencies, int *outputTypes) const;
|
||||||
void fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, const int prevWordLength,
|
void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength,
|
||||||
std::map<int, int> *map, uint8_t *filter) const;
|
std::map<int, int> *map, uint8_t *filter) const;
|
||||||
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
||||||
~BigramDictionary();
|
~BigramDictionary();
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);
|
||||||
void addWordBigram(int *word, int length, int frequency, int *bigramFreq, int *bigramCodePoints,
|
void addWordBigram(int *word, int length, int probability, int *bigramProbability,
|
||||||
int *outputTypes) const;
|
int *bigramCodePoints, int *outputTypes) const;
|
||||||
bool checkFirstCharacter(int *word, int *inputCodePoints) const;
|
bool checkFirstCharacter(int *word, int *inputCodePoints) const;
|
||||||
int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
||||||
const bool forceLowerCaseSearch) const;
|
const bool forceLowerCaseSearch) const;
|
||||||
|
|
|
@ -52,10 +52,10 @@ class BinaryFormat {
|
||||||
// Flag for sign of offset. If this flag is set, the offset value must be negated.
|
// Flag for sign of offset. If this flag is set, the offset value must be negated.
|
||||||
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
||||||
|
|
||||||
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
|
// Mask for attribute probability, stored on 4 bits inside the flags byte.
|
||||||
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
|
static const int MASK_ATTRIBUTE_PROBABILITY = 0x0F;
|
||||||
// The numeric value of the shortcut frequency that means 'whitelist'.
|
// The numeric value of the shortcut probability that means 'whitelist'.
|
||||||
static const int WHITELIST_SHORTCUT_FREQUENCY = 15;
|
static const int WHITELIST_SHORTCUT_PROBABILITY = 15;
|
||||||
|
|
||||||
// Mask and flags for attribute address type selection.
|
// Mask and flags for attribute address type selection.
|
||||||
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
||||||
|
@ -72,10 +72,10 @@ class BinaryFormat {
|
||||||
static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
|
static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
|
||||||
static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
|
static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
|
||||||
static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
|
static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
|
||||||
static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos);
|
static int readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos);
|
||||||
static int skipOtherCharacters(const uint8_t *const dict, const int pos);
|
static int skipOtherCharacters(const uint8_t *const dict, const int pos);
|
||||||
static int skipChildrenPosition(const uint8_t flags, const int pos);
|
static int skipChildrenPosition(const uint8_t flags, const int pos);
|
||||||
static int skipFrequency(const uint8_t flags, const int pos);
|
static int skipProbability(const uint8_t flags, const int pos);
|
||||||
static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
|
static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
|
||||||
static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
|
static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos);
|
const int pos);
|
||||||
|
@ -83,14 +83,15 @@ class BinaryFormat {
|
||||||
static bool hasChildrenInFlags(const uint8_t flags);
|
static bool hasChildrenInFlags(const uint8_t flags);
|
||||||
static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags,
|
static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags,
|
||||||
int *pos);
|
int *pos);
|
||||||
static int getAttributeFrequencyFromFlags(const int flags);
|
static int getAttributeProbabilityFromFlags(const int flags);
|
||||||
static int getTerminalPosition(const uint8_t *const root, const int *const inWord,
|
static int getTerminalPosition(const uint8_t *const root, const int *const inWord,
|
||||||
const int length, const bool forceLowerCaseSearch);
|
const int length, const bool forceLowerCaseSearch);
|
||||||
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
|
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
|
||||||
int *outWord, int *outUnigramFrequency);
|
int *outWord, int *outUnigramProbability);
|
||||||
static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
|
static int computeProbabilityForBigram(
|
||||||
|
const int unigramProbability, const int bigramProbability);
|
||||||
static int getProbability(const int position, const std::map<int, int> *bigramMap,
|
static int getProbability(const int position, const std::map<int, int> *bigramMap,
|
||||||
const uint8_t *bigramFilter, const int unigramFreq);
|
const uint8_t *bigramFilter, const int unigramProbability);
|
||||||
|
|
||||||
// Flags for special processing
|
// Flags for special processing
|
||||||
// Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
|
// Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
|
||||||
|
@ -264,7 +265,7 @@ AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict,
|
inline int BinaryFormat::readProbabilityWithoutMovingPointer(const uint8_t *const dict,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
return dict[pos];
|
return dict[pos];
|
||||||
}
|
}
|
||||||
|
@ -320,7 +321,7 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos
|
||||||
return pos + childrenAddressSize(flags);
|
return pos + childrenAddressSize(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
|
inline int BinaryFormat::skipProbability(const uint8_t flags, const int pos) {
|
||||||
return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -415,8 +416,8 @@ AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uin
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) {
|
inline int BinaryFormat::getAttributeProbabilityFromFlags(const int flags) {
|
||||||
return flags & MASK_ATTRIBUTE_FREQUENCY;
|
return flags & MASK_ATTRIBUTE_PROBABILITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function gets the byte position of the last chargroup of the exact matching word in the
|
// This function gets the byte position of the last chargroup of the exact matching word in the
|
||||||
|
@ -466,7 +467,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
if (wordPos == length) {
|
if (wordPos == length) {
|
||||||
return charGroupPos;
|
return charGroupPos;
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
|
pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos);
|
||||||
}
|
}
|
||||||
if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
|
if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
|
||||||
return NOT_VALID_WORD;
|
return NOT_VALID_WORD;
|
||||||
|
@ -481,7 +482,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
|
pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
|
||||||
}
|
}
|
||||||
--charGroupCount;
|
--charGroupCount;
|
||||||
|
@ -504,11 +505,11 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
* address: the byte position of the last chargroup of the word we are searching for (this is
|
* address: the byte position of the last chargroup of the word we are searching for (this is
|
||||||
* what is stored as the "bigram address" in each bigram)
|
* what is stored as the "bigram address" in each bigram)
|
||||||
* outword: an array to write the found word, with MAX_WORD_LENGTH size.
|
* outword: an array to write the found word, with MAX_WORD_LENGTH size.
|
||||||
* outUnigramFrequency: a pointer to an int to write the frequency into.
|
* outUnigramProbability: a pointer to an int to write the probability into.
|
||||||
* Return value : the length of the word, of 0 if the word was not found.
|
* Return value : the length of the word, of 0 if the word was not found.
|
||||||
*/
|
*/
|
||||||
AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
|
AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
|
||||||
const int maxDepth, int *outWord, int *outUnigramFrequency) {
|
const int maxDepth, int *outWord, int *outUnigramProbability) {
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
int wordPos = 0;
|
int wordPos = 0;
|
||||||
|
|
||||||
|
@ -541,15 +542,15 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
|
||||||
nextChar = getCodePointAndForwardPointer(root, &pos);
|
nextChar = getCodePointAndForwardPointer(root, &pos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
|
*outUnigramProbability = readProbabilityWithoutMovingPointer(root, pos);
|
||||||
return ++wordPos;
|
return ++wordPos;
|
||||||
}
|
}
|
||||||
// We need to skip past this char group, so skip any remaining chars after the
|
// We need to skip past this char group, so skip any remaining chars after the
|
||||||
// first and possibly the frequency.
|
// first and possibly the probability.
|
||||||
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
pos = skipOtherCharacters(root, pos);
|
pos = skipOtherCharacters(root, pos);
|
||||||
}
|
}
|
||||||
pos = skipFrequency(flags, pos);
|
pos = skipProbability(flags, pos);
|
||||||
|
|
||||||
// The fact that this group has children is very important. Since we already know
|
// The fact that this group has children is very important. Since we already know
|
||||||
// that this group does not match, if it has no children we know it is irrelevant
|
// that this group does not match, if it has no children we know it is irrelevant
|
||||||
|
@ -604,9 +605,9 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
++wordPos;
|
++wordPos;
|
||||||
// Now we only need to branch to the children address. Skip the frequency if
|
// Now we only need to branch to the children address. Skip the probability if
|
||||||
// it's there, read pos, and break to resume the search at pos.
|
// it's there, read pos, and break to resume the search at pos.
|
||||||
lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
|
lastCandidateGroupPos = skipProbability(lastFlags, lastCandidateGroupPos);
|
||||||
pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
|
pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
|
@ -635,36 +636,39 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int backoff(const int unigramFreq) {
|
static inline int backoff(const int unigramProbability) {
|
||||||
return unigramFreq;
|
return unigramProbability;
|
||||||
// For some reason, applying the backoff weight gives bad results in tests. To apply the
|
// For some reason, applying the backoff weight gives bad results in tests. To apply the
|
||||||
// backoff weight, we divide the probability by 2, which in our storing format means
|
// backoff weight, we divide the probability by 2, which in our storing format means
|
||||||
// decreasing the score by 8.
|
// decreasing the score by 8.
|
||||||
// TODO: figure out what's wrong with this.
|
// TODO: figure out what's wrong with this.
|
||||||
// return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
|
// return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
|
inline int BinaryFormat::computeProbabilityForBigram(
|
||||||
// We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
|
const int unigramProbability, const int bigramProbability) {
|
||||||
// unigram frequency to be the median value of the 17th step from the top. A value of
|
// We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the
|
||||||
// 0 for the bigram frequency represents the middle of the 16th step from the top,
|
// unigram probability to be the median value of the 17th step from the top. A value of
|
||||||
|
// 0 for the bigram probability represents the middle of the 16th step from the top,
|
||||||
// while a value of 15 represents the middle of the top step.
|
// while a value of 15 represents the middle of the top step.
|
||||||
// See makedict.BinaryDictInputOutput for details.
|
// See makedict.BinaryDictInputOutput for details.
|
||||||
const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
|
const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability)
|
||||||
return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize);
|
/ (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY);
|
||||||
|
return unigramProbability
|
||||||
|
+ static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This returns a probability in log space.
|
// This returns a probability in log space.
|
||||||
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
|
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
|
||||||
const uint8_t *bigramFilter, const int unigramFreq) {
|
const uint8_t *bigramFilter, const int unigramProbability) {
|
||||||
if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
|
if (!bigramMap || !bigramFilter) return backoff(unigramProbability);
|
||||||
if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
|
if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability);
|
||||||
const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
|
const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
|
||||||
if (bigramFreqIt != bigramMap->end()) {
|
if (bigramProbabilityIt != bigramMap->end()) {
|
||||||
const int bigramFreq = bigramFreqIt->second;
|
const int bigramProbability = bigramProbabilityIt->second;
|
||||||
return computeFrequencyForBigram(unigramFreq, bigramFreq);
|
return computeProbabilityForBigram(unigramProbability, bigramProbability);
|
||||||
}
|
}
|
||||||
return backoff(unigramFreq);
|
return backoff(unigramProbability);
|
||||||
}
|
}
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_BINARY_FORMAT_H
|
#endif // LATINIME_BINARY_FORMAT_H
|
||||||
|
|
|
@ -841,7 +841,7 @@ inline static bool isUpperCase(unsigned short c) {
|
||||||
const int freq = freqArray[i];
|
const int freq = freqArray[i];
|
||||||
// Demote too short weak words
|
// Demote too short weak words
|
||||||
if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) {
|
if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) {
|
||||||
multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
|
multiplyRate(100 * freq / MAX_PROBABILITY, &totalFreq);
|
||||||
}
|
}
|
||||||
if (wordLength == 1) {
|
if (wordLength == 1) {
|
||||||
++oneLengthCounter;
|
++oneLengthCounter;
|
||||||
|
|
|
@ -72,11 +72,11 @@ AK_FORCE_INLINE static int intArrayToCharArray(const int *source, const int sour
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void dumpWordInfo(const int *word, const int length, const int rank,
|
static inline void dumpWordInfo(const int *word, const int length, const int rank,
|
||||||
const int frequency) {
|
const int probability) {
|
||||||
static char charBuf[50];
|
static char charBuf[50];
|
||||||
const int N = intArrayToCharArray(word, length, charBuf);
|
const int N = intArrayToCharArray(word, length, charBuf);
|
||||||
if (N > 1) {
|
if (N > 1) {
|
||||||
AKLOGI("%2d [ %s ] (%d)", rank, charBuf, frequency);
|
AKLOGI("%2d [ %s ] (%d)", rank, charBuf, probability);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -312,8 +312,8 @@ static inline void prof_out(void) {
|
||||||
#define ZERO_DISTANCE_PROMOTION_RATE 110.0f
|
#define ZERO_DISTANCE_PROMOTION_RATE 110.0f
|
||||||
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
|
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
|
||||||
#define HALF_SCORE_SQUARED_RADIUS 32.0f
|
#define HALF_SCORE_SQUARED_RADIUS 32.0f
|
||||||
#define MAX_FREQ 255
|
#define MAX_PROBABILITY 255
|
||||||
#define MAX_BIGRAM_FREQ 15
|
#define MAX_BIGRAM_ENCODED_PROBABILITY 15
|
||||||
|
|
||||||
// Assuming locale strings such as en_US, sr-Latn etc.
|
// Assuming locale strings such as en_US, sr-Latn etc.
|
||||||
#define MAX_LOCALE_STRING_LENGTH 10
|
#define MAX_LOCALE_STRING_LENGTH 10
|
||||||
|
@ -335,8 +335,8 @@ static inline void prof_out(void) {
|
||||||
|
|
||||||
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.35f
|
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.35f
|
||||||
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.185f
|
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.185f
|
||||||
/* heuristic... This should be changed if we change the unit of the frequency. */
|
/* heuristic... This should be changed if we change the unit of the probability. */
|
||||||
#define SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ (MAX_FREQ * 58 / 100)
|
#define SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ (MAX_PROBABILITY * 58 / 100)
|
||||||
|
|
||||||
#define MAX_DEPTH_MULTIPLIER 3
|
#define MAX_DEPTH_MULTIPLIER 3
|
||||||
#define FIRST_WORD_INDEX 0
|
#define FIRST_WORD_INDEX 0
|
||||||
|
|
|
@ -62,7 +62,7 @@ int Dictionary::getSuggestions(ProximityInfo *proximityInfo, void *traverseSessi
|
||||||
} else {
|
} else {
|
||||||
std::map<int, int> bigramMap;
|
std::map<int, int> bigramMap;
|
||||||
uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE];
|
uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE];
|
||||||
mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordCodePoints,
|
mBigramDictionary->fillBigramAddressToProbabilityMapAndFilter(prevWordCodePoints,
|
||||||
prevWordLength, &bigramMap, bigramFilter);
|
prevWordLength, &bigramMap, bigramFilter);
|
||||||
result = mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates,
|
result = mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates,
|
||||||
inputCodePoints, inputSize, &bigramMap, bigramFilter, useFullEditDistance, outWords,
|
inputCodePoints, inputSize, &bigramMap, bigramFilter, useFullEditDistance, outWords,
|
||||||
|
@ -78,8 +78,8 @@ int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, in
|
||||||
frequencies, outputTypes);
|
frequencies, outputTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Dictionary::getFrequency(const int *word, int length) const {
|
int Dictionary::getProbability(const int *word, int length) const {
|
||||||
return mUnigramDictionary->getFrequency(word, length);
|
return mUnigramDictionary->getProbability(word, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Dictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const {
|
bool Dictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const {
|
||||||
|
|
|
@ -52,7 +52,7 @@ class Dictionary {
|
||||||
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
||||||
int *frequencies, int *outputTypes) const;
|
int *frequencies, int *outputTypes) const;
|
||||||
|
|
||||||
int getFrequency(const int *word, int length) const;
|
int getProbability(const int *word, int length) const;
|
||||||
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
||||||
const uint8_t *getDict() const { // required to release dictionary buffer
|
const uint8_t *getDict() const { // required to release dictionary buffer
|
||||||
return mDict;
|
return mDict;
|
||||||
|
|
|
@ -51,7 +51,7 @@ class TerminalAttributes {
|
||||||
if (NOT_A_CODE_POINT == codePoint) break;
|
if (NOT_A_CODE_POINT == codePoint) break;
|
||||||
outWord[i] = codePoint;
|
outWord[i] = codePoint;
|
||||||
}
|
}
|
||||||
*outFreq = BinaryFormat::getAttributeFrequencyFromFlags(shortcutFlags);
|
*outFreq = BinaryFormat::getAttributeProbabilityFromFlags(shortcutFlags);
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -52,8 +52,8 @@ UnigramDictionary::~UnigramDictionary() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This needs to take a const int* and not tinker with its contents
|
// TODO: This needs to take a const int* and not tinker with its contents
|
||||||
static void addWord(int *word, int length, int frequency, WordsPriorityQueue *queue, int type) {
|
static void addWord(int *word, int length, int probability, WordsPriorityQueue *queue, int type) {
|
||||||
queue->push(frequency, word, length, type);
|
queue->push(probability, word, length, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the replacement code point for a digraph, or 0 if none.
|
// Return the replacement code point for a digraph, or 0 if none.
|
||||||
|
@ -158,7 +158,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit
|
||||||
queuePool);
|
queuePool);
|
||||||
}
|
}
|
||||||
|
|
||||||
// bigramMap contains the association <bigram address> -> <bigram frequency>
|
// bigramMap contains the association <bigram address> -> <bigram probability>
|
||||||
// bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter
|
// bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter
|
||||||
// in bigram_dictionary.cpp
|
// in bigram_dictionary.cpp
|
||||||
int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
|
@ -399,7 +399,7 @@ void UnigramDictionary::onTerminal(const int probability,
|
||||||
MAX_WORD_LENGTH, shortcutTarget, &shortcutFrequency);
|
MAX_WORD_LENGTH, shortcutTarget, &shortcutFrequency);
|
||||||
int shortcutScore;
|
int shortcutScore;
|
||||||
int kind;
|
int kind;
|
||||||
if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_FREQUENCY
|
if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_PROBABILITY
|
||||||
&& correction->sameAsTyped()) {
|
&& correction->sameAsTyped()) {
|
||||||
shortcutScore = S_INT_MAX;
|
shortcutScore = S_INT_MAX;
|
||||||
kind = Dictionary::KIND_WHITELIST;
|
kind = Dictionary::KIND_WHITELIST;
|
||||||
|
@ -483,7 +483,7 @@ int UnigramDictionary::getSubStringSuggestion(
|
||||||
inputSize, correction);
|
inputSize, correction);
|
||||||
|
|
||||||
int word[MAX_WORD_LENGTH];
|
int word[MAX_WORD_LENGTH];
|
||||||
int freq = getMostFrequentWordLike(
|
int freq = getMostProbableWordLike(
|
||||||
inputWordStartPos, inputWordLength, correction, word);
|
inputWordStartPos, inputWordLength, correction, word);
|
||||||
if (freq > 0) {
|
if (freq > 0) {
|
||||||
nextWordLength = inputWordLength;
|
nextWordLength = inputWordLength;
|
||||||
|
@ -679,15 +679,15 @@ void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximit
|
||||||
outputWord);
|
outputWord);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
|
// Wrapper for getMostProbableWordLikeInner, which matches it to the previous
|
||||||
// interface.
|
// interface.
|
||||||
int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, const int inputSize,
|
int UnigramDictionary::getMostProbableWordLike(const int startInputIndex, const int inputSize,
|
||||||
Correction *correction, int *word) const {
|
Correction *correction, int *word) const {
|
||||||
int inWord[inputSize];
|
int inWord[inputSize];
|
||||||
for (int i = 0; i < inputSize; ++i) {
|
for (int i = 0; i < inputSize; ++i) {
|
||||||
inWord[i] = correction->getPrimaryCodePointAt(startInputIndex + i);
|
inWord[i] = correction->getPrimaryCodePointAt(startInputIndex + i);
|
||||||
}
|
}
|
||||||
return getMostFrequentWordLikeInner(inWord, inputSize, word);
|
return getMostProbableWordLikeInner(inWord, inputSize, word);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function will take the position of a character array within a CharGroup,
|
// This function will take the position of a character array within a CharGroup,
|
||||||
|
@ -738,9 +738,9 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function is invoked when a word like the word searched for is found.
|
// This function is invoked when a word like the word searched for is found.
|
||||||
// It will compare the frequency to the max frequency, and if greater, will
|
// It will compare the probability to the max probability, and if greater, will
|
||||||
// copy the word into the output buffer. In output value maxFreq, it will
|
// copy the word into the output buffer. In output value maxFreq, it will
|
||||||
// write the new maximum frequency if it changed.
|
// write the new maximum probability if it changed.
|
||||||
static inline void onTerminalWordLike(const int freq, int *newWord, const int length, int *outWord,
|
static inline void onTerminalWordLike(const int freq, int *newWord, const int length, int *outWord,
|
||||||
int *maxFreq) {
|
int *maxFreq) {
|
||||||
if (freq > *maxFreq) {
|
if (freq > *maxFreq) {
|
||||||
|
@ -752,9 +752,9 @@ static inline void onTerminalWordLike(const int freq, int *newWord, const int le
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Will find the highest frequency of the words like the one passed as an argument,
|
// Will find the highest probability of the words like the one passed as an argument,
|
||||||
// that is, everything that only differs by case/accents.
|
// that is, everything that only differs by case/accents.
|
||||||
int UnigramDictionary::getMostFrequentWordLikeInner(const int *const inWord, const int inputSize,
|
int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, const int inputSize,
|
||||||
int *outWord) const {
|
int *outWord) const {
|
||||||
int newWord[MAX_WORD_LENGTH];
|
int newWord[MAX_WORD_LENGTH];
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
|
@ -775,17 +775,18 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const int *const inWord, con
|
||||||
int inputIndex = stackInputIndex[depth];
|
int inputIndex = stackInputIndex[depth];
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
// Test whether all chars in this group match with the word we are searching for. If so,
|
// Test whether all chars in this group match with the word we are searching for. If so,
|
||||||
// we want to traverse its children (or if the inputSize match, evaluate its frequency).
|
// we want to traverse its children (or if the inputSize match, evaluate its
|
||||||
// Note that this function will output the position regardless, but will only write
|
// probability). Note that this function will output the position regardless, but will
|
||||||
// into inputIndex if there is a match.
|
// only write into inputIndex if there is a match.
|
||||||
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
|
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
|
||||||
inputIndex, inputSize, newWord, &inputIndex, &pos);
|
inputIndex, inputSize, newWord, &inputIndex, &pos);
|
||||||
if (isAlike && (!(BinaryFormat::FLAG_IS_NOT_A_WORD & flags))
|
if (isAlike && (!(BinaryFormat::FLAG_IS_NOT_A_WORD & flags))
|
||||||
&& (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) {
|
&& (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) {
|
||||||
const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
|
const int probability =
|
||||||
onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
|
BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
|
||||||
|
onTerminalWordLike(probability, newWord, inputIndex, outWord, &maxFreq);
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
|
const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
|
||||||
const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
|
const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
|
||||||
// If we had a match and the word has children, we want to traverse them. We don't have
|
// If we had a match and the word has children, we want to traverse them. We don't have
|
||||||
|
@ -816,7 +817,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const int *const inWord, con
|
||||||
return maxFreq;
|
return maxFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
int UnigramDictionary::getFrequency(const int *const inWord, const int length) const {
|
int UnigramDictionary::getProbability(const int *const inWord, const int length) const {
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = DICT_ROOT;
|
||||||
int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
|
int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
|
||||||
false /* forceLowerCaseSearch */);
|
false /* forceLowerCaseSearch */);
|
||||||
|
@ -826,7 +827,7 @@ int UnigramDictionary::getFrequency(const int *const inWord, const int length) c
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) {
|
if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) {
|
||||||
// If this is not a word, or if it's a blacklisted entry, it should behave as
|
// If this is not a word, or if it's a blacklisted entry, it should behave as
|
||||||
// having no frequency outside of the suggestion process (where it should be used
|
// having no probability outside of the suggestion process (where it should be used
|
||||||
// for shortcuts).
|
// for shortcuts).
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
|
@ -836,8 +837,8 @@ int UnigramDictionary::getFrequency(const int *const inWord, const int length) c
|
||||||
} else {
|
} else {
|
||||||
BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
|
BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
|
||||||
}
|
}
|
||||||
const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
|
const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
|
||||||
return unigramFreq;
|
return unigramProbability;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: remove this function.
|
// TODO: remove this function.
|
||||||
|
@ -884,7 +885,7 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
|
|
||||||
// This gets only ONE character from the stream. Next there will be:
|
// This gets only ONE character from the stream. Next there will be:
|
||||||
// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
|
// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
|
||||||
// else if FLAG_IS_TERMINAL: the frequency
|
// else if FLAG_IS_TERMINAL: the probability
|
||||||
// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
|
// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
|
||||||
// Note that you can't have a node that both is not a terminal and has no children.
|
// Note that you can't have a node that both is not a terminal and has no children.
|
||||||
int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
|
int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
|
||||||
|
@ -917,14 +918,14 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// We found that this is an unrelated character, so we should give up traversing
|
// We found that this is an unrelated character, so we should give up traversing
|
||||||
// this node and its children entirely.
|
// this node and its children entirely.
|
||||||
// However we may not be on the last virtual node yet so we skip the remaining
|
// However we may not be on the last virtual node yet so we skip the remaining
|
||||||
// characters in this node, the frequency if it's there, read the next sibling
|
// characters in this node, the probability if it's there, read the next sibling
|
||||||
// position to output it, then return false.
|
// position to output it, then return false.
|
||||||
// We don't have to output other values because we return false, as in
|
// We don't have to output other values because we return false, as in
|
||||||
// "don't traverse children".
|
// "don't traverse children".
|
||||||
if (!isLastChar) {
|
if (!isLastChar) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
|
pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
*nextSiblingPosition =
|
*nextSiblingPosition =
|
||||||
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
||||||
return false;
|
return false;
|
||||||
|
@ -937,16 +938,17 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
} while (NOT_A_CODE_POINT != c);
|
} while (NOT_A_CODE_POINT != c);
|
||||||
|
|
||||||
if (isTerminalNode) {
|
if (isTerminalNode) {
|
||||||
// The frequency should be here, because we come here only if this is actually
|
// The probability should be here, because we come here only if this is actually
|
||||||
// a terminal node, and we are on its last char.
|
// a terminal node, and we are on its last char.
|
||||||
const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
|
const int unigramProbability =
|
||||||
const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos);
|
BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos);
|
||||||
|
const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos);
|
||||||
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
|
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
|
||||||
TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
|
TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
|
||||||
// bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
|
// bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
|
||||||
// bigramFilter is a bloom filter of said frequencies for even faster rejection.
|
// bigramFilter is a bloom filter of said frequencies for even faster rejection.
|
||||||
const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
|
const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
|
||||||
unigramFreq);
|
unigramProbability);
|
||||||
onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
|
onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
|
||||||
currentWordIndex);
|
currentWordIndex);
|
||||||
|
|
||||||
|
@ -961,7 +963,7 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// Note that !hasChildren implies isLastChar, so we know we don't have to skip any
|
// Note that !hasChildren implies isLastChar, so we know we don't have to skip any
|
||||||
// remaining char in this group for there can't be any.
|
// remaining char in this group for there can't be any.
|
||||||
if (!hasChildren) {
|
if (!hasChildren) {
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
*nextSiblingPosition =
|
*nextSiblingPosition =
|
||||||
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
||||||
return false;
|
return false;
|
||||||
|
@ -969,7 +971,7 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
|
|
||||||
// Optimization: Prune out words that are too long compared to how much was typed.
|
// Optimization: Prune out words that are too long compared to how much was typed.
|
||||||
if (correction->needsToPrune()) {
|
if (correction->needsToPrune()) {
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
*nextSiblingPosition =
|
*nextSiblingPosition =
|
||||||
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
||||||
if (DEBUG_DICT_FULL) {
|
if (DEBUG_DICT_FULL) {
|
||||||
|
@ -983,13 +985,13 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// children, we can't come here.
|
// children, we can't come here.
|
||||||
ASSERT(BinaryFormat::hasChildrenInFlags(flags));
|
ASSERT(BinaryFormat::hasChildrenInFlags(flags));
|
||||||
|
|
||||||
// If this node was a terminal it still has the frequency under the pointer (it may have been
|
// If this node was a terminal it still has the probability under the pointer (it may have been
|
||||||
// read, but not skipped - see readFrequencyWithoutMovingPointer).
|
// read, but not skipped - see readProbabilityWithoutMovingPointer).
|
||||||
// Next come the children position, then possibly attributes (attributes are bigrams only for
|
// Next come the children position, then possibly attributes (attributes are bigrams only for
|
||||||
// now, maybe something related to shortcuts in the future).
|
// now, maybe something related to shortcuts in the future).
|
||||||
// Once this is read, we still need to output the number of nodes in the immediate children of
|
// Once this is read, we still need to output the number of nodes in the immediate children of
|
||||||
// this node, so we read and output it before returning true, as in "please traverse children".
|
// this node, so we read and output it before returning true, as in "please traverse children".
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
|
int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
|
||||||
*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
||||||
*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
|
*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
|
||||||
|
|
|
@ -40,7 +40,7 @@ class UnigramDictionary {
|
||||||
static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1;
|
static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1;
|
||||||
static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2;
|
static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2;
|
||||||
UnigramDictionary(const uint8_t *const streamStart, const unsigned int flags);
|
UnigramDictionary(const uint8_t *const streamStart, const unsigned int flags);
|
||||||
int getFrequency(const int *const inWord, const int length) const;
|
int getProbability(const int *const inWord, const int length) const;
|
||||||
int getBigramPosition(int pos, int *word, int offset, int length) const;
|
int getBigramPosition(int pos, int *word, int offset, int length) const;
|
||||||
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *inputCodePoints, const int inputSize,
|
const int *ycoordinates, const int *inputCodePoints, const int inputSize,
|
||||||
|
@ -89,9 +89,9 @@ class UnigramDictionary {
|
||||||
const uint8_t *bigramFilter, Correction *correction, int *newCount,
|
const uint8_t *bigramFilter, Correction *correction, int *newCount,
|
||||||
int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
|
int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
|
||||||
const int currentWordIndex) const;
|
const int currentWordIndex) const;
|
||||||
int getMostFrequentWordLike(const int startInputIndex, const int inputSize,
|
int getMostProbableWordLike(const int startInputIndex, const int inputSize,
|
||||||
Correction *correction, int *word) const;
|
Correction *correction, int *word) const;
|
||||||
int getMostFrequentWordLikeInner(const int *const inWord, const int inputSize,
|
int getMostProbableWordLikeInner(const int *const inWord, const int inputSize,
|
||||||
int *outWord) const;
|
int *outWord) const;
|
||||||
int getSubStringSuggestion(ProximityInfo *proximityInfo, const int *xcoordinates,
|
int getSubStringSuggestion(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||||
|
|
Loading…
Reference in a new issue