diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index ab2a12fd0..d369e2b47 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -91,7 +91,7 @@ public final class BinaryDictionary extends Dictionary { private static native long openNative(String sourceDir, long dictOffset, long dictSize); private static native void closeNative(long dict); - private static native int getFrequencyNative(long dict, int[] word); + private static native int getProbabilityNative(long dict, int[] word); private static native boolean isValidBigramNative(long dict, int[] word1, int[] word2); private static native int getSuggestionsNative(long dict, long proximityInfo, long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, @@ -186,7 +186,7 @@ public final class BinaryDictionary extends Dictionary { public int getFrequency(final String word) { if (word == null) return -1; int[] codePoints = StringUtils.toCodePointArray(word); - return getFrequencyNative(mNativeDict, codePoints); + return getProbabilityNative(mNativeDict, codePoints); } // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index ca38b0de5..9321c4b8c 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -203,14 +203,14 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz, j return count; } -static jint latinime_BinaryDictionary_getFrequency(JNIEnv *env, jclass clazz, jlong dict, +static jint latinime_BinaryDictionary_getProbability(JNIEnv *env, jclass clazz, jlong dict, jintArray wordArray) { Dictionary *dictionary = reinterpret_cast(dict); if (!dictionary) return 0; const jsize codePointLength = env->GetArrayLength(wordArray); int codePoints[codePointLength]; env->GetIntArrayRegion(wordArray, 0, codePointLength, codePoints); - return dictionary->getFrequency(codePoints, codePointLength); + return dictionary->getProbability(codePoints, codePointLength); } static jboolean latinime_BinaryDictionary_isValidBigram(JNIEnv *env, jclass clazz, jlong dict, @@ -285,8 +285,8 @@ static JNINativeMethod sMethods[] = { {"closeNative", "(J)V", reinterpret_cast(latinime_BinaryDictionary_close)}, {"getSuggestionsNative", "(JJJ[I[I[I[I[IIIZ[IZ[I[I[I[I)I", reinterpret_cast(latinime_BinaryDictionary_getSuggestions)}, - {"getFrequencyNative", "(J[I)I", - reinterpret_cast(latinime_BinaryDictionary_getFrequency)}, + {"getProbabilityNative", "(J[I)I", + reinterpret_cast(latinime_BinaryDictionary_getProbability)}, {"isValidBigramNative", "(J[I[I)Z", reinterpret_cast(latinime_BinaryDictionary_isValidBigram)}, {"calcNormalizedScoreNative", "([I[II)F", diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index ef0434c49..43e59a262 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -36,21 +36,21 @@ BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT BigramDictionary::~BigramDictionary() { } -void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq, +void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const { word[length] = 0; if (DEBUG_DICT) { #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = static_cast(word[i]); - AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency); + AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); #endif } // Find the right insertion point int insertAt = 0; while (insertAt < MAX_RESULTS) { - if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency + if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability && length < getCodePointCount(MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { break; @@ -63,10 +63,10 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * if (insertAt >= MAX_RESULTS) { return; } - memmove(bigramFreq + (insertAt + 1), - bigramFreq + insertAt, - (MAX_RESULTS - insertAt - 1) * sizeof(bigramFreq[0])); - bigramFreq[insertAt] = frequency; + memmove(bigramProbability + (insertAt + 1), + bigramProbability + insertAt, + (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); + bigramProbability[insertAt] = probability; outputTypes[insertAt] = Dictionary::KIND_PREDICTION; memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH, @@ -87,7 +87,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions. * inputSize: the size of the codes array. * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions. - * bigramFreq: an array to output frequencies. + * bigramProbability: an array to output frequencies. * outputTypes: an array to output types. * This method returns the number of bigrams this word has, for backward compatibility. * Note: this is not the number of bigrams output in the array, which is the number of @@ -98,7 +98,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * * reduce their scope to the ones that match the first letter. */ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, - int inputSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const { + int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name @@ -118,23 +118,24 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); int bigramBuffer[MAX_WORD_LENGTH]; - int unigramFreq = 0; + int unigramProbability = 0; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, - bigramBuffer, &unigramFreq); + bigramBuffer, &unigramProbability); // inputSize == 0 means we are trying to find bigram predictions. if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { - const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; - // Due to space constraints, the frequency for bigrams is approximate - the lower the - // unigram frequency, the worse the precision. The theoritical maximum error in - // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 + const int bigramProbabilityTemp = + BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; + // Due to space constraints, the probability for bigrams is approximate - the lower the + // unigram probability, the worse the precision. The theoritical maximum error in + // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. - const int frequency = - BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp); - addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints, + const int probability = BinaryFormat::computeProbabilityForBigram( + unigramProbability, bigramProbabilityTemp); + addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, outputTypes); ++bigramCount; } @@ -159,13 +160,13 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in } else { pos = BinaryFormat::skipOtherCharacters(root, pos); } - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipShortcuts(root, flags, pos); return pos; } -void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, +void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t *const root = DICT_ROOT; @@ -181,10 +182,10 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevW uint8_t bigramFlags; do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); - (*map)[bigramPos] = frequency; + (*map)[bigramPos] = probability; setInFilter(filter, bigramPos); } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } diff --git a/native/jni/src/bigram_dictionary.h b/native/jni/src/bigram_dictionary.h index 2ce6c1d0d..b86e564c3 100644 --- a/native/jni/src/bigram_dictionary.h +++ b/native/jni/src/bigram_dictionary.h @@ -29,14 +29,14 @@ class BigramDictionary { BigramDictionary(const uint8_t *const streamStart); int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const; - void fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, const int prevWordLength, + void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map *map, uint8_t *filter) const; bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const; ~BigramDictionary(); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); - void addWordBigram(int *word, int length, int frequency, int *bigramFreq, int *bigramCodePoints, - int *outputTypes) const; + void addWordBigram(int *word, int length, int probability, int *bigramProbability, + int *bigramCodePoints, int *outputTypes) const; bool checkFirstCharacter(int *word, int *inputCodePoints) const; int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const; diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 2d7c4b492..1c4061fd8 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -52,10 +52,10 @@ class BinaryFormat { // Flag for sign of offset. If this flag is set, the offset value must be negated. static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; - // Mask for attribute frequency, stored on 4 bits inside the flags byte. - static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; - // The numeric value of the shortcut frequency that means 'whitelist'. - static const int WHITELIST_SHORTCUT_FREQUENCY = 15; + // Mask for attribute probability, stored on 4 bits inside the flags byte. + static const int MASK_ATTRIBUTE_PROBABILITY = 0x0F; + // The numeric value of the shortcut probability that means 'whitelist'. + static const int WHITELIST_SHORTCUT_PROBABILITY = 15; // Mask and flags for attribute address type selection. static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; @@ -72,10 +72,10 @@ class BinaryFormat { static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); - static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); + static int readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos); static int skipOtherCharacters(const uint8_t *const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); - static int skipFrequency(const uint8_t flags, const int pos); + static int skipProbability(const uint8_t flags, const int pos); static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); @@ -83,14 +83,15 @@ class BinaryFormat { static bool hasChildrenInFlags(const uint8_t flags); static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos); - static int getAttributeFrequencyFromFlags(const int flags); + static int getAttributeProbabilityFromFlags(const int flags); static int getTerminalPosition(const uint8_t *const root, const int *const inWord, const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, - int *outWord, int *outUnigramFrequency); - static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); + int *outWord, int *outUnigramProbability); + static int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability); static int getProbability(const int position, const std::map *bigramMap, - const uint8_t *bigramFilter, const int unigramFreq); + const uint8_t *bigramFilter, const int unigramProbability); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or @@ -264,7 +265,7 @@ AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *c } } -inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict, +inline int BinaryFormat::readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos) { return dict[pos]; } @@ -320,7 +321,7 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos return pos + childrenAddressSize(flags); } -inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { +inline int BinaryFormat::skipProbability(const uint8_t flags, const int pos) { return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } @@ -415,8 +416,8 @@ AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uin } } -inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) { - return flags & MASK_ATTRIBUTE_FREQUENCY; +inline int BinaryFormat::getAttributeProbabilityFromFlags(const int flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; } // This function gets the byte position of the last chargroup of the exact matching word in the @@ -466,7 +467,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (wordPos == length) { return charGroupPos; } - pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); + pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos); } if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { return NOT_VALID_WORD; @@ -481,7 +482,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = BinaryFormat::skipOtherCharacters(root, pos); } - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); } --charGroupCount; @@ -504,11 +505,11 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. - * outUnigramFrequency: a pointer to an int to write the frequency into. + * outUnigramProbability: a pointer to an int to write the probability into. * Return value : the length of the word, of 0 if the word was not found. */ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, - const int maxDepth, int *outWord, int *outUnigramFrequency) { + const int maxDepth, int *outWord, int *outUnigramProbability) { int pos = 0; int wordPos = 0; @@ -541,15 +542,15 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co nextChar = getCodePointAndForwardPointer(root, &pos); } } - *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); + *outUnigramProbability = readProbabilityWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the - // first and possibly the frequency. + // first and possibly the probability. if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = skipOtherCharacters(root, pos); } - pos = skipFrequency(flags, pos); + pos = skipProbability(flags, pos); // The fact that this group has children is very important. Since we already know // that this group does not match, if it has no children we know it is irrelevant @@ -604,9 +605,9 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co } } ++wordPos; - // Now we only need to branch to the children address. Skip the frequency if + // Now we only need to branch to the children address. Skip the probability if // it's there, read pos, and break to resume the search at pos. - lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); + lastCandidateGroupPos = skipProbability(lastFlags, lastCandidateGroupPos); pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); break; } else { @@ -635,36 +636,39 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } -static inline int backoff(const int unigramFreq) { - return unigramFreq; +static inline int backoff(const int unigramProbability) { + return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. - // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); + // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } -inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { - // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the - // unigram frequency to be the median value of the 17th step from the top. A value of - // 0 for the bigram frequency represents the middle of the 16th step from the top, +inline int BinaryFormat::computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the + // unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. - const float stepSize = static_cast(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); - return unigramFreq + static_cast(static_cast(bigramFreq + 1) * stepSize); + const float stepSize = static_cast(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast(static_cast(bigramProbability + 1) * stepSize); } // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map *bigramMap, - const uint8_t *bigramFilter, const int unigramFreq) { - if (!bigramMap || !bigramFilter) return backoff(unigramFreq); - if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); - const std::map::const_iterator bigramFreqIt = bigramMap->find(position); - if (bigramFreqIt != bigramMap->end()) { - const int bigramFreq = bigramFreqIt->second; - return computeFrequencyForBigram(unigramFreq, bigramFreq); + const uint8_t *bigramFilter, const int unigramProbability) { + if (!bigramMap || !bigramFilter) return backoff(unigramProbability); + if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); + const std::map::const_iterator bigramProbabilityIt = bigramMap->find(position); + if (bigramProbabilityIt != bigramMap->end()) { + const int bigramProbability = bigramProbabilityIt->second; + return computeProbabilityForBigram(unigramProbability, bigramProbability); } - return backoff(unigramFreq); + return backoff(unigramProbability); } } // namespace latinime #endif // LATINIME_BINARY_FORMAT_H diff --git a/native/jni/src/correction.cpp b/native/jni/src/correction.cpp index 0ae02d506..671507ee0 100644 --- a/native/jni/src/correction.cpp +++ b/native/jni/src/correction.cpp @@ -841,7 +841,7 @@ inline static bool isUpperCase(unsigned short c) { const int freq = freqArray[i]; // Demote too short weak words if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) { - multiplyRate(100 * freq / MAX_FREQ, &totalFreq); + multiplyRate(100 * freq / MAX_PROBABILITY, &totalFreq); } if (wordLength == 1) { ++oneLengthCounter; diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index 0aedc287f..6e098157d 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -72,11 +72,11 @@ AK_FORCE_INLINE static int intArrayToCharArray(const int *source, const int sour } static inline void dumpWordInfo(const int *word, const int length, const int rank, - const int frequency) { + const int probability) { static char charBuf[50]; const int N = intArrayToCharArray(word, length, charBuf); if (N > 1) { - AKLOGI("%2d [ %s ] (%d)", rank, charBuf, frequency); + AKLOGI("%2d [ %s ] (%d)", rank, charBuf, probability); } } @@ -312,8 +312,8 @@ static inline void prof_out(void) { #define ZERO_DISTANCE_PROMOTION_RATE 110.0f #define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f #define HALF_SCORE_SQUARED_RADIUS 32.0f -#define MAX_FREQ 255 -#define MAX_BIGRAM_FREQ 15 +#define MAX_PROBABILITY 255 +#define MAX_BIGRAM_ENCODED_PROBABILITY 15 // Assuming locale strings such as en_US, sr-Latn etc. #define MAX_LOCALE_STRING_LENGTH 10 @@ -335,8 +335,8 @@ static inline void prof_out(void) { #define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.35f #define START_TWO_WORDS_CORRECTION_THRESHOLD 0.185f -/* heuristic... This should be changed if we change the unit of the frequency. */ -#define SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ (MAX_FREQ * 58 / 100) +/* heuristic... This should be changed if we change the unit of the probability. */ +#define SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ (MAX_PROBABILITY * 58 / 100) #define MAX_DEPTH_MULTIPLIER 3 #define FIRST_WORD_INDEX 0 diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp index 2be1f4f39..6deab36b6 100644 --- a/native/jni/src/dictionary.cpp +++ b/native/jni/src/dictionary.cpp @@ -62,7 +62,7 @@ int Dictionary::getSuggestions(ProximityInfo *proximityInfo, void *traverseSessi } else { std::map bigramMap; uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE]; - mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordCodePoints, + mBigramDictionary->fillBigramAddressToProbabilityMapAndFilter(prevWordCodePoints, prevWordLength, &bigramMap, bigramFilter); result = mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, inputSize, &bigramMap, bigramFilter, useFullEditDistance, outWords, @@ -78,8 +78,8 @@ int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, in frequencies, outputTypes); } -int Dictionary::getFrequency(const int *word, int length) const { - return mUnigramDictionary->getFrequency(word, length); +int Dictionary::getProbability(const int *word, int length) const { + return mUnigramDictionary->getProbability(word, length); } bool Dictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const { diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h index ecdddd771..449b95ab6 100644 --- a/native/jni/src/dictionary.h +++ b/native/jni/src/dictionary.h @@ -52,7 +52,7 @@ class Dictionary { int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const; - int getFrequency(const int *word, int length) const; + int getProbability(const int *word, int length) const; bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const; const uint8_t *getDict() const { // required to release dictionary buffer return mDict; diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h index a8cc03b8d..144ae1452 100644 --- a/native/jni/src/terminal_attributes.h +++ b/native/jni/src/terminal_attributes.h @@ -51,7 +51,7 @@ class TerminalAttributes { if (NOT_A_CODE_POINT == codePoint) break; outWord[i] = codePoint; } - *outFreq = BinaryFormat::getAttributeFrequencyFromFlags(shortcutFlags); + *outFreq = BinaryFormat::getAttributeProbabilityFromFlags(shortcutFlags); return i; } diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 0b18e78a3..80ba412a3 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -52,8 +52,8 @@ UnigramDictionary::~UnigramDictionary() { } // TODO: This needs to take a const int* and not tinker with its contents -static void addWord(int *word, int length, int frequency, WordsPriorityQueue *queue, int type) { - queue->push(frequency, word, length, type); +static void addWord(int *word, int length, int probability, WordsPriorityQueue *queue, int type) { + queue->push(probability, word, length, type); } // Return the replacement code point for a digraph, or 0 if none. @@ -158,7 +158,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit queuePool); } -// bigramMap contains the association -> +// bigramMap contains the association -> // bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter // in bigram_dictionary.cpp int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, @@ -399,7 +399,7 @@ void UnigramDictionary::onTerminal(const int probability, MAX_WORD_LENGTH, shortcutTarget, &shortcutFrequency); int shortcutScore; int kind; - if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_FREQUENCY + if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_PROBABILITY && correction->sameAsTyped()) { shortcutScore = S_INT_MAX; kind = Dictionary::KIND_WHITELIST; @@ -483,7 +483,7 @@ int UnigramDictionary::getSubStringSuggestion( inputSize, correction); int word[MAX_WORD_LENGTH]; - int freq = getMostFrequentWordLike( + int freq = getMostProbableWordLike( inputWordStartPos, inputWordLength, correction, word); if (freq > 0) { nextWordLength = inputWordLength; @@ -679,15 +679,15 @@ void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximit outputWord); } -// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous +// Wrapper for getMostProbableWordLikeInner, which matches it to the previous // interface. -int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, const int inputSize, +int UnigramDictionary::getMostProbableWordLike(const int startInputIndex, const int inputSize, Correction *correction, int *word) const { int inWord[inputSize]; for (int i = 0; i < inputSize; ++i) { inWord[i] = correction->getPrimaryCodePointAt(startInputIndex + i); } - return getMostFrequentWordLikeInner(inWord, inputSize, word); + return getMostProbableWordLikeInner(inWord, inputSize, word); } // This function will take the position of a character array within a CharGroup, @@ -738,9 +738,9 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, } // This function is invoked when a word like the word searched for is found. -// It will compare the frequency to the max frequency, and if greater, will +// It will compare the probability to the max probability, and if greater, will // copy the word into the output buffer. In output value maxFreq, it will -// write the new maximum frequency if it changed. +// write the new maximum probability if it changed. static inline void onTerminalWordLike(const int freq, int *newWord, const int length, int *outWord, int *maxFreq) { if (freq > *maxFreq) { @@ -752,9 +752,9 @@ static inline void onTerminalWordLike(const int freq, int *newWord, const int le } } -// Will find the highest frequency of the words like the one passed as an argument, +// Will find the highest probability of the words like the one passed as an argument, // that is, everything that only differs by case/accents. -int UnigramDictionary::getMostFrequentWordLikeInner(const int *const inWord, const int inputSize, +int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, const int inputSize, int *outWord) const { int newWord[MAX_WORD_LENGTH]; int depth = 0; @@ -775,17 +775,18 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const int *const inWord, con int inputIndex = stackInputIndex[depth]; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); // Test whether all chars in this group match with the word we are searching for. If so, - // we want to traverse its children (or if the inputSize match, evaluate its frequency). - // Note that this function will output the position regardless, but will only write - // into inputIndex if there is a match. + // we want to traverse its children (or if the inputSize match, evaluate its + // probability). Note that this function will output the position regardless, but will + // only write into inputIndex if there is a match. const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, inputIndex, inputSize, newWord, &inputIndex, &pos); if (isAlike && (!(BinaryFormat::FLAG_IS_NOT_A_WORD & flags)) && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) { - const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); - onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); + const int probability = + BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); + onTerminalWordLike(probability, newWord, inputIndex, outWord, &maxFreq); } - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); // If we had a match and the word has children, we want to traverse them. We don't have @@ -816,7 +817,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const int *const inWord, con return maxFreq; } -int UnigramDictionary::getFrequency(const int *const inWord, const int length) const { +int UnigramDictionary::getProbability(const int *const inWord, const int length) const { const uint8_t *const root = DICT_ROOT; int pos = BinaryFormat::getTerminalPosition(root, inWord, length, false /* forceLowerCaseSearch */); @@ -826,7 +827,7 @@ int UnigramDictionary::getFrequency(const int *const inWord, const int length) c const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) { // If this is not a word, or if it's a blacklisted entry, it should behave as - // having no frequency outside of the suggestion process (where it should be used + // having no probability outside of the suggestion process (where it should be used // for shortcuts). return NOT_A_PROBABILITY; } @@ -836,8 +837,8 @@ int UnigramDictionary::getFrequency(const int *const inWord, const int length) c } else { BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); } - const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); - return unigramFreq; + const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); + return unigramProbability; } // TODO: remove this function. @@ -884,7 +885,7 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // This gets only ONE character from the stream. Next there will be: // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node - // else if FLAG_IS_TERMINAL: the frequency + // else if FLAG_IS_TERMINAL: the probability // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); @@ -917,14 +918,14 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // We found that this is an unrelated character, so we should give up traversing // this node and its children entirely. // However we may not be on the last virtual node yet so we skip the remaining - // characters in this node, the frequency if it's there, read the next sibling + // characters in this node, the probability if it's there, read the next sibling // position to output it, then return false. // We don't have to output other values because we return false, as in // "don't traverse children". if (!isLastChar) { pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); } - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); return false; @@ -937,16 +938,17 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, } while (NOT_A_CODE_POINT != c); if (isTerminalNode) { - // The frequency should be here, because we come here only if this is actually + // The probability should be here, because we come here only if this is actually // a terminal node, and we are on its last char. - const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); + const int unigramProbability = + BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos); + const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, - unigramFreq); + unigramProbability); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); @@ -961,7 +963,7 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // Note that !hasChildren implies isLastChar, so we know we don't have to skip any // remaining char in this group for there can't be any. if (!hasChildren) { - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); return false; @@ -969,7 +971,7 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // Optimization: Prune out words that are too long compared to how much was typed. if (correction->needsToPrune()) { - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); if (DEBUG_DICT_FULL) { @@ -983,13 +985,13 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // children, we can't come here. ASSERT(BinaryFormat::hasChildrenInFlags(flags)); - // If this node was a terminal it still has the frequency under the pointer (it may have been - // read, but not skipped - see readFrequencyWithoutMovingPointer). + // If this node was a terminal it still has the probability under the pointer (it may have been + // read, but not skipped - see readProbabilityWithoutMovingPointer). // Next come the children position, then possibly attributes (attributes are bigrams only for // now, maybe something related to shortcuts in the future). // Once this is read, we still need to output the number of nodes in the immediate children of // this node, so we read and output it before returning true, as in "please traverse children". - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index 502bf4790..c1955e8bb 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -40,7 +40,7 @@ class UnigramDictionary { static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1; static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2; UnigramDictionary(const uint8_t *const streamStart, const unsigned int flags); - int getFrequency(const int *const inWord, const int length) const; + int getProbability(const int *const inWord, const int length) const; int getBigramPosition(int pos, int *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *inputCodePoints, const int inputSize, @@ -89,9 +89,9 @@ class UnigramDictionary { const uint8_t *bigramFilter, Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) const; - int getMostFrequentWordLike(const int startInputIndex, const int inputSize, + int getMostProbableWordLike(const int startInputIndex, const int inputSize, Correction *correction, int *word) const; - int getMostFrequentWordLikeInner(const int *const inWord, const int inputSize, + int getMostProbableWordLikeInner(const int *const inWord, const int inputSize, int *outWord) const; int getSubStringSuggestion(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance,