diff --git a/java/src/com/android/inputmethod/latin/Utils.java b/java/src/com/android/inputmethod/latin/Utils.java index 7091d9b56..149c5ca9e 100644 --- a/java/src/com/android/inputmethod/latin/Utils.java +++ b/java/src/com/android/inputmethod/latin/Utils.java @@ -285,13 +285,22 @@ public class Utils { // (the number of matched characters between typed word and suggested word)) // * (individual word's score which defined in the unigram dictionary, // and this score is defined in range [0, 255].) - // * (when before.length() == after.length(), - // mFullWordMultiplier (this is defined 2)) - // So, maximum original score is pow(2, before.length()) * 255 * 2 - // So, we can normalize original score by dividing this value. + // Then, the following processing is applied. + // - If the dictionary word is matched up to the point of the user entry + // (full match up to min(before.length(), after.length()) + // => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) + // - If the word is a true full match except for differences in accents or + // capitalization, then treat it as if the frequency was 255. + // - If before.length() == after.length() + // => multiply by mFullWordMultiplier (this is defined 2)) + // So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 + // For historical reasons we ignore the 1.2 modifier (because the measure for a good + // autocorrection threshold was done at a time when it didn't exist). This doesn't change + // the result. + // So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. private static final int MAX_INITIAL_SCORE = 255; private static final int TYPED_LETTER_MULTIPLIER = 2; - private static final int FULL_WORD_MULTIPLYER = 2; + private static final int FULL_WORD_MULTIPLIER = 2; public static double calcNormalizedScore(CharSequence before, CharSequence after, int score) { final int beforeLength = before.length(); final int afterLength = after.length(); @@ -301,7 +310,7 @@ public class Utils { // correction. final double maximumScore = MAX_INITIAL_SCORE * Math.pow(TYPED_LETTER_MULTIPLIER, Math.min(beforeLength, afterLength)) - * FULL_WORD_MULTIPLYER; + * FULL_WORD_MULTIPLIER; // add a weight based on edit distance. // distance <= max(afterLength, beforeLength) == afterLength, // so, 0 <= distance / afterLength <= 1 diff --git a/native/src/debug.h b/native/src/debug.h new file mode 100644 index 000000000..e5572e1a5 --- /dev/null +++ b/native/src/debug.h @@ -0,0 +1,58 @@ +/* +** +** Copyright 2011, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#ifndef LATINIME_DEBUG_H +#define LATINIME_DEBUG_H + +#include "defines.h" + +static inline unsigned char* convertToUnibyteString(unsigned short* input, unsigned char* output, + const unsigned int length) { + int i = 0; + for (; i <= length && input[i] != 0; ++i) + output[i] = input[i] & 0xFF; + output[i] = 0; + return output; +} +static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input, + unsigned char* output, const unsigned int length, unsigned char c) { + int i = 0; + for (; i <= length && input[i] != 0; ++i) + output[i] = input[i] & 0xFF; + output[i-1] = c; + output[i] = 0; + return output; +} +static inline void LOGI_S16(unsigned short* string, const unsigned int length) { + unsigned char tmp_buffer[length]; + convertToUnibyteString(string, tmp_buffer, length); + LOGI(">> %s", tmp_buffer); + // The log facility is throwing out log that comes too fast. The following + // is a dirty way of slowing down processing so that we can see all log. + // TODO : refactor this in a blocking log or something. + // usleep(10); +} +static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length, + unsigned char c) { + unsigned char tmp_buffer[length+1]; + convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c); + LOGI(">> %s", tmp_buffer); + // Likewise + // usleep(10); +} + +#endif // LATINIME_DEBUG_H diff --git a/native/src/defines.h b/native/src/defines.h index 7fa7e35e0..918028afe 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -100,6 +100,9 @@ static void prof_out(void) { #ifndef U_SHORT_MAX #define U_SHORT_MAX 1 << 16 #endif +#ifndef S_INT_MAX +#define S_INT_MAX ((1 << 31) - 1) +#endif // Define this to use mmap() for dictionary loading. Undefine to use malloc() instead of mmap(). // We measured and compared performance of both, and found mmap() is fairly good in terms of @@ -137,9 +140,6 @@ static void prof_out(void) { #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 -// This is used as a bare multiplier (not subject to /100) -#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2 - // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. #define MAX_WORD_LENGTH_INTERNAL 48 diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 3d5683ed9..f36eabb3f 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons } } +static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255; +static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); +} inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) { @@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { - finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER; + finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } } if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER; @@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex return false; } + +// In the following function, c is the current character of the dictionary word +// currently examined. +// currentChars is an array containing the keys close to the character the +// user actually typed at the same position. We want to see if c is in it: if so, +// then the word contains at that position a character close to what the user +// typed. +// What the user typed is actually the first character of the array. +// Notice : accented characters do not have a proximity list, so they are alone +// in their list. The non-accented version of the character should be considered +// "close", but not the other keys close to the non-accented version. inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId( const int *currentChars, const unsigned short c, const int skipPos, const int excessivePos, const int transposedPos) { const unsigned short lowerC = toLowerCase(c); - int j = 0; + + // The first char in the array is what user typed. If it matches right away, + // that means the user typed that same char for this pos. + if (currentChars[0] == lowerC || currentChars[0] == c) + return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; + + // If one of those is true, we should not check for close characters at all. + if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) + return UNRELATED_CHAR; + + // If the non-accented, lowercased version of that first character matches c, + // then we have a non-accented version of the accented character the user + // typed. Treat it as a close char. + if (toLowerCase(currentChars[0]) == lowerC) + return NEAR_PROXIMITY_CHAR; + + // Not an exact nor an accent-alike match: search the list of close keys + int j = 1; while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) { const bool matched = (currentChars[j] == lowerC || currentChars[j] == c); - // If skipPos is defined, not to search proximity collections. - // First char is what user typed. - if (matched) { - if (j > 0) return NEAR_PROXIMITY_CHAR; - return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; - } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) { - // Not to check proximity characters - return UNRELATED_CHAR; - } + if (matched) return NEAR_PROXIMITY_CHAR; ++j; } + + // Was not included, signal this as an unrelated character. return UNRELATED_CHAR; }