From c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 25 Feb 2011 17:56:53 +0900 Subject: [PATCH] Use translation of fallback umlauts digraphs for German. For German : handle "ae", "oe" and "ue" to be alternate forms for umlaut-bearing versions of "a", "o" and "u". Issue: 3275926 Change-Id: I056c707cdacc464ceab63be56c016c7f8439196c --- java/res/xml/method.xml | 1 + .../inputmethod/latin/BinaryDictionary.java | 34 +++- .../inputmethod/latin/SubtypeSwitcher.java | 54 ++++-- ...oid_inputmethod_latin_BinaryDictionary.cpp | 7 +- native/src/debug.h | 11 ++ native/src/dictionary.h | 4 +- native/src/unigram_dictionary.cpp | 154 ++++++++++++++---- native/src/unigram_dictionary.h | 30 +++- 8 files changed, 239 insertions(+), 56 deletions(-) diff --git a/java/res/xml/method.xml b/java/res/xml/method.xml index b1f737903..8dec7abec 100644 --- a/java/res/xml/method.xml +++ b/java/res/xml/method.xml @@ -65,6 +65,7 @@ android:label="@string/subtype_mode_de_keyboard" android:imeSubtypeLocale="de" android:imeSubtypeMode="keyboard" + android:imeSubtypeExtraValue="requiresGermanUmlautProcessing" /> mAllEnabledSubtypesOfCurrentInputMethod; + private InputMethodSubtype mCurrentSubtype; private Locale mSystemLocale; private Locale mInputLocale; private String mInputLocaleStr; - private String mMode; private VoiceInput mVoiceInput; /*-----------------------------------------------------------*/ @@ -110,8 +110,7 @@ public class SubtypeSwitcher { mSystemLocale = null; mInputLocale = null; mInputLocaleStr = null; - // Mode is initialized to KEYBOARD_MODE, in case that LatinIME can't obtain currentSubtype - mMode = KEYBOARD_MODE; + mCurrentSubtype = null; mAllEnabledSubtypesOfCurrentInputMethod = null; // TODO: Voice input should be created here mVoiceInput = null; @@ -145,6 +144,7 @@ public class SubtypeSwitcher { // Reload enabledSubtypes from the framework. private void updateEnabledSubtypes() { + final String currentMode = getCurrentSubtypeMode(); boolean foundCurrentSubtypeBecameDisabled = true; mAllEnabledSubtypesOfCurrentInputMethod = mImm.getEnabledInputMethodSubtypeList( null, true); @@ -157,7 +157,7 @@ public class SubtypeSwitcher { if (mLocaleSplitter.hasNext()) { mEnabledLanguagesOfCurrentInputMethod.add(mLocaleSplitter.next()); } - if (locale.equals(mInputLocaleStr) && mode.equals(mMode)) { + if (locale.equals(mInputLocaleStr) && mode.equals(currentMode)) { foundCurrentSubtypeBecameDisabled = false; } if (KEYBOARD_MODE.equals(ims.getMode())) { @@ -168,7 +168,7 @@ public class SubtypeSwitcher { && mIsSystemLanguageSameAsInputLanguage); if (foundCurrentSubtypeBecameDisabled) { if (DBG) { - Log.w(TAG, "Current subtype: " + mInputLocaleStr + ", " + mMode); + Log.w(TAG, "Current subtype: " + mInputLocaleStr + ", " + currentMode); Log.w(TAG, "Last subtype was disabled. Update to the current one."); } updateSubtype(mImm.getCurrentInputMethodSubtype()); @@ -209,9 +209,10 @@ public class SubtypeSwitcher { public void updateSubtype(InputMethodSubtype newSubtype) { final String newLocale; final String newMode; + final String oldMode = getCurrentSubtypeMode(); if (newSubtype == null) { // Normally, newSubtype shouldn't be null. But just in case newSubtype was null, - // fallback to the default locale and mode. + // fallback to the default locale. Log.w(TAG, "Couldn't get the current subtype."); newLocale = "en_US"; newMode = KEYBOARD_MODE; @@ -220,8 +221,8 @@ public class SubtypeSwitcher { newMode = newSubtype.getMode(); } if (DBG) { - Log.w(TAG, "Update subtype to:" + newLocale + "," + newMode - + ", from: " + mInputLocaleStr + ", " + mMode); + Log.w(TAG, "Update subtype to:" + newLocale + "," + newSubtype.getMode() + + ", from: " + mInputLocaleStr + ", " + oldMode); } boolean languageChanged = false; if (!newLocale.equals(mInputLocaleStr)) { @@ -231,13 +232,12 @@ public class SubtypeSwitcher { updateInputLocale(newLocale); } boolean modeChanged = false; - String oldMode = mMode; - if (!newMode.equals(mMode)) { - if (mMode != null) { + if (!newMode.equals(oldMode)) { + if (oldMode != null) { modeChanged = true; } - mMode = newMode; } + mCurrentSubtype = newSubtype; // If the old mode is voice input, we need to reset or cancel its status. // We cancel its status when we change mode, while we reset otherwise. @@ -262,7 +262,7 @@ public class SubtypeSwitcher { triggerVoiceIME(); } } else { - Log.w(TAG, "Unknown subtype mode: " + mMode); + Log.w(TAG, "Unknown subtype mode: " + newMode); if (VOICE_MODE.equals(oldMode) && mVoiceInput != null) { // We need to reset the voice input to release the resources and to reset its status // as it is not the current input mode. @@ -483,7 +483,7 @@ public class SubtypeSwitcher { } public boolean isKeyboardMode() { - return KEYBOARD_MODE.equals(mMode); + return KEYBOARD_MODE.equals(getCurrentSubtypeMode()); } @@ -506,7 +506,7 @@ public class SubtypeSwitcher { } public boolean isVoiceMode() { - return VOICE_MODE.equals(mMode); + return null == mCurrentSubtype ? false : VOICE_MODE.equals(getCurrentSubtypeMode()); } private void triggerVoiceIME() { @@ -572,6 +572,30 @@ public class SubtypeSwitcher { } } + ///////////////////////////// + // Other utility functions // + ///////////////////////////// + + public String getCurrentSubtypeExtraValue() { + // If null, return what an empty ExtraValue would return : the empty string. + return null != mCurrentSubtype ? mCurrentSubtype.getExtraValue() : ""; + } + + public boolean currentSubtypeContainsExtraValueKey(String key) { + // If null, return what an empty ExtraValue would return : false. + return null != mCurrentSubtype ? mCurrentSubtype.containsExtraValueKey(key) : false; + } + + public String getCurrentSubtypeExtraValueOf(String key) { + // If null, return what an empty ExtraValue would return : null. + return null != mCurrentSubtype ? mCurrentSubtype.getExtraValueOf(key) : null; + } + + public String getCurrentSubtypeMode() { + return null != mCurrentSubtype ? mCurrentSubtype.getMode() : KEYBOARD_MODE; + } + + // A list of locales which are supported by default for voice input, unless we get a // different list from Gservices. private static final String DEFAULT_VOICE_INPUT_SUPPORTED_LOCALES = diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index b10dd6d7b..555a522eb 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -126,7 +126,8 @@ static jint latinime_BinaryDictionary_open(JNIEnv *env, jobject object, static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jint dict, jint proximityInfo, jintArray xCoordinatesArray, jintArray yCoordinatesArray, - jintArray inputArray, jint arraySize, jcharArray outputArray, jintArray frequencyArray) { + jintArray inputArray, jint arraySize, jint flags, + jcharArray outputArray, jintArray frequencyArray) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return 0; ProximityInfo *pInfo = (ProximityInfo*)proximityInfo; @@ -140,7 +141,7 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jchar *outputChars = env->GetCharArrayElements(outputArray, NULL); int count = dictionary->getSuggestions(pInfo, xCoordinates, yCoordinates, inputCodes, - arraySize, (unsigned short*) outputChars, frequencies); + arraySize, flags, (unsigned short*) outputChars, frequencies); env->ReleaseIntArrayElements(frequencyArray, frequencies, 0); env->ReleaseIntArrayElements(inputArray, inputCodes, JNI_ABORT); @@ -213,7 +214,7 @@ static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jint di static JNINativeMethod sMethods[] = { {"openNative", "(Ljava/lang/String;JJIIIII)I", (void*)latinime_BinaryDictionary_open}, {"closeNative", "(I)V", (void*)latinime_BinaryDictionary_close}, - {"getSuggestionsNative", "(II[I[I[II[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, + {"getSuggestionsNative", "(II[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, {"isValidWordNative", "(I[CI)Z", (void*)latinime_BinaryDictionary_isValidWord}, {"getBigramsNative", "(I[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams} }; diff --git a/native/src/debug.h b/native/src/debug.h index e5572e1a5..ae629b222 100644 --- a/native/src/debug.h +++ b/native/src/debug.h @@ -55,4 +55,15 @@ static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int leng // usleep(10); } +static inline void printDebug(const char* tag, int* codes, int codesSize, int MAX_PROXIMITY_CHARS) { + unsigned char *buf = (unsigned char*)malloc((1 + codesSize) * sizeof(*buf)); + + buf[codesSize] = 0; + while (--codesSize >= 0) + buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS]; + LOGI("%s, WORD = %s", tag, buf); + + free(buf); +} + #endif // LATINIME_DEBUG_H diff --git a/native/src/dictionary.h b/native/src/dictionary.h index fbbb8312b..13b2a2816 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -29,9 +29,9 @@ public: Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives); int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, - int *codes, int codesSize, unsigned short *outWords, int *frequencies) { + int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) { return mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - codesSize, outWords, frequencies); + codesSize, flags, outWords, frequencies); } // TODO: Call mBigramDictionary instead of mUnigramDictionary diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 72b0f361a..9aa36b064 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -29,20 +29,136 @@ namespace latinime { +const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = + { { 'a', 'e' }, + { 'o', 'e' }, + { 'u', 'e' } }; + UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), - ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) { + ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0), + BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)) { if (DEBUG_DICT) LOGI("UnigramDictionary - constructor"); } UnigramDictionary::~UnigramDictionary() {} -int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, - int *ycoordinates, int *codes, int codesSize, unsigned short *outWords, int *frequencies) { +static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize, + const int MAX_PROXIMITY_CHARS) { + return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize; +} + +bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const { + + // There can't be a digraph if we don't have at least 2 characters to examine + if (i + 2 > codesSize) return false; + + // Search for the first char of some digraph + int lastDigraphIndex = -1; + const int thisChar = codes[i * MAX_PROXIMITY_CHARS]; + for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1; + lastDigraphIndex >= 0; --lastDigraphIndex) { + if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break; + } + // No match: return early + if (lastDigraphIndex < 0) return false; + + // It's an interesting digraph if the second char matches too. + return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS]; +} + +// Mostly the same arguments as the non-recursive version, except: +// codes is the original value. It points to the start of the work buffer, and gets passed as is. +// codesSize is the size of the user input (thus, it is the size of codesSrc). +// codesDest is the current point in the work buffer. +// codesSrc is the current point in the user-input, original, content-unmodified buffer. +// codesRemain is the remaining size in codesSrc. +void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo, + const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, + const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, + int* codesDest, unsigned short* outWords, int* frequencies) { + + for (int i = 0; i < codesRemain; ++i) { + if (isDigraph(codesSrc, i, codesRemain)) { + // Found a digraph. We will try both spellings. eg. the word is "pruefen" + + // Copy the word up to the first char of the digraph, then continue processing + // on the remaining part of the word, skipping the second char of the digraph. + // In our example, copy "pru" and continue running on "fen" + memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR); + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + codesBufferSize, flags, codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, + codesRemain - i - 1, codesDest + i * MAX_PROXIMITY_CHARS, + outWords, frequencies); + + // Copy the second char of the digraph in place, then continue processing on + // the remaining part of the word. + // In our example, after "pru" in the buffer copy the "e", and continue running on "fen" + memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS, + BYTES_IN_ONE_CHAR); + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS, codesRemain - i, + codesDest + i * MAX_PROXIMITY_CHARS, outWords, frequencies); + return; + } + } + + // If we come here, we hit the end of the word: let's check it against the dictionary. + // In our example, we'll come here once for "prufen" and then once for "pruefen". + // If the word contains several digraphs, we'll come it for the product of them. + // eg. if the word is "ueberpruefen" we'll test, in order, against + // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen". + const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain; + if (0 != remainingBytes) + memcpy(codesDest, codesSrc, remainingBytes); + + getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies); +} + +int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, const int flags, + unsigned short *outWords, int *frequencies) { + + if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags) + { // Incrementally tune the word and try all possibilities + int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)]; + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + codesSize, flags, codes, codesSize, codesBuffer, outWords, frequencies); + } else { // Normal processing + getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, + outWords, frequencies); + } + + PROF_START(6); + // Get the word count + int suggestedWordsCount = 0; + while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) { + suggestedWordsCount++; + } + + if (DEBUG_DICT) { + LOGI("Returning %d words", suggestedWordsCount); + LOGI("Next letters: "); + for (int k = 0; k < NEXT_LETTERS_SIZE; k++) { + if (mNextLettersFrequency[k] > 0) { + LOGI("%c = %d,", k, mNextLettersFrequency[k]); + } + } + } + PROF_END(6); + PROF_CLOSE; + return suggestedWordsCount; +} + +void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, + unsigned short *outWords, int *frequencies) { + PROF_OPEN; PROF_START(0); initSuggestions(codes, codesSize, outWords, frequencies); @@ -103,30 +219,10 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int *xcoordi } } PROF_END(5); - - PROF_START(6); - // Get the word count - int suggestedWordsCount = 0; - while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) { - suggestedWordsCount++; - } - - if (DEBUG_DICT) { - LOGI("Returning %d words", suggestedWordsCount); - LOGI("Next letters: "); - for (int k = 0; k < NEXT_LETTERS_SIZE; k++) { - if (mNextLettersFrequency[k] > 0) { - LOGI("%c = %d,", k, mNextLettersFrequency[k]); - } - } - } - PROF_END(6); - PROF_CLOSE; - return suggestedWordsCount; } -void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords, - int *frequencies) { +void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, + unsigned short *outWords, int *frequencies) { if (DEBUG_DICT) LOGI("initSuggest"); mFrequencies = frequencies; mOutputChars = outWords; @@ -204,7 +300,7 @@ bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) { if (length != mInputLength) { return false; } - int *inputCodes = mInputCodes; + const int *inputCodes = mInputCodes; while (length--) { if ((unsigned int) *inputCodes != (unsigned int) *word) { return false; @@ -423,7 +519,7 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex const int currentChar = *getInputCharsAt(inputIndex); const int leftIndex = inputIndex - 1; if (leftIndex >= 0) { - int *leftChars = getInputCharsAt(leftIndex); + const int *leftChars = getInputCharsAt(leftIndex); int i = 0; while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { if (leftChars[i++] == currentChar) return true; @@ -431,7 +527,7 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex } const int rightIndex = inputIndex + 1; if (rightIndex < inputLength) { - int *rightChars = getInputCharsAt(rightIndex); + const int *rightChars = getInputCharsAt(rightIndex); int i = 0; while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { if (rightChars[i++] == currentChar) return true; @@ -523,7 +619,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth *newDiffs = diffs; *newInputIndex = inputIndex; } else { - int *currentChars = getInputCharsAt(inputIndex); + const int *currentChars = getInputCharsAt(inputIndex); if (transposedPos >= 0) { if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index e84875b59..a95984520 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -33,12 +33,22 @@ class UnigramDictionary { public: UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion); - int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, - int *codes, int codesSize, unsigned short *outWords, int *frequencies); + int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, const int flags, + unsigned short *outWords, int *frequencies); ~UnigramDictionary(); private: - void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies); + void getWordSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, + unsigned short *outWords, int *frequencies); + bool isDigraph(const int* codes, const int i, const int codesSize) const; + void getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo, + const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, + const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, + int* codesDest, unsigned short* outWords, int* frequencies); + void initSuggestions(const int *codes, const int codesSize, unsigned short *outWords, + int *frequencies); void getSuggestionCandidates(const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, const int maxDepth); @@ -86,7 +96,7 @@ private: const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos); bool existsAdjacentProximityChars(const int inputIndex, const int inputLength); - inline int* getInputCharsAt(const int index) { + inline const int* getInputCharsAt(const int index) { return mInputCodes + (index * MAX_PROXIMITY_CHARS); } const unsigned char *DICT; @@ -97,10 +107,20 @@ private: const int TYPED_LETTER_MULTIPLIER; const int FULL_WORD_MULTIPLIER; const int ROOT_POS; + const unsigned int BYTES_IN_ONE_CHAR; + + // Flags for special processing + // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java + // or something very bad (like, the apocalypse) will happen. + // Please update both at the same time. + enum { + REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1 + }; + static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[]; int *mFrequencies; unsigned short *mOutputChars; - int *mInputCodes; + const int *mInputCodes; int mInputLength; // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];