From 85a1d1ea749a70211fb25c43f3398461d7375da5 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 21 Jun 2011 22:23:21 +0900 Subject: [PATCH] New dict format, step 6 Copy the modified functions to be able to see the diff Bug: 4392433 Change-Id: Ic9b83b4b4b7b89cc922eed1825507d7d516aff24 --- native/src/unigram_dictionary.cpp | 130 ++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 36233b741..e3296f12a 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -992,6 +992,136 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth } #else // NEW_DICTIONARY_FORMAT + +bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, + const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos, + const int secondWordLength, const bool isSpaceProximity) { + if (inputLength >= MAX_WORD_LENGTH) return false; + if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos + || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) + return false; + const int newWordLength = firstWordLength + secondWordLength + 1; + // Allocating variable length array on stack + unsigned short word[newWordLength]; + const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord); + if (DEBUG_DICT) { + LOGI("First freq: %d", firstFreq); + } + if (firstFreq <= 0) return false; + + for (int i = 0; i < firstWordLength; ++i) { + word[i] = mWord[i]; + } + + const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord); + if (DEBUG_DICT) { + LOGI("Second freq: %d", secondFreq); + } + if (secondFreq <= 0) return false; + + word[firstWordLength] = SPACE; + for (int i = (firstWordLength + 1); i < newWordLength; ++i) { + word[i] = mWord[i - firstWordLength - 1]; + } + + int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength, + secondWordLength, firstFreq, secondFreq, isSpaceProximity); + if (DEBUG_DICT) { + LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength, + TYPED_LETTER_MULTIPLIER); + } + addWord(word, newWordLength, pairFreq); + return true; +} + +inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth, + const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex, + const int diffs, const int skipPos, const int excessivePos, const int transposedPos, + int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, + bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, + int *nextSiblingPosition, int *nextOutputIndex) { + if (DEBUG_DICT) { + int inputCount = 0; + if (skipPos >= 0) ++inputCount; + if (excessivePos >= 0) ++inputCount; + if (transposedPos >= 0) ++inputCount; + assert(inputCount <= 1); + } + unsigned short c; + int childPosition; + bool terminal; + int freq; + bool isSameAsUserTypedLength = false; + + const uint8_t flags = 0; // No flags for now + + if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; + + *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos, + &c, &childPosition, &terminal, &freq); + *nextOutputIndex = depth + 1; + + const bool needsToTraverseChildrenNodes = childPosition != 0; + + // If we are only doing traverseAllNodes, no need to look at the typed characters. + if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { + mWord[depth] = c; + if (traverseAllNodes && terminal) { + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, + excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize); + } + if (!needsToTraverseChildrenNodes) return false; + *newTraverseAllNodes = traverseAllNodes; + *newMatchRate = matchWeight; + *newDiffs = diffs; + *newInputIndex = inputIndex; + } else { + const int *currentChars = getInputCharsAt(inputIndex); + + if (transposedPos >= 0) { + if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; + if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; + } + + int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos, + transposedPos); + if (UNRELATED_CHAR == matchedProximityCharId) return false; + mWord[depth] = c; + // If inputIndex is greater than mInputLength, that means there is no + // proximity chars. So, we don't need to check proximity. + if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); + } + bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 + || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); + if (isSameAsUserTypedLength && terminal) { + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, + excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize); + } + if (!needsToTraverseChildrenNodes) return false; + // Start traversing all nodes after the index exceeds the user typed length + *newTraverseAllNodes = isSameAsUserTypedLength; + *newMatchRate = matchWeight; + *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); + *newInputIndex = inputIndex + 1; + } + // Optimization: Prune out words that are too long compared to how much was typed. + if (depth >= maxDepth || *newDiffs > mMaxEditDistance) { + return false; + } + + // If inputIndex is greater than mInputLength, that means there are no proximity chars. + // TODO: Check if this can be isSameAsUserTypedLength only. + if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) { + *newTraverseAllNodes = true; + } + // get the count of nodes and increment childAddress. + *newCount = Dictionary::getCount(DICT_ROOT, &childPosition); + *newChildPosition = childPosition; + if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); + return needsToTraverseChildrenNodes; +} + #endif // NEW_DICTIONARY_FORMAT } // namespace latinime