From 29dc80614bc529ca2c0b96e1a731ebb7a5433090 Mon Sep 17 00:00:00 2001 From: satok Date: Tue, 17 Jan 2012 15:59:15 +0900 Subject: [PATCH] Prepair for advanced two words error correction Change-Id: I4c8a21f0f6e349ddafd9b402583321a60855cfe8 --- native/src/correction.cpp | 96 ++++++++++++++++++++++++++++++- native/src/correction.h | 2 + native/src/defines.h | 6 +- native/src/unigram_dictionary.cpp | 92 ++++++++++++++++++++++++++++- native/src/unigram_dictionary.h | 4 ++ native/src/words_priority_queue.h | 5 +- 6 files changed, 198 insertions(+), 7 deletions(-) diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 5dc6f8737..6a129d4e3 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne inline static int getCurrentEditDistance( int *editDistanceTable, const int inputLength, const int outputLength) { - if (DEBUG_DICT) { + if (DEBUG_EDIT_DISTANCE) { AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); } return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; @@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( return totalFreq; } +/* static */ +int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld( + const int firstFreq, const int secondFreq, const Correction* correction, + const unsigned short *word) { + const int spaceProximityPos = correction->mSpaceProximityPos; + const int missingSpacePos = correction->mMissingSpacePos; + if (DEBUG_DICT) { + int inputCount = 0; + if (spaceProximityPos >= 0) ++inputCount; + if (missingSpacePos >= 0) ++inputCount; + assert(inputCount <= 1); + } + const bool isSpaceProximity = spaceProximityPos >= 0; + const int inputLength = correction->mInputLength; + const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; + const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) + : (inputLength - missingSpacePos); + const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; + + bool firstCapitalizedWordDemotion = false; + if (firstWordLength >= 2) { + firstCapitalizedWordDemotion = isUpperCase(word[0]); + } + + bool secondCapitalizedWordDemotion = false; + if (secondWordLength >= 2) { + secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + } + + const bool capitalizedWordDemotion = + firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; + + if (DEBUG_DICT_FULL) { + AKLOGI("Two words: %c, %c, %d", + word[0], word[firstWordLength + 1], capitalizedWordDemotion); + } + + if (firstWordLength == 0 || secondWordLength == 0) { + return 0; + } + const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); + int tempFirstFreq = firstFreq; + multiplyRate(firstDemotionRate, &tempFirstFreq); + + const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); + int tempSecondFreq = secondFreq; + multiplyRate(secondDemotionRate, &tempSecondFreq); + + const int totalLength = firstWordLength + secondWordLength; + + // Promote pairFreq with multiplying by 2, because the word length is the same as the typed + // length. + int totalFreq = tempFirstFreq + tempSecondFreq; + + // This is a workaround to try offsetting the not-enough-demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) + // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by + // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) + const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); + multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); + + // At this moment, totalFreq is calculated by the following formula: + // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) + // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) + + multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); + + // This is another workaround to offset the demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote + // the same amount because we already have adjusted the synthetic freq of this "missing or + // mistyped space" suggestion candidate above in this method. + const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); + multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); + + if (isSpaceProximity) { + // A word pair with one space proximity correction + if (DEBUG_DICT) { + AKLOGI("Found a word pair with space proximity correction."); + } + multiplyIntCapped(typedLetterMultiplier, &totalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); + } + + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + + if (capitalizedWordDemotion) { + multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); + } + + return totalFreq; +} + /* Damerau-Levenshtein distance */ inline static int editDistanceInternal( int* editDistanceTable, const unsigned short* before, diff --git a/native/src/correction.h b/native/src/correction.h index a0fd55fd9..22a424f5c 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -100,6 +100,8 @@ class Correction { const int freq, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word); + static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq, + const Correction* correction, const unsigned short *word); static double calcNormalizedScore(const unsigned short* before, const int beforeLength, const unsigned short* after, const int afterLength, const int score); static int editDistance(const unsigned short* before, diff --git a/native/src/defines.h b/native/src/defines.h index 31175c369..d739043a4 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -117,8 +117,8 @@ static void prof_out(void) { #define DEBUG_TRACE DEBUG_DICT_FULL #define DEBUG_PROXIMITY_INFO false #define DEBUG_CORRECTION false -#define DEBUG_CORRECTION_FREQ true -#define DEBUG_WORDS_PRIORITY_QUEUE true +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_WORDS_PRIORITY_QUEUE false #else // FLAG_DBG @@ -213,6 +213,8 @@ static void prof_out(void) { #define SUB_QUEUE_MAX_WORDS 1 #define SUB_QUEUE_MAX_COUNT 10 +#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f + #define MAX_DEPTH_MULTIPLIER 3 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 69e3200fc..8be95bc40 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } } PROF_END(6); - if (DEBUG_WORDS_PRIORITY_QUEUE) { + if (DEBUG_DICT) { queuePool->dumpSubQueue1TopSuggestions(); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + WordsPriorityQueue* queue = queuePool->getSubQueue1(i); + if (queue->size() > 0) { + WordsPriorityQueue::SuggestedWord* sw = queue->top(); + const int score = sw->mScore; + const unsigned short* word = sw->mWord; + const int wordLength = sw->mWordLength; + double ns = Correction::RankingAlgorithm::calcNormalizedScore( + proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); + ns += 0; + AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, + (ns > TWO_WORDS_CORRECTION_THRESHOLD)); + DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); + DUMP_WORD(word, wordLength); + } + } } } @@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo return; } +void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, const int missingSpacePos, + const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + + if (DEBUG_DICT) { + int inputCount = 0; + if (spaceProximityPos >= 0) ++inputCount; + if (missingSpacePos >= 0) ++inputCount; + assert(inputCount <= 1); + } + const bool isSpaceProximity = spaceProximityPos >= 0; + const int firstWordStartPos = 0; + const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; + const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; + const int secondWordLength = isSpaceProximity + ? (inputLength - spaceProximityPos - 1) + : (inputLength - missingSpacePos); + + if (inputLength >= MAX_WORD_LENGTH) return; + if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos + || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) + return; + + const int newWordLength = firstWordLength + secondWordLength + 1; + + + // Space proximity preparation + //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); + //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue, + //correction); + //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false, + //MAX_ERRORS_FOR_TWO_WORDS); + + // Allocating variable length array on stack + unsigned short word[newWordLength]; + const int firstFreq = getMostFrequentWordLike( + firstWordStartPos, firstWordLength, proximityInfo, mWord); + if (DEBUG_DICT) { + AKLOGI("First freq: %d", firstFreq); + } + if (firstFreq <= 0) return; + + for (int i = 0; i < firstWordLength; ++i) { + word[i] = mWord[i]; + } + + const int secondFreq = getMostFrequentWordLike( + secondWordStartPos, secondWordLength, proximityInfo, mWord); + if (DEBUG_DICT) { + AKLOGI("Second freq: %d", secondFreq); + } + if (secondFreq <= 0) return; + + word[firstWordLength] = SPACE; + for (int i = (firstWordLength + 1); i < newWordLength; ++i) { + word[i] = mWord[i - firstWordLength - 1]; + } + + // TODO: Remove initSuggestions and correction->setCorrectionParams + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + + correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, + -1 /* transposedPos */, spaceProximityPos, missingSpacePos, + useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); + const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); + if (DEBUG_DICT) { + AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); + } + addWord(word, newWordLength, pairFreq, masterQueue); + return; +} + // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 5e7a7580f..b950971bb 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -104,6 +104,10 @@ class UnigramDictionary { const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); + void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, + const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, diff --git a/native/src/words_priority_queue.h b/native/src/words_priority_queue.h index 54bf27a59..6262439b5 100644 --- a/native/src/words_priority_queue.h +++ b/native/src/words_priority_queue.h @@ -81,10 +81,9 @@ class WordsPriorityQueue { mSuggestions.push(sw); } - SuggestedWord* topAndPop() { + SuggestedWord* top() { if (mSuggestions.empty()) return 0; SuggestedWord* sw = mSuggestions.top(); - mSuggestions.pop(); return sw; } @@ -112,7 +111,7 @@ class WordsPriorityQueue { return size; } - int size() { + int size() const { return mSuggestions.size(); }