am ff020671: Merge "Prepair for advanced two words error correction"

* commit 'ff020671a9790a095c135b9dbe5d22d011d3e2db':
  Prepair for advanced two words error correction
main
satok 2012-01-16 23:08:23 -08:00 committed by Android Git Automerger
commit 2af901c64c
6 changed files with 198 additions and 7 deletions

View File

@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne
inline static int getCurrentEditDistance( inline static int getCurrentEditDistance(
int *editDistanceTable, const int inputLength, const int outputLength) { int *editDistanceTable, const int inputLength, const int outputLength) {
if (DEBUG_DICT) { if (DEBUG_EDIT_DISTANCE) {
AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
} }
return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
return totalFreq; return totalFreq;
} }
/* static */
int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld(
const int firstFreq, const int secondFreq, const Correction* correction,
const unsigned short *word) {
const int spaceProximityPos = correction->mSpaceProximityPos;
const int missingSpacePos = correction->mMissingSpacePos;
if (DEBUG_DICT) {
int inputCount = 0;
if (spaceProximityPos >= 0) ++inputCount;
if (missingSpacePos >= 0) ++inputCount;
assert(inputCount <= 1);
}
const bool isSpaceProximity = spaceProximityPos >= 0;
const int inputLength = correction->mInputLength;
const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
: (inputLength - missingSpacePos);
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
bool firstCapitalizedWordDemotion = false;
if (firstWordLength >= 2) {
firstCapitalizedWordDemotion = isUpperCase(word[0]);
}
bool secondCapitalizedWordDemotion = false;
if (secondWordLength >= 2) {
secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
}
const bool capitalizedWordDemotion =
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
if (DEBUG_DICT_FULL) {
AKLOGI("Two words: %c, %c, %d",
word[0], word[firstWordLength + 1], capitalizedWordDemotion);
}
if (firstWordLength == 0 || secondWordLength == 0) {
return 0;
}
const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
int tempFirstFreq = firstFreq;
multiplyRate(firstDemotionRate, &tempFirstFreq);
const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
int tempSecondFreq = secondFreq;
multiplyRate(secondDemotionRate, &tempSecondFreq);
const int totalLength = firstWordLength + secondWordLength;
// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
// length.
int totalFreq = tempFirstFreq + tempSecondFreq;
// This is a workaround to try offsetting the not-enough-demotion which will be done in
// calcNormalizedScore in Utils.java.
// In calcNormalizedScore the score will be demoted by (1 - 1 / length)
// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
// At this moment, totalFreq is calculated by the following formula:
// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
// This is another workaround to offset the demotion which will be done in
// calcNormalizedScore in Utils.java.
// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
// the same amount because we already have adjusted the synthetic freq of this "missing or
// mistyped space" suggestion candidate above in this method.
const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
if (isSpaceProximity) {
// A word pair with one space proximity correction
if (DEBUG_DICT) {
AKLOGI("Found a word pair with space proximity correction.");
}
multiplyIntCapped(typedLetterMultiplier, &totalFreq);
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
}
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
if (capitalizedWordDemotion) {
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
}
return totalFreq;
}
/* Damerau-Levenshtein distance */ /* Damerau-Levenshtein distance */
inline static int editDistanceInternal( inline static int editDistanceInternal(
int* editDistanceTable, const unsigned short* before, int* editDistanceTable, const unsigned short* before,

View File

@ -100,6 +100,8 @@ class Correction {
const int freq, int *editDistanceTable, const Correction* correction); const int freq, int *editDistanceTable, const Correction* correction);
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const Correction* correction, const unsigned short *word); const Correction* correction, const unsigned short *word);
static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
const Correction* correction, const unsigned short *word);
static double calcNormalizedScore(const unsigned short* before, const int beforeLength, static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
const unsigned short* after, const int afterLength, const int score); const unsigned short* after, const int afterLength, const int score);
static int editDistance(const unsigned short* before, static int editDistance(const unsigned short* before,

View File

@ -117,8 +117,8 @@ static void prof_out(void) {
#define DEBUG_TRACE DEBUG_DICT_FULL #define DEBUG_TRACE DEBUG_DICT_FULL
#define DEBUG_PROXIMITY_INFO false #define DEBUG_PROXIMITY_INFO false
#define DEBUG_CORRECTION false #define DEBUG_CORRECTION false
#define DEBUG_CORRECTION_FREQ true #define DEBUG_CORRECTION_FREQ false
#define DEBUG_WORDS_PRIORITY_QUEUE true #define DEBUG_WORDS_PRIORITY_QUEUE false
#else // FLAG_DBG #else // FLAG_DBG
@ -213,6 +213,8 @@ static void prof_out(void) {
#define SUB_QUEUE_MAX_WORDS 1 #define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10 #define SUB_QUEUE_MAX_COUNT 10
#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f
#define MAX_DEPTH_MULTIPLIER 3 #define MAX_DEPTH_MULTIPLIER 3
// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German

View File

@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
} }
} }
PROF_END(6); PROF_END(6);
if (DEBUG_WORDS_PRIORITY_QUEUE) { if (DEBUG_DICT) {
queuePool->dumpSubQueue1TopSuggestions(); queuePool->dumpSubQueue1TopSuggestions();
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
WordsPriorityQueue* queue = queuePool->getSubQueue1(i);
if (queue->size() > 0) {
WordsPriorityQueue::SuggestedWord* sw = queue->top();
const int score = sw->mScore;
const unsigned short* word = sw->mWord;
const int wordLength = sw->mWordLength;
double ns = Correction::RankingAlgorithm::calcNormalizedScore(
proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
ns += 0;
AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
(ns > TWO_WORDS_CORRECTION_THRESHOLD));
DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
DUMP_WORD(word, wordLength);
}
}
} }
} }
@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
return; return;
} }
void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) {
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
if (DEBUG_DICT) {
int inputCount = 0;
if (spaceProximityPos >= 0) ++inputCount;
if (missingSpacePos >= 0) ++inputCount;
assert(inputCount <= 1);
}
const bool isSpaceProximity = spaceProximityPos >= 0;
const int firstWordStartPos = 0;
const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
const int secondWordLength = isSpaceProximity
? (inputLength - spaceProximityPos - 1)
: (inputLength - missingSpacePos);
if (inputLength >= MAX_WORD_LENGTH) return;
if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
return;
const int newWordLength = firstWordLength + secondWordLength + 1;
// Space proximity preparation
//WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
//initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
//correction);
//getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false,
//MAX_ERRORS_FOR_TWO_WORDS);
// Allocating variable length array on stack
unsigned short word[newWordLength];
const int firstFreq = getMostFrequentWordLike(
firstWordStartPos, firstWordLength, proximityInfo, mWord);
if (DEBUG_DICT) {
AKLOGI("First freq: %d", firstFreq);
}
if (firstFreq <= 0) return;
for (int i = 0; i < firstWordLength; ++i) {
word[i] = mWord[i];
}
const int secondFreq = getMostFrequentWordLike(
secondWordStartPos, secondWordLength, proximityInfo, mWord);
if (DEBUG_DICT) {
AKLOGI("Second freq: %d", secondFreq);
}
if (secondFreq <= 0) return;
word[firstWordLength] = SPACE;
for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
word[i] = mWord[i - firstWordLength - 1];
}
// TODO: Remove initSuggestions and correction->setCorrectionParams
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, spaceProximityPos, missingSpacePos,
useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
if (DEBUG_DICT) {
AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
}
addWord(word, newWordLength, pairFreq, masterQueue);
return;
}
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
// interface. // interface.
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

View File

@ -104,6 +104,10 @@ class UnigramDictionary {
const int *xcoordinates, const int *ycoordinates, const int *codes, const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int *ycoordinates, const int *codes, const bool useFullEditDistance,
const int inputLength, const int missingSpacePos, Correction *correction, const int inputLength, const int missingSpacePos, Correction *correction,

View File

@ -81,10 +81,9 @@ class WordsPriorityQueue {
mSuggestions.push(sw); mSuggestions.push(sw);
} }
SuggestedWord* topAndPop() { SuggestedWord* top() {
if (mSuggestions.empty()) return 0; if (mSuggestions.empty()) return 0;
SuggestedWord* sw = mSuggestions.top(); SuggestedWord* sw = mSuggestions.top();
mSuggestions.pop();
return sw; return sw;
} }
@ -112,7 +111,7 @@ class WordsPriorityQueue {
return size; return size;
} }
int size() { int size() const {
return mSuggestions.size(); return mSuggestions.size();
} }