From 8950ce6c44706467bb386570ae236a2b8b983666 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Mon, 7 May 2012 16:28:30 +0900 Subject: [PATCH] Replace the bigram list position with the map and filter Passing the position will not allow us a reasonable lookup time. Replace this with a map and bloom filter for very fast lookup. Bug: 6313806 Change-Id: I3a61c0001cbc987c1c3c7b8df635d4590a370144 --- native/jni/src/bigram_dictionary.cpp | 5 +++ native/jni/src/binary_format.h | 11 ++++-- native/jni/src/dictionary.h | 8 +--- native/jni/src/unigram_dictionary.cpp | 53 ++++++++++++++------------- native/jni/src/unigram_dictionary.h | 30 +++++++++------ 5 files changed, 60 insertions(+), 47 deletions(-) diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index d120261ae..220b340d1 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -158,6 +158,11 @@ static inline void setInFilter(uint8_t *filter, const int position) { filter[bucket >> 3] |= (1 << (bucket & 0x7)); } +static inline bool isInFilter(uint8_t *filter, const int position) { + const unsigned int bucket = position % BIGRAM_FILTER_MODULO; + return filter[bucket >> 3] & (1 << (bucket & 0x7)); +} + void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, std::map *map, uint8_t *filter) { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index d5d67c108..71ade48a3 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -66,7 +66,8 @@ class BinaryFormat { const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, uint16_t* outWord); - static int getProbability(const int bigramListPosition, const int unigramFreq); + static int getProbability(const std::map *bigramMap, const uint8_t *bigramFilter, + const int unigramFreq); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or @@ -519,9 +520,11 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a } // This should probably return a probability in log space. -inline int BinaryFormat::getProbability(const int bigramListPosition, const int unigramFreq) { - // TODO: use the bigram list position to get the bigram probability. If the bigram - // is not found, use the unigram frequency. +inline int BinaryFormat::getProbability(const std::map *bigramMap, + const uint8_t *bigramFilter, const int unigramFreq) { + // TODO: use the bigram filter for fast rejection, then the bigram map for lookup + // to get the bigram probability. If the bigram is not found, use the unigram frequency. + // Don't forget that they can be null. // TODO: if the unigram frequency is used, compute the actual probability return unigramFreq; } diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h index 8bdd77165..bce86d1ad 100644 --- a/native/jni/src/dictionary.h +++ b/native/jni/src/dictionary.h @@ -37,17 +37,13 @@ class Dictionary { int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, int *codes, int codesSize, const int32_t* prevWordChars, const int prevWordLength, bool useFullEditDistance, unsigned short *outWords, int *frequencies) { - // bigramListPosition is, as an int, the offset of the bigram list in the file. - // If none, it's zero. - const int bigramListPosition = !prevWordChars ? 0 - : mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength); std::map bigramMap; uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE]; mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars, prevWordLength, &bigramMap, bigramFilter); return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, - mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition, - useFullEditDistance, outWords, frequencies); + mCorrection, xcoordinates, ycoordinates, codes, codesSize, &bigramMap, + bigramFilter, useFullEditDistance, outWords, frequencies); } int getBigrams(const int32_t *word, int length, int *codes, int codesSize, diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 0c759d438..2e5468dd7 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -98,7 +98,7 @@ int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, cons void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer, int *yCoordinatesBuffer, - const int codesBufferSize, const int bigramListPosition, + const int codesBufferSize, const std::map *bigramMap, const uint8_t *bigramFilter, const bool useFullEditDistance, const int *codesSrc, const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, WordsPriorityQueuePool *queuePool, @@ -128,7 +128,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit replacementCodePoint; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - bigramListPosition, useFullEditDistance, codesSrc + i + 1, + bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1, codesRemain - i - 1, currentDepth + 1, codesDest + i, correction, queuePool, digraphs, digraphsSize); @@ -138,7 +138,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR); getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - bigramListPosition, useFullEditDistance, codesSrc + i, codesRemain - i, + bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i, currentDepth + 1, codesDest + i, correction, queuePool, digraphs, digraphsSize); return; @@ -161,16 +161,18 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit } getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, - startIndex + codesRemain, bigramListPosition, useFullEditDistance, correction, + startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction, queuePool); } -// bigramListPosition is the offset in the file to the list of bigrams for the previous word. +// bigramMap contains the association -> +// bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter +// in bigram_dictionary.cpp int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, - const int bigramListPosition, const bool useFullEditDistance, unsigned short *outWords, - int *frequencies) { + const std::map *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, unsigned short *outWords, int *frequencies) { queuePool->clearAll(); Correction* masterCorrection = correction; @@ -180,7 +182,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, GERMAN_UMLAUT_DIGRAPHS, sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); @@ -189,13 +191,13 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, FRENCH_LIGATURES_DIGRAPHS, sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); } else { // Normal processing getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, - bigramListPosition, useFullEditDistance, masterCorrection, queuePool); + bigramMap, bigramFilter, useFullEditDistance, masterCorrection, queuePool); } PROF_START(20); @@ -228,15 +230,15 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const int inputLength, const int bigramListPosition, const bool useFullEditDistance, - Correction *correction, WordsPriorityQueuePool *queuePool) { + const int inputLength, const std::map *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) { PROF_OPEN; PROF_START(0); PROF_END(0); PROF_START(1); - getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramListPosition, + getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramMap, bigramFilter, useFullEditDistance, inputLength, correction, queuePool); PROF_END(1); @@ -308,15 +310,16 @@ static const char SPACE = ' '; void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const int bigramListPosition, const bool useFullEditDistance, const int inputLength, + const std::map *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool) { initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, bigramListPosition, correction, + getSuggestionCandidates(useFullEditDistance, inputLength, bigramMap, bigramFilter, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, const int bigramListPosition, + const int inputLength, const std::map *bigramMap, const uint8_t *bigramFilter, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams @@ -337,7 +340,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - bigramListPosition, correction, &childCount, &firstChildPos, &siblingPos, + bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos, queuePool, currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -432,8 +435,8 @@ bool UnigramDictionary::getSubStringSuggestion( queuePool->clearSubQueue(currentWordIndex); // TODO: pass the bigram list for substring suggestion getSuggestionCandidates(useFullEditDistance, inputWordLength, - 0 /* bigramListPosition */, correction, queuePool, false /* doAutoCompletion */, - MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); + 0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool, + false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); if (DEBUG_DICT) { if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); @@ -763,9 +766,9 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, - const int bigramListPosition, Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, - const int currentWordIndex) { + const std::map *bigramMap, const uint8_t *bigramFilter, Correction *correction, + int *newCount, int *newChildrenPosition, int *nextSiblingPosition, + WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } @@ -846,9 +849,9 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); - // The bigramListPosition is the offset in the file of the bigrams for the previous word, - // or zero if we don't know of any bigrams for it. - const int probability = BinaryFormat::getProbability(bigramListPosition, unigramFreq); + // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. + // bigramFilter is a bloom filter of said frequencies for even faster rejection. + const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index 0cc59bac8..b9233518f 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -17,6 +17,7 @@ #ifndef LATINIME_UNIGRAM_DICTIONARY_H #define LATINIME_UNIGRAM_DICTIONARY_H +#include #include #include "correction.h" #include "correction_state.h" @@ -75,32 +76,36 @@ class UnigramDictionary { int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, - const int *codes, const int codesSize, const int bigramListPosition, - const bool useFullEditDistance, unsigned short *outWords, int *frequencies); + const int *codes, const int codesSize, const std::map *bigramMap, + const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords, + int *frequencies); virtual ~UnigramDictionary(); private: void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int inputLength, - const int bigramListPosition, const bool useFullEditDistance, Correction *correction, + const std::map *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool); int getDigraphReplacement(const int *codes, const int i, const int codesSize, const digraph_t* const digraphs, const unsigned int digraphsSize) const; void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize, - const int bigramListPosition, const bool useFullEditDistance, const int* codesSrc, - const int codesRemain, const int currentDepth, int* codesDest, Correction *correction, + const std::map *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, const int* codesSrc, const int codesRemain, + const int currentDepth, int* codesDest, Correction *correction, WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs, const unsigned int digraphsSize); void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int bigramListPosition, - const bool useFullEditDistance, const int inputLength, Correction *correction, - WordsPriorityQueuePool* queuePool); + const int *ycoordinates, const int *codes, const std::map *bigramMap, + const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool); void getSuggestionCandidates( - const bool useFullEditDistance, const int inputLength, const int bigramListPosition, + const bool useFullEditDistance, const int inputLength, + const std::map *bigramMap, const uint8_t *bigramFilter, Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex); void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, @@ -114,9 +119,10 @@ class UnigramDictionary { bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character - bool processCurrentNode(const int initialPos, const int bigramListPosition, - Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition, - WordsPriorityQueuePool *queuePool, const int currentWordIndex); + bool processCurrentNode(const int initialPos, const std::map *bigramMap, + const uint8_t *bigramFilter, Correction *correction, int *newCount, + int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, + const int currentWordIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,