From 49ba135fdedb3c6b33ec915e91ecad682b7655b8 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Mon, 7 May 2012 20:14:00 +0900 Subject: [PATCH] Perform the actual bigram frequency lookup. This still returns the unigram frequency, because the values stored for bigrams in the dictionary are not ready to be returned in-place instead of unigram values. Aside from this, the code is complete. Bug: 6313806 Change-Id: If7bb7b644730782277f0f6663334c170b7fe13fb --- native/jni/src/bigram_dictionary.cpp | 13 ++------- native/jni/src/binary_format.h | 20 +++++++++----- native/jni/src/bloom_filter.h | 38 +++++++++++++++++++++++++++ native/jni/src/unigram_dictionary.cpp | 3 ++- 4 files changed, 55 insertions(+), 19 deletions(-) create mode 100644 native/jni/src/bloom_filter.h diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index 220b340d1..07031086c 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -20,8 +20,9 @@ #define LOG_TAG "LatinIME: bigram_dictionary.cpp" #include "bigram_dictionary.h" -#include "dictionary.h" #include "binary_format.h" +#include "bloom_filter.h" +#include "dictionary.h" namespace latinime { @@ -153,16 +154,6 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, return pos; } -static inline void setInFilter(uint8_t *filter, const int position) { - const unsigned int bucket = position % BIGRAM_FILTER_MODULO; - filter[bucket >> 3] |= (1 << (bucket & 0x7)); -} - -static inline bool isInFilter(uint8_t *filter, const int position) { - const unsigned int bucket = position % BIGRAM_FILTER_MODULO; - return filter[bucket >> 3] & (1 << (bucket & 0x7)); -} - void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, std::map *map, uint8_t *filter) { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 71ade48a3..b87593ca9 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -18,6 +18,7 @@ #define LATINIME_BINARY_FORMAT_H #include +#include "bloom_filter.h" #include "unigram_dictionary.h" namespace latinime { @@ -66,8 +67,8 @@ class BinaryFormat { const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, uint16_t* outWord); - static int getProbability(const std::map *bigramMap, const uint8_t *bigramFilter, - const int unigramFreq); + static int getProbability(const int position, const std::map *bigramMap, + const uint8_t *bigramFilter, const int unigramFreq); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or @@ -520,13 +521,18 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a } // This should probably return a probability in log space. -inline int BinaryFormat::getProbability(const std::map *bigramMap, +inline int BinaryFormat::getProbability(const int position, const std::map *bigramMap, const uint8_t *bigramFilter, const int unigramFreq) { - // TODO: use the bigram filter for fast rejection, then the bigram map for lookup - // to get the bigram probability. If the bigram is not found, use the unigram frequency. - // Don't forget that they can be null. + if (!bigramMap || !bigramFilter) return unigramFreq; + if (!isInFilter(bigramFilter, position)) return unigramFreq; + const std::map::const_iterator bigramFreq = bigramMap->find(position); + if (bigramFreq != bigramMap->end()) { + // TODO: return the frequency in bigramFreq->second + return unigramFreq; + } else { + return unigramFreq; + } // TODO: if the unigram frequency is used, compute the actual probability - return unigramFreq; } } // namespace latinime diff --git a/native/jni/src/bloom_filter.h b/native/jni/src/bloom_filter.h new file mode 100644 index 000000000..7ae6a1fa4 --- /dev/null +++ b/native/jni/src/bloom_filter.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BLOOM_FILTER_H +#define LATINIME_BLOOM_FILTER_H + +#include + +#include "defines.h" + +namespace latinime { + +static inline void setInFilter(uint8_t *filter, const int position) { + const unsigned int bucket = position % BIGRAM_FILTER_MODULO; + filter[bucket >> 3] |= (1 << (bucket & 0x7)); +} + +static inline bool isInFilter(const uint8_t *filter, const int position) { + const unsigned int bucket = position % BIGRAM_FILTER_MODULO; + return filter[bucket >> 3] & (1 << (bucket & 0x7)); +} + +} // namespace latinime + +#endif // LATINIME_BLOOM_FILTER_H diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 2e5468dd7..9234b1b52 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -851,7 +851,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. - const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq); + const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, + unigramFreq); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex);