Merge "Perform the actual bigram frequency lookup." into jb-dev
This commit is contained in:
commit
6cb23a49bf
4 changed files with 55 additions and 19 deletions
|
@ -20,8 +20,9 @@
|
||||||
#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
|
#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
|
||||||
|
|
||||||
#include "bigram_dictionary.h"
|
#include "bigram_dictionary.h"
|
||||||
#include "dictionary.h"
|
|
||||||
#include "binary_format.h"
|
#include "binary_format.h"
|
||||||
|
#include "bloom_filter.h"
|
||||||
|
#include "dictionary.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -153,16 +154,6 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void setInFilter(uint8_t *filter, const int position) {
|
|
||||||
const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
|
|
||||||
filter[bucket >> 3] |= (1 << (bucket & 0x7));
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool isInFilter(uint8_t *filter, const int position) {
|
|
||||||
const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
|
|
||||||
return filter[bucket >> 3] & (1 << (bucket & 0x7));
|
|
||||||
}
|
|
||||||
|
|
||||||
void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
|
void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
|
||||||
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
|
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
|
||||||
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
|
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#define LATINIME_BINARY_FORMAT_H
|
#define LATINIME_BINARY_FORMAT_H
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
#include "bloom_filter.h"
|
||||||
#include "unigram_dictionary.h"
|
#include "unigram_dictionary.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
@ -66,8 +67,8 @@ class BinaryFormat {
|
||||||
const int length);
|
const int length);
|
||||||
static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
|
static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
|
||||||
uint16_t* outWord);
|
uint16_t* outWord);
|
||||||
static int getProbability(const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
|
static int getProbability(const int position, const std::map<int, int> *bigramMap,
|
||||||
const int unigramFreq);
|
const uint8_t *bigramFilter, const int unigramFreq);
|
||||||
|
|
||||||
// Flags for special processing
|
// Flags for special processing
|
||||||
// Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
|
// Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
|
||||||
|
@ -520,13 +521,18 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
|
||||||
}
|
}
|
||||||
|
|
||||||
// This should probably return a probability in log space.
|
// This should probably return a probability in log space.
|
||||||
inline int BinaryFormat::getProbability(const std::map<int, int> *bigramMap,
|
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
|
||||||
const uint8_t *bigramFilter, const int unigramFreq) {
|
const uint8_t *bigramFilter, const int unigramFreq) {
|
||||||
// TODO: use the bigram filter for fast rejection, then the bigram map for lookup
|
if (!bigramMap || !bigramFilter) return unigramFreq;
|
||||||
// to get the bigram probability. If the bigram is not found, use the unigram frequency.
|
if (!isInFilter(bigramFilter, position)) return unigramFreq;
|
||||||
// Don't forget that they can be null.
|
const std::map<int, int>::const_iterator bigramFreq = bigramMap->find(position);
|
||||||
// TODO: if the unigram frequency is used, compute the actual probability
|
if (bigramFreq != bigramMap->end()) {
|
||||||
|
// TODO: return the frequency in bigramFreq->second
|
||||||
return unigramFreq;
|
return unigramFreq;
|
||||||
|
} else {
|
||||||
|
return unigramFreq;
|
||||||
|
}
|
||||||
|
// TODO: if the unigram frequency is used, compute the actual probability
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
38
native/jni/src/bloom_filter.h
Normal file
38
native/jni/src/bloom_filter.h
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2012 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_BLOOM_FILTER_H
|
||||||
|
#define LATINIME_BLOOM_FILTER_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
static inline void setInFilter(uint8_t *filter, const int position) {
|
||||||
|
const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
|
||||||
|
filter[bucket >> 3] |= (1 << (bucket & 0x7));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool isInFilter(const uint8_t *filter, const int position) {
|
||||||
|
const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
|
||||||
|
return filter[bucket >> 3] & (1 << (bucket & 0x7));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace latinime
|
||||||
|
|
||||||
|
#endif // LATINIME_BLOOM_FILTER_H
|
|
@ -851,7 +851,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
|
TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
|
||||||
// bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
|
// bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
|
||||||
// bigramFilter is a bloom filter of said frequencies for even faster rejection.
|
// bigramFilter is a bloom filter of said frequencies for even faster rejection.
|
||||||
const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq);
|
const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
|
||||||
|
unigramFreq);
|
||||||
onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
|
onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
|
||||||
currentWordIndex);
|
currentWordIndex);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue