Use BinaryDictonaryInfo instead of raw pointers.
Bug: 6669677 Change-Id: I9792a872f1609de7c1ba0fc08d916047d6724c0bmain
parent
d3ccd4bf22
commit
0ecfb94247
|
@ -58,6 +58,8 @@ LATIN_IME_CORE_SRC_FILES := \
|
||||||
dic_nodes_cache.cpp) \
|
dic_nodes_cache.cpp) \
|
||||||
$(addprefix suggest/core/dictionary/, \
|
$(addprefix suggest/core/dictionary/, \
|
||||||
char_utils.cpp \
|
char_utils.cpp \
|
||||||
|
binary_dictionary_format.cpp \
|
||||||
|
byte_array_utils.cpp \
|
||||||
dictionary.cpp \
|
dictionary.cpp \
|
||||||
digraph_utils.cpp) \
|
digraph_utils.cpp) \
|
||||||
$(addprefix suggest/core/layout/, \
|
$(addprefix suggest/core/layout/, \
|
||||||
|
|
|
@ -35,7 +35,8 @@
|
||||||
#include "jni.h"
|
#include "jni.h"
|
||||||
#include "jni_common.h"
|
#include "jni_common.h"
|
||||||
#include "suggest_options.h"
|
#include "suggest_options.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_dictionary_format.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/dictionary.h"
|
#include "suggest/core/dictionary/dictionary.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
@ -110,8 +111,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
Dictionary *dictionary = 0;
|
Dictionary *dictionary = 0;
|
||||||
if (BinaryFormat::UNKNOWN_FORMAT
|
if (BinaryDictionaryFormat::UNKNOWN_VERSION
|
||||||
== BinaryFormat::detectFormat(static_cast<uint8_t *>(dictBuf),
|
== BinaryDictionaryFormat::detectFormatVersion(static_cast<uint8_t *>(dictBuf),
|
||||||
static_cast<int>(dictSize))) {
|
static_cast<int>(dictSize))) {
|
||||||
AKLOGE("DICT: dictionary format is unknown, bad magic number");
|
AKLOGE("DICT: dictionary format is unknown, bad magic number");
|
||||||
#ifdef USE_MMAP_FOR_DICTIONARY
|
#ifdef USE_MMAP_FOR_DICTIONARY
|
||||||
|
@ -260,7 +261,7 @@ static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jclass clazz, ji
|
||||||
static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) {
|
static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
if (!dictionary) return;
|
if (!dictionary) return;
|
||||||
const void *dictBuf = dictionary->getDict();
|
const void *dictBuf = dictionary->getBinaryDictionaryInfo()->getDictBuf();
|
||||||
if (!dictBuf) return;
|
if (!dictBuf) return;
|
||||||
#ifdef USE_MMAP_FOR_DICTIONARY
|
#ifdef USE_MMAP_FOR_DICTIONARY
|
||||||
releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(),
|
releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(),
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#include "bigram_dictionary.h"
|
#include "bigram_dictionary.h"
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
#include "suggest/core/dictionary/bloom_filter.h"
|
#include "suggest/core/dictionary/bloom_filter.h"
|
||||||
#include "suggest/core/dictionary/char_utils.h"
|
#include "suggest/core/dictionary/char_utils.h"
|
||||||
|
@ -28,7 +29,8 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) {
|
BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo)
|
||||||
|
: mBinaryDictionaryInfo(binaryDictionaryInfo) {
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
AKLOGI("BigramDictionary - constructor");
|
AKLOGI("BigramDictionary - constructor");
|
||||||
}
|
}
|
||||||
|
@ -103,7 +105,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
|
||||||
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
||||||
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
||||||
|
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
||||||
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
|
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
|
||||||
false /* forceLowerCaseSearch */);
|
false /* forceLowerCaseSearch */);
|
||||||
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
||||||
|
@ -149,7 +151,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
|
||||||
int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
||||||
const bool forceLowerCaseSearch) const {
|
const bool forceLowerCaseSearch) const {
|
||||||
if (0 >= prevWordLength) return 0;
|
if (0 >= prevWordLength) return 0;
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
||||||
int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
|
int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
|
||||||
forceLowerCaseSearch);
|
forceLowerCaseSearch);
|
||||||
|
|
||||||
|
@ -170,7 +172,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in
|
||||||
void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
|
void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
|
||||||
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
|
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
|
||||||
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
|
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
||||||
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
|
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
|
||||||
false /* forceLowerCaseSearch */);
|
false /* forceLowerCaseSearch */);
|
||||||
if (0 == pos) {
|
if (0 == pos) {
|
||||||
|
@ -209,7 +211,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons
|
||||||
|
|
||||||
bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
|
bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
|
||||||
int length2) const {
|
int length2) const {
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
||||||
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
|
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
|
||||||
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
||||||
if (0 == pos) return false;
|
if (0 == pos) return false;
|
||||||
|
|
|
@ -24,9 +24,12 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryInfo;
|
||||||
|
|
||||||
class BigramDictionary {
|
class BigramDictionary {
|
||||||
public:
|
public:
|
||||||
BigramDictionary(const uint8_t *const streamStart);
|
BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
|
||||||
|
|
||||||
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
||||||
int *frequencies, int *outputTypes) const;
|
int *frequencies, int *outputTypes) const;
|
||||||
void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength,
|
void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength,
|
||||||
|
@ -35,13 +38,14 @@ class BigramDictionary {
|
||||||
~BigramDictionary();
|
~BigramDictionary();
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);
|
||||||
|
|
||||||
void addWordBigram(int *word, int length, int probability, int *bigramProbability,
|
void addWordBigram(int *word, int length, int probability, int *bigramProbability,
|
||||||
int *bigramCodePoints, int *outputTypes) const;
|
int *bigramCodePoints, int *outputTypes) const;
|
||||||
bool checkFirstCharacter(int *word, int *inputCodePoints) const;
|
bool checkFirstCharacter(int *word, int *inputCodePoints) const;
|
||||||
int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
||||||
const bool forceLowerCaseSearch) const;
|
const bool forceLowerCaseSearch) const;
|
||||||
|
|
||||||
const uint8_t *const DICT_ROOT;
|
const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
|
||||||
// TODO: Re-implement proximity correction for bigram correction
|
// TODO: Re-implement proximity correction for bigram correction
|
||||||
static const int MAX_ALTERNATIVES = 1;
|
static const int MAX_ALTERNATIVES = 1;
|
||||||
};
|
};
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include "suggest/core/dicnode/dic_node.h"
|
#include "suggest/core/dicnode/dic_node.h"
|
||||||
#include "suggest/core/dicnode/dic_node_utils.h"
|
#include "suggest/core/dicnode/dic_node_utils.h"
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
#include "suggest/core/dictionary/char_utils.h"
|
#include "suggest/core/dictionary/char_utils.h"
|
||||||
#include "suggest/core/dictionary/multi_bigram_map.h"
|
#include "suggest/core/dictionary/multi_bigram_map.h"
|
||||||
|
@ -32,20 +33,23 @@ namespace latinime {
|
||||||
// Node initialization utils //
|
// Node initialization utils //
|
||||||
///////////////////////////////
|
///////////////////////////////
|
||||||
|
|
||||||
/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot,
|
/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const int prevWordNodePos, DicNode *newRootNode) {
|
const int prevWordNodePos, DicNode *const newRootNode) {
|
||||||
int curPos = rootPos;
|
int curPos = binaryDictionaryInfo->getRootPosition();
|
||||||
const int pos = curPos;
|
const int pos = curPos;
|
||||||
const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
|
const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), &curPos);
|
||||||
const int childrenPos = curPos;
|
const int childrenPos = curPos;
|
||||||
newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos);
|
newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos,
|
/*static */ void DicNodeUtils::initAsRootWithPreviousWord(
|
||||||
const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) {
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
int curPos = rootPos;
|
DicNode *const prevWordLastNode, DicNode *const newRootNode) {
|
||||||
|
int curPos = binaryDictionaryInfo->getRootPosition();
|
||||||
const int pos = curPos;
|
const int pos = curPos;
|
||||||
const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
|
const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), &curPos);
|
||||||
const int childrenPos = curPos;
|
const int childrenPos = curPos;
|
||||||
newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount);
|
newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount);
|
||||||
}
|
}
|
||||||
|
@ -71,16 +75,19 @@ namespace latinime {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos,
|
/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos,
|
||||||
const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState,
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth,
|
||||||
const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter,
|
const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
|
||||||
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
|
const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
|
||||||
|
DicNodeVector *childDicNodes) {
|
||||||
int nextPos = pos;
|
int nextPos = pos;
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), &pos);
|
||||||
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
|
const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
|
||||||
const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
|
const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
|
||||||
|
|
||||||
int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos);
|
int codePoint = BinaryFormat::getCodePointAndForwardPointer(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), &pos);
|
||||||
ASSERT(NOT_A_CODE_POINT != codePoint);
|
ASSERT(NOT_A_CODE_POINT != codePoint);
|
||||||
const int nodeCodePoint = codePoint;
|
const int nodeCodePoint = codePoint;
|
||||||
// TODO: optimize this
|
// TODO: optimize this
|
||||||
|
@ -90,7 +97,8 @@ namespace latinime {
|
||||||
|
|
||||||
do {
|
do {
|
||||||
const int nextCodePoint = hasMultipleChars
|
const int nextCodePoint = hasMultipleChars
|
||||||
? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT;
|
? BinaryFormat::getCodePointAndForwardPointer(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
|
||||||
const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
|
const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
|
||||||
if (!isLastChar) {
|
if (!isLastChar) {
|
||||||
additionalWordBuf[additionalSubwordLength++] = nextCodePoint;
|
additionalWordBuf[additionalSubwordLength++] = nextCodePoint;
|
||||||
|
@ -98,12 +106,14 @@ namespace latinime {
|
||||||
codePoint = nextCodePoint;
|
codePoint = nextCodePoint;
|
||||||
} while (NOT_A_CODE_POINT != codePoint);
|
} while (NOT_A_CODE_POINT != codePoint);
|
||||||
|
|
||||||
const int probability =
|
const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(
|
||||||
isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1;
|
binaryDictionaryInfo->getDictRoot(), pos) : -1;
|
||||||
pos = BinaryFormat::skipProbability(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0;
|
int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), flags, pos) : 0;
|
||||||
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos);
|
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos);
|
||||||
const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos);
|
const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), flags, pos);
|
||||||
|
|
||||||
if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) {
|
if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) {
|
||||||
return siblingPos;
|
return siblingPos;
|
||||||
|
@ -111,8 +121,8 @@ namespace latinime {
|
||||||
if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) {
|
if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) {
|
||||||
return siblingPos;
|
return siblingPos;
|
||||||
}
|
}
|
||||||
const int childrenCount = hasChildren
|
const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer(
|
||||||
? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0;
|
binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0;
|
||||||
childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos,
|
childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos,
|
||||||
nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal,
|
nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal,
|
||||||
hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf);
|
hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf);
|
||||||
|
@ -148,16 +158,18 @@ namespace latinime {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode,
|
/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode,
|
||||||
const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const bool exactOnly, const std::vector<int> *const codePointsFilter,
|
const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
|
||||||
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
|
const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
|
||||||
|
DicNodeVector *childDicNodes) {
|
||||||
const int terminalDepth = dicNode->getLeavingDepth();
|
const int terminalDepth = dicNode->getLeavingDepth();
|
||||||
const int childCount = dicNode->getChildrenCount();
|
const int childCount = dicNode->getChildrenCount();
|
||||||
int nextPos = dicNode->getChildrenPos();
|
int nextPos = dicNode->getChildrenPos();
|
||||||
for (int i = 0; i < childCount; i++) {
|
for (int i = 0; i < childCount; i++) {
|
||||||
const int filterSize = codePointsFilter ? codePointsFilter->size() : 0;
|
const int filterSize = codePointsFilter ? codePointsFilter->size() : 0;
|
||||||
nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState,
|
nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo,
|
||||||
pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes);
|
terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo,
|
||||||
|
childDicNodes);
|
||||||
if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) {
|
if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) {
|
||||||
// All code points have been found.
|
// All code points have been found.
|
||||||
break;
|
break;
|
||||||
|
@ -165,14 +177,15 @@ namespace latinime {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
|
/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode,
|
||||||
DicNodeVector *childDicNodes) {
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) {
|
||||||
getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes);
|
getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode,
|
/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode,
|
||||||
const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
bool exactOnly, DicNodeVector *childDicNodes) {
|
const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
|
||||||
|
DicNodeVector *childDicNodes) {
|
||||||
if (dicNode->isTotalInputSizeExceedingLimit()) {
|
if (dicNode->isTotalInputSizeExceedingLimit()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -180,9 +193,9 @@ namespace latinime {
|
||||||
DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly,
|
DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly,
|
||||||
childDicNodes);
|
childDicNodes);
|
||||||
} else {
|
} else {
|
||||||
DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex,
|
DicNodeUtils::createAndGetAllLeavingChildNodes(
|
||||||
exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */,
|
dicNode, binaryDictionaryInfo, pInfoState, pointIndex, exactOnly,
|
||||||
childDicNodes);
|
0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -192,19 +205,21 @@ namespace latinime {
|
||||||
/**
|
/**
|
||||||
* Computes the combined bigram / unigram cost for the given dicNode.
|
* Computes the combined bigram / unigram cost for the given dicNode.
|
||||||
*/
|
*/
|
||||||
/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot,
|
/* static */ float DicNodeUtils::getBigramNodeImprobability(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const DicNode *const node, MultiBigramMap *multiBigramMap) {
|
const DicNode *const node, MultiBigramMap *multiBigramMap) {
|
||||||
if (node->isImpossibleBigramWord()) {
|
if (node->isImpossibleBigramWord()) {
|
||||||
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
|
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
|
||||||
}
|
}
|
||||||
const int probability = getBigramNodeProbability(dicRoot, node, multiBigramMap);
|
const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap);
|
||||||
// TODO: This equation to calculate the improbability looks unreasonable. Investigate this.
|
// TODO: This equation to calculate the improbability looks unreasonable. Investigate this.
|
||||||
const float cost = static_cast<float>(MAX_PROBABILITY - probability)
|
const float cost = static_cast<float>(MAX_PROBABILITY - probability)
|
||||||
/ static_cast<float>(MAX_PROBABILITY);
|
/ static_cast<float>(MAX_PROBABILITY);
|
||||||
return cost;
|
return cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot,
|
/* static */ int DicNodeUtils::getBigramNodeProbability(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const DicNode *const node, MultiBigramMap *multiBigramMap) {
|
const DicNode *const node, MultiBigramMap *multiBigramMap) {
|
||||||
const int unigramProbability = node->getProbability();
|
const int unigramProbability = node->getProbability();
|
||||||
const int wordPos = node->getPos();
|
const int wordPos = node->getPos();
|
||||||
|
@ -215,9 +230,10 @@ namespace latinime {
|
||||||
}
|
}
|
||||||
if (multiBigramMap) {
|
if (multiBigramMap) {
|
||||||
return multiBigramMap->getBigramProbability(
|
return multiBigramMap->getBigramProbability(
|
||||||
dicRoot, prevWordPos, wordPos, unigramProbability);
|
binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
|
||||||
}
|
}
|
||||||
return BinaryFormat::getBigramProbability(dicRoot, prevWordPos, wordPos, unigramProbability);
|
return BinaryFormat::getBigramProbability(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryInfo;
|
||||||
class DicNode;
|
class DicNode;
|
||||||
class DicNodeVector;
|
class DicNodeVector;
|
||||||
class ProximityInfo;
|
class ProximityInfo;
|
||||||
|
@ -34,19 +35,20 @@ class DicNodeUtils {
|
||||||
public:
|
public:
|
||||||
static int appendTwoWords(const int *src0, const int16_t length0, const int *src1,
|
static int appendTwoWords(const int *src0, const int16_t length0, const int *src1,
|
||||||
const int16_t length1, int *dest);
|
const int16_t length1, int *dest);
|
||||||
static void initAsRoot(const int rootPos, const uint8_t *const dicRoot,
|
static void initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const int prevWordNodePos, DicNode *newRootNode);
|
const int prevWordNodePos, DicNode *newRootNode);
|
||||||
static void initAsRootWithPreviousWord(const int rootPos, const uint8_t *const dicRoot,
|
static void initAsRootWithPreviousWord(const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
DicNode *prevWordLastNode, DicNode *newRootNode);
|
DicNode *prevWordLastNode, DicNode *newRootNode);
|
||||||
static void initByCopy(DicNode *srcNode, DicNode *destNode);
|
static void initByCopy(DicNode *srcNode, DicNode *destNode);
|
||||||
static void getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
|
static void getAllChildDicNodes(DicNode *dicNode,
|
||||||
DicNodeVector *childDicNodes);
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes);
|
||||||
static float getBigramNodeImprobability(const uint8_t *const dicRoot,
|
static float getBigramNodeImprobability(const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const DicNode *const node, MultiBigramMap *const multiBigramMap);
|
const DicNode *const node, MultiBigramMap *const multiBigramMap);
|
||||||
static bool isDicNodeFilteredOut(const int nodeCodePoint, const ProximityInfo *const pInfo,
|
static bool isDicNodeFilteredOut(const int nodeCodePoint, const ProximityInfo *const pInfo,
|
||||||
const std::vector<int> *const codePointsFilter);
|
const std::vector<int> *const codePointsFilter);
|
||||||
// TODO: Move to private
|
// TODO: Move to private
|
||||||
static void getProximityChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
|
static void getProximityChildDicNodes(DicNode *dicNode,
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
|
const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
|
||||||
DicNodeVector *childDicNodes);
|
DicNodeVector *childDicNodes);
|
||||||
|
|
||||||
|
@ -60,16 +62,18 @@ class DicNodeUtils {
|
||||||
// Max number of bigrams to look up
|
// Max number of bigrams to look up
|
||||||
static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500;
|
static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500;
|
||||||
|
|
||||||
static int getBigramNodeProbability(const uint8_t *const dicRoot, const DicNode *const node,
|
static int getBigramNodeProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
MultiBigramMap *multiBigramMap);
|
const DicNode *const node, MultiBigramMap *multiBigramMap);
|
||||||
static void createAndGetPassingChildNode(DicNode *dicNode, const ProximityInfoState *pInfoState,
|
static void createAndGetPassingChildNode(DicNode *dicNode, const ProximityInfoState *pInfoState,
|
||||||
const int pointIndex, const bool exactOnly, DicNodeVector *childDicNodes);
|
const int pointIndex, const bool exactOnly, DicNodeVector *childDicNodes);
|
||||||
static void createAndGetAllLeavingChildNodes(DicNode *dicNode, const uint8_t *const dicRoot,
|
static void createAndGetAllLeavingChildNodes(DicNode *dicNode,
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
|
const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
|
||||||
const std::vector<int> *const codePointsFilter,
|
const std::vector<int> *const codePointsFilter,
|
||||||
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
|
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
|
||||||
static int createAndGetLeavingChildNode(DicNode *dicNode, int pos, const uint8_t *const dicRoot,
|
static int createAndGetLeavingChildNode(DicNode *dicNode, int pos,
|
||||||
const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex,
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth,
|
||||||
|
const ProximityInfoState *pInfoState, const int pointIndex,
|
||||||
const bool exactOnly, const std::vector<int> *const codePointsFilter,
|
const bool exactOnly, const std::vector<int> *const codePointsFilter,
|
||||||
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
|
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_format.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dictionary size
|
||||||
|
*/
|
||||||
|
// Any file smaller than this is not a dictionary.
|
||||||
|
const int BinaryDictionaryFormat::DICTIONARY_MINIMUM_SIZE = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format versions
|
||||||
|
*/
|
||||||
|
// Originally, format version 1 had a 16-bit magic number, then the version number `01'
|
||||||
|
// then options that must be 0. Hence the first 32-bits of the format are always as follow
|
||||||
|
// and it's okay to consider them a magic number as a whole.
|
||||||
|
const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
|
||||||
|
const int BinaryDictionaryFormat::FORMAT_VERSION_1_HEADER_SIZE = 5;
|
||||||
|
|
||||||
|
// The versions of Latin IME that only handle format version 1 only test for the magic
|
||||||
|
// number, so we had to change it so that version 2 files would be rejected by older
|
||||||
|
// implementations. On this occasion, we made the magic number 32 bits long.
|
||||||
|
const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
|
||||||
|
// Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12
|
||||||
|
const int BinaryDictionaryFormat::FORMAT_VERSION_2_MINIMUM_SIZE = 12;
|
||||||
|
const int BinaryDictionaryFormat::VERSION_2_MAGIC_NUMBER_SIZE = 4;
|
||||||
|
const int BinaryDictionaryFormat::VERSION_2_DICTIONARY_VERSION_SIZE = 2;
|
||||||
|
const int BinaryDictionaryFormat::VERSION_2_DICTIONARY_FLAG_SIZE = 2;
|
||||||
|
|
||||||
|
/* static */ BinaryDictionaryFormat::FORMAT_VERSION BinaryDictionaryFormat::detectFormatVersion(
|
||||||
|
const uint8_t *const dict, const int dictSize) {
|
||||||
|
// The magic number is stored big-endian.
|
||||||
|
// If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't
|
||||||
|
// understand this format.
|
||||||
|
if (dictSize < DICTIONARY_MINIMUM_SIZE) {
|
||||||
|
return UNKNOWN_VERSION;
|
||||||
|
}
|
||||||
|
const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0);
|
||||||
|
switch (magicNumber) {
|
||||||
|
case FORMAT_VERSION_1_MAGIC_NUMBER:
|
||||||
|
// Format 1 header is exactly 5 bytes long and looks like:
|
||||||
|
// Magic number (2 bytes) 0x78 0xB1
|
||||||
|
// Version number (1 byte) 0x01
|
||||||
|
// Options (2 bytes) must be 0x00 0x00
|
||||||
|
return VERSION_1;
|
||||||
|
case FORMAT_VERSION_2_MAGIC_NUMBER:
|
||||||
|
// Version 2 dictionaries are at least 12 bytes long.
|
||||||
|
// If this dictionary has the version 2 magic number but is less than 12 bytes long,
|
||||||
|
// then it's an unknown format and we need to avoid confidently reading the next bytes.
|
||||||
|
if (dictSize < FORMAT_VERSION_2_MINIMUM_SIZE) {
|
||||||
|
return UNKNOWN_VERSION;
|
||||||
|
}
|
||||||
|
// Format 2 header is as follows:
|
||||||
|
// Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
|
||||||
|
// Version number (2 bytes) 0x00 0x02
|
||||||
|
// Options (2 bytes)
|
||||||
|
// Header size (4 bytes) : integer, big endian
|
||||||
|
if (ByteArrayUtils::readUint16(dict, 4) == 2) {
|
||||||
|
return VERSION_2;
|
||||||
|
} else {
|
||||||
|
return UNKNOWN_VERSION;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return UNKNOWN_VERSION;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace latinime
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013, The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_BINARY_DICTIONARY_FORMAT_H
|
||||||
|
#define LATINIME_BINARY_DICTIONARY_FORMAT_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Methods to handle binary dictionary format version.
|
||||||
|
*
|
||||||
|
* Currently, we have a file with a similar name, binary_format.h. binary_format.h contains binary
|
||||||
|
* reading methods and utility methods for various purposes.
|
||||||
|
* On the other hand, this file deals with only about dictionary format version.
|
||||||
|
*/
|
||||||
|
class BinaryDictionaryFormat {
|
||||||
|
public:
|
||||||
|
// TODO: Remove obsolete version logic
|
||||||
|
enum FORMAT_VERSION {
|
||||||
|
VERSION_1,
|
||||||
|
VERSION_2,
|
||||||
|
UNKNOWN_VERSION
|
||||||
|
};
|
||||||
|
|
||||||
|
static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize);
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int getHeaderSize(
|
||||||
|
const uint8_t *const dict, const FORMAT_VERSION format) {
|
||||||
|
switch (format) {
|
||||||
|
case VERSION_1:
|
||||||
|
return FORMAT_VERSION_1_HEADER_SIZE;
|
||||||
|
case VERSION_2:
|
||||||
|
// See the format of the header in the comment in detectFormat() above
|
||||||
|
return ByteArrayUtils::readUint32(dict, 8);
|
||||||
|
default:
|
||||||
|
return S_INT_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryFormat);
|
||||||
|
|
||||||
|
static const int DICTIONARY_MINIMUM_SIZE;
|
||||||
|
static const uint32_t FORMAT_VERSION_1_MAGIC_NUMBER;
|
||||||
|
static const int FORMAT_VERSION_1_HEADER_SIZE;
|
||||||
|
static const uint32_t FORMAT_VERSION_2_MAGIC_NUMBER;
|
||||||
|
static const int FORMAT_VERSION_2_MINIMUM_SIZE;
|
||||||
|
static const int VERSION_2_MAGIC_NUMBER_SIZE;
|
||||||
|
static const int VERSION_2_DICTIONARY_VERSION_SIZE ;
|
||||||
|
static const int VERSION_2_DICTIONARY_FLAG_SIZE;
|
||||||
|
};
|
||||||
|
} // namespace latinime
|
||||||
|
#endif /* LATINIME_BINARY_DICTIONARY_FORMAT_H */
|
|
@ -0,0 +1,58 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013, The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_BINARY_DICTIONARY_INFO_H
|
||||||
|
#define LATINIME_BINARY_DICTIONARY_INFO_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_format.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryInfo {
|
||||||
|
public:
|
||||||
|
BinaryDictionaryInfo(const uint8_t *const dictBuf, const int dictSize)
|
||||||
|
: mDictBuf(dictBuf),
|
||||||
|
mFormat(BinaryDictionaryFormat::detectFormatVersion(mDictBuf, dictSize)),
|
||||||
|
mDictRoot(mDictBuf + BinaryDictionaryFormat::getHeaderSize(mDictBuf, mFormat)) {}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE const uint8_t *getDictBuf() const {
|
||||||
|
return mDictBuf;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE const uint8_t *getDictRoot() const {
|
||||||
|
return mDictRoot;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE BinaryDictionaryFormat::FORMAT_VERSION getFormat() const {
|
||||||
|
return mFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getRootPosition() const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryInfo);
|
||||||
|
|
||||||
|
const uint8_t *const mDictBuf;
|
||||||
|
const BinaryDictionaryFormat::FORMAT_VERSION mFormat;
|
||||||
|
const uint8_t *const mDictRoot;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif /* LATINIME_BINARY_DICTIONARY_INFO_H */
|
|
@ -0,0 +1,24 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
||||||
|
const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F;
|
||||||
|
|
||||||
|
} // namespace latinime
|
|
@ -0,0 +1,148 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013, The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_BYTE_ARRAY_UTILS_H
|
||||||
|
#define LATINIME_BYTE_ARRAY_UTILS_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility methods for reading byte arrays.
|
||||||
|
*/
|
||||||
|
class ByteArrayUtils {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* Integer
|
||||||
|
*
|
||||||
|
* Each method read a corresponding size integer in a big endian manner.
|
||||||
|
*/
|
||||||
|
static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) {
|
||||||
|
return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16)
|
||||||
|
^ (buffer[pos + 2] << 8) ^ buffer[pos + 3];
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) {
|
||||||
|
return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2];
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) {
|
||||||
|
return (buffer[pos] << 8) ^ buffer[pos + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) {
|
||||||
|
return buffer[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE uint32_t readUint32andAdvancePosition(
|
||||||
|
const uint8_t *const buffer, int *const pos) {
|
||||||
|
const uint32_t value = readUint32(buffer, *pos);
|
||||||
|
*pos += 4;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE uint32_t readUint24andAdvancePosition(
|
||||||
|
const uint8_t *const buffer, int *const pos) {
|
||||||
|
const uint32_t value = readUint24(buffer, *pos);
|
||||||
|
*pos += 3;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE uint16_t readUint16andAdvancePosition(
|
||||||
|
const uint8_t *const buffer, int *const pos) {
|
||||||
|
const uint16_t value = readUint16(buffer, *pos);
|
||||||
|
*pos += 2;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE uint8_t readUint8andAdvancePosition(
|
||||||
|
const uint8_t *const buffer, int *const pos) {
|
||||||
|
return buffer[(*pos)++];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Code Point
|
||||||
|
*
|
||||||
|
* 1 byte = bbbbbbbb match
|
||||||
|
* case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
|
||||||
|
* else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
|
||||||
|
* unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
|
||||||
|
* 00011111 would be outside unicode.
|
||||||
|
* else: iso-latin-1 code
|
||||||
|
* This allows for the whole unicode range to be encoded, including chars outside of
|
||||||
|
* the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
|
||||||
|
* characters which should never happen anyway (and still work, but take 3 bytes).
|
||||||
|
*/
|
||||||
|
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
|
||||||
|
int p = pos;
|
||||||
|
return readCodePointAndAdvancePosition(buffer, &p);
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
|
||||||
|
const uint8_t *const buffer, int *const pos) {
|
||||||
|
const uint8_t firstByte = readUint8(buffer, *pos);
|
||||||
|
if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
|
||||||
|
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
|
||||||
|
*pos += 1;
|
||||||
|
return NOT_A_CODE_POINT;
|
||||||
|
} else {
|
||||||
|
return readUint24andAdvancePosition(buffer, pos);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
*pos += 1;
|
||||||
|
return firstByte;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* String (array of code points)
|
||||||
|
*
|
||||||
|
* Reads code points until the terminator is found.
|
||||||
|
*/
|
||||||
|
// Returns the length of the string.
|
||||||
|
static int readStringAndAdvancePosition(const uint8_t *const buffer, int *const pos,
|
||||||
|
int *const outBuffer, const int maxLength) {
|
||||||
|
int length = 0;
|
||||||
|
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||||
|
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
||||||
|
outBuffer[length++] = codePoint;
|
||||||
|
codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advances the position and returns the length of the string.
|
||||||
|
static int advancePositionToBehindString(
|
||||||
|
const uint8_t *const buffer, int *const pos, const int maxLength) {
|
||||||
|
int length = 0;
|
||||||
|
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||||
|
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
||||||
|
codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
|
||||||
|
|
||||||
|
static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE;
|
||||||
|
static const uint8_t CHARACTER_ARRAY_TERMINATOR;
|
||||||
|
};
|
||||||
|
} // namespace latinime
|
||||||
|
#endif /* LATINIME_BYTE_ARRAY_UTILS_H */
|
|
@ -34,13 +34,11 @@
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust)
|
Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust)
|
||||||
: mDict(static_cast<unsigned char *>(dict)),
|
: mBinaryDicitonaryInfo(static_cast<const uint8_t *>(dict), dictSize),
|
||||||
mOffsetDict((static_cast<unsigned char *>(dict))
|
|
||||||
+ BinaryFormat::getHeaderSize(mDict, dictSize)),
|
|
||||||
mDictSize(dictSize), mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust),
|
mDictSize(dictSize), mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust),
|
||||||
mUnigramDictionary(new UnigramDictionary(mOffsetDict,
|
mUnigramDictionary(new UnigramDictionary(&mBinaryDicitonaryInfo,
|
||||||
BinaryFormat::getFlags(mDict, dictSize))),
|
BinaryFormat::getFlags(mBinaryDicitonaryInfo.getDictBuf(), dictSize))),
|
||||||
mBigramDictionary(new BigramDictionary(mOffsetDict)),
|
mBigramDictionary(new BigramDictionary(&mBinaryDicitonaryInfo)),
|
||||||
mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())),
|
mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())),
|
||||||
mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) {
|
mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) {
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -64,11 +65,8 @@ class Dictionary {
|
||||||
|
|
||||||
int getProbability(const int *word, int length) const;
|
int getProbability(const int *word, int length) const;
|
||||||
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
||||||
const uint8_t *getDict() const { // required to release dictionary buffer
|
const BinaryDictionaryInfo *getBinaryDictionaryInfo() const {
|
||||||
return mDict;
|
return &mBinaryDicitonaryInfo;
|
||||||
}
|
|
||||||
const uint8_t *getOffsetDict() const {
|
|
||||||
return mOffsetDict;
|
|
||||||
}
|
}
|
||||||
int getDictSize() const { return mDictSize; }
|
int getDictSize() const { return mDictSize; }
|
||||||
int getMmapFd() const { return mMmapFd; }
|
int getMmapFd() const { return mMmapFd; }
|
||||||
|
@ -78,9 +76,8 @@ class Dictionary {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary);
|
||||||
const uint8_t *mDict;
|
|
||||||
const uint8_t *mOffsetDict;
|
|
||||||
|
|
||||||
|
const BinaryDictionaryInfo mBinaryDicitonaryInfo;
|
||||||
// Used only for the mmap version of dictionary loading, but we use these as dummy variables
|
// Used only for the mmap version of dictionary loading, but we use these as dummy variables
|
||||||
// also for the malloc version.
|
// also for the malloc version.
|
||||||
const int mDictSize;
|
const int mDictSize;
|
||||||
|
|
|
@ -17,10 +17,9 @@
|
||||||
#ifndef LATINIME_MULTI_BIGRAM_MAP_H
|
#ifndef LATINIME_MULTI_BIGRAM_MAP_H
|
||||||
#define LATINIME_MULTI_BIGRAM_MAP_H
|
#define LATINIME_MULTI_BIGRAM_MAP_H
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "hash_map_compat.h"
|
#include "hash_map_compat.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
@ -35,20 +34,20 @@ class MultiBigramMap {
|
||||||
|
|
||||||
// Look up the bigram probability for the given word pair from the cached bigram maps.
|
// Look up the bigram probability for the given word pair from the cached bigram maps.
|
||||||
// Also caches the bigrams if there is space remaining and they have not been cached already.
|
// Also caches the bigrams if there is space remaining and they have not been cached already.
|
||||||
int getBigramProbability(const uint8_t *const dicRoot, const int wordPosition,
|
int getBigramProbability(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
||||||
const int nextWordPosition, const int unigramProbability) {
|
const int wordPosition, const int nextWordPosition, const int unigramProbability) {
|
||||||
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
||||||
mBigramMaps.find(wordPosition);
|
mBigramMaps.find(wordPosition);
|
||||||
if (mapPosition != mBigramMaps.end()) {
|
if (mapPosition != mBigramMaps.end()) {
|
||||||
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
|
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
|
||||||
}
|
}
|
||||||
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
||||||
addBigramsForWordPosition(dicRoot, wordPosition);
|
addBigramsForWordPosition(binaryDicitonaryInfo, wordPosition);
|
||||||
return mBigramMaps[wordPosition].getBigramProbability(
|
return mBigramMaps[wordPosition].getBigramProbability(
|
||||||
nextWordPosition, unigramProbability);
|
nextWordPosition, unigramProbability);
|
||||||
}
|
}
|
||||||
return BinaryFormat::getBigramProbability(
|
return BinaryFormat::getBigramProbability(binaryDicitonaryInfo->getDictRoot(),
|
||||||
dicRoot, wordPosition, nextWordPosition, unigramProbability);
|
wordPosition, nextWordPosition, unigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clear() {
|
void clear() {
|
||||||
|
@ -63,8 +62,9 @@ class MultiBigramMap {
|
||||||
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {}
|
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {}
|
||||||
~BigramMap() {}
|
~BigramMap() {}
|
||||||
|
|
||||||
void init(const uint8_t *const dicRoot, int position) {
|
void init(const BinaryDictionaryInfo *const binaryDicitonaryInfo, const int position) {
|
||||||
BinaryFormat::fillBigramProbabilityToHashMap(dicRoot, position, &mBigramMap);
|
BinaryFormat::fillBigramProbabilityToHashMap(
|
||||||
|
binaryDicitonaryInfo->getDictRoot(), position, &mBigramMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int getBigramProbability(const int nextWordPosition, const int unigramProbability)
|
inline int getBigramProbability(const int nextWordPosition, const int unigramProbability)
|
||||||
|
@ -78,8 +78,9 @@ class MultiBigramMap {
|
||||||
hash_map_compat<int, int> mBigramMap;
|
hash_map_compat<int, int> mBigramMap;
|
||||||
};
|
};
|
||||||
|
|
||||||
void addBigramsForWordPosition(const uint8_t *const dicRoot, const int position) {
|
void addBigramsForWordPosition(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
||||||
mBigramMaps[position].init(dicRoot, position);
|
const int position) {
|
||||||
|
mBigramMaps[position].init(binaryDicitonaryInfo, position);
|
||||||
}
|
}
|
||||||
|
|
||||||
hash_map_compat<int, BigramMap> mBigramMaps;
|
hash_map_compat<int, BigramMap> mBigramMaps;
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
@ -32,8 +33,9 @@ class TerminalAttributes {
|
||||||
public:
|
public:
|
||||||
class ShortcutIterator {
|
class ShortcutIterator {
|
||||||
public:
|
public:
|
||||||
ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags)
|
ShortcutIterator(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos,
|
||||||
: mDict(dict), mPos(pos),
|
const uint8_t flags)
|
||||||
|
: mBinaryDicitionaryInfo(binaryDictionaryInfo), mPos(pos),
|
||||||
mHasNextShortcutTarget(0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS)) {
|
mHasNextShortcutTarget(0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,11 +46,13 @@ class TerminalAttributes {
|
||||||
// Gets the shortcut target itself as an int string. For parameters and return value
|
// Gets the shortcut target itself as an int string. For parameters and return value
|
||||||
// see BinaryFormat::getWordAtAddress.
|
// see BinaryFormat::getWordAtAddress.
|
||||||
inline int getNextShortcutTarget(const int maxDepth, int *outWord, int *outFreq) {
|
inline int getNextShortcutTarget(const int maxDepth, int *outWord, int *outFreq) {
|
||||||
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(
|
||||||
|
mBinaryDicitionaryInfo->getDictRoot(), &mPos);
|
||||||
mHasNextShortcutTarget = 0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
|
mHasNextShortcutTarget = 0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for (i = 0; i < MAX_WORD_LENGTH; ++i) {
|
for (i = 0; i < MAX_WORD_LENGTH; ++i) {
|
||||||
const int codePoint = BinaryFormat::getCodePointAndForwardPointer(mDict, &mPos);
|
const int codePoint = BinaryFormat::getCodePointAndForwardPointer(
|
||||||
|
mBinaryDicitionaryInfo->getDictRoot(), &mPos);
|
||||||
if (NOT_A_CODE_POINT == codePoint) break;
|
if (NOT_A_CODE_POINT == codePoint) break;
|
||||||
outWord[i] = codePoint;
|
outWord[i] = codePoint;
|
||||||
}
|
}
|
||||||
|
@ -57,19 +61,21 @@ class TerminalAttributes {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const uint8_t *const mDict;
|
const BinaryDictionaryInfo *const mBinaryDicitionaryInfo;
|
||||||
int mPos;
|
int mPos;
|
||||||
bool mHasNextShortcutTarget;
|
bool mHasNextShortcutTarget;
|
||||||
};
|
};
|
||||||
|
|
||||||
TerminalAttributes(const uint8_t *const dict, const uint8_t flags, const int pos)
|
TerminalAttributes(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
||||||
: mDict(dict), mFlags(flags), mStartPos(pos) {
|
const uint8_t flags, const int pos)
|
||||||
|
: mBinaryDicitionaryInfo(binaryDicitonaryInfo), mFlags(flags), mStartPos(pos) {
|
||||||
}
|
}
|
||||||
|
|
||||||
inline ShortcutIterator getShortcutIterator() const {
|
inline ShortcutIterator getShortcutIterator() const {
|
||||||
// The size of the shortcuts is stored here so that the whole shortcut chunk can be
|
// The size of the shortcuts is stored here so that the whole shortcut chunk can be
|
||||||
// skipped quickly, so we ignore it.
|
// skipped quickly, so we ignore it.
|
||||||
return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
|
return ShortcutIterator(
|
||||||
|
mBinaryDicitionaryInfo, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isBlacklistedOrNotAWord() const {
|
bool isBlacklistedOrNotAWord() const {
|
||||||
|
@ -78,7 +84,7 @@ class TerminalAttributes {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes);
|
||||||
const uint8_t *const mDict;
|
const BinaryDictionaryInfo *const mBinaryDicitionaryInfo;
|
||||||
const uint8_t mFlags;
|
const uint8_t mFlags;
|
||||||
const int mStartPos;
|
const int mStartPos;
|
||||||
};
|
};
|
||||||
|
|
|
@ -142,7 +142,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
|
||||||
case CT_TERMINAL: {
|
case CT_TERMINAL: {
|
||||||
const float languageImprobability =
|
const float languageImprobability =
|
||||||
DicNodeUtils::getBigramNodeImprobability(
|
DicNodeUtils::getBigramNodeImprobability(
|
||||||
traverseSession->getOffsetDict(), dicNode, multiBigramMap);
|
traverseSession->getBinaryDictionaryInfo(), dicNode, multiBigramMap);
|
||||||
return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability);
|
return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability);
|
||||||
}
|
}
|
||||||
case CT_NEW_WORD_SPACE_SUBSTITUTION:
|
case CT_NEW_WORD_SPACE_SUBSTITUTION:
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include "dic_traverse_wrapper.h"
|
#include "dic_traverse_wrapper.h"
|
||||||
#include "jni.h"
|
#include "jni.h"
|
||||||
#include "suggest/core/dicnode/dic_node_utils.h"
|
#include "suggest/core/dicnode/dic_node_utils.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
#include "suggest/core/dictionary/dictionary.h"
|
#include "suggest/core/dictionary/dictionary.h"
|
||||||
|
|
||||||
|
@ -65,7 +66,8 @@ static TraverseSessionFactoryRegisterer traverseSessionFactoryRegisterer;
|
||||||
void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord,
|
void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord,
|
||||||
int prevWordLength, const SuggestOptions *const suggestOptions) {
|
int prevWordLength, const SuggestOptions *const suggestOptions) {
|
||||||
mDictionary = dictionary;
|
mDictionary = dictionary;
|
||||||
mMultiWordCostMultiplier = BinaryFormat::getMultiWordCostMultiplier(mDictionary->getDict(),
|
mMultiWordCostMultiplier = BinaryFormat::getMultiWordCostMultiplier(
|
||||||
|
mDictionary->getBinaryDictionaryInfo()->getDictBuf(),
|
||||||
mDictionary->getDictSize());
|
mDictionary->getDictSize());
|
||||||
mSuggestOptions = suggestOptions;
|
mSuggestOptions = suggestOptions;
|
||||||
if (!prevWord) {
|
if (!prevWord) {
|
||||||
|
@ -73,12 +75,14 @@ void DicTraverseSession::init(const Dictionary *const dictionary, const int *pre
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// TODO: merge following similar calls to getTerminalPosition into one case-insensitive call.
|
// TODO: merge following similar calls to getTerminalPosition into one case-insensitive call.
|
||||||
mPrevWordPos = BinaryFormat::getTerminalPosition(dictionary->getOffsetDict(), prevWord,
|
mPrevWordPos = BinaryFormat::getTerminalPosition(
|
||||||
|
dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord,
|
||||||
prevWordLength, false /* forceLowerCaseSearch */);
|
prevWordLength, false /* forceLowerCaseSearch */);
|
||||||
if (mPrevWordPos == NOT_VALID_WORD) {
|
if (mPrevWordPos == NOT_VALID_WORD) {
|
||||||
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
||||||
// auto-capitalized words like "The [current_word]".
|
// auto-capitalized words like "The [current_word]".
|
||||||
mPrevWordPos = BinaryFormat::getTerminalPosition(dictionary->getOffsetDict(), prevWord,
|
mPrevWordPos = BinaryFormat::getTerminalPosition(
|
||||||
|
dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord,
|
||||||
prevWordLength, true /* forceLowerCaseSearch */);
|
prevWordLength, true /* forceLowerCaseSearch */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -93,8 +97,8 @@ void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo,
|
||||||
maxSpatialDistance, maxPointerCount);
|
maxSpatialDistance, maxPointerCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint8_t *DicTraverseSession::getOffsetDict() const {
|
const BinaryDictionaryInfo *DicTraverseSession::getBinaryDictionaryInfo() const {
|
||||||
return mDictionary->getOffsetDict();
|
return mDictionary->getBinaryDictionaryInfo();
|
||||||
}
|
}
|
||||||
|
|
||||||
int DicTraverseSession::getDictFlags() const {
|
int DicTraverseSession::getDictFlags() const {
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryInfo;
|
||||||
class Dictionary;
|
class Dictionary;
|
||||||
class ProximityInfo;
|
class ProximityInfo;
|
||||||
class SuggestOptions;
|
class SuggestOptions;
|
||||||
|
@ -56,7 +57,7 @@ class DicTraverseSession {
|
||||||
void resetCache(const int nextActiveCacheSize, const int maxWords);
|
void resetCache(const int nextActiveCacheSize, const int maxWords);
|
||||||
|
|
||||||
// TODO: Remove
|
// TODO: Remove
|
||||||
const uint8_t *getOffsetDict() const;
|
const BinaryDictionaryInfo *getBinaryDictionaryInfo() const;
|
||||||
int getDictFlags() const;
|
int getDictFlags() const;
|
||||||
|
|
||||||
//--------------------
|
//--------------------
|
||||||
|
|
|
@ -105,8 +105,8 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPo
|
||||||
traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(), MAX_RESULTS);
|
traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(), MAX_RESULTS);
|
||||||
// Create a new dic node here
|
// Create a new dic node here
|
||||||
DicNode rootNode;
|
DicNode rootNode;
|
||||||
DicNodeUtils::initAsRoot(traverseSession->getDicRootPos(),
|
DicNodeUtils::initAsRoot(traverseSession->getBinaryDictionaryInfo(),
|
||||||
traverseSession->getOffsetDict(), traverseSession->getPrevWordPos(), &rootNode);
|
traverseSession->getPrevWordPos(), &rootNode);
|
||||||
traverseSession->getDicTraverseCache()->copyPushActive(&rootNode);
|
traverseSession->getDicTraverseCache()->copyPushActive(&rootNode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -158,7 +158,7 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
|
||||||
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
|
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
|
||||||
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
||||||
+ doubleLetterCost;
|
+ doubleLetterCost;
|
||||||
const TerminalAttributes terminalAttributes(traverseSession->getOffsetDict(),
|
const TerminalAttributes terminalAttributes(traverseSession->getBinaryDictionaryInfo(),
|
||||||
terminalDicNode->getFlags(), terminalDicNode->getAttributesPos());
|
terminalDicNode->getFlags(), terminalDicNode->getAttributesPos());
|
||||||
const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0;
|
const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0;
|
||||||
const bool isExactMatch = terminalDicNode->isExactMatch();
|
const bool isExactMatch = terminalDicNode->isExactMatch();
|
||||||
|
@ -284,7 +284,7 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
DicNodeUtils::getAllChildDicNodes(
|
DicNodeUtils::getAllChildDicNodes(
|
||||||
&dicNode, traverseSession->getOffsetDict(), &childDicNodes);
|
&dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes);
|
||||||
|
|
||||||
const int childDicNodesSize = childDicNodes.getSizeAndLock();
|
const int childDicNodesSize = childDicNodes.getSizeAndLock();
|
||||||
for (int i = 0; i < childDicNodesSize; ++i) {
|
for (int i = 0; i < childDicNodesSize; ++i) {
|
||||||
|
@ -431,7 +431,8 @@ void Suggest::processDicNodeAsDigraph(DicTraverseSession *traverseSession,
|
||||||
void Suggest::processDicNodeAsOmission(
|
void Suggest::processDicNodeAsOmission(
|
||||||
DicTraverseSession *traverseSession, DicNode *dicNode) const {
|
DicTraverseSession *traverseSession, DicNode *dicNode) const {
|
||||||
DicNodeVector childDicNodes;
|
DicNodeVector childDicNodes;
|
||||||
DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getOffsetDict(), &childDicNodes);
|
DicNodeUtils::getAllChildDicNodes(
|
||||||
|
dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes);
|
||||||
|
|
||||||
const int size = childDicNodes.getSizeAndLock();
|
const int size = childDicNodes.getSizeAndLock();
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size; i++) {
|
||||||
|
@ -456,7 +457,7 @@ void Suggest::processDicNodeAsInsertion(DicTraverseSession *traverseSession,
|
||||||
DicNode *dicNode) const {
|
DicNode *dicNode) const {
|
||||||
const int16_t pointIndex = dicNode->getInputIndex(0);
|
const int16_t pointIndex = dicNode->getInputIndex(0);
|
||||||
DicNodeVector childDicNodes;
|
DicNodeVector childDicNodes;
|
||||||
DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getOffsetDict(),
|
DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(),
|
||||||
traverseSession->getProximityInfoState(0), pointIndex + 1, true, &childDicNodes);
|
traverseSession->getProximityInfoState(0), pointIndex + 1, true, &childDicNodes);
|
||||||
const int size = childDicNodes.getSizeAndLock();
|
const int size = childDicNodes.getSizeAndLock();
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size; i++) {
|
||||||
|
@ -474,14 +475,14 @@ void Suggest::processDicNodeAsTransposition(DicTraverseSession *traverseSession,
|
||||||
DicNode *dicNode) const {
|
DicNode *dicNode) const {
|
||||||
const int16_t pointIndex = dicNode->getInputIndex(0);
|
const int16_t pointIndex = dicNode->getInputIndex(0);
|
||||||
DicNodeVector childDicNodes1;
|
DicNodeVector childDicNodes1;
|
||||||
DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getOffsetDict(),
|
DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(),
|
||||||
traverseSession->getProximityInfoState(0), pointIndex + 1, false, &childDicNodes1);
|
traverseSession->getProximityInfoState(0), pointIndex + 1, false, &childDicNodes1);
|
||||||
const int childSize1 = childDicNodes1.getSizeAndLock();
|
const int childSize1 = childDicNodes1.getSizeAndLock();
|
||||||
for (int i = 0; i < childSize1; i++) {
|
for (int i = 0; i < childSize1; i++) {
|
||||||
if (childDicNodes1[i]->hasChildren()) {
|
if (childDicNodes1[i]->hasChildren()) {
|
||||||
DicNodeVector childDicNodes2;
|
DicNodeVector childDicNodes2;
|
||||||
DicNodeUtils::getProximityChildDicNodes(
|
DicNodeUtils::getProximityChildDicNodes(
|
||||||
childDicNodes1[i], traverseSession->getOffsetDict(),
|
childDicNodes1[i], traverseSession->getBinaryDictionaryInfo(),
|
||||||
traverseSession->getProximityInfoState(0), pointIndex, false, &childDicNodes2);
|
traverseSession->getProximityInfoState(0), pointIndex, false, &childDicNodes2);
|
||||||
const int childSize2 = childDicNodes2.getSizeAndLock();
|
const int childSize2 = childDicNodes2.getSizeAndLock();
|
||||||
for (int j = 0; j < childSize2; j++) {
|
for (int j = 0; j < childSize2; j++) {
|
||||||
|
@ -521,8 +522,8 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
|
||||||
|
|
||||||
// Create a non-cached node here.
|
// Create a non-cached node here.
|
||||||
DicNode newDicNode;
|
DicNode newDicNode;
|
||||||
DicNodeUtils::initAsRootWithPreviousWord(traverseSession->getDicRootPos(),
|
DicNodeUtils::initAsRootWithPreviousWord(
|
||||||
traverseSession->getOffsetDict(), dicNode, &newDicNode);
|
traverseSession->getBinaryDictionaryInfo(), dicNode, &newDicNode);
|
||||||
const CorrectionType correctionType = spaceSubstitution ?
|
const CorrectionType correctionType = spaceSubstitution ?
|
||||||
CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMITTION;
|
CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMITTION;
|
||||||
Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode,
|
Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode,
|
||||||
|
|
|
@ -146,7 +146,7 @@ class TypingWeighting : public Weighting {
|
||||||
float getNewWordBigramCost(const DicTraverseSession *const traverseSession,
|
float getNewWordBigramCost(const DicTraverseSession *const traverseSession,
|
||||||
const DicNode *const dicNode,
|
const DicNode *const dicNode,
|
||||||
MultiBigramMap *const multiBigramMap) const {
|
MultiBigramMap *const multiBigramMap) const {
|
||||||
return DicNodeUtils::getBigramNodeImprobability(traverseSession->getOffsetDict(),
|
return DicNodeUtils::getBigramNodeImprobability(traverseSession->getBinaryDictionaryInfo(),
|
||||||
dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE;
|
dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
|
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
#include "suggest/core/dictionary/char_utils.h"
|
#include "suggest/core/dictionary/char_utils.h"
|
||||||
#include "suggest/core/dictionary/dictionary.h"
|
#include "suggest/core/dictionary/dictionary.h"
|
||||||
|
@ -32,8 +33,9 @@
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
// TODO: check the header
|
// TODO: check the header
|
||||||
UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags)
|
UnigramDictionary::UnigramDictionary(
|
||||||
: DICT_ROOT(streamStart), ROOT_POS(0),
|
const BinaryDictionaryInfo *const binaryDicitonaryInfo, const uint8_t dictFlags)
|
||||||
|
: mBinaryDicitonaryInfo(binaryDicitonaryInfo),
|
||||||
MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) {
|
MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) {
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
AKLOGI("UnigramDictionary - constructor");
|
AKLOGI("UnigramDictionary - constructor");
|
||||||
|
@ -315,9 +317,10 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
|
||||||
correction->setCorrectionParams(0, 0, 0,
|
correction->setCorrectionParams(0, 0, 0,
|
||||||
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
|
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
|
||||||
doAutoCompletion, maxErrors);
|
doAutoCompletion, maxErrors);
|
||||||
int rootPosition = ROOT_POS;
|
int rootPosition = mBinaryDicitonaryInfo->getRootPosition();
|
||||||
// Get the number of children of root, then increment the position
|
// Get the number of children of root, then increment the position
|
||||||
int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition);
|
int childCount = BinaryFormat::getGroupCountAndForwardPointer(
|
||||||
|
mBinaryDicitonaryInfo->getDictRoot(), &rootPosition);
|
||||||
int outputIndex = 0;
|
int outputIndex = 0;
|
||||||
|
|
||||||
correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0));
|
correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0));
|
||||||
|
@ -747,7 +750,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con
|
||||||
int newWord[MAX_WORD_LENGTH];
|
int newWord[MAX_WORD_LENGTH];
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
int maxFreq = -1;
|
int maxFreq = -1;
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot();
|
||||||
int stackChildCount[MAX_WORD_LENGTH];
|
int stackChildCount[MAX_WORD_LENGTH];
|
||||||
int stackInputIndex[MAX_WORD_LENGTH];
|
int stackInputIndex[MAX_WORD_LENGTH];
|
||||||
int stackSiblingPos[MAX_WORD_LENGTH];
|
int stackSiblingPos[MAX_WORD_LENGTH];
|
||||||
|
@ -806,7 +809,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con
|
||||||
}
|
}
|
||||||
|
|
||||||
int UnigramDictionary::getProbability(const int *const inWord, const int length) const {
|
int UnigramDictionary::getProbability(const int *const inWord, const int length) const {
|
||||||
const uint8_t *const root = DICT_ROOT;
|
const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot();
|
||||||
int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
|
int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
|
||||||
false /* forceLowerCaseSearch */);
|
false /* forceLowerCaseSearch */);
|
||||||
if (NOT_VALID_WORD == pos) {
|
if (NOT_VALID_WORD == pos) {
|
||||||
|
@ -823,7 +826,7 @@ int UnigramDictionary::getProbability(const int *const inWord, const int length)
|
||||||
if (hasMultipleChars) {
|
if (hasMultipleChars) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
} else {
|
} else {
|
||||||
BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
|
BinaryFormat::getCodePointAndForwardPointer(root, &pos);
|
||||||
}
|
}
|
||||||
const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
|
const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
|
||||||
return unigramProbability;
|
return unigramProbability;
|
||||||
|
@ -865,7 +868,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
|
// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
|
||||||
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
|
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
|
||||||
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
|
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
|
||||||
|
mBinaryDicitonaryInfo->getDictRoot(), &pos);
|
||||||
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
|
const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
|
||||||
|
|
||||||
|
@ -876,7 +880,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// else if FLAG_IS_TERMINAL: the probability
|
// else if FLAG_IS_TERMINAL: the probability
|
||||||
// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
|
// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
|
||||||
// Note that you can't have a node that both is not a terminal and has no children.
|
// Note that you can't have a node that both is not a terminal and has no children.
|
||||||
int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
|
int c = BinaryFormat::getCodePointAndForwardPointer(
|
||||||
|
mBinaryDicitonaryInfo->getDictRoot(), &pos);
|
||||||
ASSERT(NOT_A_CODE_POINT != c);
|
ASSERT(NOT_A_CODE_POINT != c);
|
||||||
|
|
||||||
// We are going to loop through each character and make it look like it's a different
|
// We are going to loop through each character and make it look like it's a different
|
||||||
|
@ -890,8 +895,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// We prefetch the next char. If 'c' is the last char of this node, we will have
|
// We prefetch the next char. If 'c' is the last char of this node, we will have
|
||||||
// NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node
|
// NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node
|
||||||
// should behave as a terminal or not and whether we have children.
|
// should behave as a terminal or not and whether we have children.
|
||||||
const int nextc = hasMultipleChars
|
const int nextc = hasMultipleChars ? BinaryFormat::getCodePointAndForwardPointer(
|
||||||
? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT;
|
mBinaryDicitonaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
|
||||||
const bool isLastChar = (NOT_A_CODE_POINT == nextc);
|
const bool isLastChar = (NOT_A_CODE_POINT == nextc);
|
||||||
// If there are more chars in this nodes, then this virtual node is not a terminal.
|
// If there are more chars in this nodes, then this virtual node is not a terminal.
|
||||||
// If we are on the last char, this virtual node is a terminal if this node is.
|
// If we are on the last char, this virtual node is a terminal if this node is.
|
||||||
|
@ -911,11 +916,11 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// We don't have to output other values because we return false, as in
|
// We don't have to output other values because we return false, as in
|
||||||
// "don't traverse children".
|
// "don't traverse children".
|
||||||
if (!isLastChar) {
|
if (!isLastChar) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
|
pos = BinaryFormat::skipOtherCharacters(mBinaryDicitonaryInfo->getDictRoot(), pos);
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipProbability(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
*nextSiblingPosition =
|
*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
|
||||||
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -928,11 +933,11 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
if (isTerminalNode) {
|
if (isTerminalNode) {
|
||||||
// The probability should be here, because we come here only if this is actually
|
// The probability should be here, because we come here only if this is actually
|
||||||
// a terminal node, and we are on its last char.
|
// a terminal node, and we are on its last char.
|
||||||
const int unigramProbability =
|
const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(
|
||||||
BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos);
|
mBinaryDicitonaryInfo->getDictRoot(), pos);
|
||||||
const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos);
|
const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos);
|
||||||
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
|
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
|
||||||
TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
|
TerminalAttributes terminalAttributes(mBinaryDicitonaryInfo, flags, attributesPos);
|
||||||
// bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
|
// bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
|
||||||
// bigramFilter is a bloom filter of said frequencies for even faster rejection.
|
// bigramFilter is a bloom filter of said frequencies for even faster rejection.
|
||||||
const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
|
const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
|
||||||
|
@ -952,16 +957,16 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// remaining char in this group for there can't be any.
|
// remaining char in this group for there can't be any.
|
||||||
if (!hasChildren) {
|
if (!hasChildren) {
|
||||||
pos = BinaryFormat::skipProbability(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
*nextSiblingPosition =
|
*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
|
||||||
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Optimization: Prune out words that are too long compared to how much was typed.
|
// Optimization: Prune out words that are too long compared to how much was typed.
|
||||||
if (correction->needsToPrune()) {
|
if (correction->needsToPrune()) {
|
||||||
pos = BinaryFormat::skipProbability(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
*nextSiblingPosition =
|
*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
|
||||||
BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
|
||||||
if (DEBUG_DICT_FULL) {
|
if (DEBUG_DICT_FULL) {
|
||||||
AKLOGI("Traversing was pruned.");
|
AKLOGI("Traversing was pruned.");
|
||||||
}
|
}
|
||||||
|
@ -980,9 +985,12 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// Once this is read, we still need to output the number of nodes in the immediate children of
|
// Once this is read, we still need to output the number of nodes in the immediate children of
|
||||||
// this node, so we read and output it before returning true, as in "please traverse children".
|
// this node, so we read and output it before returning true, as in "please traverse children".
|
||||||
pos = BinaryFormat::skipProbability(flags, pos);
|
pos = BinaryFormat::skipProbability(flags, pos);
|
||||||
int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
|
int childrenPos = BinaryFormat::readChildrenPosition(
|
||||||
*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
|
mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
|
||||||
*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
|
*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
|
||||||
|
mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
|
||||||
|
*newCount = BinaryFormat::getGroupCountAndForwardPointer(
|
||||||
|
mBinaryDicitonaryInfo->getDictRoot(), &childrenPos);
|
||||||
*newChildrenPosition = childrenPos;
|
*newChildrenPosition = childrenPos;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryInfo;
|
||||||
class Correction;
|
class Correction;
|
||||||
class ProximityInfo;
|
class ProximityInfo;
|
||||||
class TerminalAttributes;
|
class TerminalAttributes;
|
||||||
|
@ -39,7 +40,10 @@ class UnigramDictionary {
|
||||||
static const int FLAG_MULTIPLE_SUGGEST_ABORT = 0;
|
static const int FLAG_MULTIPLE_SUGGEST_ABORT = 0;
|
||||||
static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1;
|
static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1;
|
||||||
static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2;
|
static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2;
|
||||||
UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags);
|
|
||||||
|
UnigramDictionary(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
||||||
|
const uint8_t dictFlags);
|
||||||
|
virtual ~UnigramDictionary();
|
||||||
int getProbability(const int *const inWord, const int length) const;
|
int getProbability(const int *const inWord, const int length) const;
|
||||||
int getBigramPosition(int pos, int *word, int offset, int length) const;
|
int getBigramPosition(int pos, int *word, int offset, int length) const;
|
||||||
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
|
@ -48,7 +52,6 @@ class UnigramDictionary {
|
||||||
const bool useFullEditDistance, int *outWords, int *frequencies,
|
const bool useFullEditDistance, int *outWords, int *frequencies,
|
||||||
int *outputTypes) const;
|
int *outputTypes) const;
|
||||||
int getDictFlags() const { return DICT_FLAGS; }
|
int getDictFlags() const { return DICT_FLAGS; }
|
||||||
virtual ~UnigramDictionary();
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(UnigramDictionary);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(UnigramDictionary);
|
||||||
|
@ -108,8 +111,7 @@ class UnigramDictionary {
|
||||||
const int outputWordLength, int *freqArray, int *wordLengthArray,
|
const int outputWordLength, int *freqArray, int *wordLengthArray,
|
||||||
int *outputWord) const;
|
int *outputWord) const;
|
||||||
|
|
||||||
const uint8_t *const DICT_ROOT;
|
const BinaryDictionaryInfo *const mBinaryDicitonaryInfo;
|
||||||
const int ROOT_POS;
|
|
||||||
const int MAX_DIGRAPH_SEARCH_DEPTH;
|
const int MAX_DIGRAPH_SEARCH_DEPTH;
|
||||||
const int DICT_FLAGS;
|
const int DICT_FLAGS;
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue