am bde25a4a: am 38c26dd0: Move dicnode to AOSP

* commit 'bde25a4a6334f4120ec696c693ab9bf12ee979e4':
  Move dicnode to AOSP
This commit is contained in:
Satoshi Kataoka 2013-04-03 19:27:18 -07:00 committed by Android Git Automerger
commit fdf3c49c8b
15 changed files with 2308 additions and 1 deletions

View file

@ -26,7 +26,10 @@ include $(CLEAR_VARS)
LATIN_IME_SRC_DIR := src
LATIN_IME_SRC_FULLPATH_DIR := $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR)
LOCAL_C_INCLUDES += $(LATIN_IME_SRC_FULLPATH_DIR) $(LATIN_IME_SRC_FULLPATH_DIR)/suggest
LOCAL_C_INCLUDES += \
$(LATIN_IME_SRC_FULLPATH_DIR) \
$(LATIN_IME_SRC_FULLPATH_DIR)/suggest \
$(LATIN_IME_SRC_FULLPATH_DIR)/suggest/core/dicnode
LOCAL_CFLAGS += -Werror -Wall -Wextra -Weffc++ -Wformat=2 -Wcast-qual -Wcast-align \
-Wwrite-strings -Wfloat-equal -Wpointer-arith -Winit-self -Wredundant-decls -Wno-system-headers
@ -59,6 +62,8 @@ LATIN_IME_CORE_SRC_FILES := \
proximity_info_state_utils.cpp \
unigram_dictionary.cpp \
words_priority_queue.cpp \
suggest/core/dicnode/dic_node.cpp \
suggest/core/dicnode/dic_node_utils.cpp \
suggest/gesture_suggest.cpp \
suggest/typing_suggest.cpp

View file

@ -0,0 +1,44 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dic_node.h"
namespace latinime {
DicNode::DicNode(const DicNode &dicNode)
:
#if DEBUG_DICT
mProfiler(dicNode.mProfiler),
#endif
mDicNodeProperties(dicNode.mDicNodeProperties), mDicNodeState(dicNode.mDicNodeState),
mIsCachedForNextSuggestion(dicNode.mIsCachedForNextSuggestion), mIsUsed(dicNode.mIsUsed),
mReleaseListener(0) {
/* empty */
}
DicNode &DicNode::operator=(const DicNode &dicNode) {
#if DEBUG_DICT
mProfiler = dicNode.mProfiler;
#endif
mDicNodeProperties = dicNode.mDicNodeProperties;
mDicNodeState = dicNode.mDicNodeState;
mIsCachedForNextSuggestion = dicNode.mIsCachedForNextSuggestion;
mIsUsed = dicNode.mIsUsed;
mReleaseListener = dicNode.mReleaseListener;
return *this;
}
} // namespace latinime

View file

@ -0,0 +1,572 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_H
#define LATINIME_DIC_NODE_H
#include "char_utils.h"
#include "defines.h"
#include "dic_node_state.h"
#include "dic_node_profiler.h"
#include "dic_node_properties.h"
#include "dic_node_release_listener.h"
#if DEBUG_DICT
#define LOGI_SHOW_ADD_COST_PROP \
do { char charBuf[50]; \
INTS_TO_CHARS(getOutputWordBuf(), getDepth(), charBuf); \
AKLOGI("%20s, \"%c\", size = %03d, total = %03d, index(0) = %02d, dist = %.4f, %s,,", \
__FUNCTION__, getNodeCodePoint(), inputSize, getTotalInputIndex(), \
getInputIndex(0), getNormalizedCompoundDistance(), charBuf); } while (0)
#define DUMP_WORD_AND_SCORE(header) \
do { char charBuf[50]; char prevWordCharBuf[50]; \
INTS_TO_CHARS(getOutputWordBuf(), getDepth(), charBuf); \
INTS_TO_CHARS(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, \
mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(), prevWordCharBuf); \
AKLOGI("#%8s, %5f, %5f, %5f, %5f, %s, %s, %d,,", header, \
getSpatialDistanceForScoring(), getLanguageDistanceForScoring(), \
getNormalizedCompoundDistance(), getRawLength(), prevWordCharBuf, charBuf, \
getInputIndex(0)); \
} while (0)
#else
#define LOGI_SHOW_ADD_COST_PROP
#define DUMP_WORD_AND_SCORE(header)
#endif
namespace latinime {
// Naming convention
// - Distance: "Weighted" edit distance -- used both for spatial and language.
// - Compound Distance: Spatial Distance + Language Distance -- used for pruning and scoring
// - Cost: delta/diff for Distance -- used both for spatial and language
// - Length: "Non-weighted" -- used only for spatial
// - Probability: "Non-weighted" -- used only for language
// This struct is purely a bucket to return values. No instances of this struct should be kept.
struct DicNode_InputStateG {
bool mNeedsToUpdateInputStateG;
int mPointerId;
int16_t mInputIndex;
int mPrevCodePoint;
float mTerminalDiffCost;
float mRawLength;
DoubleLetterLevel mDoubleLetterLevel;
};
class DicNode {
// Caveat: We define Weighting as a friend class of DicNode to let Weighting change
// the distance of DicNode.
// Caution!!! In general, we avoid using the "friend" access modifier.
// This is an exception to explicitly hide DicNode::addCost() from all classes but Weighting.
friend class Weighting;
public:
#if DEBUG_DICT
DicNodeProfiler mProfiler;
#endif
//////////////////
// Memory utils //
//////////////////
AK_FORCE_INLINE static void managedDelete(DicNode *node) {
node->remove();
}
// end
/////////////////
AK_FORCE_INLINE DicNode()
:
#if DEBUG_DICT
mProfiler(),
#endif
mDicNodeProperties(), mDicNodeState(), mIsCachedForNextSuggestion(false),
mIsUsed(false), mReleaseListener(0) {}
DicNode(const DicNode &dicNode);
DicNode &operator=(const DicNode &dicNode);
virtual ~DicNode() {}
// TODO: minimize arguments by looking binary_format
// Init for copy
void initByCopy(const DicNode *dicNode) {
mIsUsed = true;
mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion;
mDicNodeProperties.init(&dicNode->mDicNodeProperties);
mDicNodeState.init(&dicNode->mDicNodeState);
PROF_NODE_COPY(&dicNode->mProfiler, mProfiler);
}
// TODO: minimize arguments by looking binary_format
// Init for root with prevWordNodePos which is used for bigram
void initAsRoot(const int pos, const int childrenPos, const int childrenCount,
const int prevWordNodePos) {
mIsUsed = true;
mIsCachedForNextSuggestion = false;
mDicNodeProperties.init(
pos, 0, childrenPos, 0, 0, 0, childrenCount, 0, 0, false, false, true, 0, 0);
mDicNodeState.init(prevWordNodePos);
PROF_NODE_RESET(mProfiler);
}
void initAsPassingChild(DicNode *parentNode) {
mIsUsed = true;
mIsCachedForNextSuggestion = parentNode->mIsCachedForNextSuggestion;
const int c = parentNode->getNodeTypedCodePoint();
mDicNodeProperties.init(&parentNode->mDicNodeProperties, c);
mDicNodeState.init(&parentNode->mDicNodeState);
PROF_NODE_COPY(&parentNode->mProfiler, mProfiler);
}
// TODO: minimize arguments by looking binary_format
// Init for root with previous word
void initAsRootWithPreviousWord(DicNode *dicNode, const int pos, const int childrenPos,
const int childrenCount) {
mIsUsed = true;
mIsCachedForNextSuggestion = false;
mDicNodeProperties.init(
pos, 0, childrenPos, 0, 0, 0, childrenCount, 0, 0, false, false, true, 0, 0);
// TODO: Move to dicNodeState?
mDicNodeState.mDicNodeStateOutput.init(); // reset for next word
mDicNodeState.mDicNodeStateInput.init(
&dicNode->mDicNodeState.mDicNodeStateInput, true /* resetTerminalDiffCost */);
mDicNodeState.mDicNodeStateScoring.init(
&dicNode->mDicNodeState.mDicNodeStateScoring);
mDicNodeState.mDicNodeStatePrevWord.init(
dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() + 1,
dicNode->mDicNodeProperties.getProbability(),
dicNode->mDicNodeProperties.getPos(),
dicNode->mDicNodeState.mDicNodeStatePrevWord.mPrevWord,
dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(),
dicNode->getOutputWordBuf(),
dicNode->mDicNodeProperties.getDepth(),
dicNode->mDicNodeState.mDicNodeStatePrevWord.mPrevSpacePositions,
mDicNodeState.mDicNodeStateInput.getInputIndex(0) /* lastInputIndex */);
PROF_NODE_COPY(&dicNode->mProfiler, mProfiler);
}
// TODO: minimize arguments by looking binary_format
void initAsChild(DicNode *dicNode, const int pos, const uint8_t flags, const int childrenPos,
const int attributesPos, const int siblingPos, const int nodeCodePoint,
const int childrenCount, const int probability, const int bigramProbability,
const bool isTerminal, const bool hasMultipleChars, const bool hasChildren,
const uint16_t additionalSubwordLength, const int *additionalSubword) {
mIsUsed = true;
uint16_t newDepth = static_cast<uint16_t>(dicNode->getDepth() + 1);
mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion;
const uint16_t newLeavingDepth = static_cast<uint16_t>(
dicNode->mDicNodeProperties.getLeavingDepth() + additionalSubwordLength);
mDicNodeProperties.init(pos, flags, childrenPos, attributesPos, siblingPos, nodeCodePoint,
childrenCount, probability, bigramProbability, isTerminal, hasMultipleChars,
hasChildren, newDepth, newLeavingDepth);
mDicNodeState.init(&dicNode->mDicNodeState, additionalSubwordLength, additionalSubword);
PROF_NODE_COPY(&dicNode->mProfiler, mProfiler);
}
AK_FORCE_INLINE void remove() {
mIsUsed = false;
if (mReleaseListener) {
mReleaseListener->onReleased(this);
}
}
bool isUsed() const {
return mIsUsed;
}
bool isRoot() const {
return getDepth() == 0;
}
bool hasChildren() const {
return mDicNodeProperties.hasChildren();
}
bool isLeavingNode() const {
ASSERT(getDepth() <= getLeavingDepth());
return getDepth() == getLeavingDepth();
}
AK_FORCE_INLINE bool isFirstLetter() const {
return getDepth() == 1;
}
bool isCached() const {
return mIsCachedForNextSuggestion;
}
void setCached() {
mIsCachedForNextSuggestion = true;
}
// Used to expand the node in DicNodeUtils
int getNodeTypedCodePoint() const {
return mDicNodeState.mDicNodeStateOutput.getCodePointAt(getDepth());
}
bool isImpossibleBigramWord() const {
const int probability = mDicNodeProperties.getProbability();
if (probability == 0) {
return true;
}
const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength()
- mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1;
const int currentWordLen = getDepth();
return (prevWordLen == 1 && currentWordLen == 1);
}
bool isCapitalized() const {
const int c = getOutputWordBuf()[0];
return isAsciiUpper(c);
}
bool isFirstWord() const {
return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos() == NOT_VALID_WORD;
}
bool isCompletion(const int inputSize) const {
return mDicNodeState.mDicNodeStateInput.getInputIndex(0) >= inputSize;
}
bool canDoLookAheadCorrection(const int inputSize) const {
return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1;
}
// Used to get bigram probability in DicNodeUtils
int getPos() const {
return mDicNodeProperties.getPos();
}
// Used to get bigram probability in DicNodeUtils
int getPrevWordPos() const {
return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos();
}
// Used in DicNodeUtils
int getChildrenPos() const {
return mDicNodeProperties.getChildrenPos();
}
// Used in DicNodeUtils
int getChildrenCount() const {
return mDicNodeProperties.getChildrenCount();
}
// Used in DicNodeUtils
int getProbability() const {
return mDicNodeProperties.getProbability();
}
AK_FORCE_INLINE bool isTerminalWordNode() const {
const bool isTerminalNodes = mDicNodeProperties.isTerminal();
const int currentNodeDepth = getDepth();
const int terminalNodeDepth = mDicNodeProperties.getLeavingDepth();
return isTerminalNodes && currentNodeDepth > 0 && currentNodeDepth == terminalNodeDepth;
}
bool shouldBeFilterdBySafetyNetForBigram() const {
const uint16_t currentDepth = getDepth();
const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength()
- mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1;
return !(currentDepth > 0 && (currentDepth != 1 || prevWordLen != 1));
}
uint16_t getLeavingDepth() const {
return mDicNodeProperties.getLeavingDepth();
}
bool isTotalInputSizeExceedingLimit() const {
const int prevWordsLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength();
const int currentWordDepth = getDepth();
// TODO: 3 can be 2? Needs to be investigated.
// TODO: Have a const variable for 3 (or 2)
return prevWordsLen + currentWordDepth > MAX_WORD_LENGTH - 3;
}
// TODO: This may be defective. Needs to be revised.
bool truncateNode(const DicNode *const topNode, const int inputCommitPoint) {
const int prevWordLenOfTop = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength();
int newPrevWordStartIndex = inputCommitPoint;
int charCount = 0;
// Find new word start index
for (int i = 0; i < prevWordLenOfTop; ++i) {
const int c = mDicNodeState.mDicNodeStatePrevWord.getPrevWordCodePointAt(i);
// TODO: Check other separators.
if (c != KEYCODE_SPACE && c != KEYCODE_SINGLE_QUOTE) {
if (charCount == inputCommitPoint) {
newPrevWordStartIndex = i;
break;
}
++charCount;
}
}
if (!mDicNodeState.mDicNodeStatePrevWord.startsWith(
&topNode->mDicNodeState.mDicNodeStatePrevWord, newPrevWordStartIndex - 1)) {
// Node mismatch.
return false;
}
mDicNodeState.mDicNodeStateInput.truncate(inputCommitPoint);
mDicNodeState.mDicNodeStatePrevWord.truncate(newPrevWordStartIndex);
return true;
}
void outputResult(int *dest) const {
const uint16_t prevWordLength = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength();
const uint16_t currentDepth = getDepth();
DicNodeUtils::appendTwoWords(mDicNodeState.mDicNodeStatePrevWord.mPrevWord,
prevWordLength, getOutputWordBuf(), currentDepth, dest);
DUMP_WORD_AND_SCORE("OUTPUT");
}
void outputSpacePositionsResult(int *spaceIndices) const {
mDicNodeState.mDicNodeStatePrevWord.outputSpacePositions(spaceIndices);
}
bool hasMultipleWords() const {
return mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() > 0;
}
float getProximityCorrectionCount() const {
return static_cast<float>(mDicNodeState.mDicNodeStateScoring.getProximityCorrectionCount());
}
float getEditCorrectionCount() const {
return static_cast<float>(mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount());
}
// Used to prune nodes
float getNormalizedCompoundDistance() const {
return mDicNodeState.mDicNodeStateScoring.getNormalizedCompoundDistance();
}
// Used to prune nodes
float getNormalizedSpatialDistance() const {
return mDicNodeState.mDicNodeStateScoring.getSpatialDistance()
/ static_cast<float>(getInputIndex(0) + 1);
}
// Used to prune nodes
float getCompoundDistance() const {
return mDicNodeState.mDicNodeStateScoring.getCompoundDistance();
}
// Used to prune nodes
float getCompoundDistance(const float languageWeight) const {
return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(languageWeight);
}
// Note that "cost" means delta for "distance" that is weighted.
float getTotalPrevWordsLanguageCost() const {
return mDicNodeState.mDicNodeStateScoring.getTotalPrevWordsLanguageCost();
}
// Used to commit input partially
int getPrevWordNodePos() const {
return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos();
}
AK_FORCE_INLINE const int *getOutputWordBuf() const {
return mDicNodeState.mDicNodeStateOutput.mWordBuf;
}
int getPrevCodePointG(int pointerId) const {
return mDicNodeState.mDicNodeStateInput.getPrevCodePoint(pointerId);
}
// Whether the current codepoint can be an intentional omission, in which case the traversal
// algorithm will always check for a possible omission here.
bool canBeIntentionalOmission() const {
return isIntentionalOmissionCodePoint(getNodeCodePoint());
}
// Whether the omission is so frequent that it should incur zero cost.
bool isZeroCostOmission() const {
// TODO: do not hardcode and read from header
return (getNodeCodePoint() == KEYCODE_SINGLE_QUOTE);
}
// TODO: remove
float getTerminalDiffCostG(int path) const {
return mDicNodeState.mDicNodeStateInput.getTerminalDiffCost(path);
}
//////////////////////
// Temporary getter //
// TODO: Remove //
//////////////////////
// TODO: Remove once touch path is merged into ProximityInfoState
int getNodeCodePoint() const {
return mDicNodeProperties.getNodeCodePoint();
}
////////////////////////////////
// Utils for cost calculation //
////////////////////////////////
AK_FORCE_INLINE bool isSameNodeCodePoint(const DicNode *const dicNode) const {
return mDicNodeProperties.getNodeCodePoint()
== dicNode->mDicNodeProperties.getNodeCodePoint();
}
// TODO: remove
// TODO: rename getNextInputIndex
int16_t getInputIndex(int pointerId) const {
return mDicNodeState.mDicNodeStateInput.getInputIndex(pointerId);
}
////////////////////////////////////
// Getter of features for scoring //
////////////////////////////////////
float getSpatialDistanceForScoring() const {
return mDicNodeState.mDicNodeStateScoring.getSpatialDistance();
}
float getLanguageDistanceForScoring() const {
return mDicNodeState.mDicNodeStateScoring.getLanguageDistance();
}
float getLanguageDistanceRatePerWordForScoring() const {
const float langDist = getLanguageDistanceForScoring();
const float totalWordCount =
static_cast<float>(mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() + 1);
return langDist / totalWordCount;
}
float getRawLength() const {
return mDicNodeState.mDicNodeStateScoring.getRawLength();
}
bool isLessThanOneErrorForScoring() const {
return mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount()
+ mDicNodeState.mDicNodeStateScoring.getProximityCorrectionCount() <= 1;
}
DoubleLetterLevel getDoubleLetterLevel() const {
return mDicNodeState.mDicNodeStateScoring.getDoubleLetterLevel();
}
void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) {
mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel);
}
uint8_t getFlags() const {
return mDicNodeProperties.getFlags();
}
int getAttributesPos() const {
return mDicNodeProperties.getAttributesPos();
}
inline uint16_t getDepth() const {
return mDicNodeProperties.getDepth();
}
AK_FORCE_INLINE void dump(const char *tag) const {
#if DEBUG_DICT
DUMP_WORD_AND_SCORE(tag);
#if DEBUG_DUMP_ERROR
mProfiler.dump();
#endif
#endif
}
void setReleaseListener(DicNodeReleaseListener *releaseListener) {
mReleaseListener = releaseListener;
}
AK_FORCE_INLINE bool compare(const DicNode *right) {
if (!isUsed() && !right->isUsed()) {
// Compare pointer values here for stable comparison
return this > right;
}
if (!isUsed()) {
return true;
}
if (!right->isUsed()) {
return false;
}
const float diff =
right->getNormalizedCompoundDistance() - getNormalizedCompoundDistance();
static const float MIN_DIFF = 0.000001f;
if (diff > MIN_DIFF) {
return true;
} else if (diff < -MIN_DIFF) {
return false;
}
const int depth = getDepth();
const int depthDiff = right->getDepth() - depth;
if (depthDiff != 0) {
return depthDiff > 0;
}
for (int i = 0; i < depth; ++i) {
const int codePoint = mDicNodeState.mDicNodeStateOutput.getCodePointAt(i);
const int rightCodePoint = right->mDicNodeState.mDicNodeStateOutput.getCodePointAt(i);
if (codePoint != rightCodePoint) {
return rightCodePoint > codePoint;
}
}
// Compare pointer values here for stable comparison
return this > right;
}
private:
DicNodeProperties mDicNodeProperties;
DicNodeState mDicNodeState;
// TODO: Remove
bool mIsCachedForNextSuggestion;
bool mIsUsed;
DicNodeReleaseListener *mReleaseListener;
AK_FORCE_INLINE int getTotalInputIndex() const {
int index = 0;
for (int i = 0; i < MAX_POINTER_COUNT_G; i++) {
index += mDicNodeState.mDicNodeStateInput.getInputIndex(i);
}
return index;
}
// Caveat: Must not be called outside Weighting
// This restriction is guaranteed by "friend"
AK_FORCE_INLINE void addCost(const float spatialCost, const float languageCost,
const bool doNormalization, const int inputSize, const bool isEditCorrection,
const bool isProximityCorrection) {
if (DEBUG_GEO_FULL) {
LOGI_SHOW_ADD_COST_PROP;
}
mDicNodeState.mDicNodeStateScoring.addCost(spatialCost, languageCost, doNormalization,
inputSize, getTotalInputIndex(), isEditCorrection, isProximityCorrection);
}
// Caveat: Must not be called outside Weighting
// This restriction is guaranteed by "friend"
AK_FORCE_INLINE void forwardInputIndex(const int pointerId, const int count,
const bool overwritesPrevCodePointByNodeCodePoint) {
if (count == 0) {
return;
}
mDicNodeState.mDicNodeStateInput.forwardInputIndex(pointerId, count);
if (overwritesPrevCodePointByNodeCodePoint) {
mDicNodeState.mDicNodeStateInput.setPrevCodePoint(0, getNodeCodePoint());
}
}
AK_FORCE_INLINE void updateInputIndexG(DicNode_InputStateG *inputStateG) {
mDicNodeState.mDicNodeStateInput.updateInputIndexG(inputStateG->mPointerId,
inputStateG->mInputIndex, inputStateG->mPrevCodePoint,
inputStateG->mTerminalDiffCost, inputStateG->mRawLength);
mDicNodeState.mDicNodeStateScoring.addRawLength(inputStateG->mRawLength);
mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(inputStateG->mDoubleLetterLevel);
}
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_H

View file

@ -0,0 +1,213 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_PRIORITY_QUEUE_H
#define LATINIME_DIC_NODE_PRIORITY_QUEUE_H
#include <queue>
#include <vector>
#include "defines.h"
#include "dic_node.h"
#include "dic_node_release_listener.h"
#define MAX_DIC_NODE_PRIORITY_QUEUE_CAPACITY 200
namespace latinime {
class DicNodePriorityQueue : public DicNodeReleaseListener {
public:
AK_FORCE_INLINE DicNodePriorityQueue()
: MAX_CAPACITY(MAX_DIC_NODE_PRIORITY_QUEUE_CAPACITY),
mMaxSize(MAX_DIC_NODE_PRIORITY_QUEUE_CAPACITY), mDicNodesBuf(), mUnusedNodeIndices(),
mNextUnusedNodeId(0), mDicNodesQueue() {
mDicNodesBuf.resize(MAX_CAPACITY + 1);
mUnusedNodeIndices.resize(MAX_CAPACITY + 1);
reset();
}
// Non virtual inline destructor -- never inherit this class
AK_FORCE_INLINE ~DicNodePriorityQueue() {}
int getSize() const {
return static_cast<int>(mDicNodesQueue.size());
}
int getMaxSize() const {
return mMaxSize;
}
AK_FORCE_INLINE void setMaxSize(const int maxSize) {
mMaxSize = min(maxSize, MAX_CAPACITY);
}
AK_FORCE_INLINE void reset() {
clearAndResize(MAX_CAPACITY);
}
AK_FORCE_INLINE void clear() {
clearAndResize(mMaxSize);
}
AK_FORCE_INLINE void clearAndResize(const int maxSize) {
while (!mDicNodesQueue.empty()) {
mDicNodesQueue.pop();
}
setMaxSize(maxSize);
for (int i = 0; i < MAX_CAPACITY + 1; ++i) {
mDicNodesBuf[i].remove();
mDicNodesBuf[i].setReleaseListener(this);
mUnusedNodeIndices[i] = i == MAX_CAPACITY ? NOT_A_NODE_ID : static_cast<int>(i) + 1;
}
mNextUnusedNodeId = 0;
}
AK_FORCE_INLINE DicNode *newDicNode(DicNode *dicNode) {
DicNode *newNode = searchEmptyDicNode();
if (newNode) {
DicNodeUtils::initByCopy(dicNode, newNode);
return newNode;
}
return 0;
}
// Copy
AK_FORCE_INLINE DicNode *copyPush(DicNode *dicNode) {
return copyPush(dicNode, mMaxSize);
}
AK_FORCE_INLINE void copyPop(DicNode *dest) {
if (mDicNodesQueue.empty()) {
ASSERT(false);
return;
}
DicNode *node = mDicNodesQueue.top();
if (dest) {
DicNodeUtils::initByCopy(node, dest);
}
node->remove();
mDicNodesQueue.pop();
}
void onReleased(DicNode *dicNode) {
const int index = static_cast<int>(dicNode - &mDicNodesBuf[0]);
if (mUnusedNodeIndices[index] != NOT_A_NODE_ID) {
// it's already released
return;
}
mUnusedNodeIndices[index] = mNextUnusedNodeId;
mNextUnusedNodeId = index;
ASSERT(index >= 0 && index < (MAX_CAPACITY + 1));
}
AK_FORCE_INLINE void dump() const {
AKLOGI("\n\n\n\n\n===========================");
for (int i = 0; i < MAX_CAPACITY + 1; ++i) {
if (mDicNodesBuf[i].isUsed()) {
mDicNodesBuf[i].dump("QUEUE: ");
}
}
AKLOGI("===========================\n\n\n\n\n");
}
private:
DISALLOW_COPY_AND_ASSIGN(DicNodePriorityQueue);
static const int NOT_A_NODE_ID = -1;
AK_FORCE_INLINE static bool compareDicNode(DicNode *left, DicNode *right) {
return left->compare(right);
}
struct DicNodeComparator {
bool operator ()(DicNode *left, DicNode *right) {
return compareDicNode(left, right);
}
};
typedef std::priority_queue<DicNode *, std::vector<DicNode *>, DicNodeComparator> DicNodesQueue;
const int MAX_CAPACITY;
int mMaxSize;
std::vector<DicNode> mDicNodesBuf; // of each element of mDicNodesBuf respectively
std::vector<int> mUnusedNodeIndices;
int mNextUnusedNodeId;
DicNodesQueue mDicNodesQueue;
inline bool isFull(const int maxSize) const {
return getSize() >= maxSize;
}
AK_FORCE_INLINE void pop() {
copyPop(0);
}
AK_FORCE_INLINE bool betterThanWorstDicNode(DicNode *dicNode) const {
DicNode *worstNode = mDicNodesQueue.top();
if (!worstNode) {
return true;
}
return compareDicNode(dicNode, worstNode);
}
AK_FORCE_INLINE DicNode *searchEmptyDicNode() {
// TODO: Currently O(n) but should be improved to O(1)
if (MAX_CAPACITY == 0) {
return 0;
}
if (mNextUnusedNodeId == NOT_A_NODE_ID) {
AKLOGI("No unused node found.");
for (int i = 0; i < MAX_CAPACITY + 1; ++i) {
AKLOGI("Dump node availability, %d, %d, %d",
i, mDicNodesBuf[i].isUsed(), mUnusedNodeIndices[i]);
}
ASSERT(false);
return 0;
}
DicNode *dicNode = &mDicNodesBuf[mNextUnusedNodeId];
markNodeAsUsed(dicNode);
return dicNode;
}
AK_FORCE_INLINE void markNodeAsUsed(DicNode *dicNode) {
const int index = static_cast<int>(dicNode - &mDicNodesBuf[0]);
mNextUnusedNodeId = mUnusedNodeIndices[index];
mUnusedNodeIndices[index] = NOT_A_NODE_ID;
ASSERT(index >= 0 && index < (MAX_CAPACITY + 1));
}
AK_FORCE_INLINE DicNode *pushPoolNodeWithMaxSize(DicNode *dicNode, const int maxSize) {
if (!dicNode) {
return 0;
}
if (!isFull(maxSize)) {
mDicNodesQueue.push(dicNode);
return dicNode;
}
if (betterThanWorstDicNode(dicNode)) {
pop();
mDicNodesQueue.push(dicNode);
return dicNode;
}
dicNode->remove();
return 0;
}
// Copy
AK_FORCE_INLINE DicNode *copyPush(DicNode *dicNode, const int maxSize) {
return pushPoolNodeWithMaxSize(newDicNode(dicNode), maxSize);
}
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_PRIORITY_QUEUE_H

View file

@ -0,0 +1,181 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_PROFILER_H
#define LATINIME_DIC_NODE_PROFILER_H
#include "defines.h"
#if DEBUG_DICT
#define PROF_SPACE_SUBSTITUTION(profiler) profiler.profSpaceSubstitution()
#define PROF_SPACE_OMISSION(profiler) profiler.profSpaceOmission()
#define PROF_ADDITIONAL_PROXIMITY(profiler) profiler.profAdditionalProximity()
#define PROF_SUBSTITUTION(profiler) profiler.profSubstitution()
#define PROF_OMISSION(profiler) profiler.profOmission()
#define PROF_INSERTION(profiler) profiler.profInsertion()
#define PROF_MATCH(profiler) profiler.profMatch()
#define PROF_COMPLETION(profiler) profiler.profCompletion()
#define PROF_TRANSPOSITION(profiler) profiler.profTransposition()
#define PROF_NEARESTKEY(profiler) profiler.profNearestKey()
#define PROF_TERMINAL(profiler) profiler.profTerminal()
#define PROF_NEW_WORD(profiler) profiler.profNewWord()
#define PROF_NEW_WORD_BIGRAM(profiler) profiler.profNewWordBigram()
#define PROF_NODE_RESET(profiler) profiler.reset()
#define PROF_NODE_COPY(src, dest) dest.copy(src)
#else
#define PROF_SPACE_SUBSTITUTION(profiler)
#define PROF_SPACE_OMISSION(profiler)
#define PROF_ADDITONAL_PROXIMITY(profiler)
#define PROF_SUBSTITUTION(profiler)
#define PROF_OMISSION(profiler)
#define PROF_INSERTION(profiler)
#define PROF_MATCH(profiler)
#define PROF_COMPLETION(profiler)
#define PROF_TRANSPOSITION(profiler)
#define PROF_NEARESTKEY(profiler)
#define PROF_TERMINAL(profiler)
#define PROF_NEW_WORD(profiler)
#define PROF_NEW_WORD_BIGRAM(profiler)
#define PROF_NODE_RESET(profiler)
#define PROF_NODE_COPY(src, dest)
#endif
namespace latinime {
class DicNodeProfiler {
public:
#if DEBUG_DICT
AK_FORCE_INLINE DicNodeProfiler()
: mProfOmission(0), mProfInsertion(0), mProfTransposition(0),
mProfAdditionalProximity(0), mProfSubstitution(0),
mProfSpaceSubstitution(0), mProfSpaceOmission(0),
mProfMatch(0), mProfCompletion(0), mProfTerminal(0),
mProfNearestKey(0), mProfNewWord(0), mProfNewWordBigram(0) {}
int mProfOmission;
int mProfInsertion;
int mProfTransposition;
int mProfAdditionalProximity;
int mProfSubstitution;
int mProfSpaceSubstitution;
int mProfSpaceOmission;
int mProfMatch;
int mProfCompletion;
int mProfTerminal;
int mProfNearestKey;
int mProfNewWord;
int mProfNewWordBigram;
void profSpaceSubstitution() {
++mProfSpaceSubstitution;
}
void profSpaceOmission() {
++mProfSpaceOmission;
}
void profAdditionalProximity() {
++mProfAdditionalProximity;
}
void profSubstitution() {
++mProfSubstitution;
}
void profOmission() {
++mProfOmission;
}
void profInsertion() {
++mProfInsertion;
}
void profMatch() {
++mProfMatch;
}
void profCompletion() {
++mProfCompletion;
}
void profTransposition() {
++mProfTransposition;
}
void profNearestKey() {
++mProfNearestKey;
}
void profTerminal() {
++mProfTerminal;
}
void profNewWord() {
++mProfNewWord;
}
void profNewWordBigram() {
++mProfNewWordBigram;
}
void reset() {
mProfSpaceSubstitution = 0;
mProfSpaceOmission = 0;
mProfAdditionalProximity = 0;
mProfSubstitution = 0;
mProfOmission = 0;
mProfInsertion = 0;
mProfMatch = 0;
mProfCompletion = 0;
mProfTransposition = 0;
mProfNearestKey = 0;
mProfTerminal = 0;
mProfNewWord = 0;
mProfNewWordBigram = 0;
}
void copy(const DicNodeProfiler *const profiler) {
mProfSpaceSubstitution = profiler->mProfSpaceSubstitution;
mProfSpaceOmission = profiler->mProfSpaceOmission;
mProfAdditionalProximity = profiler->mProfAdditionalProximity;
mProfSubstitution = profiler->mProfSubstitution;
mProfOmission = profiler->mProfOmission;
mProfInsertion = profiler->mProfInsertion;
mProfMatch = profiler->mProfMatch;
mProfCompletion = profiler->mProfCompletion;
mProfTransposition = profiler->mProfTransposition;
mProfNearestKey = profiler->mProfNearestKey;
mProfTerminal = profiler->mProfTerminal;
mProfNewWord = profiler->mProfNewWord;
mProfNewWordBigram = profiler->mProfNewWordBigram;
}
void dump() const {
AKLOGI("O %d, I %d, T %d, AP %d, S %d, SS %d, SO %d, M %d, C %d, TE %d, NW = %d, NWB = %d",
mProfOmission, mProfInsertion, mProfTransposition, mProfAdditionalProximity,
mProfSubstitution, mProfSpaceSubstitution, mProfSpaceOmission, mProfMatch,
mProfCompletion, mProfTerminal, mProfNewWord, mProfNewWordBigram);
}
#else
DicNodeProfiler() {}
#endif
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
};
}
#endif // LATINIME_DIC_NODE_PROFILER_H

View file

@ -0,0 +1,173 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_PROPERTIES_H
#define LATINIME_DIC_NODE_PROPERTIES_H
#include <stdint.h>
#include "defines.h"
namespace latinime {
/**
* Node for traversing the lexicon trie.
*/
class DicNodeProperties {
public:
AK_FORCE_INLINE DicNodeProperties()
: mPos(0), mFlags(0), mChildrenPos(0), mAttributesPos(0), mSiblingPos(0),
mChildrenCount(0), mProbability(0), mBigramProbability(0), mNodeCodePoint(0),
mDepth(0), mLeavingDepth(0), mIsTerminal(false), mHasMultipleChars(false),
mHasChildren(false) {
}
virtual ~DicNodeProperties() {}
// Should be called only once per DicNode is initialized.
void init(const int pos, const uint8_t flags, const int childrenPos, const int attributesPos,
const int siblingPos, const int nodeCodePoint, const int childrenCount,
const int probability, const int bigramProbability, const bool isTerminal,
const bool hasMultipleChars, const bool hasChildren, const uint16_t depth,
const uint16_t terminalDepth) {
mPos = pos;
mFlags = flags;
mChildrenPos = childrenPos;
mAttributesPos = attributesPos;
mSiblingPos = siblingPos;
mNodeCodePoint = nodeCodePoint;
mChildrenCount = childrenCount;
mProbability = probability;
mBigramProbability = bigramProbability;
mIsTerminal = isTerminal;
mHasMultipleChars = hasMultipleChars;
mHasChildren = hasChildren;
mDepth = depth;
mLeavingDepth = terminalDepth;
}
// Init for copy
void init(const DicNodeProperties *const nodeProp) {
mPos = nodeProp->mPos;
mFlags = nodeProp->mFlags;
mChildrenPos = nodeProp->mChildrenPos;
mAttributesPos = nodeProp->mAttributesPos;
mSiblingPos = nodeProp->mSiblingPos;
mNodeCodePoint = nodeProp->mNodeCodePoint;
mChildrenCount = nodeProp->mChildrenCount;
mProbability = nodeProp->mProbability;
mBigramProbability = nodeProp->mBigramProbability;
mIsTerminal = nodeProp->mIsTerminal;
mHasMultipleChars = nodeProp->mHasMultipleChars;
mHasChildren = nodeProp->mHasChildren;
mDepth = nodeProp->mDepth;
mLeavingDepth = nodeProp->mLeavingDepth;
}
// Init as passing child
void init(const DicNodeProperties *const nodeProp, const int codePoint) {
mPos = nodeProp->mPos;
mFlags = nodeProp->mFlags;
mChildrenPos = nodeProp->mChildrenPos;
mAttributesPos = nodeProp->mAttributesPos;
mSiblingPos = nodeProp->mSiblingPos;
mNodeCodePoint = codePoint; // Overwrite the node char of a passing child
mChildrenCount = nodeProp->mChildrenCount;
mProbability = nodeProp->mProbability;
mBigramProbability = nodeProp->mBigramProbability;
mIsTerminal = nodeProp->mIsTerminal;
mHasMultipleChars = nodeProp->mHasMultipleChars;
mHasChildren = nodeProp->mHasChildren;
mDepth = nodeProp->mDepth + 1; // Increment the depth of a passing child
mLeavingDepth = nodeProp->mLeavingDepth;
}
int getPos() const {
return mPos;
}
uint8_t getFlags() const {
return mFlags;
}
int getChildrenPos() const {
return mChildrenPos;
}
int getAttributesPos() const {
return mAttributesPos;
}
int getChildrenCount() const {
return mChildrenCount;
}
int getProbability() const {
return mProbability;
}
int getNodeCodePoint() const {
return mNodeCodePoint;
}
uint16_t getDepth() const {
return mDepth;
}
// TODO: Move to output?
uint16_t getLeavingDepth() const {
return mLeavingDepth;
}
bool isTerminal() const {
return mIsTerminal;
}
bool hasMultipleChars() const {
return mHasMultipleChars;
}
bool hasChildren() const {
return mChildrenCount > 0 || mDepth != mLeavingDepth;
}
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
// Not used
int getSiblingPos() const {
return mSiblingPos;
}
int mPos;
uint8_t mFlags;
int mChildrenPos;
int mAttributesPos;
int mSiblingPos;
int mChildrenCount;
int mProbability;
int mBigramProbability; // not used for now
int mNodeCodePoint;
uint16_t mDepth;
uint16_t mLeavingDepth;
bool mIsTerminal;
bool mHasMultipleChars;
bool mHasChildren;
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_PROPERTIES_H

View file

@ -0,0 +1,33 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_RELEASE_LISTENER_H
#define LATINIME_DIC_NODE_RELEASE_LISTENER_H
#include "defines.h"
namespace latinime {
class DicNodeReleaseListener {
public:
DicNodeReleaseListener() {}
virtual ~DicNodeReleaseListener() {}
virtual void onReleased(DicNode *dicNode) = 0;
private:
DISALLOW_COPY_AND_ASSIGN(DicNodeReleaseListener);
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_RELEASE_LISTENER_H

View file

@ -0,0 +1,71 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_STATE_H
#define LATINIME_DIC_NODE_STATE_H
#include "defines.h"
#include "dic_node_state_input.h"
#include "dic_node_state_output.h"
#include "dic_node_state_prevword.h"
#include "dic_node_state_scoring.h"
namespace latinime {
class DicNodeState {
public:
DicNodeStateInput mDicNodeStateInput;
DicNodeStateOutput mDicNodeStateOutput;
DicNodeStatePrevWord mDicNodeStatePrevWord;
DicNodeStateScoring mDicNodeStateScoring;
AK_FORCE_INLINE DicNodeState()
: mDicNodeStateInput(), mDicNodeStateOutput(), mDicNodeStatePrevWord(),
mDicNodeStateScoring() {
}
virtual ~DicNodeState() {}
// Init with prevWordPos
void init(const int prevWordPos) {
mDicNodeStateInput.init();
mDicNodeStateOutput.init();
mDicNodeStatePrevWord.init(prevWordPos);
mDicNodeStateScoring.init();
}
// Init by copy
AK_FORCE_INLINE void init(const DicNodeState *const src) {
mDicNodeStateInput.init(&src->mDicNodeStateInput);
mDicNodeStateOutput.init(&src->mDicNodeStateOutput);
mDicNodeStatePrevWord.init(&src->mDicNodeStatePrevWord);
mDicNodeStateScoring.init(&src->mDicNodeStateScoring);
}
// Init by copy and adding subword
void init(const DicNodeState *const src, const uint16_t additionalSubwordLength,
const int *const additionalSubword) {
init(src);
mDicNodeStateOutput.addSubword(additionalSubwordLength, additionalSubword);
}
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_STATE_H

View file

@ -0,0 +1,100 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_STATE_INPUT_H
#define LATINIME_DIC_NODE_STATE_INPUT_H
#include "defines.h"
namespace latinime {
// TODO: Have a .cpp for this class
class DicNodeStateInput {
public:
DicNodeStateInput() {}
virtual ~DicNodeStateInput() {}
// TODO: Merge into DicNodeStatePrevWord::truncate
void truncate(const int commitPoint) {
mInputIndex[0] -= commitPoint;
}
void init() {
for (int i = 0; i < MAX_POINTER_COUNT_G; i++) {
// TODO: The initial value for mInputIndex should be -1?
//mInputIndex[i] = i == 0 ? 0 : -1;
mInputIndex[i] = 0;
mPrevCodePoint[i] = NOT_A_CODE_POINT;
mTerminalDiffCost[i] = static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
}
}
void init(const DicNodeStateInput *const src, const bool resetTerminalDiffCost) {
for (int i = 0; i < MAX_POINTER_COUNT_G; i++) {
mInputIndex[i] = src->mInputIndex[i];
mPrevCodePoint[i] = src->mPrevCodePoint[i];
mTerminalDiffCost[i] = resetTerminalDiffCost ?
static_cast<float>(MAX_VALUE_FOR_WEIGHTING) : src->mTerminalDiffCost[i];
}
}
void updateInputIndexG(const int pointerId, const int inputIndex,
const int prevCodePoint, const float terminalDiffCost, const float rawLength) {
mInputIndex[pointerId] = inputIndex;
mPrevCodePoint[pointerId] = prevCodePoint;
mTerminalDiffCost[pointerId] = terminalDiffCost;
}
void init(const DicNodeStateInput *const src) {
init(src, false);
}
// For transposition
void setPrevCodePoint(const int pointerId, const int c) {
mPrevCodePoint[pointerId] = c;
}
void forwardInputIndex(const int pointerId, const int val) {
if (mInputIndex[pointerId] < 0) {
mInputIndex[pointerId] = val;
} else {
mInputIndex[pointerId] = mInputIndex[pointerId] + val;
}
}
int getInputIndex(const int pointerId) const {
// when "inputIndex" exceeds "inputSize", auto-completion needs to be done
return mInputIndex[pointerId];
}
int getPrevCodePoint(const int pointerId) const {
return mPrevCodePoint[pointerId];
}
float getTerminalDiffCost(const int pointerId) const {
return mTerminalDiffCost[pointerId];
}
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
int mInputIndex[MAX_POINTER_COUNT_G];
int mPrevCodePoint[MAX_POINTER_COUNT_G];
float mTerminalDiffCost[MAX_POINTER_COUNT_G];
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_STATE_INPUT_H

View file

@ -0,0 +1,75 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_STATE_OUTPUT_H
#define LATINIME_DIC_NODE_STATE_OUTPUT_H
#include <cstring> // for memcpy()
#include <stdint.h>
#include "defines.h"
namespace latinime {
class DicNodeStateOutput {
public:
DicNodeStateOutput() : mOutputtedLength(0) {
init();
}
virtual ~DicNodeStateOutput() {}
void init() {
mOutputtedLength = 0;
mWordBuf[0] = 0;
}
void init(const DicNodeStateOutput *const stateOutput) {
memcpy(mWordBuf, stateOutput->mWordBuf,
stateOutput->mOutputtedLength * sizeof(mWordBuf[0]));
mOutputtedLength = stateOutput->mOutputtedLength;
if (mOutputtedLength < MAX_WORD_LENGTH) {
mWordBuf[mOutputtedLength] = 0;
}
}
void addSubword(const uint16_t additionalSubwordLength, const int *const additionalSubword) {
if (additionalSubword) {
memcpy(&mWordBuf[mOutputtedLength], additionalSubword,
additionalSubwordLength * sizeof(mWordBuf[0]));
mOutputtedLength = static_cast<uint16_t>(mOutputtedLength + additionalSubwordLength);
if (mOutputtedLength < MAX_WORD_LENGTH) {
mWordBuf[mOutputtedLength] = 0;
}
}
}
// TODO: Remove
int getCodePointAt(const int id) const {
return mWordBuf[id];
}
// TODO: Move to private
int mWordBuf[MAX_WORD_LENGTH];
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
uint16_t mOutputtedLength;
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_STATE_OUTPUT_H

View file

@ -0,0 +1,156 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_STATE_PREVWORD_H
#define LATINIME_DIC_NODE_STATE_PREVWORD_H
#include <cstring> // for memset()
#include <stdint.h>
#include "defines.h"
#include "dic_node_utils.h"
namespace latinime {
class DicNodeStatePrevWord {
public:
AK_FORCE_INLINE DicNodeStatePrevWord()
: mPrevWordCount(0), mPrevWordLength(0), mPrevWordStart(0), mPrevWordProbability(0),
mPrevWordNodePos(0) {
memset(mPrevWord, 0, sizeof(mPrevWord));
memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions));
}
virtual ~DicNodeStatePrevWord() {}
void init() {
mPrevWordLength = 0;
mPrevWordCount = 0;
mPrevWordStart = 0;
mPrevWordProbability = -1;
mPrevWordNodePos = NOT_VALID_WORD;
memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions));
}
void init(const int prevWordNodePos) {
mPrevWordLength = 0;
mPrevWordCount = 0;
mPrevWordStart = 0;
mPrevWordProbability = -1;
mPrevWordNodePos = prevWordNodePos;
memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions));
}
// Init by copy
AK_FORCE_INLINE void init(const DicNodeStatePrevWord *const prevWord) {
mPrevWordLength = prevWord->mPrevWordLength;
mPrevWordCount = prevWord->mPrevWordCount;
mPrevWordStart = prevWord->mPrevWordStart;
mPrevWordProbability = prevWord->mPrevWordProbability;
mPrevWordNodePos = prevWord->mPrevWordNodePos;
memcpy(mPrevWord, prevWord->mPrevWord, prevWord->mPrevWordLength * sizeof(mPrevWord[0]));
memcpy(mPrevSpacePositions, prevWord->mPrevSpacePositions, sizeof(mPrevSpacePositions));
}
void init(const int16_t prevWordCount, const int16_t prevWordProbability,
const int prevWordNodePos, const int *const src0, const int16_t length0,
const int *const src1, const int16_t length1, const int *const prevSpacePositions,
const int lastInputIndex) {
mPrevWordCount = prevWordCount;
mPrevWordProbability = prevWordProbability;
mPrevWordNodePos = prevWordNodePos;
const int twoWordsLen =
DicNodeUtils::appendTwoWords(src0, length0, src1, length1, mPrevWord);
mPrevWord[twoWordsLen] = KEYCODE_SPACE;
mPrevWordStart = length0;
mPrevWordLength = static_cast<int16_t>(twoWordsLen + 1);
memcpy(mPrevSpacePositions, prevSpacePositions, sizeof(mPrevSpacePositions));
mPrevSpacePositions[mPrevWordCount - 1] = lastInputIndex;
}
void truncate(const int offset) {
// TODO: memmove
if (mPrevWordLength < offset) {
memset(mPrevWord, 0, sizeof(mPrevWord));
mPrevWordLength = 0;
return;
}
const int newPrevWordLength = mPrevWordLength - offset;
memmove(mPrevWord, &mPrevWord[offset], newPrevWordLength * sizeof(mPrevWord[0]));
mPrevWordLength = newPrevWordLength;
}
void outputSpacePositions(int *spaceIndices) const {
// Convert uint16_t to int
for (int i = 0; i < MAX_RESULTS; i++) {
spaceIndices[i] = mPrevSpacePositions[i];
}
}
// TODO: remove
int16_t getPrevWordLength() const {
return mPrevWordLength;
}
int16_t getPrevWordCount() const {
return mPrevWordCount;
}
int16_t getPrevWordStart() const {
return mPrevWordStart;
}
int16_t getPrevWordProbability() const {
return mPrevWordProbability;
}
int getPrevWordNodePos() const {
return mPrevWordNodePos;
}
int getPrevWordCodePointAt(const int id) const {
return mPrevWord[id];
}
bool startsWith(const DicNodeStatePrevWord *const prefix, const int prefixLen) const {
if (prefixLen > mPrevWordLength) {
return false;
}
for (int i = 0; i < prefixLen; ++i) {
if (mPrevWord[i] != prefix->mPrevWord[i]) {
return false;
}
}
return true;
}
// TODO: Move to private
int mPrevWord[MAX_WORD_LENGTH];
// TODO: Move to private
int mPrevSpacePositions[MAX_RESULTS];
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
int16_t mPrevWordCount;
int16_t mPrevWordLength;
int16_t mPrevWordStart;
int16_t mPrevWordProbability;
int mPrevWordNodePos;
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_STATE_PREVWORD_H

View file

@ -0,0 +1,166 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_STATE_SCORING_H
#define LATINIME_DIC_NODE_STATE_SCORING_H
#include <stdint.h>
#include "defines.h"
namespace latinime {
class DicNodeStateScoring {
public:
AK_FORCE_INLINE DicNodeStateScoring()
: mDoubleLetterLevel(NOT_A_DOUBLE_LETTER),
mEditCorrectionCount(0), mProximityCorrectionCount(0),
mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f),
mTotalPrevWordsLanguageCost(0.0f), mRawLength(0.0f) {
}
virtual ~DicNodeStateScoring() {}
void init() {
mEditCorrectionCount = 0;
mProximityCorrectionCount = 0;
mNormalizedCompoundDistance = 0.0f;
mSpatialDistance = 0.0f;
mLanguageDistance = 0.0f;
mTotalPrevWordsLanguageCost = 0.0f;
mRawLength = 0.0f;
mDoubleLetterLevel = NOT_A_DOUBLE_LETTER;
}
AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) {
mEditCorrectionCount = scoring->mEditCorrectionCount;
mProximityCorrectionCount = scoring->mProximityCorrectionCount;
mNormalizedCompoundDistance = scoring->mNormalizedCompoundDistance;
mSpatialDistance = scoring->mSpatialDistance;
mLanguageDistance = scoring->mLanguageDistance;
mTotalPrevWordsLanguageCost = scoring->mTotalPrevWordsLanguageCost;
mRawLength = scoring->mRawLength;
mDoubleLetterLevel = scoring->mDoubleLetterLevel;
}
void addCost(const float spatialCost, const float languageCost, const bool doNormalization,
const int inputSize, const int totalInputIndex, const bool isEditCorrection,
const bool isProximityCorrection) {
addDistance(spatialCost, languageCost, doNormalization, inputSize, totalInputIndex);
if (isEditCorrection) {
++mEditCorrectionCount;
}
if (isProximityCorrection) {
++mProximityCorrectionCount;
}
if (languageCost > 0.0f) {
setTotalPrevWordsLanguageCost(mTotalPrevWordsLanguageCost + languageCost);
}
}
void addRawLength(const float rawLength) {
mRawLength += rawLength;
}
float getCompoundDistance() const {
return getCompoundDistance(1.0f);
}
float getCompoundDistance(const float languageWeight) const {
return mSpatialDistance + mLanguageDistance * languageWeight;
}
float getNormalizedCompoundDistance() const {
return mNormalizedCompoundDistance;
}
float getSpatialDistance() const {
return mSpatialDistance;
}
float getLanguageDistance() const {
return mLanguageDistance;
}
int16_t getEditCorrectionCount() const {
return mEditCorrectionCount;
}
int16_t getProximityCorrectionCount() const {
return mProximityCorrectionCount;
}
float getRawLength() const {
return mRawLength;
}
DoubleLetterLevel getDoubleLetterLevel() const {
return mDoubleLetterLevel;
}
void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) {
switch(doubleLetterLevel) {
case NOT_A_DOUBLE_LETTER:
break;
case A_DOUBLE_LETTER:
if (mDoubleLetterLevel != A_STRONG_DOUBLE_LETTER) {
mDoubleLetterLevel = doubleLetterLevel;
}
break;
case A_STRONG_DOUBLE_LETTER:
mDoubleLetterLevel = doubleLetterLevel;
break;
}
}
float getTotalPrevWordsLanguageCost() const {
return mTotalPrevWordsLanguageCost;
}
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
DoubleLetterLevel mDoubleLetterLevel;
int16_t mEditCorrectionCount;
int16_t mProximityCorrectionCount;
float mNormalizedCompoundDistance;
float mSpatialDistance;
float mLanguageDistance;
float mTotalPrevWordsLanguageCost;
float mRawLength;
AK_FORCE_INLINE void addDistance(float spatialDistance, float languageDistance,
bool doNormalization, int inputSize, int totalInputIndex) {
mSpatialDistance += spatialDistance;
mLanguageDistance += languageDistance;
if (!doNormalization) {
mNormalizedCompoundDistance = mSpatialDistance + mLanguageDistance;
} else {
mNormalizedCompoundDistance = (mSpatialDistance + mLanguageDistance)
/ static_cast<float>(max(1, totalInputIndex));
}
}
//TODO: remove
AK_FORCE_INLINE void setTotalPrevWordsLanguageCost(float totalPrevWordsLanguageCost) {
mTotalPrevWordsLanguageCost = totalPrevWordsLanguageCost;
}
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_STATE_SCORING_H

View file

@ -0,0 +1,335 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstring>
#include <vector>
#include "binary_format.h"
#include "dic_node.h"
#include "dic_node_utils.h"
#include "dic_node_vector.h"
#include "proximity_info.h"
#include "proximity_info_state.h"
namespace latinime {
///////////////////////////////
// Node initialization utils //
///////////////////////////////
/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot,
const int prevWordNodePos, DicNode *newRootNode) {
int curPos = rootPos;
const int pos = curPos;
const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
const int childrenPos = curPos;
newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos);
}
/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos,
const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) {
int curPos = rootPos;
const int pos = curPos;
const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
const int childrenPos = curPos;
newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount);
}
/* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) {
destNode->initByCopy(srcNode);
}
///////////////////////////////////
// Traverse node expansion utils //
///////////////////////////////////
/* static */ void DicNodeUtils::createAndGetPassingChildNode(DicNode *dicNode,
const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
DicNodeVector *childDicNodes) {
// Passing multiple chars node. No need to traverse child
const int codePoint = dicNode->getNodeTypedCodePoint();
const int baseLowerCaseCodePoint = toBaseLowerCase(codePoint);
const bool isMatch = isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, codePoint);
if (isMatch || isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) {
childDicNodes->pushPassingChild(dicNode);
}
}
/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos,
const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState,
const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter,
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
int nextPos = pos;
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos);
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos);
ASSERT(NOT_A_CODE_POINT != codePoint);
const int nodeCodePoint = codePoint;
// TODO: optimize this
int additionalWordBuf[MAX_WORD_LENGTH];
uint16_t additionalSubwordLength = 0;
additionalWordBuf[additionalSubwordLength++] = codePoint;
do {
const int nextCodePoint = hasMultipleChars
? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT;
const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
if (!isLastChar) {
additionalWordBuf[additionalSubwordLength++] = nextCodePoint;
}
codePoint = nextCodePoint;
} while (NOT_A_CODE_POINT != codePoint);
const int probability =
isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1;
pos = BinaryFormat::skipProbability(flags, pos);
int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0;
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos);
const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos);
if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) {
return siblingPos;
}
if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) {
return siblingPos;
}
const int childrenCount = hasChildren
? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0;
childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos,
nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal,
hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf);
return siblingPos;
}
/* static */ bool DicNodeUtils::isDicNodeFilteredOut(const int nodeCodePoint,
const ProximityInfo *const pInfo, const std::vector<int> *const codePointsFilter) {
const int filterSize = codePointsFilter ? codePointsFilter->size() : 0;
if (filterSize <= 0) {
return false;
}
if (pInfo && (pInfo->getKeyIndexOf(nodeCodePoint) == NOT_AN_INDEX
|| isIntentionalOmissionCodePoint(nodeCodePoint))) {
// If normalized nodeCodePoint is not on the keyboard or skippable, this child is never
// filtered.
return false;
}
const int lowerCodePoint = toLowerCase(nodeCodePoint);
const int baseLowerCodePoint = toBaseCodePoint(lowerCodePoint);
// TODO: Avoid linear search
for (int i = 0; i < filterSize; ++i) {
// Checking if a normalized code point is in filter characters when pInfo is not
// null. When pInfo is null, nodeCodePoint is used to check filtering without
// normalizing.
if ((pInfo && ((*codePointsFilter)[i] == lowerCodePoint
|| (*codePointsFilter)[i] == baseLowerCodePoint))
|| (!pInfo && (*codePointsFilter)[i] == nodeCodePoint)) {
return false;
}
}
return true;
}
/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode,
const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
const bool exactOnly, const std::vector<int> *const codePointsFilter,
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
const int terminalDepth = dicNode->getLeavingDepth();
const int childCount = dicNode->getChildrenCount();
int nextPos = dicNode->getChildrenPos();
for (int i = 0; i < childCount; i++) {
const int filterSize = codePointsFilter ? codePointsFilter->size() : 0;
nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState,
pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes);
if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) {
// All code points have been found.
break;
}
}
}
/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
DicNodeVector *childDicNodes) {
getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes);
}
/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode,
const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
bool exactOnly, DicNodeVector *childDicNodes) {
if (dicNode->isTotalInputSizeExceedingLimit()) {
return;
}
if (!dicNode->isLeavingNode()) {
DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly,
childDicNodes);
} else {
DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex,
exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */,
childDicNodes);
}
}
///////////////////
// Scoring utils //
///////////////////
/**
* Computes the combined bigram / unigram cost for the given dicNode.
*/
/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot,
const DicNode *const node, hash_map_compat<int, int16_t> *bigramCacheMap) {
if (node->isImpossibleBigramWord()) {
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
}
const int probability = getBigramNodeProbability(dicRoot, node, bigramCacheMap);
// TODO: This equation to calculate the improbability looks unreasonable. Investigate this.
const float cost = static_cast<float>(MAX_PROBABILITY - probability)
/ static_cast<float>(MAX_PROBABILITY);
return cost;
}
/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot,
const DicNode *const node, hash_map_compat<int, int16_t> *bigramCacheMap) {
const int unigramProbability = node->getProbability();
const int encodedDiffOfBigramProbability =
getBigramNodeEncodedDiffProbability(dicRoot, node, bigramCacheMap);
if (NOT_A_PROBABILITY == encodedDiffOfBigramProbability) {
return backoff(unigramProbability);
}
return BinaryFormat::computeProbabilityForBigram(
unigramProbability, encodedDiffOfBigramProbability);
}
///////////////////////////////////////
// Bigram / Unigram dictionary utils //
///////////////////////////////////////
/* static */ int16_t DicNodeUtils::getBigramNodeEncodedDiffProbability(const uint8_t *const dicRoot,
const DicNode *const node, hash_map_compat<int, int16_t> *bigramCacheMap) {
const int wordPos = node->getPos();
const int prevWordPos = node->getPrevWordPos();
return getBigramProbability(dicRoot, prevWordPos, wordPos, bigramCacheMap);
}
// TODO: Move this to BigramDictionary
/* static */ int16_t DicNodeUtils::getBigramProbability(const uint8_t *const dicRoot, int pos,
const int nextPos, hash_map_compat<int, int16_t> *bigramCacheMap) {
// TODO: this is painfully slow compared to the method used in the previous version of the
// algorithm. Switch to that method.
if (NOT_VALID_WORD == pos) return NOT_A_PROBABILITY;
if (NOT_VALID_WORD == nextPos) return NOT_A_PROBABILITY;
// Create a hash code for the given node pair (based on Josh Bloch's effective Java).
// TODO: Use a real hash map data structure that deals with collisions.
int hash = 17;
hash = hash * 31 + pos;
hash = hash * 31 + nextPos;
hash_map_compat<int, int16_t>::const_iterator mapPos = bigramCacheMap->find(hash);
if (mapPos != bigramCacheMap->end()) {
return mapPos->second;
}
if (NOT_VALID_WORD == pos) {
return NOT_A_PROBABILITY;
}
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos);
if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) {
return NOT_A_PROBABILITY;
}
if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos);
} else {
pos = BinaryFormat::skipOtherCharacters(dicRoot, pos);
}
pos = BinaryFormat::skipChildrenPosition(flags, pos);
pos = BinaryFormat::skipProbability(flags, pos);
uint8_t bigramFlags;
int count = 0;
do {
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos);
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(dicRoot,
bigramFlags, &pos);
if (bigramPos == nextPos) {
const int16_t probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
if (static_cast<int>(bigramCacheMap->size()) < MAX_BIGRAM_MAP_SIZE) {
(*bigramCacheMap)[hash] = probability;
}
return probability;
}
count++;
} while ((0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags))
&& count < MAX_BIGRAMS_CONSIDERED_PER_CONTEXT);
if (static_cast<int>(bigramCacheMap->size()) < MAX_BIGRAM_MAP_SIZE) {
// TODO: does this -1 mean NOT_VALID_WORD?
(*bigramCacheMap)[hash] = -1;
}
return NOT_A_PROBABILITY;
}
/* static */ int DicNodeUtils::getWordPos(const uint8_t *const dicRoot, const int *word,
const int wordLength) {
if (!word) {
return NOT_VALID_WORD;
}
return BinaryFormat::getTerminalPosition(
dicRoot, word, wordLength, false /* forceLowerCaseSearch */);
}
/* static */ bool DicNodeUtils::isMatchedNodeCodePoint(const ProximityInfoState *pInfoState,
const int pointIndex, const bool exactOnly, const int nodeCodePoint) {
if (!pInfoState) {
return true;
}
if (exactOnly) {
return pInfoState->getPrimaryCodePointAt(pointIndex) == nodeCodePoint;
}
const ProximityType matchedId = pInfoState->getProximityType(pointIndex, nodeCodePoint,
true /* checkProximityChars */);
return isProximityChar(matchedId);
}
////////////////
// Char utils //
////////////////
// TODO: Move to char_utils?
/* static */ int DicNodeUtils::appendTwoWords(const int *const src0, const int16_t length0,
const int *const src1, const int16_t length1, int *dest) {
int actualLength0 = 0;
for (int i = 0; i < length0; ++i) {
if (src0[i] == 0) {
break;
}
actualLength0 = i + 1;
}
actualLength0 = min(actualLength0, MAX_WORD_LENGTH);
memcpy(dest, src0, actualLength0 * sizeof(dest[0]));
if (!src1 || length1 == 0) {
return actualLength0;
}
int actualLength1 = 0;
for (int i = 0; i < length1; ++i) {
if (src1[i] == 0) {
break;
}
actualLength1 = i + 1;
}
actualLength1 = min(actualLength1, MAX_WORD_LENGTH - actualLength0 - 1);
memcpy(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0]));
return actualLength0 + actualLength1;
}
} // namespace latinime

View file

@ -0,0 +1,88 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_UTILS_H
#define LATINIME_DIC_NODE_UTILS_H
#include <stdint.h>
#include <vector>
#include "defines.h"
#include "hash_map_compat.h"
namespace latinime {
class DicNode;
class DicNodeVector;
class ProximityInfo;
class ProximityInfoState;
class DicNodeUtils {
public:
static int appendTwoWords(const int *src0, const int16_t length0, const int *src1,
const int16_t length1, int *dest);
static void initAsRoot(const int rootPos, const uint8_t *const dicRoot,
const int prevWordNodePos, DicNode *newRootNode);
static void initAsRootWithPreviousWord(const int rootPos, const uint8_t *const dicRoot,
DicNode *prevWordLastNode, DicNode *newRootNode);
static void initByCopy(DicNode *srcNode, DicNode *destNode);
static void getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
DicNodeVector *childDicNodes);
static int getWordPos(const uint8_t *const dicRoot, const int *word, const int prevWordLength);
static float getBigramNodeImprobability(const uint8_t *const dicRoot,
const DicNode *const node, hash_map_compat<int, int16_t> *const bigramCacheMap);
static bool isDicNodeFilteredOut(const int nodeCodePoint, const ProximityInfo *const pInfo,
const std::vector<int> *const codePointsFilter);
// TODO: Move to private
static void getProximityChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
DicNodeVector *childDicNodes);
// TODO: Move to proximity info
static bool isProximityChar(ProximityType type) {
return type == MATCH_CHAR || type == PROXIMITY_CHAR || type == ADDITIONAL_PROXIMITY_CHAR;
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils);
// Max cache size for the space omission error correction bigram lookup
static const int MAX_BIGRAM_MAP_SIZE = 20000;
// Max number of bigrams to look up
static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500;
static int getBigramNodeProbability(const uint8_t *const dicRoot, const DicNode *const node,
hash_map_compat<int, int16_t> *bigramCacheMap);
static int16_t getBigramNodeEncodedDiffProbability(const uint8_t *const dicRoot,
const DicNode *const node, hash_map_compat<int, int16_t> *bigramCacheMap);
static void createAndGetPassingChildNode(DicNode *dicNode, const ProximityInfoState *pInfoState,
const int pointIndex, const bool exactOnly, DicNodeVector *childDicNodes);
static void createAndGetAllLeavingChildNodes(DicNode *dicNode, const uint8_t *const dicRoot,
const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
const std::vector<int> *const codePointsFilter,
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
static int createAndGetLeavingChildNode(DicNode *dicNode, int pos, const uint8_t *const dicRoot,
const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex,
const bool exactOnly, const std::vector<int> *const codePointsFilter,
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
static int16_t getBigramProbability(const uint8_t *const dicRoot, int pos, const int nextPos,
hash_map_compat<int, int16_t> *bigramCacheMap);
// TODO: Move to proximity info
static bool isMatchedNodeCodePoint(const ProximityInfoState *pInfoState, const int pointIndex,
const bool exactOnly, const int nodeCodePoint);
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_UTILS_H

View file

@ -0,0 +1,95 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DIC_NODE_VECTOR_H
#define LATINIME_DIC_NODE_VECTOR_H
#include <vector>
#include "defines.h"
#include "dic_node.h"
namespace latinime {
class DicNodeVector {
public:
#ifdef FLAG_DBG
// 0 will introduce resizing the vector.
static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 0;
#else
static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 60;
#endif
AK_FORCE_INLINE DicNodeVector() : mDicNodes(0), mLock(false), mEmptyNode() {}
// Specify the capacity of the vector
AK_FORCE_INLINE DicNodeVector(const int size) : mDicNodes(0), mLock(false), mEmptyNode() {
mDicNodes.reserve(size);
}
// Non virtual inline destructor -- never inherit this class
AK_FORCE_INLINE ~DicNodeVector() {}
AK_FORCE_INLINE void clear() {
mDicNodes.clear();
mLock = false;
}
int getSizeAndLock() {
mLock = true;
return static_cast<int>(mDicNodes.size());
}
bool exceeds(const size_t limit) const {
return mDicNodes.size() >= limit;
}
void pushPassingChild(DicNode *dicNode) {
ASSERT(!mLock);
mDicNodes.push_back(mEmptyNode);
mDicNodes.back().initAsPassingChild(dicNode);
}
void pushLeavingChild(DicNode *dicNode, const int pos, const uint8_t flags,
const int childrenPos, const int attributesPos, const int siblingPos,
const int nodeCodePoint, const int childrenCount, const int probability,
const int bigramProbability, const bool isTerminal, const bool hasMultipleChars,
const bool hasChildren, const uint16_t additionalSubwordLength,
const int *additionalSubword) {
ASSERT(!mLock);
mDicNodes.push_back(mEmptyNode);
mDicNodes.back().initAsChild(dicNode, pos, flags, childrenPos, attributesPos, siblingPos,
nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal,
hasMultipleChars, hasChildren, additionalSubwordLength, additionalSubword);
}
DicNode *operator[](const int id) {
ASSERT(id < static_cast<int>(mDicNodes.size()));
return &mDicNodes[id];
}
DicNode *front() {
ASSERT(1 <= static_cast<int>(mDicNodes.size()));
return &mDicNodes[0];
}
private:
DISALLOW_COPY_AND_ASSIGN(DicNodeVector);
std::vector<DicNode> mDicNodes;
bool mLock;
DicNode mEmptyNode;
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_VECTOR_H