From 38c26dd0bf8cd5c4511e4a02d5eeae4b3553f03a Mon Sep 17 00:00:00 2001 From: Satoshi Kataoka Date: Mon, 1 Apr 2013 15:15:07 +0900 Subject: [PATCH] Move dicnode to AOSP Bug: 8187060 Change-Id: I72398fa45b12683bd46d23c5ca69e6bcd5ca2b7e --- native/jni/Android.mk | 7 +- .../jni/src/suggest/core/dicnode/dic_node.cpp | 44 ++ .../jni/src/suggest/core/dicnode/dic_node.h | 572 ++++++++++++++++++ .../core/dicnode/dic_node_priority_queue.h | 213 +++++++ .../suggest/core/dicnode/dic_node_profiler.h | 181 ++++++ .../core/dicnode/dic_node_properties.h | 173 ++++++ .../core/dicnode/dic_node_release_listener.h | 33 + .../src/suggest/core/dicnode/dic_node_state.h | 71 +++ .../core/dicnode/dic_node_state_input.h | 100 +++ .../core/dicnode/dic_node_state_output.h | 75 +++ .../core/dicnode/dic_node_state_prevword.h | 156 +++++ .../core/dicnode/dic_node_state_scoring.h | 166 +++++ .../suggest/core/dicnode/dic_node_utils.cpp | 335 ++++++++++ .../src/suggest/core/dicnode/dic_node_utils.h | 88 +++ .../suggest/core/dicnode/dic_node_vector.h | 95 +++ 15 files changed, 2308 insertions(+), 1 deletion(-) create mode 100644 native/jni/src/suggest/core/dicnode/dic_node.cpp create mode 100644 native/jni/src/suggest/core/dicnode/dic_node.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_priority_queue.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_profiler.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_properties.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_release_listener.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_state.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_state_input.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_state_output.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_utils.cpp create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_utils.h create mode 100644 native/jni/src/suggest/core/dicnode/dic_node_vector.h diff --git a/native/jni/Android.mk b/native/jni/Android.mk index 3735ec07b..12f99eb52 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -26,7 +26,10 @@ include $(CLEAR_VARS) LATIN_IME_SRC_DIR := src LATIN_IME_SRC_FULLPATH_DIR := $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) -LOCAL_C_INCLUDES += $(LATIN_IME_SRC_FULLPATH_DIR) $(LATIN_IME_SRC_FULLPATH_DIR)/suggest +LOCAL_C_INCLUDES += \ + $(LATIN_IME_SRC_FULLPATH_DIR) \ + $(LATIN_IME_SRC_FULLPATH_DIR)/suggest \ + $(LATIN_IME_SRC_FULLPATH_DIR)/suggest/core/dicnode LOCAL_CFLAGS += -Werror -Wall -Wextra -Weffc++ -Wformat=2 -Wcast-qual -Wcast-align \ -Wwrite-strings -Wfloat-equal -Wpointer-arith -Winit-self -Wredundant-decls -Wno-system-headers @@ -59,6 +62,8 @@ LATIN_IME_CORE_SRC_FILES := \ proximity_info_state_utils.cpp \ unigram_dictionary.cpp \ words_priority_queue.cpp \ + suggest/core/dicnode/dic_node.cpp \ + suggest/core/dicnode/dic_node_utils.cpp \ suggest/gesture_suggest.cpp \ suggest/typing_suggest.cpp diff --git a/native/jni/src/suggest/core/dicnode/dic_node.cpp b/native/jni/src/suggest/core/dicnode/dic_node.cpp new file mode 100644 index 000000000..8c48c587b --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dic_node.h" + +namespace latinime { + +DicNode::DicNode(const DicNode &dicNode) + : +#if DEBUG_DICT + mProfiler(dicNode.mProfiler), +#endif + mDicNodeProperties(dicNode.mDicNodeProperties), mDicNodeState(dicNode.mDicNodeState), + mIsCachedForNextSuggestion(dicNode.mIsCachedForNextSuggestion), mIsUsed(dicNode.mIsUsed), + mReleaseListener(0) { + /* empty */ +} + +DicNode &DicNode::operator=(const DicNode &dicNode) { +#if DEBUG_DICT + mProfiler = dicNode.mProfiler; +#endif + mDicNodeProperties = dicNode.mDicNodeProperties; + mDicNodeState = dicNode.mDicNodeState; + mIsCachedForNextSuggestion = dicNode.mIsCachedForNextSuggestion; + mIsUsed = dicNode.mIsUsed; + mReleaseListener = dicNode.mReleaseListener; + return *this; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h new file mode 100644 index 000000000..7bfa459a2 --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -0,0 +1,572 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_H +#define LATINIME_DIC_NODE_H + +#include "char_utils.h" +#include "defines.h" +#include "dic_node_state.h" +#include "dic_node_profiler.h" +#include "dic_node_properties.h" +#include "dic_node_release_listener.h" + +#if DEBUG_DICT +#define LOGI_SHOW_ADD_COST_PROP \ + do { char charBuf[50]; \ + INTS_TO_CHARS(getOutputWordBuf(), getDepth(), charBuf); \ + AKLOGI("%20s, \"%c\", size = %03d, total = %03d, index(0) = %02d, dist = %.4f, %s,,", \ + __FUNCTION__, getNodeCodePoint(), inputSize, getTotalInputIndex(), \ + getInputIndex(0), getNormalizedCompoundDistance(), charBuf); } while (0) +#define DUMP_WORD_AND_SCORE(header) \ + do { char charBuf[50]; char prevWordCharBuf[50]; \ + INTS_TO_CHARS(getOutputWordBuf(), getDepth(), charBuf); \ + INTS_TO_CHARS(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, \ + mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(), prevWordCharBuf); \ + AKLOGI("#%8s, %5f, %5f, %5f, %5f, %s, %s, %d,,", header, \ + getSpatialDistanceForScoring(), getLanguageDistanceForScoring(), \ + getNormalizedCompoundDistance(), getRawLength(), prevWordCharBuf, charBuf, \ + getInputIndex(0)); \ + } while (0) +#else +#define LOGI_SHOW_ADD_COST_PROP +#define DUMP_WORD_AND_SCORE(header) +#endif + +namespace latinime { + +// Naming convention +// - Distance: "Weighted" edit distance -- used both for spatial and language. +// - Compound Distance: Spatial Distance + Language Distance -- used for pruning and scoring +// - Cost: delta/diff for Distance -- used both for spatial and language +// - Length: "Non-weighted" -- used only for spatial +// - Probability: "Non-weighted" -- used only for language + +// This struct is purely a bucket to return values. No instances of this struct should be kept. +struct DicNode_InputStateG { + bool mNeedsToUpdateInputStateG; + int mPointerId; + int16_t mInputIndex; + int mPrevCodePoint; + float mTerminalDiffCost; + float mRawLength; + DoubleLetterLevel mDoubleLetterLevel; +}; + +class DicNode { + // Caveat: We define Weighting as a friend class of DicNode to let Weighting change + // the distance of DicNode. + // Caution!!! In general, we avoid using the "friend" access modifier. + // This is an exception to explicitly hide DicNode::addCost() from all classes but Weighting. + friend class Weighting; + + public: +#if DEBUG_DICT + DicNodeProfiler mProfiler; +#endif + ////////////////// + // Memory utils // + ////////////////// + AK_FORCE_INLINE static void managedDelete(DicNode *node) { + node->remove(); + } + // end + ///////////////// + + AK_FORCE_INLINE DicNode() + : +#if DEBUG_DICT + mProfiler(), +#endif + mDicNodeProperties(), mDicNodeState(), mIsCachedForNextSuggestion(false), + mIsUsed(false), mReleaseListener(0) {} + + DicNode(const DicNode &dicNode); + DicNode &operator=(const DicNode &dicNode); + virtual ~DicNode() {} + + // TODO: minimize arguments by looking binary_format + // Init for copy + void initByCopy(const DicNode *dicNode) { + mIsUsed = true; + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + mDicNodeProperties.init(&dicNode->mDicNodeProperties); + mDicNodeState.init(&dicNode->mDicNodeState); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + // TODO: minimize arguments by looking binary_format + // Init for root with prevWordNodePos which is used for bigram + void initAsRoot(const int pos, const int childrenPos, const int childrenCount, + const int prevWordNodePos) { + mIsUsed = true; + mIsCachedForNextSuggestion = false; + mDicNodeProperties.init( + pos, 0, childrenPos, 0, 0, 0, childrenCount, 0, 0, false, false, true, 0, 0); + mDicNodeState.init(prevWordNodePos); + PROF_NODE_RESET(mProfiler); + } + + void initAsPassingChild(DicNode *parentNode) { + mIsUsed = true; + mIsCachedForNextSuggestion = parentNode->mIsCachedForNextSuggestion; + const int c = parentNode->getNodeTypedCodePoint(); + mDicNodeProperties.init(&parentNode->mDicNodeProperties, c); + mDicNodeState.init(&parentNode->mDicNodeState); + PROF_NODE_COPY(&parentNode->mProfiler, mProfiler); + } + + // TODO: minimize arguments by looking binary_format + // Init for root with previous word + void initAsRootWithPreviousWord(DicNode *dicNode, const int pos, const int childrenPos, + const int childrenCount) { + mIsUsed = true; + mIsCachedForNextSuggestion = false; + mDicNodeProperties.init( + pos, 0, childrenPos, 0, 0, 0, childrenCount, 0, 0, false, false, true, 0, 0); + // TODO: Move to dicNodeState? + mDicNodeState.mDicNodeStateOutput.init(); // reset for next word + mDicNodeState.mDicNodeStateInput.init( + &dicNode->mDicNodeState.mDicNodeStateInput, true /* resetTerminalDiffCost */); + mDicNodeState.mDicNodeStateScoring.init( + &dicNode->mDicNodeState.mDicNodeStateScoring); + mDicNodeState.mDicNodeStatePrevWord.init( + dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() + 1, + dicNode->mDicNodeProperties.getProbability(), + dicNode->mDicNodeProperties.getPos(), + dicNode->mDicNodeState.mDicNodeStatePrevWord.mPrevWord, + dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(), + dicNode->getOutputWordBuf(), + dicNode->mDicNodeProperties.getDepth(), + dicNode->mDicNodeState.mDicNodeStatePrevWord.mPrevSpacePositions, + mDicNodeState.mDicNodeStateInput.getInputIndex(0) /* lastInputIndex */); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + // TODO: minimize arguments by looking binary_format + void initAsChild(DicNode *dicNode, const int pos, const uint8_t flags, const int childrenPos, + const int attributesPos, const int siblingPos, const int nodeCodePoint, + const int childrenCount, const int probability, const int bigramProbability, + const bool isTerminal, const bool hasMultipleChars, const bool hasChildren, + const uint16_t additionalSubwordLength, const int *additionalSubword) { + mIsUsed = true; + uint16_t newDepth = static_cast(dicNode->getDepth() + 1); + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + const uint16_t newLeavingDepth = static_cast( + dicNode->mDicNodeProperties.getLeavingDepth() + additionalSubwordLength); + mDicNodeProperties.init(pos, flags, childrenPos, attributesPos, siblingPos, nodeCodePoint, + childrenCount, probability, bigramProbability, isTerminal, hasMultipleChars, + hasChildren, newDepth, newLeavingDepth); + mDicNodeState.init(&dicNode->mDicNodeState, additionalSubwordLength, additionalSubword); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + AK_FORCE_INLINE void remove() { + mIsUsed = false; + if (mReleaseListener) { + mReleaseListener->onReleased(this); + } + } + + bool isUsed() const { + return mIsUsed; + } + + bool isRoot() const { + return getDepth() == 0; + } + + bool hasChildren() const { + return mDicNodeProperties.hasChildren(); + } + + bool isLeavingNode() const { + ASSERT(getDepth() <= getLeavingDepth()); + return getDepth() == getLeavingDepth(); + } + + AK_FORCE_INLINE bool isFirstLetter() const { + return getDepth() == 1; + } + + bool isCached() const { + return mIsCachedForNextSuggestion; + } + + void setCached() { + mIsCachedForNextSuggestion = true; + } + + // Used to expand the node in DicNodeUtils + int getNodeTypedCodePoint() const { + return mDicNodeState.mDicNodeStateOutput.getCodePointAt(getDepth()); + } + + bool isImpossibleBigramWord() const { + const int probability = mDicNodeProperties.getProbability(); + if (probability == 0) { + return true; + } + const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength() + - mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1; + const int currentWordLen = getDepth(); + return (prevWordLen == 1 && currentWordLen == 1); + } + + bool isCapitalized() const { + const int c = getOutputWordBuf()[0]; + return isAsciiUpper(c); + } + + bool isFirstWord() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos() == NOT_VALID_WORD; + } + + bool isCompletion(const int inputSize) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(0) >= inputSize; + } + + bool canDoLookAheadCorrection(const int inputSize) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1; + } + + // Used to get bigram probability in DicNodeUtils + int getPos() const { + return mDicNodeProperties.getPos(); + } + + // Used to get bigram probability in DicNodeUtils + int getPrevWordPos() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos(); + } + + // Used in DicNodeUtils + int getChildrenPos() const { + return mDicNodeProperties.getChildrenPos(); + } + + // Used in DicNodeUtils + int getChildrenCount() const { + return mDicNodeProperties.getChildrenCount(); + } + + // Used in DicNodeUtils + int getProbability() const { + return mDicNodeProperties.getProbability(); + } + + AK_FORCE_INLINE bool isTerminalWordNode() const { + const bool isTerminalNodes = mDicNodeProperties.isTerminal(); + const int currentNodeDepth = getDepth(); + const int terminalNodeDepth = mDicNodeProperties.getLeavingDepth(); + return isTerminalNodes && currentNodeDepth > 0 && currentNodeDepth == terminalNodeDepth; + } + + bool shouldBeFilterdBySafetyNetForBigram() const { + const uint16_t currentDepth = getDepth(); + const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength() + - mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1; + return !(currentDepth > 0 && (currentDepth != 1 || prevWordLen != 1)); + } + + uint16_t getLeavingDepth() const { + return mDicNodeProperties.getLeavingDepth(); + } + + bool isTotalInputSizeExceedingLimit() const { + const int prevWordsLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); + const int currentWordDepth = getDepth(); + // TODO: 3 can be 2? Needs to be investigated. + // TODO: Have a const variable for 3 (or 2) + return prevWordsLen + currentWordDepth > MAX_WORD_LENGTH - 3; + } + + // TODO: This may be defective. Needs to be revised. + bool truncateNode(const DicNode *const topNode, const int inputCommitPoint) { + const int prevWordLenOfTop = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); + int newPrevWordStartIndex = inputCommitPoint; + int charCount = 0; + // Find new word start index + for (int i = 0; i < prevWordLenOfTop; ++i) { + const int c = mDicNodeState.mDicNodeStatePrevWord.getPrevWordCodePointAt(i); + // TODO: Check other separators. + if (c != KEYCODE_SPACE && c != KEYCODE_SINGLE_QUOTE) { + if (charCount == inputCommitPoint) { + newPrevWordStartIndex = i; + break; + } + ++charCount; + } + } + if (!mDicNodeState.mDicNodeStatePrevWord.startsWith( + &topNode->mDicNodeState.mDicNodeStatePrevWord, newPrevWordStartIndex - 1)) { + // Node mismatch. + return false; + } + mDicNodeState.mDicNodeStateInput.truncate(inputCommitPoint); + mDicNodeState.mDicNodeStatePrevWord.truncate(newPrevWordStartIndex); + return true; + } + + void outputResult(int *dest) const { + const uint16_t prevWordLength = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); + const uint16_t currentDepth = getDepth(); + DicNodeUtils::appendTwoWords(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, + prevWordLength, getOutputWordBuf(), currentDepth, dest); + DUMP_WORD_AND_SCORE("OUTPUT"); + } + + void outputSpacePositionsResult(int *spaceIndices) const { + mDicNodeState.mDicNodeStatePrevWord.outputSpacePositions(spaceIndices); + } + + bool hasMultipleWords() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() > 0; + } + + float getProximityCorrectionCount() const { + return static_cast(mDicNodeState.mDicNodeStateScoring.getProximityCorrectionCount()); + } + + float getEditCorrectionCount() const { + return static_cast(mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount()); + } + + // Used to prune nodes + float getNormalizedCompoundDistance() const { + return mDicNodeState.mDicNodeStateScoring.getNormalizedCompoundDistance(); + } + + // Used to prune nodes + float getNormalizedSpatialDistance() const { + return mDicNodeState.mDicNodeStateScoring.getSpatialDistance() + / static_cast(getInputIndex(0) + 1); + } + + // Used to prune nodes + float getCompoundDistance() const { + return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(); + } + + // Used to prune nodes + float getCompoundDistance(const float languageWeight) const { + return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(languageWeight); + } + + // Note that "cost" means delta for "distance" that is weighted. + float getTotalPrevWordsLanguageCost() const { + return mDicNodeState.mDicNodeStateScoring.getTotalPrevWordsLanguageCost(); + } + + // Used to commit input partially + int getPrevWordNodePos() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos(); + } + + AK_FORCE_INLINE const int *getOutputWordBuf() const { + return mDicNodeState.mDicNodeStateOutput.mWordBuf; + } + + int getPrevCodePointG(int pointerId) const { + return mDicNodeState.mDicNodeStateInput.getPrevCodePoint(pointerId); + } + + // Whether the current codepoint can be an intentional omission, in which case the traversal + // algorithm will always check for a possible omission here. + bool canBeIntentionalOmission() const { + return isIntentionalOmissionCodePoint(getNodeCodePoint()); + } + + // Whether the omission is so frequent that it should incur zero cost. + bool isZeroCostOmission() const { + // TODO: do not hardcode and read from header + return (getNodeCodePoint() == KEYCODE_SINGLE_QUOTE); + } + + // TODO: remove + float getTerminalDiffCostG(int path) const { + return mDicNodeState.mDicNodeStateInput.getTerminalDiffCost(path); + } + + ////////////////////// + // Temporary getter // + // TODO: Remove // + ////////////////////// + // TODO: Remove once touch path is merged into ProximityInfoState + int getNodeCodePoint() const { + return mDicNodeProperties.getNodeCodePoint(); + } + + //////////////////////////////// + // Utils for cost calculation // + //////////////////////////////// + AK_FORCE_INLINE bool isSameNodeCodePoint(const DicNode *const dicNode) const { + return mDicNodeProperties.getNodeCodePoint() + == dicNode->mDicNodeProperties.getNodeCodePoint(); + } + + // TODO: remove + // TODO: rename getNextInputIndex + int16_t getInputIndex(int pointerId) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(pointerId); + } + + //////////////////////////////////// + // Getter of features for scoring // + //////////////////////////////////// + float getSpatialDistanceForScoring() const { + return mDicNodeState.mDicNodeStateScoring.getSpatialDistance(); + } + + float getLanguageDistanceForScoring() const { + return mDicNodeState.mDicNodeStateScoring.getLanguageDistance(); + } + + float getLanguageDistanceRatePerWordForScoring() const { + const float langDist = getLanguageDistanceForScoring(); + const float totalWordCount = + static_cast(mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() + 1); + return langDist / totalWordCount; + } + + float getRawLength() const { + return mDicNodeState.mDicNodeStateScoring.getRawLength(); + } + + bool isLessThanOneErrorForScoring() const { + return mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount() + + mDicNodeState.mDicNodeStateScoring.getProximityCorrectionCount() <= 1; + } + + DoubleLetterLevel getDoubleLetterLevel() const { + return mDicNodeState.mDicNodeStateScoring.getDoubleLetterLevel(); + } + + void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) { + mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel); + } + + uint8_t getFlags() const { + return mDicNodeProperties.getFlags(); + } + + int getAttributesPos() const { + return mDicNodeProperties.getAttributesPos(); + } + + inline uint16_t getDepth() const { + return mDicNodeProperties.getDepth(); + } + + AK_FORCE_INLINE void dump(const char *tag) const { +#if DEBUG_DICT + DUMP_WORD_AND_SCORE(tag); +#if DEBUG_DUMP_ERROR + mProfiler.dump(); +#endif +#endif + } + + void setReleaseListener(DicNodeReleaseListener *releaseListener) { + mReleaseListener = releaseListener; + } + + AK_FORCE_INLINE bool compare(const DicNode *right) { + if (!isUsed() && !right->isUsed()) { + // Compare pointer values here for stable comparison + return this > right; + } + if (!isUsed()) { + return true; + } + if (!right->isUsed()) { + return false; + } + const float diff = + right->getNormalizedCompoundDistance() - getNormalizedCompoundDistance(); + static const float MIN_DIFF = 0.000001f; + if (diff > MIN_DIFF) { + return true; + } else if (diff < -MIN_DIFF) { + return false; + } + const int depth = getDepth(); + const int depthDiff = right->getDepth() - depth; + if (depthDiff != 0) { + return depthDiff > 0; + } + for (int i = 0; i < depth; ++i) { + const int codePoint = mDicNodeState.mDicNodeStateOutput.getCodePointAt(i); + const int rightCodePoint = right->mDicNodeState.mDicNodeStateOutput.getCodePointAt(i); + if (codePoint != rightCodePoint) { + return rightCodePoint > codePoint; + } + } + // Compare pointer values here for stable comparison + return this > right; + } + + private: + DicNodeProperties mDicNodeProperties; + DicNodeState mDicNodeState; + // TODO: Remove + bool mIsCachedForNextSuggestion; + bool mIsUsed; + DicNodeReleaseListener *mReleaseListener; + + AK_FORCE_INLINE int getTotalInputIndex() const { + int index = 0; + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + index += mDicNodeState.mDicNodeStateInput.getInputIndex(i); + } + return index; + } + + // Caveat: Must not be called outside Weighting + // This restriction is guaranteed by "friend" + AK_FORCE_INLINE void addCost(const float spatialCost, const float languageCost, + const bool doNormalization, const int inputSize, const bool isEditCorrection, + const bool isProximityCorrection) { + if (DEBUG_GEO_FULL) { + LOGI_SHOW_ADD_COST_PROP; + } + mDicNodeState.mDicNodeStateScoring.addCost(spatialCost, languageCost, doNormalization, + inputSize, getTotalInputIndex(), isEditCorrection, isProximityCorrection); + } + + // Caveat: Must not be called outside Weighting + // This restriction is guaranteed by "friend" + AK_FORCE_INLINE void forwardInputIndex(const int pointerId, const int count, + const bool overwritesPrevCodePointByNodeCodePoint) { + if (count == 0) { + return; + } + mDicNodeState.mDicNodeStateInput.forwardInputIndex(pointerId, count); + if (overwritesPrevCodePointByNodeCodePoint) { + mDicNodeState.mDicNodeStateInput.setPrevCodePoint(0, getNodeCodePoint()); + } + } + + AK_FORCE_INLINE void updateInputIndexG(DicNode_InputStateG *inputStateG) { + mDicNodeState.mDicNodeStateInput.updateInputIndexG(inputStateG->mPointerId, + inputStateG->mInputIndex, inputStateG->mPrevCodePoint, + inputStateG->mTerminalDiffCost, inputStateG->mRawLength); + mDicNodeState.mDicNodeStateScoring.addRawLength(inputStateG->mRawLength); + mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(inputStateG->mDoubleLetterLevel); + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_priority_queue.h b/native/jni/src/suggest/core/dicnode/dic_node_priority_queue.h new file mode 100644 index 000000000..d3f28a8bd --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_priority_queue.h @@ -0,0 +1,213 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PRIORITY_QUEUE_H +#define LATINIME_DIC_NODE_PRIORITY_QUEUE_H + +#include +#include + +#include "defines.h" +#include "dic_node.h" +#include "dic_node_release_listener.h" + +#define MAX_DIC_NODE_PRIORITY_QUEUE_CAPACITY 200 + +namespace latinime { + +class DicNodePriorityQueue : public DicNodeReleaseListener { + public: + AK_FORCE_INLINE DicNodePriorityQueue() + : MAX_CAPACITY(MAX_DIC_NODE_PRIORITY_QUEUE_CAPACITY), + mMaxSize(MAX_DIC_NODE_PRIORITY_QUEUE_CAPACITY), mDicNodesBuf(), mUnusedNodeIndices(), + mNextUnusedNodeId(0), mDicNodesQueue() { + mDicNodesBuf.resize(MAX_CAPACITY + 1); + mUnusedNodeIndices.resize(MAX_CAPACITY + 1); + reset(); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicNodePriorityQueue() {} + + int getSize() const { + return static_cast(mDicNodesQueue.size()); + } + + int getMaxSize() const { + return mMaxSize; + } + + AK_FORCE_INLINE void setMaxSize(const int maxSize) { + mMaxSize = min(maxSize, MAX_CAPACITY); + } + + AK_FORCE_INLINE void reset() { + clearAndResize(MAX_CAPACITY); + } + + AK_FORCE_INLINE void clear() { + clearAndResize(mMaxSize); + } + + AK_FORCE_INLINE void clearAndResize(const int maxSize) { + while (!mDicNodesQueue.empty()) { + mDicNodesQueue.pop(); + } + setMaxSize(maxSize); + for (int i = 0; i < MAX_CAPACITY + 1; ++i) { + mDicNodesBuf[i].remove(); + mDicNodesBuf[i].setReleaseListener(this); + mUnusedNodeIndices[i] = i == MAX_CAPACITY ? NOT_A_NODE_ID : static_cast(i) + 1; + } + mNextUnusedNodeId = 0; + } + + AK_FORCE_INLINE DicNode *newDicNode(DicNode *dicNode) { + DicNode *newNode = searchEmptyDicNode(); + if (newNode) { + DicNodeUtils::initByCopy(dicNode, newNode); + return newNode; + } + return 0; + } + + // Copy + AK_FORCE_INLINE DicNode *copyPush(DicNode *dicNode) { + return copyPush(dicNode, mMaxSize); + } + + AK_FORCE_INLINE void copyPop(DicNode *dest) { + if (mDicNodesQueue.empty()) { + ASSERT(false); + return; + } + DicNode *node = mDicNodesQueue.top(); + if (dest) { + DicNodeUtils::initByCopy(node, dest); + } + node->remove(); + mDicNodesQueue.pop(); + } + + void onReleased(DicNode *dicNode) { + const int index = static_cast(dicNode - &mDicNodesBuf[0]); + if (mUnusedNodeIndices[index] != NOT_A_NODE_ID) { + // it's already released + return; + } + mUnusedNodeIndices[index] = mNextUnusedNodeId; + mNextUnusedNodeId = index; + ASSERT(index >= 0 && index < (MAX_CAPACITY + 1)); + } + + AK_FORCE_INLINE void dump() const { + AKLOGI("\n\n\n\n\n==========================="); + for (int i = 0; i < MAX_CAPACITY + 1; ++i) { + if (mDicNodesBuf[i].isUsed()) { + mDicNodesBuf[i].dump("QUEUE: "); + } + } + AKLOGI("===========================\n\n\n\n\n"); + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodePriorityQueue); + static const int NOT_A_NODE_ID = -1; + + AK_FORCE_INLINE static bool compareDicNode(DicNode *left, DicNode *right) { + return left->compare(right); + } + + struct DicNodeComparator { + bool operator ()(DicNode *left, DicNode *right) { + return compareDicNode(left, right); + } + }; + + typedef std::priority_queue, DicNodeComparator> DicNodesQueue; + const int MAX_CAPACITY; + int mMaxSize; + std::vector mDicNodesBuf; // of each element of mDicNodesBuf respectively + std::vector mUnusedNodeIndices; + int mNextUnusedNodeId; + DicNodesQueue mDicNodesQueue; + + inline bool isFull(const int maxSize) const { + return getSize() >= maxSize; + } + + AK_FORCE_INLINE void pop() { + copyPop(0); + } + + AK_FORCE_INLINE bool betterThanWorstDicNode(DicNode *dicNode) const { + DicNode *worstNode = mDicNodesQueue.top(); + if (!worstNode) { + return true; + } + return compareDicNode(dicNode, worstNode); + } + + AK_FORCE_INLINE DicNode *searchEmptyDicNode() { + // TODO: Currently O(n) but should be improved to O(1) + if (MAX_CAPACITY == 0) { + return 0; + } + if (mNextUnusedNodeId == NOT_A_NODE_ID) { + AKLOGI("No unused node found."); + for (int i = 0; i < MAX_CAPACITY + 1; ++i) { + AKLOGI("Dump node availability, %d, %d, %d", + i, mDicNodesBuf[i].isUsed(), mUnusedNodeIndices[i]); + } + ASSERT(false); + return 0; + } + DicNode *dicNode = &mDicNodesBuf[mNextUnusedNodeId]; + markNodeAsUsed(dicNode); + return dicNode; + } + + AK_FORCE_INLINE void markNodeAsUsed(DicNode *dicNode) { + const int index = static_cast(dicNode - &mDicNodesBuf[0]); + mNextUnusedNodeId = mUnusedNodeIndices[index]; + mUnusedNodeIndices[index] = NOT_A_NODE_ID; + ASSERT(index >= 0 && index < (MAX_CAPACITY + 1)); + } + + AK_FORCE_INLINE DicNode *pushPoolNodeWithMaxSize(DicNode *dicNode, const int maxSize) { + if (!dicNode) { + return 0; + } + if (!isFull(maxSize)) { + mDicNodesQueue.push(dicNode); + return dicNode; + } + if (betterThanWorstDicNode(dicNode)) { + pop(); + mDicNodesQueue.push(dicNode); + return dicNode; + } + dicNode->remove(); + return 0; + } + + // Copy + AK_FORCE_INLINE DicNode *copyPush(DicNode *dicNode, const int maxSize) { + return pushPoolNodeWithMaxSize(newDicNode(dicNode), maxSize); + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_PRIORITY_QUEUE_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_profiler.h b/native/jni/src/suggest/core/dicnode/dic_node_profiler.h new file mode 100644 index 000000000..90f75d0c6 --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_profiler.h @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PROFILER_H +#define LATINIME_DIC_NODE_PROFILER_H + +#include "defines.h" + +#if DEBUG_DICT +#define PROF_SPACE_SUBSTITUTION(profiler) profiler.profSpaceSubstitution() +#define PROF_SPACE_OMISSION(profiler) profiler.profSpaceOmission() +#define PROF_ADDITIONAL_PROXIMITY(profiler) profiler.profAdditionalProximity() +#define PROF_SUBSTITUTION(profiler) profiler.profSubstitution() +#define PROF_OMISSION(profiler) profiler.profOmission() +#define PROF_INSERTION(profiler) profiler.profInsertion() +#define PROF_MATCH(profiler) profiler.profMatch() +#define PROF_COMPLETION(profiler) profiler.profCompletion() +#define PROF_TRANSPOSITION(profiler) profiler.profTransposition() +#define PROF_NEARESTKEY(profiler) profiler.profNearestKey() +#define PROF_TERMINAL(profiler) profiler.profTerminal() +#define PROF_NEW_WORD(profiler) profiler.profNewWord() +#define PROF_NEW_WORD_BIGRAM(profiler) profiler.profNewWordBigram() +#define PROF_NODE_RESET(profiler) profiler.reset() +#define PROF_NODE_COPY(src, dest) dest.copy(src) +#else +#define PROF_SPACE_SUBSTITUTION(profiler) +#define PROF_SPACE_OMISSION(profiler) +#define PROF_ADDITONAL_PROXIMITY(profiler) +#define PROF_SUBSTITUTION(profiler) +#define PROF_OMISSION(profiler) +#define PROF_INSERTION(profiler) +#define PROF_MATCH(profiler) +#define PROF_COMPLETION(profiler) +#define PROF_TRANSPOSITION(profiler) +#define PROF_NEARESTKEY(profiler) +#define PROF_TERMINAL(profiler) +#define PROF_NEW_WORD(profiler) +#define PROF_NEW_WORD_BIGRAM(profiler) +#define PROF_NODE_RESET(profiler) +#define PROF_NODE_COPY(src, dest) +#endif + +namespace latinime { + +class DicNodeProfiler { + public: +#if DEBUG_DICT + AK_FORCE_INLINE DicNodeProfiler() + : mProfOmission(0), mProfInsertion(0), mProfTransposition(0), + mProfAdditionalProximity(0), mProfSubstitution(0), + mProfSpaceSubstitution(0), mProfSpaceOmission(0), + mProfMatch(0), mProfCompletion(0), mProfTerminal(0), + mProfNearestKey(0), mProfNewWord(0), mProfNewWordBigram(0) {} + + int mProfOmission; + int mProfInsertion; + int mProfTransposition; + int mProfAdditionalProximity; + int mProfSubstitution; + int mProfSpaceSubstitution; + int mProfSpaceOmission; + int mProfMatch; + int mProfCompletion; + int mProfTerminal; + int mProfNearestKey; + int mProfNewWord; + int mProfNewWordBigram; + + void profSpaceSubstitution() { + ++mProfSpaceSubstitution; + } + + void profSpaceOmission() { + ++mProfSpaceOmission; + } + + void profAdditionalProximity() { + ++mProfAdditionalProximity; + } + + void profSubstitution() { + ++mProfSubstitution; + } + + void profOmission() { + ++mProfOmission; + } + + void profInsertion() { + ++mProfInsertion; + } + + void profMatch() { + ++mProfMatch; + } + + void profCompletion() { + ++mProfCompletion; + } + + void profTransposition() { + ++mProfTransposition; + } + + void profNearestKey() { + ++mProfNearestKey; + } + + void profTerminal() { + ++mProfTerminal; + } + + void profNewWord() { + ++mProfNewWord; + } + + void profNewWordBigram() { + ++mProfNewWordBigram; + } + + void reset() { + mProfSpaceSubstitution = 0; + mProfSpaceOmission = 0; + mProfAdditionalProximity = 0; + mProfSubstitution = 0; + mProfOmission = 0; + mProfInsertion = 0; + mProfMatch = 0; + mProfCompletion = 0; + mProfTransposition = 0; + mProfNearestKey = 0; + mProfTerminal = 0; + mProfNewWord = 0; + mProfNewWordBigram = 0; + } + + void copy(const DicNodeProfiler *const profiler) { + mProfSpaceSubstitution = profiler->mProfSpaceSubstitution; + mProfSpaceOmission = profiler->mProfSpaceOmission; + mProfAdditionalProximity = profiler->mProfAdditionalProximity; + mProfSubstitution = profiler->mProfSubstitution; + mProfOmission = profiler->mProfOmission; + mProfInsertion = profiler->mProfInsertion; + mProfMatch = profiler->mProfMatch; + mProfCompletion = profiler->mProfCompletion; + mProfTransposition = profiler->mProfTransposition; + mProfNearestKey = profiler->mProfNearestKey; + mProfTerminal = profiler->mProfTerminal; + mProfNewWord = profiler->mProfNewWord; + mProfNewWordBigram = profiler->mProfNewWordBigram; + } + + void dump() const { + AKLOGI("O %d, I %d, T %d, AP %d, S %d, SS %d, SO %d, M %d, C %d, TE %d, NW = %d, NWB = %d", + mProfOmission, mProfInsertion, mProfTransposition, mProfAdditionalProximity, + mProfSubstitution, mProfSpaceSubstitution, mProfSpaceOmission, mProfMatch, + mProfCompletion, mProfTerminal, mProfNewWord, mProfNewWordBigram); + } +#else + DicNodeProfiler() {} +#endif + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class +}; +} +#endif // LATINIME_DIC_NODE_PROFILER_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_properties.h b/native/jni/src/suggest/core/dicnode/dic_node_properties.h new file mode 100644 index 000000000..173ef35d0 --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_properties.h @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PROPERTIES_H +#define LATINIME_DIC_NODE_PROPERTIES_H + +#include + +#include "defines.h" + +namespace latinime { + +/** + * Node for traversing the lexicon trie. + */ +class DicNodeProperties { + public: + AK_FORCE_INLINE DicNodeProperties() + : mPos(0), mFlags(0), mChildrenPos(0), mAttributesPos(0), mSiblingPos(0), + mChildrenCount(0), mProbability(0), mBigramProbability(0), mNodeCodePoint(0), + mDepth(0), mLeavingDepth(0), mIsTerminal(false), mHasMultipleChars(false), + mHasChildren(false) { + } + + virtual ~DicNodeProperties() {} + + // Should be called only once per DicNode is initialized. + void init(const int pos, const uint8_t flags, const int childrenPos, const int attributesPos, + const int siblingPos, const int nodeCodePoint, const int childrenCount, + const int probability, const int bigramProbability, const bool isTerminal, + const bool hasMultipleChars, const bool hasChildren, const uint16_t depth, + const uint16_t terminalDepth) { + mPos = pos; + mFlags = flags; + mChildrenPos = childrenPos; + mAttributesPos = attributesPos; + mSiblingPos = siblingPos; + mNodeCodePoint = nodeCodePoint; + mChildrenCount = childrenCount; + mProbability = probability; + mBigramProbability = bigramProbability; + mIsTerminal = isTerminal; + mHasMultipleChars = hasMultipleChars; + mHasChildren = hasChildren; + mDepth = depth; + mLeavingDepth = terminalDepth; + } + + // Init for copy + void init(const DicNodeProperties *const nodeProp) { + mPos = nodeProp->mPos; + mFlags = nodeProp->mFlags; + mChildrenPos = nodeProp->mChildrenPos; + mAttributesPos = nodeProp->mAttributesPos; + mSiblingPos = nodeProp->mSiblingPos; + mNodeCodePoint = nodeProp->mNodeCodePoint; + mChildrenCount = nodeProp->mChildrenCount; + mProbability = nodeProp->mProbability; + mBigramProbability = nodeProp->mBigramProbability; + mIsTerminal = nodeProp->mIsTerminal; + mHasMultipleChars = nodeProp->mHasMultipleChars; + mHasChildren = nodeProp->mHasChildren; + mDepth = nodeProp->mDepth; + mLeavingDepth = nodeProp->mLeavingDepth; + } + + // Init as passing child + void init(const DicNodeProperties *const nodeProp, const int codePoint) { + mPos = nodeProp->mPos; + mFlags = nodeProp->mFlags; + mChildrenPos = nodeProp->mChildrenPos; + mAttributesPos = nodeProp->mAttributesPos; + mSiblingPos = nodeProp->mSiblingPos; + mNodeCodePoint = codePoint; // Overwrite the node char of a passing child + mChildrenCount = nodeProp->mChildrenCount; + mProbability = nodeProp->mProbability; + mBigramProbability = nodeProp->mBigramProbability; + mIsTerminal = nodeProp->mIsTerminal; + mHasMultipleChars = nodeProp->mHasMultipleChars; + mHasChildren = nodeProp->mHasChildren; + mDepth = nodeProp->mDepth + 1; // Increment the depth of a passing child + mLeavingDepth = nodeProp->mLeavingDepth; + } + + int getPos() const { + return mPos; + } + + uint8_t getFlags() const { + return mFlags; + } + + int getChildrenPos() const { + return mChildrenPos; + } + + int getAttributesPos() const { + return mAttributesPos; + } + + int getChildrenCount() const { + return mChildrenCount; + } + + int getProbability() const { + return mProbability; + } + + int getNodeCodePoint() const { + return mNodeCodePoint; + } + + uint16_t getDepth() const { + return mDepth; + } + + // TODO: Move to output? + uint16_t getLeavingDepth() const { + return mLeavingDepth; + } + + bool isTerminal() const { + return mIsTerminal; + } + + bool hasMultipleChars() const { + return mHasMultipleChars; + } + + bool hasChildren() const { + return mChildrenCount > 0 || mDepth != mLeavingDepth; + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + + // Not used + int getSiblingPos() const { + return mSiblingPos; + } + + int mPos; + uint8_t mFlags; + int mChildrenPos; + int mAttributesPos; + int mSiblingPos; + int mChildrenCount; + int mProbability; + int mBigramProbability; // not used for now + int mNodeCodePoint; + uint16_t mDepth; + uint16_t mLeavingDepth; + bool mIsTerminal; + bool mHasMultipleChars; + bool mHasChildren; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_PROPERTIES_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_release_listener.h b/native/jni/src/suggest/core/dicnode/dic_node_release_listener.h new file mode 100644 index 000000000..2a81c3cae --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_release_listener.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_RELEASE_LISTENER_H +#define LATINIME_DIC_NODE_RELEASE_LISTENER_H + +#include "defines.h" + +namespace latinime { + +class DicNodeReleaseListener { + public: + DicNodeReleaseListener() {} + virtual ~DicNodeReleaseListener() {} + virtual void onReleased(DicNode *dicNode) = 0; + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeReleaseListener); +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_RELEASE_LISTENER_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state.h b/native/jni/src/suggest/core/dicnode/dic_node_state.h new file mode 100644 index 000000000..239b63c32 --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_state.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_H +#define LATINIME_DIC_NODE_STATE_H + +#include "defines.h" +#include "dic_node_state_input.h" +#include "dic_node_state_output.h" +#include "dic_node_state_prevword.h" +#include "dic_node_state_scoring.h" + +namespace latinime { + +class DicNodeState { + public: + DicNodeStateInput mDicNodeStateInput; + DicNodeStateOutput mDicNodeStateOutput; + DicNodeStatePrevWord mDicNodeStatePrevWord; + DicNodeStateScoring mDicNodeStateScoring; + + AK_FORCE_INLINE DicNodeState() + : mDicNodeStateInput(), mDicNodeStateOutput(), mDicNodeStatePrevWord(), + mDicNodeStateScoring() { + } + + virtual ~DicNodeState() {} + + // Init with prevWordPos + void init(const int prevWordPos) { + mDicNodeStateInput.init(); + mDicNodeStateOutput.init(); + mDicNodeStatePrevWord.init(prevWordPos); + mDicNodeStateScoring.init(); + } + + // Init by copy + AK_FORCE_INLINE void init(const DicNodeState *const src) { + mDicNodeStateInput.init(&src->mDicNodeStateInput); + mDicNodeStateOutput.init(&src->mDicNodeStateOutput); + mDicNodeStatePrevWord.init(&src->mDicNodeStatePrevWord); + mDicNodeStateScoring.init(&src->mDicNodeStateScoring); + } + + // Init by copy and adding subword + void init(const DicNodeState *const src, const uint16_t additionalSubwordLength, + const int *const additionalSubword) { + init(src); + mDicNodeStateOutput.addSubword(additionalSubwordLength, additionalSubword); + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state_input.h b/native/jni/src/suggest/core/dicnode/dic_node_state_input.h new file mode 100644 index 000000000..7ad3e3e5f --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_state_input.h @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_INPUT_H +#define LATINIME_DIC_NODE_STATE_INPUT_H + +#include "defines.h" + +namespace latinime { + +// TODO: Have a .cpp for this class +class DicNodeStateInput { + public: + DicNodeStateInput() {} + virtual ~DicNodeStateInput() {} + + // TODO: Merge into DicNodeStatePrevWord::truncate + void truncate(const int commitPoint) { + mInputIndex[0] -= commitPoint; + } + + void init() { + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + // TODO: The initial value for mInputIndex should be -1? + //mInputIndex[i] = i == 0 ? 0 : -1; + mInputIndex[i] = 0; + mPrevCodePoint[i] = NOT_A_CODE_POINT; + mTerminalDiffCost[i] = static_cast(MAX_VALUE_FOR_WEIGHTING); + } + } + + void init(const DicNodeStateInput *const src, const bool resetTerminalDiffCost) { + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + mInputIndex[i] = src->mInputIndex[i]; + mPrevCodePoint[i] = src->mPrevCodePoint[i]; + mTerminalDiffCost[i] = resetTerminalDiffCost ? + static_cast(MAX_VALUE_FOR_WEIGHTING) : src->mTerminalDiffCost[i]; + } + } + + void updateInputIndexG(const int pointerId, const int inputIndex, + const int prevCodePoint, const float terminalDiffCost, const float rawLength) { + mInputIndex[pointerId] = inputIndex; + mPrevCodePoint[pointerId] = prevCodePoint; + mTerminalDiffCost[pointerId] = terminalDiffCost; + } + + void init(const DicNodeStateInput *const src) { + init(src, false); + } + + // For transposition + void setPrevCodePoint(const int pointerId, const int c) { + mPrevCodePoint[pointerId] = c; + } + + void forwardInputIndex(const int pointerId, const int val) { + if (mInputIndex[pointerId] < 0) { + mInputIndex[pointerId] = val; + } else { + mInputIndex[pointerId] = mInputIndex[pointerId] + val; + } + } + + int getInputIndex(const int pointerId) const { + // when "inputIndex" exceeds "inputSize", auto-completion needs to be done + return mInputIndex[pointerId]; + } + + int getPrevCodePoint(const int pointerId) const { + return mPrevCodePoint[pointerId]; + } + + float getTerminalDiffCost(const int pointerId) const { + return mTerminalDiffCost[pointerId]; + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + int mInputIndex[MAX_POINTER_COUNT_G]; + int mPrevCodePoint[MAX_POINTER_COUNT_G]; + float mTerminalDiffCost[MAX_POINTER_COUNT_G]; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_INPUT_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state_output.h b/native/jni/src/suggest/core/dicnode/dic_node_state_output.h new file mode 100644 index 000000000..1d4f50a06 --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_state_output.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_OUTPUT_H +#define LATINIME_DIC_NODE_STATE_OUTPUT_H + +#include // for memcpy() +#include + +#include "defines.h" + +namespace latinime { + +class DicNodeStateOutput { + public: + DicNodeStateOutput() : mOutputtedLength(0) { + init(); + } + + virtual ~DicNodeStateOutput() {} + + void init() { + mOutputtedLength = 0; + mWordBuf[0] = 0; + } + + void init(const DicNodeStateOutput *const stateOutput) { + memcpy(mWordBuf, stateOutput->mWordBuf, + stateOutput->mOutputtedLength * sizeof(mWordBuf[0])); + mOutputtedLength = stateOutput->mOutputtedLength; + if (mOutputtedLength < MAX_WORD_LENGTH) { + mWordBuf[mOutputtedLength] = 0; + } + } + + void addSubword(const uint16_t additionalSubwordLength, const int *const additionalSubword) { + if (additionalSubword) { + memcpy(&mWordBuf[mOutputtedLength], additionalSubword, + additionalSubwordLength * sizeof(mWordBuf[0])); + mOutputtedLength = static_cast(mOutputtedLength + additionalSubwordLength); + if (mOutputtedLength < MAX_WORD_LENGTH) { + mWordBuf[mOutputtedLength] = 0; + } + } + } + + // TODO: Remove + int getCodePointAt(const int id) const { + return mWordBuf[id]; + } + + // TODO: Move to private + int mWordBuf[MAX_WORD_LENGTH]; + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + uint16_t mOutputtedLength; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_OUTPUT_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h b/native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h new file mode 100644 index 000000000..e3b892bda --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_PREVWORD_H +#define LATINIME_DIC_NODE_STATE_PREVWORD_H + +#include // for memset() +#include + +#include "defines.h" +#include "dic_node_utils.h" + +namespace latinime { + +class DicNodeStatePrevWord { + public: + AK_FORCE_INLINE DicNodeStatePrevWord() + : mPrevWordCount(0), mPrevWordLength(0), mPrevWordStart(0), mPrevWordProbability(0), + mPrevWordNodePos(0) { + memset(mPrevWord, 0, sizeof(mPrevWord)); + memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions)); + } + + virtual ~DicNodeStatePrevWord() {} + + void init() { + mPrevWordLength = 0; + mPrevWordCount = 0; + mPrevWordStart = 0; + mPrevWordProbability = -1; + mPrevWordNodePos = NOT_VALID_WORD; + memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions)); + } + + void init(const int prevWordNodePos) { + mPrevWordLength = 0; + mPrevWordCount = 0; + mPrevWordStart = 0; + mPrevWordProbability = -1; + mPrevWordNodePos = prevWordNodePos; + memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions)); + } + + // Init by copy + AK_FORCE_INLINE void init(const DicNodeStatePrevWord *const prevWord) { + mPrevWordLength = prevWord->mPrevWordLength; + mPrevWordCount = prevWord->mPrevWordCount; + mPrevWordStart = prevWord->mPrevWordStart; + mPrevWordProbability = prevWord->mPrevWordProbability; + mPrevWordNodePos = prevWord->mPrevWordNodePos; + memcpy(mPrevWord, prevWord->mPrevWord, prevWord->mPrevWordLength * sizeof(mPrevWord[0])); + memcpy(mPrevSpacePositions, prevWord->mPrevSpacePositions, sizeof(mPrevSpacePositions)); + } + + void init(const int16_t prevWordCount, const int16_t prevWordProbability, + const int prevWordNodePos, const int *const src0, const int16_t length0, + const int *const src1, const int16_t length1, const int *const prevSpacePositions, + const int lastInputIndex) { + mPrevWordCount = prevWordCount; + mPrevWordProbability = prevWordProbability; + mPrevWordNodePos = prevWordNodePos; + const int twoWordsLen = + DicNodeUtils::appendTwoWords(src0, length0, src1, length1, mPrevWord); + mPrevWord[twoWordsLen] = KEYCODE_SPACE; + mPrevWordStart = length0; + mPrevWordLength = static_cast(twoWordsLen + 1); + memcpy(mPrevSpacePositions, prevSpacePositions, sizeof(mPrevSpacePositions)); + mPrevSpacePositions[mPrevWordCount - 1] = lastInputIndex; + } + + void truncate(const int offset) { + // TODO: memmove + if (mPrevWordLength < offset) { + memset(mPrevWord, 0, sizeof(mPrevWord)); + mPrevWordLength = 0; + return; + } + const int newPrevWordLength = mPrevWordLength - offset; + memmove(mPrevWord, &mPrevWord[offset], newPrevWordLength * sizeof(mPrevWord[0])); + mPrevWordLength = newPrevWordLength; + } + + void outputSpacePositions(int *spaceIndices) const { + // Convert uint16_t to int + for (int i = 0; i < MAX_RESULTS; i++) { + spaceIndices[i] = mPrevSpacePositions[i]; + } + } + + // TODO: remove + int16_t getPrevWordLength() const { + return mPrevWordLength; + } + + int16_t getPrevWordCount() const { + return mPrevWordCount; + } + + int16_t getPrevWordStart() const { + return mPrevWordStart; + } + + int16_t getPrevWordProbability() const { + return mPrevWordProbability; + } + + int getPrevWordNodePos() const { + return mPrevWordNodePos; + } + + int getPrevWordCodePointAt(const int id) const { + return mPrevWord[id]; + } + + bool startsWith(const DicNodeStatePrevWord *const prefix, const int prefixLen) const { + if (prefixLen > mPrevWordLength) { + return false; + } + for (int i = 0; i < prefixLen; ++i) { + if (mPrevWord[i] != prefix->mPrevWord[i]) { + return false; + } + } + return true; + } + + // TODO: Move to private + int mPrevWord[MAX_WORD_LENGTH]; + // TODO: Move to private + int mPrevSpacePositions[MAX_RESULTS]; + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + int16_t mPrevWordCount; + int16_t mPrevWordLength; + int16_t mPrevWordStart; + int16_t mPrevWordProbability; + int mPrevWordNodePos; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_PREVWORD_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h b/native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h new file mode 100644 index 000000000..8e816329f --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_SCORING_H +#define LATINIME_DIC_NODE_STATE_SCORING_H + +#include + +#include "defines.h" + +namespace latinime { + +class DicNodeStateScoring { + public: + AK_FORCE_INLINE DicNodeStateScoring() + : mDoubleLetterLevel(NOT_A_DOUBLE_LETTER), + mEditCorrectionCount(0), mProximityCorrectionCount(0), + mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f), + mTotalPrevWordsLanguageCost(0.0f), mRawLength(0.0f) { + } + + virtual ~DicNodeStateScoring() {} + + void init() { + mEditCorrectionCount = 0; + mProximityCorrectionCount = 0; + mNormalizedCompoundDistance = 0.0f; + mSpatialDistance = 0.0f; + mLanguageDistance = 0.0f; + mTotalPrevWordsLanguageCost = 0.0f; + mRawLength = 0.0f; + mDoubleLetterLevel = NOT_A_DOUBLE_LETTER; + } + + AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) { + mEditCorrectionCount = scoring->mEditCorrectionCount; + mProximityCorrectionCount = scoring->mProximityCorrectionCount; + mNormalizedCompoundDistance = scoring->mNormalizedCompoundDistance; + mSpatialDistance = scoring->mSpatialDistance; + mLanguageDistance = scoring->mLanguageDistance; + mTotalPrevWordsLanguageCost = scoring->mTotalPrevWordsLanguageCost; + mRawLength = scoring->mRawLength; + mDoubleLetterLevel = scoring->mDoubleLetterLevel; + } + + void addCost(const float spatialCost, const float languageCost, const bool doNormalization, + const int inputSize, const int totalInputIndex, const bool isEditCorrection, + const bool isProximityCorrection) { + addDistance(spatialCost, languageCost, doNormalization, inputSize, totalInputIndex); + if (isEditCorrection) { + ++mEditCorrectionCount; + } + if (isProximityCorrection) { + ++mProximityCorrectionCount; + } + if (languageCost > 0.0f) { + setTotalPrevWordsLanguageCost(mTotalPrevWordsLanguageCost + languageCost); + } + } + + void addRawLength(const float rawLength) { + mRawLength += rawLength; + } + + float getCompoundDistance() const { + return getCompoundDistance(1.0f); + } + + float getCompoundDistance(const float languageWeight) const { + return mSpatialDistance + mLanguageDistance * languageWeight; + } + + float getNormalizedCompoundDistance() const { + return mNormalizedCompoundDistance; + } + + float getSpatialDistance() const { + return mSpatialDistance; + } + + float getLanguageDistance() const { + return mLanguageDistance; + } + + int16_t getEditCorrectionCount() const { + return mEditCorrectionCount; + } + + int16_t getProximityCorrectionCount() const { + return mProximityCorrectionCount; + } + + float getRawLength() const { + return mRawLength; + } + + DoubleLetterLevel getDoubleLetterLevel() const { + return mDoubleLetterLevel; + } + + void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) { + switch(doubleLetterLevel) { + case NOT_A_DOUBLE_LETTER: + break; + case A_DOUBLE_LETTER: + if (mDoubleLetterLevel != A_STRONG_DOUBLE_LETTER) { + mDoubleLetterLevel = doubleLetterLevel; + } + break; + case A_STRONG_DOUBLE_LETTER: + mDoubleLetterLevel = doubleLetterLevel; + break; + } + } + + float getTotalPrevWordsLanguageCost() const { + return mTotalPrevWordsLanguageCost; + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + DoubleLetterLevel mDoubleLetterLevel; + + int16_t mEditCorrectionCount; + int16_t mProximityCorrectionCount; + + float mNormalizedCompoundDistance; + float mSpatialDistance; + float mLanguageDistance; + float mTotalPrevWordsLanguageCost; + float mRawLength; + + AK_FORCE_INLINE void addDistance(float spatialDistance, float languageDistance, + bool doNormalization, int inputSize, int totalInputIndex) { + mSpatialDistance += spatialDistance; + mLanguageDistance += languageDistance; + if (!doNormalization) { + mNormalizedCompoundDistance = mSpatialDistance + mLanguageDistance; + } else { + mNormalizedCompoundDistance = (mSpatialDistance + mLanguageDistance) + / static_cast(max(1, totalInputIndex)); + } + } + + //TODO: remove + AK_FORCE_INLINE void setTotalPrevWordsLanguageCost(float totalPrevWordsLanguageCost) { + mTotalPrevWordsLanguageCost = totalPrevWordsLanguageCost; + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_SCORING_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp new file mode 100644 index 000000000..031e706ae --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "binary_format.h" +#include "dic_node.h" +#include "dic_node_utils.h" +#include "dic_node_vector.h" +#include "proximity_info.h" +#include "proximity_info_state.h" + +namespace latinime { + +/////////////////////////////// +// Node initialization utils // +/////////////////////////////// + +/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot, + const int prevWordNodePos, DicNode *newRootNode) { + int curPos = rootPos; + const int pos = curPos; + const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); + const int childrenPos = curPos; + newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos); +} + +/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos, + const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) { + int curPos = rootPos; + const int pos = curPos; + const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); + const int childrenPos = curPos; + newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount); +} + +/* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) { + destNode->initByCopy(srcNode); +} + +/////////////////////////////////// +// Traverse node expansion utils // +/////////////////////////////////// + +/* static */ void DicNodeUtils::createAndGetPassingChildNode(DicNode *dicNode, + const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, + DicNodeVector *childDicNodes) { + // Passing multiple chars node. No need to traverse child + const int codePoint = dicNode->getNodeTypedCodePoint(); + const int baseLowerCaseCodePoint = toBaseLowerCase(codePoint); + const bool isMatch = isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, codePoint); + if (isMatch || isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) { + childDicNodes->pushPassingChild(dicNode); + } +} + +/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos, + const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState, + const int pointIndex, const bool exactOnly, const std::vector *const codePointsFilter, + const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { + int nextPos = pos; + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); + const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); + const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); + const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); + + int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos); + ASSERT(NOT_A_CODE_POINT != codePoint); + const int nodeCodePoint = codePoint; + // TODO: optimize this + int additionalWordBuf[MAX_WORD_LENGTH]; + uint16_t additionalSubwordLength = 0; + additionalWordBuf[additionalSubwordLength++] = codePoint; + + do { + const int nextCodePoint = hasMultipleChars + ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT; + const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint); + if (!isLastChar) { + additionalWordBuf[additionalSubwordLength++] = nextCodePoint; + } + codePoint = nextCodePoint; + } while (NOT_A_CODE_POINT != codePoint); + + const int probability = + isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1; + pos = BinaryFormat::skipProbability(flags, pos); + int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0; + const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos); + const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos); + + if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) { + return siblingPos; + } + if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) { + return siblingPos; + } + const int childrenCount = hasChildren + ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0; + childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos, + nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, + hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf); + return siblingPos; +} + +/* static */ bool DicNodeUtils::isDicNodeFilteredOut(const int nodeCodePoint, + const ProximityInfo *const pInfo, const std::vector *const codePointsFilter) { + const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; + if (filterSize <= 0) { + return false; + } + if (pInfo && (pInfo->getKeyIndexOf(nodeCodePoint) == NOT_AN_INDEX + || isIntentionalOmissionCodePoint(nodeCodePoint))) { + // If normalized nodeCodePoint is not on the keyboard or skippable, this child is never + // filtered. + return false; + } + const int lowerCodePoint = toLowerCase(nodeCodePoint); + const int baseLowerCodePoint = toBaseCodePoint(lowerCodePoint); + // TODO: Avoid linear search + for (int i = 0; i < filterSize; ++i) { + // Checking if a normalized code point is in filter characters when pInfo is not + // null. When pInfo is null, nodeCodePoint is used to check filtering without + // normalizing. + if ((pInfo && ((*codePointsFilter)[i] == lowerCodePoint + || (*codePointsFilter)[i] == baseLowerCodePoint)) + || (!pInfo && (*codePointsFilter)[i] == nodeCodePoint)) { + return false; + } + } + return true; +} + +/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode, + const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, + const bool exactOnly, const std::vector *const codePointsFilter, + const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { + const int terminalDepth = dicNode->getLeavingDepth(); + const int childCount = dicNode->getChildrenCount(); + int nextPos = dicNode->getChildrenPos(); + for (int i = 0; i < childCount; i++) { + const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; + nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState, + pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); + if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) { + // All code points have been found. + break; + } + } +} + +/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, + DicNodeVector *childDicNodes) { + getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes); +} + +/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode, + const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, + bool exactOnly, DicNodeVector *childDicNodes) { + if (dicNode->isTotalInputSizeExceedingLimit()) { + return; + } + if (!dicNode->isLeavingNode()) { + DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly, + childDicNodes); + } else { + DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex, + exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */, + childDicNodes); + } +} + +/////////////////// +// Scoring utils // +/////////////////// +/** + * Computes the combined bigram / unigram cost for the given dicNode. + */ +/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot, + const DicNode *const node, hash_map_compat *bigramCacheMap) { + if (node->isImpossibleBigramWord()) { + return static_cast(MAX_VALUE_FOR_WEIGHTING); + } + const int probability = getBigramNodeProbability(dicRoot, node, bigramCacheMap); + // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. + const float cost = static_cast(MAX_PROBABILITY - probability) + / static_cast(MAX_PROBABILITY); + return cost; +} + +/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot, + const DicNode *const node, hash_map_compat *bigramCacheMap) { + const int unigramProbability = node->getProbability(); + const int encodedDiffOfBigramProbability = + getBigramNodeEncodedDiffProbability(dicRoot, node, bigramCacheMap); + if (NOT_A_PROBABILITY == encodedDiffOfBigramProbability) { + return backoff(unigramProbability); + } + return BinaryFormat::computeProbabilityForBigram( + unigramProbability, encodedDiffOfBigramProbability); +} + +/////////////////////////////////////// +// Bigram / Unigram dictionary utils // +/////////////////////////////////////// + +/* static */ int16_t DicNodeUtils::getBigramNodeEncodedDiffProbability(const uint8_t *const dicRoot, + const DicNode *const node, hash_map_compat *bigramCacheMap) { + const int wordPos = node->getPos(); + const int prevWordPos = node->getPrevWordPos(); + return getBigramProbability(dicRoot, prevWordPos, wordPos, bigramCacheMap); +} + +// TODO: Move this to BigramDictionary +/* static */ int16_t DicNodeUtils::getBigramProbability(const uint8_t *const dicRoot, int pos, + const int nextPos, hash_map_compat *bigramCacheMap) { + // TODO: this is painfully slow compared to the method used in the previous version of the + // algorithm. Switch to that method. + if (NOT_VALID_WORD == pos) return NOT_A_PROBABILITY; + if (NOT_VALID_WORD == nextPos) return NOT_A_PROBABILITY; + + // Create a hash code for the given node pair (based on Josh Bloch's effective Java). + // TODO: Use a real hash map data structure that deals with collisions. + int hash = 17; + hash = hash * 31 + pos; + hash = hash * 31 + nextPos; + + hash_map_compat::const_iterator mapPos = bigramCacheMap->find(hash); + if (mapPos != bigramCacheMap->end()) { + return mapPos->second; + } + if (NOT_VALID_WORD == pos) { + return NOT_A_PROBABILITY; + } + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); + if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) { + return NOT_A_PROBABILITY; + } + if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { + BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos); + } else { + pos = BinaryFormat::skipOtherCharacters(dicRoot, pos); + } + pos = BinaryFormat::skipChildrenPosition(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); + uint8_t bigramFlags; + int count = 0; + do { + bigramFlags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); + const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(dicRoot, + bigramFlags, &pos); + if (bigramPos == nextPos) { + const int16_t probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; + if (static_cast(bigramCacheMap->size()) < MAX_BIGRAM_MAP_SIZE) { + (*bigramCacheMap)[hash] = probability; + } + return probability; + } + count++; + } while ((0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)) + && count < MAX_BIGRAMS_CONSIDERED_PER_CONTEXT); + if (static_cast(bigramCacheMap->size()) < MAX_BIGRAM_MAP_SIZE) { + // TODO: does this -1 mean NOT_VALID_WORD? + (*bigramCacheMap)[hash] = -1; + } + return NOT_A_PROBABILITY; +} + +/* static */ int DicNodeUtils::getWordPos(const uint8_t *const dicRoot, const int *word, + const int wordLength) { + if (!word) { + return NOT_VALID_WORD; + } + return BinaryFormat::getTerminalPosition( + dicRoot, word, wordLength, false /* forceLowerCaseSearch */); +} + +/* static */ bool DicNodeUtils::isMatchedNodeCodePoint(const ProximityInfoState *pInfoState, + const int pointIndex, const bool exactOnly, const int nodeCodePoint) { + if (!pInfoState) { + return true; + } + if (exactOnly) { + return pInfoState->getPrimaryCodePointAt(pointIndex) == nodeCodePoint; + } + const ProximityType matchedId = pInfoState->getProximityType(pointIndex, nodeCodePoint, + true /* checkProximityChars */); + return isProximityChar(matchedId); +} + +//////////////// +// Char utils // +//////////////// + +// TODO: Move to char_utils? +/* static */ int DicNodeUtils::appendTwoWords(const int *const src0, const int16_t length0, + const int *const src1, const int16_t length1, int *dest) { + int actualLength0 = 0; + for (int i = 0; i < length0; ++i) { + if (src0[i] == 0) { + break; + } + actualLength0 = i + 1; + } + actualLength0 = min(actualLength0, MAX_WORD_LENGTH); + memcpy(dest, src0, actualLength0 * sizeof(dest[0])); + if (!src1 || length1 == 0) { + return actualLength0; + } + int actualLength1 = 0; + for (int i = 0; i < length1; ++i) { + if (src1[i] == 0) { + break; + } + actualLength1 = i + 1; + } + actualLength1 = min(actualLength1, MAX_WORD_LENGTH - actualLength0 - 1); + memcpy(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0])); + return actualLength0 + actualLength1; +} +} // namespace latinime diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h new file mode 100644 index 000000000..15f9730de --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_UTILS_H +#define LATINIME_DIC_NODE_UTILS_H + +#include +#include + +#include "defines.h" +#include "hash_map_compat.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; +class ProximityInfo; +class ProximityInfoState; + +class DicNodeUtils { + public: + static int appendTwoWords(const int *src0, const int16_t length0, const int *src1, + const int16_t length1, int *dest); + static void initAsRoot(const int rootPos, const uint8_t *const dicRoot, + const int prevWordNodePos, DicNode *newRootNode); + static void initAsRootWithPreviousWord(const int rootPos, const uint8_t *const dicRoot, + DicNode *prevWordLastNode, DicNode *newRootNode); + static void initByCopy(DicNode *srcNode, DicNode *destNode); + static void getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, + DicNodeVector *childDicNodes); + static int getWordPos(const uint8_t *const dicRoot, const int *word, const int prevWordLength); + static float getBigramNodeImprobability(const uint8_t *const dicRoot, + const DicNode *const node, hash_map_compat *const bigramCacheMap); + static bool isDicNodeFilteredOut(const int nodeCodePoint, const ProximityInfo *const pInfo, + const std::vector *const codePointsFilter); + // TODO: Move to private + static void getProximityChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, + const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, + DicNodeVector *childDicNodes); + + // TODO: Move to proximity info + static bool isProximityChar(ProximityType type) { + return type == MATCH_CHAR || type == PROXIMITY_CHAR || type == ADDITIONAL_PROXIMITY_CHAR; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils); + // Max cache size for the space omission error correction bigram lookup + static const int MAX_BIGRAM_MAP_SIZE = 20000; + // Max number of bigrams to look up + static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500; + + static int getBigramNodeProbability(const uint8_t *const dicRoot, const DicNode *const node, + hash_map_compat *bigramCacheMap); + static int16_t getBigramNodeEncodedDiffProbability(const uint8_t *const dicRoot, + const DicNode *const node, hash_map_compat *bigramCacheMap); + static void createAndGetPassingChildNode(DicNode *dicNode, const ProximityInfoState *pInfoState, + const int pointIndex, const bool exactOnly, DicNodeVector *childDicNodes); + static void createAndGetAllLeavingChildNodes(DicNode *dicNode, const uint8_t *const dicRoot, + const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, + const std::vector *const codePointsFilter, + const ProximityInfo *const pInfo, DicNodeVector *childDicNodes); + static int createAndGetLeavingChildNode(DicNode *dicNode, int pos, const uint8_t *const dicRoot, + const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex, + const bool exactOnly, const std::vector *const codePointsFilter, + const ProximityInfo *const pInfo, DicNodeVector *childDicNodes); + static int16_t getBigramProbability(const uint8_t *const dicRoot, int pos, const int nextPos, + hash_map_compat *bigramCacheMap); + + // TODO: Move to proximity info + static bool isMatchedNodeCodePoint(const ProximityInfoState *pInfoState, const int pointIndex, + const bool exactOnly, const int nodeCodePoint); +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_UTILS_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h new file mode 100644 index 000000000..ca07edaee --- /dev/null +++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_VECTOR_H +#define LATINIME_DIC_NODE_VECTOR_H + +#include + +#include "defines.h" +#include "dic_node.h" + +namespace latinime { + +class DicNodeVector { + public: +#ifdef FLAG_DBG + // 0 will introduce resizing the vector. + static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 0; +#else + static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 60; +#endif + AK_FORCE_INLINE DicNodeVector() : mDicNodes(0), mLock(false), mEmptyNode() {} + + // Specify the capacity of the vector + AK_FORCE_INLINE DicNodeVector(const int size) : mDicNodes(0), mLock(false), mEmptyNode() { + mDicNodes.reserve(size); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicNodeVector() {} + + AK_FORCE_INLINE void clear() { + mDicNodes.clear(); + mLock = false; + } + + int getSizeAndLock() { + mLock = true; + return static_cast(mDicNodes.size()); + } + + bool exceeds(const size_t limit) const { + return mDicNodes.size() >= limit; + } + + void pushPassingChild(DicNode *dicNode) { + ASSERT(!mLock); + mDicNodes.push_back(mEmptyNode); + mDicNodes.back().initAsPassingChild(dicNode); + } + + void pushLeavingChild(DicNode *dicNode, const int pos, const uint8_t flags, + const int childrenPos, const int attributesPos, const int siblingPos, + const int nodeCodePoint, const int childrenCount, const int probability, + const int bigramProbability, const bool isTerminal, const bool hasMultipleChars, + const bool hasChildren, const uint16_t additionalSubwordLength, + const int *additionalSubword) { + ASSERT(!mLock); + mDicNodes.push_back(mEmptyNode); + mDicNodes.back().initAsChild(dicNode, pos, flags, childrenPos, attributesPos, siblingPos, + nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, + hasMultipleChars, hasChildren, additionalSubwordLength, additionalSubword); + } + + DicNode *operator[](const int id) { + ASSERT(id < static_cast(mDicNodes.size())); + return &mDicNodes[id]; + } + + DicNode *front() { + ASSERT(1 <= static_cast(mDicNodes.size())); + return &mDicNodes[0]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeVector); + std::vector mDicNodes; + bool mLock; + DicNode mEmptyNode; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_VECTOR_H