am d8f35f7b: Move methods for outputting from Suggest.
* commit 'd8f35f7b4c68dc8de8a8406283ad7b37902e633a': Move methods for outputting from Suggest.main
commit
3a4e865a91
|
@ -31,6 +31,7 @@ LATIN_IME_CORE_SRC_FILES := \
|
||||||
digraph_utils.cpp \
|
digraph_utils.cpp \
|
||||||
error_type_utils.cpp \
|
error_type_utils.cpp \
|
||||||
multi_bigram_map.cpp \
|
multi_bigram_map.cpp \
|
||||||
|
suggestions_output_utils.cpp \
|
||||||
unigram_property.cpp) \
|
unigram_property.cpp) \
|
||||||
$(addprefix suggest/core/layout/, \
|
$(addprefix suggest/core/layout/, \
|
||||||
additional_proximity_chars.cpp \
|
additional_proximity_chars.cpp \
|
||||||
|
|
|
@ -1,64 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (C) 2012 The Android Open Source Project
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef LATINIME_SHORTCUT_UTILS
|
|
||||||
#define LATINIME_SHORTCUT_UTILS
|
|
||||||
|
|
||||||
#include "defines.h"
|
|
||||||
#include "suggest/core/dicnode/dic_node_utils.h"
|
|
||||||
#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
|
|
||||||
|
|
||||||
namespace latinime {
|
|
||||||
|
|
||||||
class ShortcutUtils {
|
|
||||||
public:
|
|
||||||
static int outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt,
|
|
||||||
int outputWordIndex, const int finalScore, int *const outputCodePoints,
|
|
||||||
int *const frequencies, int *const outputTypes, const bool sameAsTyped) {
|
|
||||||
int shortcutTarget[MAX_WORD_LENGTH];
|
|
||||||
while (shortcutIt->hasNextShortcutTarget() && outputWordIndex < MAX_RESULTS) {
|
|
||||||
bool isWhilelist;
|
|
||||||
int shortcutTargetStringLength;
|
|
||||||
shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
|
|
||||||
&shortcutTargetStringLength, &isWhilelist);
|
|
||||||
int shortcutScore;
|
|
||||||
int kind;
|
|
||||||
if (isWhilelist && sameAsTyped) {
|
|
||||||
shortcutScore = S_INT_MAX;
|
|
||||||
kind = Dictionary::KIND_WHITELIST;
|
|
||||||
} else {
|
|
||||||
// shortcut entry's score == its base entry's score - 1
|
|
||||||
shortcutScore = finalScore;
|
|
||||||
// Protection against int underflow
|
|
||||||
shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1;
|
|
||||||
kind = Dictionary::KIND_SHORTCUT;
|
|
||||||
}
|
|
||||||
outputTypes[outputWordIndex] = kind;
|
|
||||||
frequencies[outputWordIndex] = shortcutScore;
|
|
||||||
frequencies[outputWordIndex] = max(S_INT_MIN + 1, shortcutScore) - 1;
|
|
||||||
const int startIndex2 = outputWordIndex * MAX_WORD_LENGTH;
|
|
||||||
DicNodeUtils::appendTwoWords(0, 0, shortcutTarget, shortcutTargetStringLength,
|
|
||||||
&outputCodePoints[startIndex2]);
|
|
||||||
++outputWordIndex;
|
|
||||||
}
|
|
||||||
return outputWordIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutUtils);
|
|
||||||
};
|
|
||||||
} // namespace latinime
|
|
||||||
#endif // LATINIME_SHORTCUT_UTILS
|
|
|
@ -0,0 +1,261 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "suggest/core/dictionary/suggestions_output_utils.h"
|
||||||
|
|
||||||
|
#include "suggest/core/dicnode/dic_node.h"
|
||||||
|
#include "suggest/core/dicnode/dic_node_utils.h"
|
||||||
|
#include "suggest/core/dictionary/dictionary.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
|
||||||
|
#include "suggest/core/policy/scoring.h"
|
||||||
|
#include "suggest/core/session/dic_traverse_session.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
||||||
|
|
||||||
|
// TODO: Split this method.
|
||||||
|
/* static */ int SuggestionsOutputUtils::outputSuggestions(
|
||||||
|
const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
|
||||||
|
int *frequencies, int *outputCodePoints, int *outputIndicesToPartialCommit,
|
||||||
|
int *outputTypes, int *outputAutoCommitFirstWordConfidence) {
|
||||||
|
#if DEBUG_EVALUATE_MOST_PROBABLE_STRING
|
||||||
|
const int terminalSize = 0;
|
||||||
|
#else
|
||||||
|
const int terminalSize = min(MAX_RESULTS,
|
||||||
|
static_cast<int>(traverseSession->getDicTraverseCache()->terminalSize()));
|
||||||
|
#endif
|
||||||
|
DicNode terminals[MAX_RESULTS]; // Avoiding non-POD variable length array
|
||||||
|
|
||||||
|
for (int index = terminalSize - 1; index >= 0; --index) {
|
||||||
|
traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float languageWeight = scoringPolicy->getAdjustedLanguageWeight(
|
||||||
|
traverseSession, terminals, terminalSize);
|
||||||
|
|
||||||
|
int outputWordIndex = 0;
|
||||||
|
// Insert most probable word at index == 0 as long as there is one terminal at least
|
||||||
|
const bool hasMostProbableString =
|
||||||
|
scoringPolicy->getMostProbableString(traverseSession, terminalSize, languageWeight,
|
||||||
|
&outputCodePoints[0], &outputTypes[0], &frequencies[0]);
|
||||||
|
if (hasMostProbableString) {
|
||||||
|
outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX;
|
||||||
|
++outputWordIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initial value of the loop index for terminal nodes (words)
|
||||||
|
int doubleLetterTerminalIndex = -1;
|
||||||
|
DoubleLetterLevel doubleLetterLevel = NOT_A_DOUBLE_LETTER;
|
||||||
|
scoringPolicy->searchWordWithDoubleLetter(terminals, terminalSize,
|
||||||
|
&doubleLetterTerminalIndex, &doubleLetterLevel);
|
||||||
|
|
||||||
|
int maxScore = S_INT_MIN;
|
||||||
|
// Force autocorrection for obvious long multi-word suggestions when the top suggestion is
|
||||||
|
// a long multiple words suggestion.
|
||||||
|
// TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
|
||||||
|
// traverseSession->isPartiallyCommited() always returns false because we never auto partial
|
||||||
|
// commit for now.
|
||||||
|
const bool forceCommitMultiWords = (terminalSize > 0) ?
|
||||||
|
scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
|
||||||
|
&& (traverseSession->isPartiallyCommited()
|
||||||
|
|| (traverseSession->getInputSize()
|
||||||
|
>= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
|
||||||
|
&& terminals[0].hasMultipleWords())) : false;
|
||||||
|
// TODO: have partial commit work even with multiple pointers.
|
||||||
|
const bool outputSecondWordFirstLetterInputIndex =
|
||||||
|
traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
|
||||||
|
if (terminalSize > 0) {
|
||||||
|
// If we have no suggestions, don't write this
|
||||||
|
outputAutoCommitFirstWordConfidence[0] =
|
||||||
|
computeFirstWordConfidence(&terminals[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output suggestion results here
|
||||||
|
for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS;
|
||||||
|
++terminalIndex) {
|
||||||
|
DicNode *terminalDicNode = &terminals[terminalIndex];
|
||||||
|
if (DEBUG_GEO_FULL) {
|
||||||
|
terminalDicNode->dump("OUT:");
|
||||||
|
}
|
||||||
|
const float doubleLetterCost = scoringPolicy->getDoubleLetterDemotionDistanceCost(
|
||||||
|
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
|
||||||
|
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
||||||
|
+ doubleLetterCost;
|
||||||
|
const bool isPossiblyOffensiveWord =
|
||||||
|
traverseSession->getDictionaryStructurePolicy()->getProbability(
|
||||||
|
terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
|
||||||
|
const bool isExactMatch = terminalDicNode->isExactMatch();
|
||||||
|
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
|
||||||
|
// Heuristic: We exclude freq=0 first-char-uppercase words from exact match.
|
||||||
|
// (e.g. "AMD" and "and")
|
||||||
|
const bool isSafeExactMatch = isExactMatch
|
||||||
|
&& !(isPossiblyOffensiveWord && isFirstCharUppercase);
|
||||||
|
const int outputTypeFlags =
|
||||||
|
(isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
|
||||||
|
| (isSafeExactMatch ? Dictionary::KIND_FLAG_EXACT_MATCH : 0);
|
||||||
|
|
||||||
|
// Entries that are blacklisted or do not represent a word should not be output.
|
||||||
|
const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
|
||||||
|
|
||||||
|
// Increase output score of top typing suggestion to ensure autocorrection.
|
||||||
|
// TODO: Better integration with java side autocorrection logic.
|
||||||
|
const int finalScore = scoringPolicy->calculateFinalScore(
|
||||||
|
compoundDistance, traverseSession->getInputSize(),
|
||||||
|
terminalDicNode->isExactMatch()
|
||||||
|
|| (forceCommitMultiWords && terminalDicNode->hasMultipleWords())
|
||||||
|
|| (isValidWord && scoringPolicy->doesAutoCorrectValidWord()));
|
||||||
|
if (maxScore < finalScore && isValidWord) {
|
||||||
|
maxScore = finalScore;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't output invalid words. However, we still need to submit their shortcuts if any.
|
||||||
|
if (isValidWord) {
|
||||||
|
outputTypes[outputWordIndex] = Dictionary::KIND_CORRECTION | outputTypeFlags;
|
||||||
|
frequencies[outputWordIndex] = finalScore;
|
||||||
|
if (outputSecondWordFirstLetterInputIndex) {
|
||||||
|
outputIndicesToPartialCommit[outputWordIndex] =
|
||||||
|
terminalDicNode->getSecondWordFirstInputIndex(
|
||||||
|
traverseSession->getProximityInfoState(0));
|
||||||
|
} else {
|
||||||
|
outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX;
|
||||||
|
}
|
||||||
|
// Populate the outputChars array with the suggested word.
|
||||||
|
const int startIndex = outputWordIndex * MAX_WORD_LENGTH;
|
||||||
|
terminalDicNode->outputResult(&outputCodePoints[startIndex]);
|
||||||
|
++outputWordIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!terminalDicNode->hasMultipleWords()) {
|
||||||
|
BinaryDictionaryShortcutIterator shortcutIt(
|
||||||
|
traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(),
|
||||||
|
traverseSession->getDictionaryStructurePolicy()
|
||||||
|
->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));
|
||||||
|
// Shortcut is not supported for multiple words suggestions.
|
||||||
|
// TODO: Check shortcuts during traversal for multiple words suggestions.
|
||||||
|
const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
|
||||||
|
const int shortcutBaseScore = scoringPolicy->doesAutoCorrectValidWord() ?
|
||||||
|
scoringPolicy->calculateFinalScore(compoundDistance,
|
||||||
|
traverseSession->getInputSize(), true /* forceCommit */) : finalScore;
|
||||||
|
const int updatedOutputWordIndex = outputShortcuts(&shortcutIt,
|
||||||
|
outputWordIndex, shortcutBaseScore, outputCodePoints, frequencies, outputTypes,
|
||||||
|
sameAsTyped);
|
||||||
|
const int secondWordFirstInputIndex = terminalDicNode->getSecondWordFirstInputIndex(
|
||||||
|
traverseSession->getProximityInfoState(0));
|
||||||
|
for (int i = outputWordIndex; i < updatedOutputWordIndex; ++i) {
|
||||||
|
if (outputSecondWordFirstLetterInputIndex) {
|
||||||
|
outputIndicesToPartialCommit[i] = secondWordFirstInputIndex;
|
||||||
|
} else {
|
||||||
|
outputIndicesToPartialCommit[i] = NOT_AN_INDEX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outputWordIndex = updatedOutputWordIndex;
|
||||||
|
}
|
||||||
|
DicNode::managedDelete(terminalDicNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasMostProbableString) {
|
||||||
|
scoringPolicy->safetyNetForMostProbableString(terminalSize, maxScore,
|
||||||
|
&outputCodePoints[0], &frequencies[0]);
|
||||||
|
}
|
||||||
|
return outputWordIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
|
||||||
|
const DicNode *const terminalDicNode) {
|
||||||
|
// Get the number of spaces in the first suggestion
|
||||||
|
const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
|
||||||
|
// Get the number of characters in the first suggestion
|
||||||
|
const int length = terminalDicNode->getTotalNodeCodePointCount();
|
||||||
|
// Get the distance for the first word of the suggestion
|
||||||
|
const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
|
||||||
|
|
||||||
|
// Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
|
||||||
|
// 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
|
||||||
|
// above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
|
||||||
|
// we are very confident.
|
||||||
|
// Expected space count is 1 ~ 5
|
||||||
|
static const int MIN_EXPECTED_SPACE_COUNT = 1;
|
||||||
|
static const int MAX_EXPECTED_SPACE_COUNT = 5;
|
||||||
|
// Expected length is about 4 ~ 30
|
||||||
|
static const int MIN_EXPECTED_LENGTH = 4;
|
||||||
|
static const int MAX_EXPECTED_LENGTH = 30;
|
||||||
|
// Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
|
||||||
|
static const float MIN_EXPECTED_DISTANCE = 0.0;
|
||||||
|
static const float MAX_EXPECTED_DISTANCE = 2.0;
|
||||||
|
// This is not strict: it's where most stuff will be falling, but it's still fine if it's
|
||||||
|
// outside these values. We want to output a value that reflects all of these. Each factor
|
||||||
|
// contributes a bit.
|
||||||
|
|
||||||
|
// We need at least a space.
|
||||||
|
if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
|
||||||
|
|
||||||
|
// The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
|
||||||
|
// contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
|
||||||
|
// weight of the distance. Clamp to avoid overflows.
|
||||||
|
const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
|
||||||
|
: distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
|
||||||
|
const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
|
||||||
|
* (MAX_EXPECTED_DISTANCE - clampedDistance)
|
||||||
|
/ (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
|
||||||
|
// The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
|
||||||
|
// contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
|
||||||
|
// length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
|
||||||
|
const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
|
||||||
|
* (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
|
||||||
|
// The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
|
||||||
|
// contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
|
||||||
|
// weight of the space count.
|
||||||
|
const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
|
||||||
|
* (spaceCount - MIN_EXPECTED_SPACE_COUNT)
|
||||||
|
/ (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
|
||||||
|
|
||||||
|
return distanceContribution + lengthContribution + spaceContribution;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ int SuggestionsOutputUtils::outputShortcuts(
|
||||||
|
BinaryDictionaryShortcutIterator *const shortcutIt,
|
||||||
|
int outputWordIndex, const int finalScore, int *const outputCodePoints,
|
||||||
|
int *const frequencies, int *const outputTypes, const bool sameAsTyped) {
|
||||||
|
int shortcutTarget[MAX_WORD_LENGTH];
|
||||||
|
while (shortcutIt->hasNextShortcutTarget() && outputWordIndex < MAX_RESULTS) {
|
||||||
|
bool isWhilelist;
|
||||||
|
int shortcutTargetStringLength;
|
||||||
|
shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
|
||||||
|
&shortcutTargetStringLength, &isWhilelist);
|
||||||
|
int shortcutScore;
|
||||||
|
int kind;
|
||||||
|
if (isWhilelist && sameAsTyped) {
|
||||||
|
shortcutScore = S_INT_MAX;
|
||||||
|
kind = Dictionary::KIND_WHITELIST;
|
||||||
|
} else {
|
||||||
|
// shortcut entry's score == its base entry's score - 1
|
||||||
|
shortcutScore = finalScore;
|
||||||
|
// Protection against int underflow
|
||||||
|
shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1;
|
||||||
|
kind = Dictionary::KIND_SHORTCUT;
|
||||||
|
}
|
||||||
|
outputTypes[outputWordIndex] = kind;
|
||||||
|
frequencies[outputWordIndex] = shortcutScore;
|
||||||
|
frequencies[outputWordIndex] = max(S_INT_MIN + 1, shortcutScore) - 1;
|
||||||
|
const int startIndex2 = outputWordIndex * MAX_WORD_LENGTH;
|
||||||
|
DicNodeUtils::appendTwoWords(0, 0, shortcutTarget, shortcutTargetStringLength,
|
||||||
|
&outputCodePoints[startIndex2]);
|
||||||
|
++outputWordIndex;
|
||||||
|
}
|
||||||
|
return outputWordIndex;
|
||||||
|
}
|
||||||
|
} // namespace latinime
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_SUGGESTIONS_OUTPUT_UTILS
|
||||||
|
#define LATINIME_SUGGESTIONS_OUTPUT_UTILS
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryShortcutIterator;
|
||||||
|
class DicNode;
|
||||||
|
class DicTraverseSession;
|
||||||
|
class Scoring;
|
||||||
|
|
||||||
|
class SuggestionsOutputUtils {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* Outputs the final list of suggestions (i.e., terminal nodes).
|
||||||
|
*/
|
||||||
|
static int outputSuggestions(const Scoring *const scoringPolicy,
|
||||||
|
DicTraverseSession *traverseSession, int *frequencies, int *outputCodePoints,
|
||||||
|
int *outputIndicesToPartialCommit, int *outputTypes,
|
||||||
|
int *outputAutoCommitFirstWordConfidence);
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestionsOutputUtils);
|
||||||
|
|
||||||
|
// Inputs longer than this will autocorrect if the suggestion is multi-word
|
||||||
|
static const int MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT;
|
||||||
|
|
||||||
|
static int computeFirstWordConfidence(const DicNode *const terminalDicNode);
|
||||||
|
|
||||||
|
static int outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt,
|
||||||
|
int outputWordIndex, const int finalScore, int *const outputCodePoints,
|
||||||
|
int *const frequencies, int *const outputTypes, const bool sameAsTyped);
|
||||||
|
};
|
||||||
|
} // namespace latinime
|
||||||
|
#endif // LATINIME_SUGGESTIONS_OUTPUT_UTILS
|
|
@ -19,13 +19,11 @@
|
||||||
#include "suggest/core/dicnode/dic_node.h"
|
#include "suggest/core/dicnode/dic_node.h"
|
||||||
#include "suggest/core/dicnode/dic_node_priority_queue.h"
|
#include "suggest/core/dicnode/dic_node_priority_queue.h"
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
|
|
||||||
#include "suggest/core/dictionary/dictionary.h"
|
#include "suggest/core/dictionary/dictionary.h"
|
||||||
#include "suggest/core/dictionary/digraph_utils.h"
|
#include "suggest/core/dictionary/digraph_utils.h"
|
||||||
#include "suggest/core/dictionary/shortcut_utils.h"
|
#include "suggest/core/dictionary/suggestions_output_utils.h"
|
||||||
#include "suggest/core/layout/proximity_info.h"
|
#include "suggest/core/layout/proximity_info.h"
|
||||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||||
#include "suggest/core/policy/scoring.h"
|
|
||||||
#include "suggest/core/policy/traversal.h"
|
#include "suggest/core/policy/traversal.h"
|
||||||
#include "suggest/core/policy/weighting.h"
|
#include "suggest/core/policy/weighting.h"
|
||||||
#include "suggest/core/session/dic_traverse_session.h"
|
#include "suggest/core/session/dic_traverse_session.h"
|
||||||
|
@ -33,9 +31,7 @@
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
// Initialization of class constants.
|
// Initialization of class constants.
|
||||||
const int Suggest::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
|
||||||
const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2;
|
const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2;
|
||||||
const float Suggest::AUTOCORRECT_CLASSIFICATION_THRESHOLD = 0.33f;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a set of suggestions for the given input touch points. The commitPoint argument indicates
|
* Returns a set of suggestions for the given input touch points. The commitPoint argument indicates
|
||||||
|
@ -70,8 +66,8 @@ int Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession,
|
||||||
}
|
}
|
||||||
PROF_END(1);
|
PROF_END(1);
|
||||||
PROF_START(2);
|
PROF_START(2);
|
||||||
const int size = outputSuggestions(tSession, frequencies, outWords, outputIndices, outputTypes,
|
const int size = SuggestionsOutputUtils::outputSuggestions(SCORING, tSession, frequencies,
|
||||||
outputAutoCommitFirstWordConfidence);
|
outWords, outputIndices, outputTypes, outputAutoCommitFirstWordConfidence);
|
||||||
PROF_END(2);
|
PROF_END(2);
|
||||||
PROF_CLOSE;
|
PROF_CLOSE;
|
||||||
return size;
|
return size;
|
||||||
|
@ -114,205 +110,6 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPo
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Outputs the final list of suggestions (i.e., terminal nodes).
|
|
||||||
*/
|
|
||||||
int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequencies,
|
|
||||||
int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes,
|
|
||||||
int *outputAutoCommitFirstWordConfidence) const {
|
|
||||||
#if DEBUG_EVALUATE_MOST_PROBABLE_STRING
|
|
||||||
const int terminalSize = 0;
|
|
||||||
#else
|
|
||||||
const int terminalSize = min(MAX_RESULTS,
|
|
||||||
static_cast<int>(traverseSession->getDicTraverseCache()->terminalSize()));
|
|
||||||
#endif
|
|
||||||
DicNode terminals[MAX_RESULTS]; // Avoiding non-POD variable length array
|
|
||||||
|
|
||||||
for (int index = terminalSize - 1; index >= 0; --index) {
|
|
||||||
traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float languageWeight = SCORING->getAdjustedLanguageWeight(
|
|
||||||
traverseSession, terminals, terminalSize);
|
|
||||||
|
|
||||||
int outputWordIndex = 0;
|
|
||||||
// Insert most probable word at index == 0 as long as there is one terminal at least
|
|
||||||
const bool hasMostProbableString =
|
|
||||||
SCORING->getMostProbableString(traverseSession, terminalSize, languageWeight,
|
|
||||||
&outputCodePoints[0], &outputTypes[0], &frequencies[0]);
|
|
||||||
if (hasMostProbableString) {
|
|
||||||
outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX;
|
|
||||||
++outputWordIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initial value of the loop index for terminal nodes (words)
|
|
||||||
int doubleLetterTerminalIndex = -1;
|
|
||||||
DoubleLetterLevel doubleLetterLevel = NOT_A_DOUBLE_LETTER;
|
|
||||||
SCORING->searchWordWithDoubleLetter(terminals, terminalSize,
|
|
||||||
&doubleLetterTerminalIndex, &doubleLetterLevel);
|
|
||||||
|
|
||||||
int maxScore = S_INT_MIN;
|
|
||||||
// Force autocorrection for obvious long multi-word suggestions when the top suggestion is
|
|
||||||
// a long multiple words suggestion.
|
|
||||||
// TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
|
|
||||||
// traverseSession->isPartiallyCommited() always returns false because we never auto partial
|
|
||||||
// commit for now.
|
|
||||||
const bool forceCommitMultiWords = (terminalSize > 0) ?
|
|
||||||
SCORING->autoCorrectsToMultiWordSuggestionIfTop()
|
|
||||||
&& (traverseSession->isPartiallyCommited()
|
|
||||||
|| (traverseSession->getInputSize()
|
|
||||||
>= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
|
|
||||||
&& terminals[0].hasMultipleWords())) : false;
|
|
||||||
// TODO: have partial commit work even with multiple pointers.
|
|
||||||
const bool outputSecondWordFirstLetterInputIndex =
|
|
||||||
traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
|
|
||||||
if (terminalSize > 0) {
|
|
||||||
// If we have no suggestions, don't write this
|
|
||||||
outputAutoCommitFirstWordConfidence[0] =
|
|
||||||
computeFirstWordConfidence(&terminals[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Output suggestion results here
|
|
||||||
for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS;
|
|
||||||
++terminalIndex) {
|
|
||||||
DicNode *terminalDicNode = &terminals[terminalIndex];
|
|
||||||
if (DEBUG_GEO_FULL) {
|
|
||||||
terminalDicNode->dump("OUT:");
|
|
||||||
}
|
|
||||||
const float doubleLetterCost = SCORING->getDoubleLetterDemotionDistanceCost(
|
|
||||||
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
|
|
||||||
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
|
||||||
+ doubleLetterCost;
|
|
||||||
const bool isPossiblyOffensiveWord =
|
|
||||||
traverseSession->getDictionaryStructurePolicy()->getProbability(
|
|
||||||
terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
|
|
||||||
const bool isExactMatch = terminalDicNode->isExactMatch();
|
|
||||||
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
|
|
||||||
// Heuristic: We exclude freq=0 first-char-uppercase words from exact match.
|
|
||||||
// (e.g. "AMD" and "and")
|
|
||||||
const bool isSafeExactMatch = isExactMatch
|
|
||||||
&& !(isPossiblyOffensiveWord && isFirstCharUppercase);
|
|
||||||
const int outputTypeFlags =
|
|
||||||
(isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
|
|
||||||
| (isSafeExactMatch ? Dictionary::KIND_FLAG_EXACT_MATCH : 0);
|
|
||||||
|
|
||||||
// Entries that are blacklisted or do not represent a word should not be output.
|
|
||||||
const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
|
|
||||||
|
|
||||||
// Increase output score of top typing suggestion to ensure autocorrection.
|
|
||||||
// TODO: Better integration with java side autocorrection logic.
|
|
||||||
const int finalScore = SCORING->calculateFinalScore(
|
|
||||||
compoundDistance, traverseSession->getInputSize(),
|
|
||||||
terminalDicNode->isExactMatch()
|
|
||||||
|| (forceCommitMultiWords && terminalDicNode->hasMultipleWords())
|
|
||||||
|| (isValidWord && SCORING->doesAutoCorrectValidWord()));
|
|
||||||
if (maxScore < finalScore && isValidWord) {
|
|
||||||
maxScore = finalScore;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Don't output invalid words. However, we still need to submit their shortcuts if any.
|
|
||||||
if (isValidWord) {
|
|
||||||
outputTypes[outputWordIndex] = Dictionary::KIND_CORRECTION | outputTypeFlags;
|
|
||||||
frequencies[outputWordIndex] = finalScore;
|
|
||||||
if (outputSecondWordFirstLetterInputIndex) {
|
|
||||||
outputIndicesToPartialCommit[outputWordIndex] =
|
|
||||||
terminalDicNode->getSecondWordFirstInputIndex(
|
|
||||||
traverseSession->getProximityInfoState(0));
|
|
||||||
} else {
|
|
||||||
outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX;
|
|
||||||
}
|
|
||||||
// Populate the outputChars array with the suggested word.
|
|
||||||
const int startIndex = outputWordIndex * MAX_WORD_LENGTH;
|
|
||||||
terminalDicNode->outputResult(&outputCodePoints[startIndex]);
|
|
||||||
++outputWordIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!terminalDicNode->hasMultipleWords()) {
|
|
||||||
BinaryDictionaryShortcutIterator shortcutIt(
|
|
||||||
traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(),
|
|
||||||
traverseSession->getDictionaryStructurePolicy()
|
|
||||||
->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));
|
|
||||||
// Shortcut is not supported for multiple words suggestions.
|
|
||||||
// TODO: Check shortcuts during traversal for multiple words suggestions.
|
|
||||||
const bool sameAsTyped = SCORING->sameAsTyped(traverseSession, terminalDicNode);
|
|
||||||
const int shortcutBaseScore = SCORING->doesAutoCorrectValidWord() ?
|
|
||||||
SCORING->calculateFinalScore(compoundDistance, traverseSession->getInputSize(),
|
|
||||||
true /* forceCommit */) : finalScore;
|
|
||||||
const int updatedOutputWordIndex = ShortcutUtils::outputShortcuts(&shortcutIt,
|
|
||||||
outputWordIndex, shortcutBaseScore, outputCodePoints, frequencies, outputTypes,
|
|
||||||
sameAsTyped);
|
|
||||||
const int secondWordFirstInputIndex = terminalDicNode->getSecondWordFirstInputIndex(
|
|
||||||
traverseSession->getProximityInfoState(0));
|
|
||||||
for (int i = outputWordIndex; i < updatedOutputWordIndex; ++i) {
|
|
||||||
if (outputSecondWordFirstLetterInputIndex) {
|
|
||||||
outputIndicesToPartialCommit[i] = secondWordFirstInputIndex;
|
|
||||||
} else {
|
|
||||||
outputIndicesToPartialCommit[i] = NOT_AN_INDEX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
outputWordIndex = updatedOutputWordIndex;
|
|
||||||
}
|
|
||||||
DicNode::managedDelete(terminalDicNode);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasMostProbableString) {
|
|
||||||
SCORING->safetyNetForMostProbableString(terminalSize, maxScore,
|
|
||||||
&outputCodePoints[0], &frequencies[0]);
|
|
||||||
}
|
|
||||||
return outputWordIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
int Suggest::computeFirstWordConfidence(const DicNode *const terminalDicNode) const {
|
|
||||||
// Get the number of spaces in the first suggestion
|
|
||||||
const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
|
|
||||||
// Get the number of characters in the first suggestion
|
|
||||||
const int length = terminalDicNode->getTotalNodeCodePointCount();
|
|
||||||
// Get the distance for the first word of the suggestion
|
|
||||||
const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
|
|
||||||
|
|
||||||
// Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
|
|
||||||
// 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
|
|
||||||
// above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
|
|
||||||
// we are very confident.
|
|
||||||
// Expected space count is 1 ~ 5
|
|
||||||
static const int MIN_EXPECTED_SPACE_COUNT = 1;
|
|
||||||
static const int MAX_EXPECTED_SPACE_COUNT = 5;
|
|
||||||
// Expected length is about 4 ~ 30
|
|
||||||
static const int MIN_EXPECTED_LENGTH = 4;
|
|
||||||
static const int MAX_EXPECTED_LENGTH = 30;
|
|
||||||
// Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
|
|
||||||
static const float MIN_EXPECTED_DISTANCE = 0.0;
|
|
||||||
static const float MAX_EXPECTED_DISTANCE = 2.0;
|
|
||||||
// This is not strict: it's where most stuff will be falling, but it's still fine if it's
|
|
||||||
// outside these values. We want to output a value that reflects all of these. Each factor
|
|
||||||
// contributes a bit.
|
|
||||||
|
|
||||||
// We need at least a space.
|
|
||||||
if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
|
|
||||||
|
|
||||||
// The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
|
|
||||||
// contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
|
|
||||||
// weight of the distance. Clamp to avoid overflows.
|
|
||||||
const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
|
|
||||||
: distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
|
|
||||||
const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
|
|
||||||
* (MAX_EXPECTED_DISTANCE - clampedDistance)
|
|
||||||
/ (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
|
|
||||||
// The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
|
|
||||||
// contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
|
|
||||||
// length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
|
|
||||||
const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
|
|
||||||
* (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
|
|
||||||
// The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
|
|
||||||
// contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
|
|
||||||
// weight of the space count.
|
|
||||||
const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
|
|
||||||
* (spaceCount - MIN_EXPECTED_SPACE_COUNT)
|
|
||||||
/ (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
|
|
||||||
|
|
||||||
return distanceContribution + lengthContribution + spaceContribution;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Expands the dicNodes in the current search priority queue by advancing to the possible child
|
* Expands the dicNodes in the current search priority queue by advancing to the possible child
|
||||||
* nodes based on the next touch point(s) (or no touch points for lookahead)
|
* nodes based on the next touch point(s) (or no touch points for lookahead)
|
||||||
|
|
|
@ -55,18 +55,11 @@ class Suggest : public SuggestInterface {
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Suggest);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(Suggest);
|
||||||
void createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode,
|
void createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode,
|
||||||
const bool spaceSubstitution) const;
|
const bool spaceSubstitution) const;
|
||||||
int outputSuggestions(DicTraverseSession *traverseSession, int *frequencies,
|
|
||||||
int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes,
|
|
||||||
int *outputAutoCommitFirstWordConfidence) const;
|
|
||||||
int computeFirstWordConfidence(const DicNode *const terminalDicNode) const;
|
|
||||||
void initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const;
|
void initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const;
|
||||||
void expandCurrentDicNodes(DicTraverseSession *traverseSession) const;
|
void expandCurrentDicNodes(DicTraverseSession *traverseSession) const;
|
||||||
void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
||||||
void processExpandedDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
void processExpandedDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
||||||
void weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
void weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
||||||
float getAutocorrectScore(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
|
||||||
void generateFeatures(
|
|
||||||
DicTraverseSession *traverseSession, DicNode *dicNode, float *features) const;
|
|
||||||
void processDicNodeAsOmission(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
void processDicNodeAsOmission(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
||||||
void processDicNodeAsDigraph(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
void processDicNodeAsDigraph(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
||||||
void processDicNodeAsTransposition(DicTraverseSession *traverseSession,
|
void processDicNodeAsTransposition(DicTraverseSession *traverseSession,
|
||||||
|
@ -79,13 +72,8 @@ class Suggest : public SuggestInterface {
|
||||||
void processDicNodeAsMatch(DicTraverseSession *traverseSession,
|
void processDicNodeAsMatch(DicTraverseSession *traverseSession,
|
||||||
DicNode *childDicNode) const;
|
DicNode *childDicNode) const;
|
||||||
|
|
||||||
// Inputs longer than this will autocorrect if the suggestion is multi-word
|
|
||||||
static const int MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT;
|
|
||||||
static const int MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE;
|
static const int MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE;
|
||||||
|
|
||||||
// Threshold for autocorrection classifier
|
|
||||||
static const float AUTOCORRECT_CLASSIFICATION_THRESHOLD;
|
|
||||||
|
|
||||||
const Traversal *const TRAVERSAL;
|
const Traversal *const TRAVERSAL;
|
||||||
const Scoring *const SCORING;
|
const Scoring *const SCORING;
|
||||||
const Weighting *const WEIGHTING;
|
const Weighting *const WEIGHTING;
|
||||||
|
|
Loading…
Reference in New Issue