Merge "Implement the heuristic for auto-commit."
This commit is contained in:
commit
74577bedb3
6 changed files with 90 additions and 21 deletions
|
@ -44,9 +44,9 @@ public final class BinaryDictionary extends Dictionary {
|
|||
private static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
|
||||
// Must be equal to MAX_RESULTS in native/jni/src/defines.h
|
||||
private static final int MAX_RESULTS = 18;
|
||||
// Required space count for auto commit.
|
||||
// TODO: Remove this heuristic.
|
||||
private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3;
|
||||
// The cutoff returned by native for auto-commit confidence.
|
||||
// Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h
|
||||
private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000;
|
||||
|
||||
@UsedForTesting
|
||||
public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
|
||||
|
@ -343,18 +343,7 @@ public final class BinaryDictionary extends Dictionary {
|
|||
|
||||
@Override
|
||||
public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
|
||||
// TODO: actually use the confidence rather than use this completely broken heuristic
|
||||
final String word = candidate.mWord;
|
||||
final int length = word.length();
|
||||
int remainingSpaces = SPACE_COUNT_FOR_AUTO_COMMIT;
|
||||
for (int i = 0; i < length; ++i) {
|
||||
// This is okay because no low-surrogate and no high-surrogate can ever match the
|
||||
// space character, so we don't need to take care of iterating on code points.
|
||||
if (Constants.CODE_SPACE == word.charAt(i)) {
|
||||
if (0 >= --remainingSpaces) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -298,9 +298,19 @@ static inline void prof_out(void) {
|
|||
#define NOT_AN_INDEX (-1)
|
||||
#define NOT_A_PROBABILITY (-1)
|
||||
#define NOT_A_DICT_POS (S_INT_MIN)
|
||||
|
||||
// A special value to mean the first word confidence makes no sense in this case,
|
||||
// e.g. this is not a multi-word suggestion.
|
||||
#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MIN)
|
||||
#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MAX)
|
||||
// How high the confidence needs to be for us to auto-commit. Arbitrary.
|
||||
// This needs to be the same as CONFIDENCE_FOR_AUTO_COMMIT in BinaryDictionary.java
|
||||
#define CONFIDENCE_FOR_AUTO_COMMIT (1000000)
|
||||
// 80% of the full confidence
|
||||
#define DISTANCE_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100)
|
||||
// 100% of the full confidence
|
||||
#define LENGTH_WEIGHT_FOR_AUTO_COMMIT (CONFIDENCE_FOR_AUTO_COMMIT)
|
||||
// 80% of the full confidence
|
||||
#define SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100)
|
||||
|
||||
#define KEYCODE_SPACE ' '
|
||||
#define KEYCODE_SINGLE_QUOTE '\''
|
||||
|
|
|
@ -321,6 +321,16 @@ class DicNode {
|
|||
DUMP_WORD_AND_SCORE("OUTPUT");
|
||||
}
|
||||
|
||||
// "Total" in this context (and other methods in this class) means the whole suggestion. When
|
||||
// this represents a multi-word suggestion, the referenced PtNode (in mDicNodeState) is only
|
||||
// the one that corresponds to the last word of the suggestion, and all the previous words
|
||||
// are concatenated together in mPrevWord - which contains a space at the end.
|
||||
int getTotalNodeSpaceCount() const {
|
||||
if (isFirstWord()) return 0;
|
||||
return CharUtils::getSpaceCount(mDicNodeState.mDicNodeStatePrevWord.mPrevWord,
|
||||
mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength());
|
||||
}
|
||||
|
||||
int getSecondWordFirstInputIndex(const ProximityInfoState *const pInfoState) const {
|
||||
const int inputIndex = mDicNodeState.mDicNodeStatePrevWord.getSecondWordFirstInputIndex();
|
||||
if (inputIndex == NOT_AN_INDEX) {
|
||||
|
|
|
@ -166,7 +166,11 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
|
|||
// TODO: have partial commit work even with multiple pointers.
|
||||
const bool outputSecondWordFirstLetterInputIndex =
|
||||
traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
|
||||
outputAutoCommitFirstWordConfidence[0] = computeFirstWordConfidence();
|
||||
if (terminalSize > 0) {
|
||||
// If we have no suggestions, don't write this
|
||||
outputAutoCommitFirstWordConfidence[0] =
|
||||
computeFirstWordConfidence(&terminals[0]);
|
||||
}
|
||||
|
||||
// Output suggestion results here
|
||||
for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS;
|
||||
|
@ -255,9 +259,55 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
|
|||
return outputWordIndex;
|
||||
}
|
||||
|
||||
int Suggest::computeFirstWordConfidence() const {
|
||||
// TODO: implement this.
|
||||
return NOT_A_FIRST_WORD_CONFIDENCE;
|
||||
int Suggest::computeFirstWordConfidence(const DicNode *const terminalDicNode) const {
|
||||
// Get the number of spaces in the first suggestion
|
||||
const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
|
||||
// Get the number of characters in the first suggestion
|
||||
const int length = terminalDicNode->getTotalNodeCodePointCount();
|
||||
// Get the distance for the first word of the suggestion
|
||||
const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
|
||||
|
||||
// Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
|
||||
// 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
|
||||
// above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
|
||||
// we are very confident.
|
||||
// Expected space count is 1 ~ 5
|
||||
static const int MIN_EXPECTED_SPACE_COUNT = 1;
|
||||
static const int MAX_EXPECTED_SPACE_COUNT = 5;
|
||||
// Expected length is about 4 ~ 30
|
||||
static const int MIN_EXPECTED_LENGTH = 4;
|
||||
static const int MAX_EXPECTED_LENGTH = 30;
|
||||
// Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
|
||||
static const float MIN_EXPECTED_DISTANCE = 0.0;
|
||||
static const float MAX_EXPECTED_DISTANCE = 2.0;
|
||||
// This is not strict: it's where most stuff will be falling, but it's still fine if it's
|
||||
// outside these values. We want to output a value that reflects all of these. Each factor
|
||||
// contributes a bit.
|
||||
|
||||
// We need at least a space.
|
||||
if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
|
||||
|
||||
// The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
|
||||
// contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
|
||||
// weight of the distance. Clamp to avoid overflows.
|
||||
const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
|
||||
: distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
|
||||
const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
|
||||
* (MAX_EXPECTED_DISTANCE - clampedDistance)
|
||||
/ (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
|
||||
// The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
|
||||
// contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
|
||||
// length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
|
||||
const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
|
||||
* (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
|
||||
// The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
|
||||
// contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
|
||||
// weight of the space count.
|
||||
const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
|
||||
* (spaceCount - MIN_EXPECTED_SPACE_COUNT)
|
||||
/ (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
|
||||
|
||||
return distanceContribution + lengthContribution + spaceContribution;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -58,7 +58,7 @@ class Suggest : public SuggestInterface {
|
|||
int outputSuggestions(DicTraverseSession *traverseSession, int *frequencies,
|
||||
int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes,
|
||||
int *outputAutoCommitFirstWordConfidence) const;
|
||||
int computeFirstWordConfidence() const;
|
||||
int computeFirstWordConfidence(const DicNode *const terminalDicNode) const;
|
||||
void initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const;
|
||||
void expandCurrentDicNodes(DicTraverseSession *traverseSession) const;
|
||||
void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
|
||||
|
|
|
@ -75,6 +75,16 @@ class CharUtils {
|
|||
return c;
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) {
|
||||
int spaceCount = 0;
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (codePointBuffer[i] == KEYCODE_SPACE) {
|
||||
++spaceCount;
|
||||
}
|
||||
}
|
||||
return spaceCount;
|
||||
}
|
||||
|
||||
static unsigned short latin_tolower(const unsigned short c);
|
||||
|
||||
private:
|
||||
|
|
Loading…
Reference in a new issue