From 60021bbdc25b7cda864fb3d1bf47d4f0e977e7f9 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Thu, 20 Nov 2014 15:55:44 +0900 Subject: [PATCH] Enable Quadgram for personalized dicts. Before: Total words: 1134659, Success Num: 944709, Success Percentage: 83.259% Bad Failures, with auto-correction (typed word == expected word, output word != expected word): 1258, Bad Failure Percentage: 0.111% Failures, with auto-correction (F-C): 28013, F-C Percentage: 2.469% Max Keystrokes: 6072844, Min Keystrokes: 3347332, Keystroke Saving Percentage:44.880% After: Total words: 1134665, Success Num: 945026, Success Percentage: 83.287% Bad Failures, with auto-correction (typed word == expected word, output word != expected word): 1271, Bad Failure Percentage: 0.112% Failures, with auto-correction (F-C): 27756, F-C Percentage: 2.446% Max Keystrokes: 6072850, Min Keystrokes: 3290996, Keystroke Saving Percentage:45.808% Change-Id: I16af52a3e9c371b95fd6f0741f45ee6b2443bea6 --- .../com/android/inputmethod/latin/common/Constants.java | 2 +- native/jni/src/defines.h | 2 +- .../suggest/policyimpl/dictionary/header/header_policy.cpp | 7 ++++--- .../content/dynamic_language_model_probability_utils.cpp | 7 ++++--- .../v4/content/dynamic_language_model_probability_utils.h | 2 +- .../suggest/policyimpl/dictionary/utils/entry_counters.h | 2 +- native/jni/src/utils/ngram_utils.h | 1 + 7 files changed, 13 insertions(+), 10 deletions(-) diff --git a/common/src/com/android/inputmethod/latin/common/Constants.java b/common/src/com/android/inputmethod/latin/common/Constants.java index abc377a84..a860d3560 100644 --- a/common/src/com/android/inputmethod/latin/common/Constants.java +++ b/common/src/com/android/inputmethod/latin/common/Constants.java @@ -179,7 +179,7 @@ public final class Constants { // (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram is supported in Java side. Needs to modify // MAX_PREV_WORD_COUNT_FOR_N_GRAM in native/jni/src/defines.h for suggestions. - public static final int MAX_PREV_WORD_COUNT_FOR_N_GRAM = 2; + public static final int MAX_PREV_WORD_COUNT_FOR_N_GRAM = 3; // Key events coming any faster than this are long-presses. public static final int LONG_PRESS_MILLISECONDS = 200; diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index 0e67b4d5a..10b930e4f 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -275,7 +275,7 @@ static inline void showStackTrace() { #define MAX_POINTER_COUNT_G 2 // (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram is supported. -#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 2 +#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 3 #define DISALLOW_DEFAULT_CONSTRUCTOR(TypeName) \ TypeName() = delete diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index a2a0f11b4..c93f31017 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -31,10 +31,11 @@ const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = - {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT"}; + {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"}; const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = - {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT"}; -const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000}; + {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT", + "MAX_QUADGRAM_ENTRY_COUNT"}; +const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000}; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; // Historical info is information that is needed to support decaying such as timestamp, level and // count. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp index 29bc7f719..025ee9932 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp @@ -19,12 +19,13 @@ namespace latinime { // Used to provide stable probabilities even if the user's input count is small. -const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2}; +const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2, 1}; // Encoded backoff weights. -// Note that we give positive value for trigrams that means the weight is more than 1. +// Note that we give positive values for trigrams and quadgrams that means the weight is more than +// 1. // TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight. -const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, 0, 8}; +const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, -4, 2, 8}; // This value is used to remove too old entries from the dictionary. const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS = diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h index b38047f49..644ae2ca7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h @@ -66,7 +66,7 @@ class DynamicLanguageModelProbabilityUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils); - static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 2, "Max supported Ngram is Trigram."); + static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 3, "Max supported Ngram is Quadgram."); static const int ASSUMED_MIN_COUNTS[]; static const int ENCODED_BACKOFF_WEIGHTS[]; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h b/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h index 7269913e8..5e443026e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h @@ -27,7 +27,7 @@ namespace latinime { // Copyable but immutable class EntryCounts final { public: - EntryCounts() : mEntryCounts({{0, 0, 0}}) {} + EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {} explicit EntryCounts(const std::array &counters) : mEntryCounts(counters) {} diff --git a/native/jni/src/utils/ngram_utils.h b/native/jni/src/utils/ngram_utils.h index 6227812d4..fa85ba35f 100644 --- a/native/jni/src/utils/ngram_utils.h +++ b/native/jni/src/utils/ngram_utils.h @@ -25,6 +25,7 @@ enum class NgramType : int { Unigram = 0, Bigram = 1, Trigram = 2, + Quadgram = 3, NotANgramType = -1, };