Change entry count limit.

Unigram 10K, Bigram 30K, Trigram 30K.

Change-Id: Ibd19c6a2b618499df1c70000bad7b47498187f0a
This commit is contained in:
Keisuke Kuroyanagi 2014-10-20 15:01:49 +09:00
parent 101cdca729
commit 1085fef8d0
6 changed files with 65 additions and 27 deletions

View file

@ -64,9 +64,6 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
private static final int TIMEOUT_FOR_READ_OPS_IN_MILLISECONDS = 100; private static final int TIMEOUT_FOR_READ_OPS_IN_MILLISECONDS = 100;
private static final int DEFAULT_MAX_UNIGRAM_COUNT = 10000;
private static final int DEFAULT_MAX_BIGRAM_COUNT = 10000;
/** /**
* The maximum length of a word in this dictionary. * The maximum length of a word in this dictionary.
*/ */
@ -225,10 +222,6 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
attributeMap.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, mLocale.toString()); attributeMap.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, mLocale.toString());
attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY, attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY,
String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()))); String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY,
String.valueOf(DEFAULT_MAX_UNIGRAM_COUNT));
attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY,
String.valueOf(DEFAULT_MAX_BIGRAM_COUNT));
return attributeMap; return attributeMap;
} }

View file

@ -40,8 +40,9 @@ public final class DictionaryHeader {
public static final String USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE"; public static final String USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
public static final String FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = public static final String FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_ENTRY_COUNT";
public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_ENTRY_COUNT";
public static final String MAX_TRIGRAM_COUNT_KEY = "MAX_TRIGRAM_ENTRY_COUNT";
public static final String ATTRIBUTE_VALUE_TRUE = "1"; public static final String ATTRIBUTE_VALUE_TRUE = "1";
public static final String CODE_POINT_TABLE_KEY = "codePointTable"; public static final String CODE_POINT_TABLE_KEY = "codePointTable";

View file

@ -38,15 +38,17 @@ const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_ENTRY_COUNT";
const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_ENTRY_COUNT";
const char *const HeaderPolicy::MAX_TRIGRAM_COUNT_KEY = "MAX_TRIGRAM_ENTRY_COUNT";
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000; const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000;
const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000; const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 30000;
const int HeaderPolicy::DEFAULT_MAX_TRIGRAM_COUNT = 30000;
// Used for logging. Question mark is used to indicate that the key is not found. // Used for logging. Question mark is used to indicate that the key is not found.
void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,

View file

@ -253,11 +253,13 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
static const char *const MAX_UNIGRAM_COUNT_KEY; static const char *const MAX_UNIGRAM_COUNT_KEY;
static const char *const MAX_BIGRAM_COUNT_KEY; static const char *const MAX_BIGRAM_COUNT_KEY;
static const char *const MAX_TRIGRAM_COUNT_KEY;
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
static const int DEFAULT_MAX_UNIGRAM_COUNT; static const int DEFAULT_MAX_UNIGRAM_COUNT;
static const int DEFAULT_MAX_BIGRAM_COUNT; static const int DEFAULT_MAX_BIGRAM_COUNT;
static const int DEFAULT_MAX_TRIGRAM_COUNT;
const FormatUtils::FORMAT_VERSION mDictFormatVersion; const FormatUtils::FORMAT_VERSION mDictFormatVersion;
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;

View file

@ -39,7 +39,6 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import java.util.Random; import java.util.Random;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -136,11 +135,18 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
private HashSet<File> mDictFilesToBeDeleted = new HashSet<>(); private HashSet<File> mDictFilesToBeDeleted = new HashSet<>();
private File createEmptyDictionaryAndGetFile(final int formatVersion) { private File createEmptyDictionaryAndGetFile(final int formatVersion) {
return createEmptyDictionaryWithAttributeMapAndGetFile(formatVersion,
new HashMap<String, String>());
}
private File createEmptyDictionaryWithAttributeMapAndGetFile(final int formatVersion,
final HashMap<String, String> attributeMap) {
if (formatVersion == FormatSpec.VERSION4 if (formatVersion == FormatSpec.VERSION4
|| formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING || formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING
|| formatVersion == FormatSpec.VERSION4_DEV) { || formatVersion == FormatSpec.VERSION4_DEV) {
try { try {
final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion); final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion,
attributeMap);
mDictFilesToBeDeleted.add(dictFile); mDictFilesToBeDeleted.add(dictFile);
return dictFile; return dictFile;
} catch (final IOException e) { } catch (final IOException e) {
@ -152,12 +158,12 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
return null; return null;
} }
private File createEmptyVer4DictionaryAndGetFile(final int formatVersion) private File createEmptyVer4DictionaryAndGetFile(final int formatVersion,
final HashMap<String, String> attributeMap)
throws IOException { throws IOException {
final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION, final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION,
getContext().getCacheDir()); getContext().getCacheDir());
FileUtils.deleteRecursively(file); FileUtils.deleteRecursively(file);
Map<String, String> attributeMap = new HashMap<>();
attributeMap.put(DictionaryHeader.DICTIONARY_ID_KEY, DICTIONARY_ID); attributeMap.put(DictionaryHeader.DICTIONARY_ID_KEY, DICTIONARY_ID);
attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY, attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY,
String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()))); String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
@ -388,7 +394,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
} }
final int maxUnigramCount = Integer.parseInt( final int maxUnigramCount = Integer.parseInt(
binaryDictionary.getPropertyForGettingStats(BinaryDictionary.MAX_UNIGRAM_COUNT_QUERY)); binaryDictionary.getPropertyForGettingStats(
BinaryDictionary.MAX_UNIGRAM_COUNT_QUERY));
for (int i = 0; i < unigramTypedCount; i++) { for (int i = 0; i < unigramTypedCount; i++) {
final String word = words.get(random.nextInt(words.size())); final String word = words.get(random.nextInt(words.size()));
onInputWord(binaryDictionary, word, true /* isValidWord */); onInputWord(binaryDictionary, word, true /* isValidWord */);
@ -476,6 +483,12 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
} }
private void testAddManyBigramsToDecayingDict(final int formatVersion) { private void testAddManyBigramsToDecayingDict(final int formatVersion) {
final int maxUnigramCount = 5000;
final int maxBigramCount = 10000;
final HashMap<String, String> attributeMap = new HashMap<>();
attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
final int unigramCount = 5000; final int unigramCount = 5000;
final int bigramCount = 30000; final int bigramCount = 30000;
final int bigramTypedCount = 100000; final int bigramTypedCount = 100000;
@ -484,7 +497,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
final Random random = new Random(seed); final Random random = new Random(seed);
setCurrentTimeForTestMode(mCurrentTime); setCurrentTimeForTestMode(mCurrentTime);
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); final File dictFile = createEmptyDictionaryWithAttributeMapAndGetFile(formatVersion,
attributeMap);
final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
@ -507,9 +521,6 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
bigrams.add(bigram); bigrams.add(bigram);
} }
final int maxBigramCount = Integer.parseInt(
binaryDictionary.getPropertyForGettingStats(
BinaryDictionary.MAX_BIGRAM_COUNT_QUERY));
for (int i = 0; i < bigramTypedCount; ++i) { for (int i = 0; i < bigramTypedCount; ++i) {
final Pair<String, String> bigram = bigrams.get(random.nextInt(bigrams.size())); final Pair<String, String> bigram = bigrams.get(random.nextInt(bigrams.size()));
onInputWord(binaryDictionary, bigram.first, true /* isValidWord */); onInputWord(binaryDictionary, bigram.first, true /* isValidWord */);
@ -546,6 +557,12 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
} }
private void testOverflowBigrams(final int formatVersion) { private void testOverflowBigrams(final int formatVersion) {
final int maxUnigramCount = 5000;
final int maxBigramCount = 10000;
final HashMap<String, String> attributeMap = new HashMap<>();
attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
final int bigramCount = 20000; final int bigramCount = 20000;
final int unigramCount = 1000; final int unigramCount = 1000;
final int unigramTypedCount = 20; final int unigramTypedCount = 20;
@ -556,7 +573,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
final long seed = System.currentTimeMillis(); final long seed = System.currentTimeMillis();
final Random random = new Random(seed); final Random random = new Random(seed);
setCurrentTimeForTestMode(mCurrentTime); setCurrentTimeForTestMode(mCurrentTime);
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); final File dictFile = createEmptyDictionaryWithAttributeMapAndGetFile(formatVersion,
attributeMap);
final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);

View file

@ -23,6 +23,7 @@ import android.util.Pair;
import com.android.inputmethod.latin.NgramContext.WordInfo; import com.android.inputmethod.latin.NgramContext.WordInfo;
import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.CodePointUtils;
import com.android.inputmethod.latin.makedict.DictionaryHeader;
import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.WeightedString; import com.android.inputmethod.latin.makedict.WeightedString;
import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.makedict.WordProperty;
@ -78,11 +79,18 @@ public class BinaryDictionaryTests extends AndroidTestCase {
} }
private File createEmptyDictionaryAndGetFile(final int formatVersion) { private File createEmptyDictionaryAndGetFile(final int formatVersion) {
return createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
new HashMap<String, String>());
}
private File createEmptyDictionaryWithAttributesAndGetFile(final int formatVersion,
final HashMap<String, String> attributeMap) {
if (formatVersion == FormatSpec.VERSION4 if (formatVersion == FormatSpec.VERSION4
|| formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING || formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING
|| formatVersion == FormatSpec.VERSION4_DEV) { || formatVersion == FormatSpec.VERSION4_DEV) {
try { try {
final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion); final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion,
attributeMap);
mDictFilesToBeDeleted.add(dictFile); mDictFilesToBeDeleted.add(dictFile);
return dictFile; return dictFile;
} catch (final IOException e) { } catch (final IOException e) {
@ -94,12 +102,12 @@ public class BinaryDictionaryTests extends AndroidTestCase {
return null; return null;
} }
private File createEmptyVer4DictionaryAndGetFile(final int formatVersion) throws IOException { private File createEmptyVer4DictionaryAndGetFile(final int formatVersion,
final HashMap<String, String> attributeMap) throws IOException {
final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION, final File file = File.createTempFile(DICTIONARY_ID, TEST_DICT_FILE_EXTENSION,
getContext().getCacheDir()); getContext().getCacheDir());
file.delete(); file.delete();
file.mkdir(); file.mkdir();
Map<String, String> attributeMap = new HashMap<>();
if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion, if (BinaryDictionaryUtils.createEmptyDictFile(file.getAbsolutePath(), formatVersion,
Locale.ENGLISH, attributeMap)) { Locale.ENGLISH, attributeMap)) {
return file; return file;
@ -669,6 +677,12 @@ public class BinaryDictionaryTests extends AndroidTestCase {
} }
private void testRandomOperationsAndFlashWithGC(final int formatVersion) { private void testRandomOperationsAndFlashWithGC(final int formatVersion) {
final int maxUnigramCount = 5000;
final int maxBigramCount = 10000;
final HashMap<String, String> attributeMap = new HashMap<>();
attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
final int flashWithGCIterationCount = 50; final int flashWithGCIterationCount = 50;
final int operationCountInEachIteration = 200; final int operationCountInEachIteration = 200;
final int initialUnigramCount = 100; final int initialUnigramCount = 100;
@ -679,7 +693,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
final long seed = System.currentTimeMillis(); final long seed = System.currentTimeMillis();
final Random random = new Random(seed); final Random random = new Random(seed);
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
attributeMap);
BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
final ArrayList<String> words = new ArrayList<>(); final ArrayList<String> words = new ArrayList<>();
@ -815,13 +830,20 @@ public class BinaryDictionaryTests extends AndroidTestCase {
} }
private void testUnigramAndBigramCount(final int formatVersion) { private void testUnigramAndBigramCount(final int formatVersion) {
final int maxUnigramCount = 5000;
final int maxBigramCount = 10000;
final HashMap<String, String> attributeMap = new HashMap<>();
attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY, String.valueOf(maxUnigramCount));
attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY, String.valueOf(maxBigramCount));
final int flashWithGCIterationCount = 10; final int flashWithGCIterationCount = 10;
final int codePointSetSize = 50; final int codePointSetSize = 50;
final int unigramCountPerIteration = 1000; final int unigramCountPerIteration = 1000;
final int bigramCountPerIteration = 2000; final int bigramCountPerIteration = 2000;
final long seed = System.currentTimeMillis(); final long seed = System.currentTimeMillis();
final Random random = new Random(seed); final Random random = new Random(seed);
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
attributeMap);
final ArrayList<String> words = new ArrayList<>(); final ArrayList<String> words = new ArrayList<>();
final HashSet<Pair<String, String>> bigrams = new HashSet<>(); final HashSet<Pair<String, String>> bigrams = new HashSet<>();