Retire Delight2 migration code to speed up tests.
We're waiting 10 minutes for tests to run, and half of that time is spent in depreacted code related to migration of Delight2 dictionary files. LatinIME will never migrate another Delight2 dictionary file again, so we can delete this code. Change-Id: I05c7d8429e8d9a26139456763c77997340fea8c2
This commit is contained in:
parent
fe716f0f73
commit
c15bbb52a3
7 changed files with 24 additions and 1002 deletions
|
@ -121,8 +121,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
|||
private static boolean needsToMigrateDictionary(final int formatVersion) {
|
||||
// When we bump up the dictionary format version, the old version should be added to here
|
||||
// for supporting migration. Note that native code has to support reading such formats.
|
||||
return formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING
|
||||
|| formatVersion == FormatSpec.VERSION402;
|
||||
return formatVersion == FormatSpec.VERSION402;
|
||||
}
|
||||
|
||||
public boolean isValidDictionaryLocked() {
|
||||
|
|
|
@ -174,9 +174,6 @@ public final class FormatSpec {
|
|||
public static final int VERSION202 = 202;
|
||||
// format version for Fava Dictionaries.
|
||||
public static final int VERSION_DELIGHT3 = 86736212;
|
||||
public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201;
|
||||
// Dictionary version used for testing.
|
||||
public static final int VERSION4_ONLY_FOR_TESTING = 399;
|
||||
public static final int VERSION402 = 402;
|
||||
public static final int VERSION403 = 403;
|
||||
public static final int VERSION4 = VERSION403;
|
||||
|
|
|
@ -42,8 +42,6 @@ import java.util.Random;
|
|||
public class BinaryDictionaryTests extends AndroidTestCase {
|
||||
private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
|
||||
private static final String TEST_LOCALE = "test";
|
||||
private static final int[] DICT_FORMAT_VERSIONS =
|
||||
new int[] { FormatSpec.VERSION402, FormatSpec.VERSION403 };
|
||||
private static final String DICTIONARY_ID = "TestBinaryDictionary";
|
||||
|
||||
private static boolean supportsNgram(final int formatVersion) {
|
||||
|
@ -113,13 +111,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testIsValidDictionary() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testIsValidDictionary(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testIsValidDictionary(final int formatVersion) {
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
|
||||
BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
||||
assertTrue("binaryDictionary must be valid for existing valid dictionary file.",
|
||||
binaryDictionary.isValidDictionary());
|
||||
|
@ -134,20 +126,14 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testConstructingDictionaryOnMemory() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testConstructingDictionaryOnMemory(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testConstructingDictionaryOnMemory(final int formatVersion) {
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
|
||||
FileUtils.deleteRecursively(dictFile);
|
||||
assertFalse(dictFile.exists());
|
||||
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||
true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, formatVersion,
|
||||
new HashMap<String, String>());
|
||||
true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE,
|
||||
FormatSpec.VERSION403, new HashMap<String, String>());
|
||||
assertTrue(binaryDictionary.isValidDictionary());
|
||||
assertEquals(formatVersion, binaryDictionary.getFormatVersion());
|
||||
assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
|
||||
final int probability = 100;
|
||||
addUnigramWord(binaryDictionary, "word", probability);
|
||||
assertEquals(probability, binaryDictionary.getFrequency("word"));
|
||||
|
@ -155,19 +141,13 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
binaryDictionary.flush();
|
||||
assertTrue(dictFile.exists());
|
||||
assertTrue(binaryDictionary.isValidDictionary());
|
||||
assertEquals(formatVersion, binaryDictionary.getFormatVersion());
|
||||
assertEquals(FormatSpec.VERSION403, binaryDictionary.getFormatVersion());
|
||||
assertEquals(probability, binaryDictionary.getFrequency("word"));
|
||||
binaryDictionary.close();
|
||||
}
|
||||
|
||||
public void testAddTooLongWord() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testAddTooLongWord(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testAddTooLongWord(final int formatVersion) {
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
final StringBuffer stringBuilder = new StringBuffer();
|
||||
for (int i = 0; i < BinaryDictionary.DICTIONARY_MAX_WORD_LENGTH; i++) {
|
||||
stringBuilder.append('a');
|
||||
|
@ -234,13 +214,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testAddUnigramWord() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testAddUnigramWord(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testAddUnigramWord(final int formatVersion) {
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
final int probability = 100;
|
||||
addUnigramWord(binaryDictionary, "aaa", probability);
|
||||
// Reallocate and create.
|
||||
|
@ -267,16 +241,10 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testRandomlyAddUnigramWord() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testRandomlyAddUnigramWord(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testRandomlyAddUnigramWord(final int formatVersion) {
|
||||
final int wordCount = 1000;
|
||||
final int codePointSetSize = 50;
|
||||
final long seed = System.currentTimeMillis();
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
|
||||
final HashMap<String, Integer> probabilityMap = new HashMap<>();
|
||||
// Test a word that isn't contained within the dictionary.
|
||||
|
@ -295,13 +263,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testAddBigramWords() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testAddBigramWords(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testAddBigramWords(final int formatVersion) {
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
|
||||
final int unigramProbability = 100;
|
||||
final int bigramProbability = 150;
|
||||
|
@ -354,18 +316,12 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testRandomlyAddBigramWords() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testRandomlyAddBigramWords(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testRandomlyAddBigramWords(final int formatVersion) {
|
||||
final int wordCount = 100;
|
||||
final int bigramCount = 1000;
|
||||
final int codePointSetSize = 50;
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
|
||||
final ArrayList<String> words = new ArrayList<>();
|
||||
final ArrayList<Pair<String, String>> bigramWords = new ArrayList<>();
|
||||
|
@ -406,15 +362,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testAddTrigramWords() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
if (supportsNgram(formatVersion)) {
|
||||
testAddTrigramWords(formatVersion);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void testAddTrigramWords(final int formatVersion) {
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
final int unigramProbability = 100;
|
||||
final int trigramProbability = 150;
|
||||
final int updatedTrigramProbability = 200;
|
||||
|
@ -440,13 +388,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testFlushDictionary() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testFlushDictionary(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testFlushDictionary(final int formatVersion) {
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
|
||||
BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
||||
|
||||
final int probability = 100;
|
||||
|
@ -480,13 +422,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testFlushWithGCDictionary() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testFlushWithGCDictionary(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testFlushWithGCDictionary(final int formatVersion) {
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
|
||||
BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
||||
final int unigramProbability = 100;
|
||||
final int bigramProbability = 150;
|
||||
|
@ -516,20 +452,13 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testAddBigramWordsAndFlashWithGC() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testAddBigramWordsAndFlashWithGC(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Evaluate performance of GC
|
||||
private void testAddBigramWordsAndFlashWithGC(final int formatVersion) {
|
||||
final int wordCount = 100;
|
||||
final int bigramCount = 1000;
|
||||
final int codePointSetSize = 30;
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
|
||||
BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
||||
|
||||
final ArrayList<String> words = new ArrayList<>();
|
||||
|
@ -575,12 +504,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testRandomOperationsAndFlashWithGC() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testRandomOperationsAndFlashWithGC(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testRandomOperationsAndFlashWithGC(final int formatVersion) {
|
||||
final int maxUnigramCount = 5000;
|
||||
final int maxBigramCount = 10000;
|
||||
final HashMap<String, String> attributeMap = new HashMap<>();
|
||||
|
@ -596,7 +519,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
|
||||
final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
|
||||
attributeMap);
|
||||
BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
||||
|
||||
|
@ -675,19 +598,13 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testAddManyUnigramsAndFlushWithGC() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testAddManyUnigramsAndFlushWithGC(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testAddManyUnigramsAndFlushWithGC(final int formatVersion) {
|
||||
final int flashWithGCIterationCount = 3;
|
||||
final int codePointSetSize = 50;
|
||||
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
|
||||
|
||||
final ArrayList<String> words = new ArrayList<>();
|
||||
final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
|
||||
|
@ -716,12 +633,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testUnigramAndBigramCount() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testUnigramAndBigramCount(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testUnigramAndBigramCount(final int formatVersion) {
|
||||
final int maxUnigramCount = 5000;
|
||||
final int maxBigramCount = 10000;
|
||||
final HashMap<String, String> attributeMap = new HashMap<>();
|
||||
|
@ -734,7 +645,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
final int bigramCountPerIteration = 2000;
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(formatVersion,
|
||||
final File dictFile = createEmptyDictionaryWithAttributesAndGetFile(FormatSpec.VERSION403,
|
||||
attributeMap);
|
||||
|
||||
final ArrayList<String> words = new ArrayList<>();
|
||||
|
@ -778,19 +689,13 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testGetWordProperties() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testGetWordProperties(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testGetWordProperties(final int formatVersion) {
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final int UNIGRAM_COUNT = 1000;
|
||||
final int BIGRAM_COUNT = 1000;
|
||||
final int codePointSetSize = 20;
|
||||
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||
final File dictFile = createEmptyDictionaryAndGetFile(FormatSpec.VERSION403);
|
||||
final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
||||
|
||||
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
|
||||
|
@ -869,19 +774,13 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
public void testIterateAllWords() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testIterateAllWords(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testIterateAllWords(final int formatVersion) {
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final int UNIGRAM_COUNT = 1000;
|
||||
final int BIGRAM_COUNT = 1000;
|
||||
final int codePointSetSize = 20;
|
||||
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
|
||||
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
|
||||
false /* isBeginningOfSentence */);
|
||||
|
@ -965,123 +864,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertEquals(true, wordProperty.mIsPossiblyOffensive);
|
||||
}
|
||||
|
||||
public void testDictMigration() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testDictMigration(final int fromFormatVersion, final int toFormatVersion) {
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(fromFormatVersion);
|
||||
final int unigramProbability = 100;
|
||||
addUnigramWord(binaryDictionary, "aaa", unigramProbability);
|
||||
addUnigramWord(binaryDictionary, "bbb", unigramProbability);
|
||||
final int bigramProbability = 150;
|
||||
addBigramWords(binaryDictionary, "aaa", "bbb", bigramProbability);
|
||||
binaryDictionary.addUnigramEntry("ccc", unigramProbability,
|
||||
false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
binaryDictionary.addUnigramEntry("ddd", unigramProbability,
|
||||
false /* isBeginningOfSentence */,
|
||||
true /* isNotAWord */, true /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
binaryDictionary.addNgramEntry(NgramContext.BEGINNING_OF_SENTENCE,
|
||||
"aaa", bigramProbability, 0 /* timestamp */);
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
|
||||
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
|
||||
assertEquals(fromFormatVersion, binaryDictionary.getFormatVersion());
|
||||
assertTrue(binaryDictionary.migrateTo(toFormatVersion));
|
||||
assertTrue(binaryDictionary.isValidDictionary());
|
||||
assertEquals(toFormatVersion, binaryDictionary.getFormatVersion());
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
|
||||
assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bbb"));
|
||||
assertEquals(bigramProbability, binaryDictionary.getNgramProbability(
|
||||
NgramContext.BEGINNING_OF_SENTENCE, "aaa"));
|
||||
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
|
||||
WordProperty wordProperty = binaryDictionary.getWordProperty("ccc",
|
||||
false /* isBeginningOfSentence */);
|
||||
wordProperty = binaryDictionary.getWordProperty("ddd",
|
||||
false /* isBeginningOfSentence */);
|
||||
assertTrue(wordProperty.mIsPossiblyOffensive);
|
||||
assertTrue(wordProperty.mIsNotAWord);
|
||||
}
|
||||
|
||||
public void testLargeDictMigration() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testLargeDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testLargeDictMigration(final int fromFormatVersion, final int toFormatVersion) {
|
||||
final int UNIGRAM_COUNT = 3000;
|
||||
final int BIGRAM_COUNT = 3000;
|
||||
final int codePointSetSize = 50;
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(fromFormatVersion);
|
||||
|
||||
final ArrayList<String> words = new ArrayList<>();
|
||||
final ArrayList<Pair<String, String>> bigrams = new ArrayList<>();
|
||||
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||
final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
|
||||
final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
|
||||
|
||||
for (int i = 0; i < UNIGRAM_COUNT; i++) {
|
||||
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||
final int unigramProbability = random.nextInt(0xFF);
|
||||
addUnigramWord(binaryDictionary, word, unigramProbability);
|
||||
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
words.add(word);
|
||||
unigramProbabilities.put(word, unigramProbability);
|
||||
}
|
||||
|
||||
for (int i = 0; i < BIGRAM_COUNT; i++) {
|
||||
final int word0Index = random.nextInt(words.size());
|
||||
final int word1Index = random.nextInt(words.size());
|
||||
if (word0Index == word1Index) {
|
||||
continue;
|
||||
}
|
||||
final String word0 = words.get(word0Index);
|
||||
final String word1 = words.get(word1Index);
|
||||
final int unigramProbability = unigramProbabilities.get(word1);
|
||||
final int bigramProbability =
|
||||
random.nextInt(0xFF - unigramProbability) + unigramProbability;
|
||||
addBigramWords(binaryDictionary, word0, word1, bigramProbability);
|
||||
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
final Pair<String, String> bigram = new Pair<>(word0, word1);
|
||||
bigrams.add(bigram);
|
||||
bigramProbabilities.put(bigram, bigramProbability);
|
||||
}
|
||||
assertTrue(binaryDictionary.migrateTo(toFormatVersion));
|
||||
|
||||
for (final String word : words) {
|
||||
assertEquals((int)unigramProbabilities.get(word), binaryDictionary.getFrequency(word));
|
||||
}
|
||||
assertEquals(unigramProbabilities.size(), Integer.parseInt(
|
||||
binaryDictionary.getPropertyForGettingStats(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
|
||||
|
||||
for (final Pair<String, String> bigram : bigrams) {
|
||||
assertEquals((int)bigramProbabilities.get(bigram),
|
||||
getBigramProbability(binaryDictionary, bigram.first, bigram.second));
|
||||
assertTrue(isValidBigram(binaryDictionary, bigram.first, bigram.second));
|
||||
}
|
||||
assertEquals(bigramProbabilities.size(), Integer.parseInt(
|
||||
binaryDictionary.getPropertyForGettingStats(BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||
}
|
||||
|
||||
public void testBeginningOfSentence() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testBeginningOfSentence(formatVersion);
|
||||
}
|
||||
}
|
||||
|
||||
private void testBeginningOfSentence(final int formatVersion) {
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(formatVersion);
|
||||
final BinaryDictionary binaryDictionary = getEmptyBinaryDictionary(FormatSpec.VERSION403);
|
||||
final int dummyProbability = 0;
|
||||
final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
|
||||
final int bigramProbability = 200;
|
||||
|
|
|
@ -43,22 +43,12 @@ public final class BinaryDictIOUtils {
|
|||
*/
|
||||
public static DictDecoder getDictDecoder(final File dictFile, final long offset,
|
||||
final long length, final int bufferType) {
|
||||
if (dictFile.isDirectory()) {
|
||||
return new Ver4DictDecoder(dictFile);
|
||||
} else if (dictFile.isFile()) {
|
||||
return new Ver2DictDecoder(dictFile, offset, length, bufferType);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static DictDecoder getDictDecoder(final File dictFile, final long offset,
|
||||
final long length, final DictionaryBufferFactory factory) {
|
||||
if (dictFile.isDirectory()) {
|
||||
return new Ver4DictDecoder(dictFile);
|
||||
} else if (dictFile.isFile()) {
|
||||
return new Ver2DictDecoder(dictFile, offset, length, factory);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static DictDecoder getDictDecoder(final File dictFile, final long offset,
|
||||
|
|
|
@ -1,319 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2013 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.android.inputmethod.latin.makedict;
|
||||
|
||||
import com.android.inputmethod.annotations.UsedForTesting;
|
||||
import com.android.inputmethod.latin.BinaryDictionary;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* An implementation of DictDecoder for version 2 binary dictionary.
|
||||
*/
|
||||
// TODO: Separate logics that are used only for testing.
|
||||
@UsedForTesting
|
||||
public class Ver2DictDecoder extends AbstractDictDecoder {
|
||||
/**
|
||||
* A utility class for reading a PtNode.
|
||||
*/
|
||||
static class PtNodeReader {
|
||||
static ProbabilityInfo readProbabilityInfo(final DictBuffer dictBuffer) {
|
||||
// Ver2 dicts don't contain historical information.
|
||||
return new ProbabilityInfo(dictBuffer.readUnsignedByte());
|
||||
}
|
||||
|
||||
static int readPtNodeOptionFlags(final DictBuffer dictBuffer) {
|
||||
return dictBuffer.readUnsignedByte();
|
||||
}
|
||||
|
||||
static int readChildrenAddress(final DictBuffer dictBuffer,
|
||||
final int ptNodeFlags) {
|
||||
switch (ptNodeFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
|
||||
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
|
||||
return dictBuffer.readUnsignedByte();
|
||||
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
|
||||
return dictBuffer.readUnsignedShort();
|
||||
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
|
||||
return dictBuffer.readUnsignedInt24();
|
||||
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
|
||||
default:
|
||||
return FormatSpec.NO_CHILDREN_ADDRESS;
|
||||
}
|
||||
}
|
||||
|
||||
// Reads shortcuts and returns the read length.
|
||||
static int readShortcut(final DictBuffer dictBuffer,
|
||||
final ArrayList<WeightedString> shortcutTargets) {
|
||||
final int pointerBefore = dictBuffer.position();
|
||||
dictBuffer.readUnsignedShort(); // skip the size
|
||||
while (true) {
|
||||
final int targetFlags = dictBuffer.readUnsignedByte();
|
||||
final String word = CharEncoding.readString(dictBuffer);
|
||||
shortcutTargets.add(new WeightedString(word,
|
||||
targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
|
||||
if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
|
||||
}
|
||||
return dictBuffer.position() - pointerBefore;
|
||||
}
|
||||
|
||||
static int readBigramAddresses(final DictBuffer dictBuffer,
|
||||
final ArrayList<PendingAttribute> bigrams, final int baseAddress) {
|
||||
int readLength = 0;
|
||||
int bigramCount = 0;
|
||||
while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
|
||||
final int bigramFlags = dictBuffer.readUnsignedByte();
|
||||
++readLength;
|
||||
final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE)
|
||||
? 1 : -1;
|
||||
int bigramAddress = baseAddress + readLength;
|
||||
switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) {
|
||||
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE:
|
||||
bigramAddress += sign * dictBuffer.readUnsignedByte();
|
||||
readLength += 1;
|
||||
break;
|
||||
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES:
|
||||
bigramAddress += sign * dictBuffer.readUnsignedShort();
|
||||
readLength += 2;
|
||||
break;
|
||||
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES:
|
||||
bigramAddress += sign * dictBuffer.readUnsignedInt24();
|
||||
readLength += 3;
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Has bigrams with no address");
|
||||
}
|
||||
bigrams.add(new PendingAttribute(
|
||||
bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
|
||||
bigramAddress));
|
||||
if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
|
||||
}
|
||||
return readLength;
|
||||
}
|
||||
}
|
||||
|
||||
protected final File mDictionaryBinaryFile;
|
||||
protected final long mOffset;
|
||||
protected final long mLength;
|
||||
// TODO: Remove mBufferFactory and mDictBuffer from this class members because they are now
|
||||
// used only for testing.
|
||||
private final DictionaryBufferFactory mBufferFactory;
|
||||
protected DictBuffer mDictBuffer;
|
||||
|
||||
@UsedForTesting
|
||||
/* package */ Ver2DictDecoder(final File file, final long offset, final long length,
|
||||
final int factoryFlag) {
|
||||
mDictionaryBinaryFile = file;
|
||||
mOffset = offset;
|
||||
mLength = length;
|
||||
mDictBuffer = null;
|
||||
if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
|
||||
mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
|
||||
} else if ((factoryFlag & MASK_DICTBUFFER) == USE_BYTEARRAY) {
|
||||
mBufferFactory = new DictionaryBufferFromByteArrayFactory();
|
||||
} else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) {
|
||||
mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory();
|
||||
} else {
|
||||
mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
|
||||
}
|
||||
}
|
||||
|
||||
/* package */ Ver2DictDecoder(final File file, final long offset, final long length,
|
||||
final DictionaryBufferFactory factory) {
|
||||
mDictionaryBinaryFile = file;
|
||||
mOffset = offset;
|
||||
mLength = length;
|
||||
mBufferFactory = factory;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void openDictBuffer() throws FileNotFoundException, IOException {
|
||||
mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDictBufferOpen() {
|
||||
return mDictBuffer != null;
|
||||
}
|
||||
|
||||
/* package */ DictBuffer getDictBuffer() {
|
||||
return mDictBuffer;
|
||||
}
|
||||
|
||||
@UsedForTesting
|
||||
/* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException {
|
||||
openDictBuffer();
|
||||
return getDictBuffer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException {
|
||||
// dictType is not being used in dicttool. Passing an empty string.
|
||||
final BinaryDictionary binaryDictionary = new BinaryDictionary(
|
||||
mDictionaryBinaryFile.getAbsolutePath(), mOffset, mLength,
|
||||
true /* useFullEditDistance */, null /* locale */, "" /* dictType */,
|
||||
false /* isUpdatable */);
|
||||
final DictionaryHeader header = binaryDictionary.getHeader();
|
||||
binaryDictionary.close();
|
||||
if (header == null) {
|
||||
throw new IOException("Cannot read the dictionary header.");
|
||||
}
|
||||
if (header.mFormatOptions.mVersion != FormatSpec.VERSION2 &&
|
||||
header.mFormatOptions.mVersion != FormatSpec.VERSION201 &&
|
||||
header.mFormatOptions.mVersion != FormatSpec.VERSION202) {
|
||||
throw new UnsupportedFormatException("File header has a wrong version : "
|
||||
+ header.mFormatOptions.mVersion);
|
||||
}
|
||||
if (!isDictBufferOpen()) {
|
||||
openDictBuffer();
|
||||
}
|
||||
// Advance buffer reading position to the head of dictionary body.
|
||||
setPosition(header.mBodyOffset);
|
||||
return header;
|
||||
}
|
||||
|
||||
// TODO: Make this buffer multi thread safe.
|
||||
private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
|
||||
@Override
|
||||
public PtNodeInfo readPtNode(final int ptNodePos) {
|
||||
int addressPointer = ptNodePos;
|
||||
final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
|
||||
addressPointer += FormatSpec.PTNODE_FLAGS_SIZE;
|
||||
final int characters[];
|
||||
if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
|
||||
int index = 0;
|
||||
int character = CharEncoding.readChar(mDictBuffer);
|
||||
addressPointer += CharEncoding.getCharSize(character, null);
|
||||
while (FormatSpec.INVALID_CHARACTER != character) {
|
||||
// FusionDictionary is making sure that the length of the word is smaller than
|
||||
// MAX_WORD_LENGTH.
|
||||
// So we'll never write past the end of mCharacterBuffer.
|
||||
mCharacterBuffer[index++] = character;
|
||||
character = CharEncoding.readChar(mDictBuffer);
|
||||
addressPointer += CharEncoding.getCharSize(character, null);
|
||||
}
|
||||
characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
|
||||
} else {
|
||||
final int character = CharEncoding.readChar(mDictBuffer);
|
||||
addressPointer += CharEncoding.getCharSize(character, null);
|
||||
characters = new int[] { character };
|
||||
}
|
||||
final ProbabilityInfo probabilityInfo;
|
||||
if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
|
||||
probabilityInfo = PtNodeReader.readProbabilityInfo(mDictBuffer);
|
||||
addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE;
|
||||
} else {
|
||||
probabilityInfo = null;
|
||||
}
|
||||
int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags);
|
||||
if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
|
||||
childrenAddress += addressPointer;
|
||||
}
|
||||
addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags);
|
||||
final ArrayList<WeightedString> shortcutTargets;
|
||||
if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
|
||||
// readShortcut will add shortcuts to shortcutTargets.
|
||||
shortcutTargets = new ArrayList<>();
|
||||
addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets);
|
||||
} else {
|
||||
shortcutTargets = null;
|
||||
}
|
||||
|
||||
final ArrayList<PendingAttribute> bigrams;
|
||||
if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
|
||||
bigrams = new ArrayList<>();
|
||||
addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams,
|
||||
addressPointer);
|
||||
if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
|
||||
throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
|
||||
+ " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
|
||||
}
|
||||
} else {
|
||||
bigrams = null;
|
||||
}
|
||||
return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, probabilityInfo,
|
||||
childrenAddress, shortcutTargets, bigrams);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken)
|
||||
throws FileNotFoundException, IOException, UnsupportedFormatException {
|
||||
// dictType is not being used in dicttool. Passing an empty string.
|
||||
final BinaryDictionary binaryDictionary = new BinaryDictionary(
|
||||
mDictionaryBinaryFile.getAbsolutePath(), 0 /* offset */,
|
||||
mDictionaryBinaryFile.length() /* length */, true /* useFullEditDistance */,
|
||||
null /* locale */, "" /* dictType */, false /* isUpdatable */);
|
||||
final DictionaryHeader header = readHeader();
|
||||
final FusionDictionary fusionDict =
|
||||
new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions);
|
||||
int token = 0;
|
||||
final ArrayList<WordProperty> wordProperties = new ArrayList<>();
|
||||
do {
|
||||
final BinaryDictionary.GetNextWordPropertyResult result =
|
||||
binaryDictionary.getNextWordProperty(token);
|
||||
final WordProperty wordProperty = result.mWordProperty;
|
||||
if (wordProperty == null) {
|
||||
binaryDictionary.close();
|
||||
if (deleteDictIfBroken) {
|
||||
mDictionaryBinaryFile.delete();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
wordProperties.add(wordProperty);
|
||||
token = result.mNextToken;
|
||||
} while (token != 0);
|
||||
|
||||
// Insert unigrams into the fusion dictionary.
|
||||
for (final WordProperty wordProperty : wordProperties) {
|
||||
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||
wordProperty.mIsNotAWord,
|
||||
wordProperty.mIsPossiblyOffensive);
|
||||
}
|
||||
// Insert bigrams into the fusion dictionary.
|
||||
for (final WordProperty wordProperty : wordProperties) {
|
||||
if (!wordProperty.mHasNgrams) {
|
||||
continue;
|
||||
}
|
||||
final String word0 = wordProperty.mWord;
|
||||
for (final WeightedString bigram : wordProperty.getBigrams()) {
|
||||
fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo);
|
||||
}
|
||||
}
|
||||
binaryDictionary.close();
|
||||
return fusionDict;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setPosition(int newPos) {
|
||||
mDictBuffer.position(newPos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPosition() {
|
||||
return mDictBuffer.position();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readPtNodeCount() {
|
||||
return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer);
|
||||
}
|
||||
}
|
|
@ -1,150 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2013 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.android.inputmethod.latin.makedict;
|
||||
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
|
||||
import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
|
||||
import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFromByteArrayFactory;
|
||||
import com.android.inputmethod.latin.makedict.DictDecoder.
|
||||
DictionaryBufferFromReadOnlyByteBufferFactory;
|
||||
import com.android.inputmethod.latin.makedict.DictDecoder.
|
||||
DictionaryBufferFromWritableByteBufferFactory;
|
||||
|
||||
import android.test.AndroidTestCase;
|
||||
import android.util.Log;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Unit tests for Ver2DictDecoder
|
||||
*/
|
||||
public class Ver2DictDecoderTests extends AndroidTestCase {
|
||||
private static final String TAG = Ver2DictDecoderTests.class.getSimpleName();
|
||||
|
||||
private final byte[] data = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
||||
|
||||
// Utilities for testing
|
||||
public void writeDataToFile(final File file) {
|
||||
FileOutputStream outStream = null;
|
||||
try {
|
||||
outStream = new FileOutputStream(file);
|
||||
outStream.write(data);
|
||||
} catch (IOException e) {
|
||||
fail ("Can't write data to the test file");
|
||||
} finally {
|
||||
if (outStream != null) {
|
||||
try {
|
||||
outStream.close();
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "Failed to close the output stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void runTestOpenBuffer(final String testName, final DictionaryBufferFactory factory) {
|
||||
File testFile = null;
|
||||
try {
|
||||
testFile = File.createTempFile(testName, ".tmp", getContext().getCacheDir());
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "IOException while the creating temporary file", e);
|
||||
}
|
||||
|
||||
assertNotNull(testFile);
|
||||
final Ver2DictDecoder dictDecoder = new Ver2DictDecoder(testFile, 0, testFile.length(),
|
||||
factory);
|
||||
try {
|
||||
dictDecoder.openDictBuffer();
|
||||
} catch (Exception e) {
|
||||
Log.e(TAG, "Failed to open the buffer", e);
|
||||
}
|
||||
|
||||
writeDataToFile(testFile);
|
||||
|
||||
try {
|
||||
dictDecoder.openDictBuffer();
|
||||
} catch (Exception e) {
|
||||
Log.e(TAG, "Raised the exception while opening buffer", e);
|
||||
}
|
||||
|
||||
assertEquals(testFile.length(), dictDecoder.getDictBuffer().capacity());
|
||||
}
|
||||
|
||||
public void testOpenBufferWithByteBuffer() {
|
||||
runTestOpenBuffer("testOpenBufferWithByteBuffer",
|
||||
new DictionaryBufferFromReadOnlyByteBufferFactory());
|
||||
}
|
||||
|
||||
public void testOpenBufferWithByteArray() {
|
||||
runTestOpenBuffer("testOpenBufferWithByteArray",
|
||||
new DictionaryBufferFromByteArrayFactory());
|
||||
}
|
||||
|
||||
public void testOpenBufferWithWritableByteBuffer() {
|
||||
runTestOpenBuffer("testOpenBufferWithWritableByteBuffer",
|
||||
new DictionaryBufferFromWritableByteBufferFactory());
|
||||
}
|
||||
|
||||
public void runTestGetBuffer(final String testName, final DictionaryBufferFactory factory) {
|
||||
File testFile = null;
|
||||
try {
|
||||
testFile = File.createTempFile(testName, ".tmp", getContext().getCacheDir());
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "IOException while the creating temporary file", e);
|
||||
}
|
||||
|
||||
final Ver2DictDecoder dictDecoder = new Ver2DictDecoder(testFile, 0, testFile.length(),
|
||||
factory);
|
||||
|
||||
// the default return value of getBuffer() must be null.
|
||||
assertNull("the default return value of getBuffer() is not null",
|
||||
dictDecoder.getDictBuffer());
|
||||
|
||||
writeDataToFile(testFile);
|
||||
assertTrue(testFile.exists());
|
||||
Log.d(TAG, "file length = " + testFile.length());
|
||||
|
||||
DictBuffer dictBuffer = null;
|
||||
try {
|
||||
dictBuffer = dictDecoder.openAndGetDictBuffer();
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "Failed to open and get the buffer", e);
|
||||
}
|
||||
assertNotNull("the buffer must not be null", dictBuffer);
|
||||
|
||||
for (int i = 0; i < data.length; ++i) {
|
||||
assertEquals(data[i], dictBuffer.readUnsignedByte());
|
||||
}
|
||||
}
|
||||
|
||||
public void testGetBufferWithByteBuffer() {
|
||||
runTestGetBuffer("testGetBufferWithByteBuffer",
|
||||
new DictionaryBufferFromReadOnlyByteBufferFactory());
|
||||
}
|
||||
|
||||
public void testGetBufferWithByteArray() {
|
||||
runTestGetBuffer("testGetBufferWithByteArray",
|
||||
new DictionaryBufferFromByteArrayFactory());
|
||||
}
|
||||
|
||||
public void testGetBufferWithWritableByteBuffer() {
|
||||
runTestGetBuffer("testGetBufferWithWritableByteBuffer",
|
||||
new DictionaryBufferFromWritableByteBufferFactory());
|
||||
}
|
||||
}
|
|
@ -1,279 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2013 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.android.inputmethod.latin.makedict;
|
||||
|
||||
import com.android.inputmethod.annotations.UsedForTesting;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
|
||||
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
/**
|
||||
* An implementation of DictEncoder for version 2 binary dictionary.
|
||||
*/
|
||||
@UsedForTesting
|
||||
public class Ver2DictEncoder implements DictEncoder {
|
||||
|
||||
private final File mDictFile;
|
||||
private OutputStream mOutStream;
|
||||
private byte[] mBuffer;
|
||||
private int mPosition;
|
||||
private final int mCodePointTableMode;
|
||||
public static final int CODE_POINT_TABLE_OFF = 0;
|
||||
public static final int CODE_POINT_TABLE_ON = 1;
|
||||
|
||||
@UsedForTesting
|
||||
public Ver2DictEncoder(final File dictFile, final int codePointTableMode) {
|
||||
mDictFile = dictFile;
|
||||
mOutStream = null;
|
||||
mBuffer = null;
|
||||
mCodePointTableMode = codePointTableMode;
|
||||
}
|
||||
|
||||
// This constructor is used only by BinaryDictOffdeviceUtilsTests.
|
||||
// If you want to use this in the production code, you should consider keeping consistency of
|
||||
// the interface of Ver3DictDecoder by using factory.
|
||||
@UsedForTesting
|
||||
public Ver2DictEncoder(final OutputStream outStream) {
|
||||
mDictFile = null;
|
||||
mOutStream = outStream;
|
||||
mCodePointTableMode = CODE_POINT_TABLE_OFF;
|
||||
}
|
||||
|
||||
private void openStream() throws FileNotFoundException {
|
||||
mOutStream = new FileOutputStream(mDictFile);
|
||||
}
|
||||
|
||||
private void close() throws IOException {
|
||||
if (mOutStream != null) {
|
||||
mOutStream.close();
|
||||
mOutStream = null;
|
||||
}
|
||||
}
|
||||
|
||||
// Package for testing
|
||||
static CodePointTable makeCodePointTable(final FusionDictionary dict) {
|
||||
final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>();
|
||||
for (final WordProperty word : dict) {
|
||||
// Store per code point occurrence
|
||||
final String wordString = word.mWord;
|
||||
for (int i = 0; i < wordString.length(); ++i) {
|
||||
final int codePoint = Character.codePointAt(wordString, i);
|
||||
if (codePointOccurrenceCounts.containsKey(codePoint)) {
|
||||
codePointOccurrenceCounts.put(codePoint,
|
||||
codePointOccurrenceCounts.get(codePoint) + 1);
|
||||
} else {
|
||||
codePointOccurrenceCounts.put(codePoint, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray =
|
||||
new ArrayList<>(codePointOccurrenceCounts.entrySet());
|
||||
// Descending order sort by occurrence (value side)
|
||||
Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() {
|
||||
@Override
|
||||
public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) {
|
||||
if (a.getValue() != b.getValue()) {
|
||||
return b.getValue().compareTo(a.getValue());
|
||||
}
|
||||
return b.getKey().compareTo(a.getKey());
|
||||
}
|
||||
});
|
||||
int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
|
||||
// Temporary map for writing of nodes
|
||||
final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>();
|
||||
for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
|
||||
// Put a relation from the original code point to the one byte code.
|
||||
codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex);
|
||||
if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// codePointToOneByteCodeMap for writing the trie
|
||||
// codePointOccurrenceArray for writing the header
|
||||
return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
|
||||
throws IOException, UnsupportedFormatException {
|
||||
// We no longer support anything but the latest version of v2.
|
||||
if (formatOptions.mVersion != FormatSpec.VERSION202) {
|
||||
throw new UnsupportedFormatException(
|
||||
"The given format options has wrong version number : "
|
||||
+ formatOptions.mVersion);
|
||||
}
|
||||
|
||||
if (mOutStream == null) {
|
||||
openStream();
|
||||
}
|
||||
|
||||
// Make code point conversion table ordered by occurrence of code points
|
||||
// Version 201 or later have codePointTable
|
||||
final CodePointTable codePointTable;
|
||||
if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion
|
||||
< FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) {
|
||||
codePointTable = new CodePointTable();
|
||||
} else {
|
||||
codePointTable = makeCodePointTable(dict);
|
||||
}
|
||||
|
||||
BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
|
||||
codePointTable.mCodePointOccurrenceArray);
|
||||
|
||||
// Addresses are limited to 3 bytes, but since addresses can be relative to each node
|
||||
// array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
|
||||
// the order of the PtNode arrays becomes a quite complicated problem, because though the
|
||||
// dictionary itself does not have a size limit, each node array must still be within 16MB
|
||||
// of all its children and parents. As long as this is ensured, the dictionary file may
|
||||
// grow to any size.
|
||||
|
||||
// Leave the choice of the optimal node order to the flattenTree function.
|
||||
MakedictLog.i("Flattening the tree...");
|
||||
ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
|
||||
|
||||
MakedictLog.i("Computing addresses...");
|
||||
BinaryDictEncoderUtils.computeAddresses(dict, flatNodes,
|
||||
codePointTable.mCodePointToOneByteCodeMap);
|
||||
MakedictLog.i("Checking PtNode array...");
|
||||
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
|
||||
|
||||
// Create a buffer that matches the final dictionary size.
|
||||
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
|
||||
final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
|
||||
mBuffer = new byte[bufferSize];
|
||||
|
||||
MakedictLog.i("Writing file...");
|
||||
|
||||
for (PtNodeArray nodeArray : flatNodes) {
|
||||
BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray,
|
||||
codePointTable.mCodePointToOneByteCodeMap);
|
||||
}
|
||||
if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes);
|
||||
mOutStream.write(mBuffer, 0, mPosition);
|
||||
|
||||
MakedictLog.i("Done");
|
||||
close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setPosition(final int position) {
|
||||
if (mBuffer == null || position < 0 || position >= mBuffer.length) return;
|
||||
mPosition = position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPosition() {
|
||||
return mPosition;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writePtNodeCount(final int ptNodeCount) {
|
||||
final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
|
||||
if (countSize != 1 && countSize != 2) {
|
||||
throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
|
||||
}
|
||||
final int encodedPtNodeCount = (countSize == 2) ?
|
||||
(ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
|
||||
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount,
|
||||
countSize);
|
||||
}
|
||||
|
||||
private void writePtNodeFlags(final PtNode ptNode,
|
||||
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
|
||||
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
|
||||
codePointToOneByteCodeMap);
|
||||
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition,
|
||||
BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos),
|
||||
FormatSpec.PTNODE_FLAGS_SIZE);
|
||||
}
|
||||
|
||||
private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars,
|
||||
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
|
||||
mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition,
|
||||
codePointToOneByteCodeMap);
|
||||
if (hasSeveralChars) {
|
||||
mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
|
||||
}
|
||||
}
|
||||
|
||||
private void writeFrequency(final int frequency) {
|
||||
if (frequency >= 0) {
|
||||
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency,
|
||||
FormatSpec.PTNODE_FREQUENCY_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeChildrenPosition(final PtNode ptNode,
|
||||
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
|
||||
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
|
||||
codePointToOneByteCodeMap);
|
||||
mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
|
||||
childrenPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a bigram attributes list to mBuffer.
|
||||
*
|
||||
* @param bigrams the bigram attributes list.
|
||||
* @param dict the dictionary the node array is a part of (for relative offsets).
|
||||
*/
|
||||
private void writeBigrams(final ArrayList<WeightedString> bigrams,
|
||||
final FusionDictionary dict) {
|
||||
if (bigrams == null) return;
|
||||
|
||||
final Iterator<WeightedString> bigramIterator = bigrams.iterator();
|
||||
while (bigramIterator.hasNext()) {
|
||||
final WeightedString bigram = bigramIterator.next();
|
||||
final PtNode target =
|
||||
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
|
||||
final int addressOfBigram = target.mCachedAddressAfterUpdate;
|
||||
final int unigramFrequencyForThisWord = target.getProbability();
|
||||
final int offset = addressOfBigram
|
||||
- (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
||||
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
|
||||
offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord);
|
||||
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags,
|
||||
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
||||
mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
|
||||
Math.abs(offset));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
|
||||
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
|
||||
writePtNodeFlags(ptNode, codePointToOneByteCodeMap);
|
||||
writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap);
|
||||
writeFrequency(ptNode.getProbability());
|
||||
writeChildrenPosition(ptNode, codePointToOneByteCodeMap);
|
||||
writeBigrams(ptNode.mBigrams, dict);
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue