From 6a1b37353d963db4bb25ea438be504348fea7418 Mon Sep 17 00:00:00 2001 From: "Tadashi G. Takaoka" Date: Thu, 25 Oct 2018 16:24:06 +0900 Subject: [PATCH] Fix dicttool build This CL partially reverts - Id88b02b74bdfe4ca05b08181ceb6b34d5652fc0c - I05c7d8429e8d9a26139456763c77997340fea8c2 And followup (remove shortcut support) - I73b7dc008a5acaf75a31a36a2d332b5afabd82d0 Bug: 28255684 Test: make -j10 dicttool_aosp Change-Id: I2e01ed86b9517a1141aee35ea6d8ef39258981d1 --- .../latin/makedict/FormatSpec.java | 3 + .../BinaryDictDecoderEncoderTests.java | 677 ++++++++++++++++++ .../latin/makedict/BinaryDictUtils.java | 80 +++ .../latin/makedict/Ver2DictEncoder.java | 279 ++++++++ .../latin/makedict/Ver4DictEncoder.java | 133 ++++ tools/dicttool/Android.mk | 8 +- .../latin/dicttool/CombinedInputOutput.java | 25 +- .../inputmethod/latin/dicttool/Diff.java | 3 - .../inputmethod/latin/dicttool/Info.java | 20 - .../BinaryDictOffdeviceUtilsTests.java | 14 +- .../BinaryDictEncoderFlattenTreeTests.java | 10 +- .../latin/makedict/FusionDictionaryTest.java | 2 +- 12 files changed, 1193 insertions(+), 61 deletions(-) create mode 100644 tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java create mode 100644 tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java create mode 100644 tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java create mode 100644 tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index e422c4cd2..288261bf0 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -174,6 +174,9 @@ public final class FormatSpec { public static final int VERSION202 = 202; // format version for Fava Dictionaries. public static final int VERSION_DELIGHT3 = 86736212; + public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201; + // Dictionary version used for testing. + public static final int VERSION4_ONLY_FOR_TESTING = 399; public static final int VERSION402 = 402; public static final int VERSION403 = 403; public static final int VERSION4 = VERSION403; diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java new file mode 100644 index 000000000..39da9fcd6 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java @@ -0,0 +1,677 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import android.test.AndroidTestCase; +import android.test.suitebuilder.annotation.LargeTest; +import android.util.Log; +import android.util.Pair; +import android.util.SparseArray; + +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.common.CodePointUtils; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; +import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; +import com.android.inputmethod.latin.utils.ByteArrayDictBuffer; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map.Entry; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; + +/** + * Unit tests for BinaryDictDecoderUtils and BinaryDictEncoderUtils. + */ +@LargeTest +public class BinaryDictDecoderEncoderTests extends AndroidTestCase { + private static final String TAG = BinaryDictDecoderEncoderTests.class.getSimpleName(); + private static final int DEFAULT_MAX_UNIGRAMS = 300; + private static final int DEFAULT_CODE_POINT_SET_SIZE = 50; + private static final int LARGE_CODE_POINT_SET_SIZE = 300; + private static final int UNIGRAM_FREQ = 10; + private static final int BIGRAM_FREQ = 50; + private static final int TOLERANCE_OF_BIGRAM_FREQ = 5; + + private static final ArrayList sWords = new ArrayList<>(); + private static final ArrayList sWordsWithVariousCodePoints = new ArrayList<>(); + private static final SparseArray> sEmptyBigrams = new SparseArray<>(); + private static final SparseArray> sStarBigrams = new SparseArray<>(); + private static final SparseArray> sChainBigrams = new SparseArray<>(); + + final Random mRandom; + + public BinaryDictDecoderEncoderTests() { + this(System.currentTimeMillis(), DEFAULT_MAX_UNIGRAMS); + } + + public BinaryDictDecoderEncoderTests(final long seed, final int maxUnigrams) { + super(); + BinaryDictionaryUtils.setCurrentTimeForTest(0); + Log.e(TAG, "Testing dictionary: seed is " + seed); + mRandom = new Random(seed); + sWords.clear(); + sWordsWithVariousCodePoints.clear(); + generateWords(maxUnigrams, mRandom); + + for (int i = 0; i < sWords.size(); ++i) { + sChainBigrams.put(i, new ArrayList()); + if (i > 0) { + sChainBigrams.get(i - 1).add(i); + } + } + + sStarBigrams.put(0, new ArrayList()); + // MAX - 1 because we added one above already + final int maxBigrams = Math.min(sWords.size(), FormatSpec.MAX_BIGRAMS_IN_A_PTNODE - 1); + for (int i = 1; i < maxBigrams; ++i) { + sStarBigrams.get(0).add(i); + } + } + + @Override + protected void setUp() throws Exception { + super.setUp(); + BinaryDictionaryUtils.setCurrentTimeForTest(0); + } + + @Override + protected void tearDown() throws Exception { + // Quit test mode. + BinaryDictionaryUtils.setCurrentTimeForTest(-1); + super.tearDown(); + } + + private static void generateWords(final int number, final Random random) { + final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, + random); + final Set wordSet = new HashSet<>(); + while (wordSet.size() < number) { + wordSet.add(CodePointUtils.generateWord(random, codePointSet)); + } + sWords.addAll(wordSet); + + final int[] largeCodePointSet = CodePointUtils.generateCodePointSet( + LARGE_CODE_POINT_SET_SIZE, random); + wordSet.clear(); + while (wordSet.size() < number) { + wordSet.add(CodePointUtils.generateWord(random, largeCodePointSet)); + } + sWordsWithVariousCodePoints.addAll(wordSet); + } + + /** + * Adds unigrams to the dictionary. + */ + private static void addUnigrams(final int number, final FusionDictionary dict, + final List words) { + for (int i = 0; i < number; ++i) { + final String word = words.get(i); + final ArrayList shortcuts = new ArrayList<>(); + dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ), false /* isNotAWord */, + false /* isPossiblyOffensive */); + } + } + + private static void addBigrams(final FusionDictionary dict, + final List words, + final SparseArray> bigrams) { + for (int i = 0; i < bigrams.size(); ++i) { + final int w1 = bigrams.keyAt(i); + for (int w2 : bigrams.valueAt(i)) { + dict.setBigram(words.get(w1), words.get(w2), new ProbabilityInfo(BIGRAM_FREQ)); + } + } + } + +// The following is useful to dump the dictionary into a textual file, but it can't compile +// on-device, so it's commented out. +// private void dumpToCombinedFileForDebug(final FusionDictionary dict, final String filename) +// throws IOException { +// com.android.inputmethod.latin.dicttool.CombinedInputOutput.writeDictionaryCombined( +// new java.io.FileWriter(new File(filename)), dict); +// } + + private static long timeWritingDictToFile(final File file, final FusionDictionary dict, + final FormatSpec.FormatOptions formatOptions) { + + long now = -1, diff = -1; + + try { + final DictEncoder dictEncoder = BinaryDictUtils.getDictEncoder(file, formatOptions); + + now = System.currentTimeMillis(); + // If you need to dump the dict to a textual file, uncomment the line below and the + // function above + // dumpToCombinedFileForDebug(file, "/tmp/foo"); + dictEncoder.writeDictionary(dict, formatOptions); + diff = System.currentTimeMillis() - now; + } catch (IOException e) { + Log.e(TAG, "IO exception while writing file", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException", e); + } + + return diff; + } + + private static void checkDictionary(final FusionDictionary dict, final List words, + final SparseArray> bigrams) { + assertNotNull(dict); + + // check unigram + for (final String word : words) { + final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, word); + assertNotNull(ptNode); + } + + // check bigram + for (int i = 0; i < bigrams.size(); ++i) { + final int w1 = bigrams.keyAt(i); + for (final int w2 : bigrams.valueAt(i)) { + final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, + words.get(w1)); + assertNotNull(words.get(w1) + "," + words.get(w2), ptNode.getBigram(words.get(w2))); + } + } + } + + private static String outputOptions(final int bufferType, + final FormatSpec.FormatOptions formatOptions) { + final String result = " : buffer type = " + + ((bufferType == BinaryDictUtils.USE_BYTE_BUFFER) ? "byte buffer" : "byte array"); + return result + " : version = " + formatOptions.mVersion; + } + + // Tests for readDictionaryBinary and writeDictionaryBinary + + private static long timeReadingAndCheckDict(final File file, final List words, + final SparseArray> bigrams, final int bufferType) { + long now, diff = -1; + + FusionDictionary dict = null; + try { + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + bufferType); + now = System.currentTimeMillis(); + dict = dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); + diff = System.currentTimeMillis() - now; + } catch (IOException e) { + Log.e(TAG, "IOException while reading dictionary", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "Unsupported format", e); + } + + checkDictionary(dict, words, bigrams); + return diff; + } + + // Tests for readDictionaryBinary and writeDictionaryBinary + private String runReadAndWrite(final List words, + final SparseArray> bigrams, + final int bufferType, final FormatSpec.FormatOptions formatOptions, + final String message) { + + final String dictName = "runReadAndWrite"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + checkDictionary(dict, words, bigrams); + + final long write = timeWritingDictToFile(file, dict, formatOptions); + final long read = timeReadingAndCheckDict(file, words, bigrams, bufferType); + + return "PROF: read=" + read + "ms, write=" + write + "ms :" + message + + " : " + outputOptions(bufferType, formatOptions); + } + + private void runReadAndWriteTests(final List results, final int bufferType, + final FormatSpec.FormatOptions formatOptions) { + results.add(runReadAndWrite(sWords, sEmptyBigrams, bufferType, + formatOptions, "unigram")); + results.add(runReadAndWrite(sWords, sChainBigrams, bufferType, + formatOptions, "chain")); + results.add(runReadAndWrite(sWords, sStarBigrams, bufferType, + formatOptions, "star")); + results.add(runReadAndWrite(sWords, sEmptyBigrams, bufferType, formatOptions, + "unigram with shortcuts")); + results.add(runReadAndWrite(sWords, sChainBigrams, bufferType, formatOptions, + "chain with shortcuts")); + results.add(runReadAndWrite(sWords, sStarBigrams, bufferType, formatOptions, + "star with shortcuts")); + results.add(runReadAndWrite(sWordsWithVariousCodePoints, sEmptyBigrams, + bufferType, formatOptions, + "unigram with various code points")); + } + + public void testCharacterTableIsPresent() throws IOException, UnsupportedFormatException { + final String[] wordSource = {"words", "used", "for", "testing", "a", "code point", "table"}; + final List words = Arrays.asList(wordSource); + final String correctCodePointTable = "toesdrniawuplgfcb "; + final String dictName = "codePointTableTest"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final String codePointTableAttribute = DictionaryHeader.CODE_POINT_TABLE_KEY; + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, + BinaryDictUtils.STATIC_OPTIONS, getContext().getCacheDir()); + + // Write a test dictionary + final DictEncoder dictEncoder = new Ver2DictEncoder(file, + Ver2DictEncoder.CODE_POINT_TABLE_ON); + final FormatSpec.FormatOptions formatOptions = + new FormatSpec.FormatOptions( + FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION); + final FusionDictionary sourcedict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), sourcedict, words); + dictEncoder.writeDictionary(sourcedict, formatOptions); + + // Read the dictionary + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + DictDecoder.USE_BYTEARRAY); + final DictionaryHeader fileHeader = dictDecoder.readHeader(); + // Check if codePointTable is present + assertTrue("codePointTable is not present", + fileHeader.mDictionaryOptions.mAttributes.containsKey(codePointTableAttribute)); + final String codePointTable = + fileHeader.mDictionaryOptions.mAttributes.get(codePointTableAttribute); + // Check if codePointTable is correct + assertEquals("codePointTable is incorrect", codePointTable, correctCodePointTable); + } + + // Unit test for CharEncoding.readString and CharEncoding.writeString. + public void testCharEncoding() { + // the max length of a word in sWords is less than 50. + // See generateWords. + final byte[] buffer = new byte[50 * 3]; + final DictBuffer dictBuffer = new ByteArrayDictBuffer(buffer); + for (final String word : sWords) { + Arrays.fill(buffer, (byte) 0); + CharEncoding.writeString(buffer, 0, word, null); + dictBuffer.position(0); + final String str = CharEncoding.readString(dictBuffer); + assertEquals(word, str); + } + } + + public void testReadAndWriteWithByteBuffer() { + final List results = new ArrayList<>(); + + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.STATIC_OPTIONS); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.DYNAMIC_OPTIONS_WITH_TIMESTAMP); + for (final String result : results) { + Log.d(TAG, result); + } + } + + public void testReadAndWriteWithByteArray() { + final List results = new ArrayList<>(); + + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.STATIC_OPTIONS); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.DYNAMIC_OPTIONS_WITH_TIMESTAMP); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + // Tests for readUnigramsAndBigramsBinary + + private static void checkWordMap(final List expectedWords, + final SparseArray> expectedBigrams, + final TreeMap resultWords, + final TreeMap resultFrequencies, + final TreeMap> resultBigrams, + final boolean checkProbability) { + // check unigrams + final Set actualWordsSet = new HashSet<>(resultWords.values()); + final Set expectedWordsSet = new HashSet<>(expectedWords); + assertEquals(actualWordsSet, expectedWordsSet); + if (checkProbability) { + for (int freq : resultFrequencies.values()) { + assertEquals(freq, UNIGRAM_FREQ); + } + } + + // check bigrams + final HashMap> expBigrams = new HashMap<>(); + for (int i = 0; i < expectedBigrams.size(); ++i) { + final String word1 = expectedWords.get(expectedBigrams.keyAt(i)); + for (int w2 : expectedBigrams.valueAt(i)) { + if (expBigrams.get(word1) == null) { + expBigrams.put(word1, new HashSet()); + } + expBigrams.get(word1).add(expectedWords.get(w2)); + } + } + + final HashMap> actBigrams = new HashMap<>(); + for (Entry> entry : resultBigrams.entrySet()) { + final String word1 = resultWords.get(entry.getKey()); + final int unigramFreq = resultFrequencies.get(entry.getKey()); + for (PendingAttribute attr : entry.getValue()) { + final String word2 = resultWords.get(attr.mAddress); + if (actBigrams.get(word1) == null) { + actBigrams.put(word1, new HashSet()); + } + actBigrams.get(word1).add(word2); + + if (checkProbability) { + final int bigramFreq = BinaryDictIOUtils.reconstructBigramFrequency( + unigramFreq, attr.mFrequency); + assertTrue(Math.abs(bigramFreq - BIGRAM_FREQ) < TOLERANCE_OF_BIGRAM_FREQ); + } + } + } + assertEquals(actBigrams, expBigrams); + } + + private static long timeAndCheckReadUnigramsAndBigramsBinary(final File file, + final List words, final SparseArray> bigrams, + final int bufferType, final boolean checkProbability) { + final TreeMap resultWords = new TreeMap<>(); + final TreeMap> resultBigrams = new TreeMap<>(); + final TreeMap resultFreqs = new TreeMap<>(); + + long now = -1, diff = -1; + try { + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + bufferType); + now = System.currentTimeMillis(); + dictDecoder.readUnigramsAndBigramsBinary(resultWords, resultFreqs, resultBigrams); + diff = System.currentTimeMillis() - now; + } catch (IOException e) { + Log.e(TAG, "IOException", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException", e); + } + + checkWordMap(words, bigrams, resultWords, resultFreqs, resultBigrams, checkProbability); + return diff; + } + + private String runReadUnigramsAndBigramsBinary(final ArrayList words, + final SparseArray> bigrams, final int bufferType, + final FormatSpec.FormatOptions formatOptions, final String message) { + final String dictName = "runReadUnigrams"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + + // making the dictionary from lists of words. + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + + timeWritingDictToFile(file, dict, formatOptions); + + // Caveat: Currently, the Java code to read a v4 dictionary doesn't calculate the + // probability when there's a timestamp for the entry. + // TODO: Abandon the Java code, and implement the v4 dictionary reading code in native. + long wordMap = timeAndCheckReadUnigramsAndBigramsBinary(file, words, bigrams, bufferType, + !formatOptions.mHasTimestamp /* checkProbability */); + long fullReading = timeReadingAndCheckDict(file, words, bigrams, + bufferType); + + return "readDictionaryBinary=" + fullReading + ", readUnigramsAndBigramsBinary=" + wordMap + + " : " + message + " : " + outputOptions(bufferType, formatOptions); + } + + private void runReadUnigramsAndBigramsTests(final ArrayList results, + final int bufferType, final FormatSpec.FormatOptions formatOptions) { + results.add(runReadUnigramsAndBigramsBinary(sWords, sEmptyBigrams, bufferType, + formatOptions, "unigram")); + results.add(runReadUnigramsAndBigramsBinary(sWords, sChainBigrams, bufferType, + formatOptions, "chain")); + results.add(runReadUnigramsAndBigramsBinary(sWords, sStarBigrams, bufferType, + formatOptions, "star")); + } + + public void testReadUnigramsAndBigramsBinaryWithByteBuffer() { + final ArrayList results = new ArrayList<>(); + + runReadUnigramsAndBigramsTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.STATIC_OPTIONS); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + public void testReadUnigramsAndBigramsBinaryWithByteArray() { + final ArrayList results = new ArrayList<>(); + + runReadUnigramsAndBigramsTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.STATIC_OPTIONS); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + // Tests for getTerminalPosition + private static String getWordFromBinary(final DictDecoder dictDecoder, final int address) { + if (dictDecoder.getPosition() != 0) dictDecoder.setPosition(0); + + DictionaryHeader fileHeader = null; + try { + fileHeader = dictDecoder.readHeader(); + } catch (IOException e) { + return null; + } catch (UnsupportedFormatException e) { + return null; + } + if (fileHeader == null) return null; + return BinaryDictDecoderUtils.getWordAtPosition(dictDecoder, fileHeader.mBodyOffset, + address).mWord; + } + + private static long checkGetTerminalPosition(final DictDecoder dictDecoder, final String word, + final boolean contained) { + long diff = -1; + int position = -1; + try { + final long now = System.nanoTime(); + position = dictDecoder.getTerminalPosition(word); + diff = System.nanoTime() - now; + } catch (IOException e) { + Log.e(TAG, "IOException while getTerminalPosition", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException while getTerminalPosition", e); + } + + assertEquals(FormatSpec.NOT_VALID_WORD != position, contained); + if (contained) assertEquals(getWordFromBinary(dictDecoder, position), word); + return diff; + } + + private void runGetTerminalPosition(final ArrayList words, + final SparseArray> bigrams, final int bufferType, + final FormatOptions formatOptions, final String message) { + final String dictName = "testGetTerminalPosition"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(sWords.size(), dict, sWords); + addBigrams(dict, words, bigrams); + timeWritingDictToFile(file, dict, formatOptions); + + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + DictDecoder.USE_BYTEARRAY); + try { + dictDecoder.openDictBuffer(); + } catch (IOException e) { + Log.e(TAG, "IOException while opening the buffer", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "IOException while opening the buffer", e); + } + assertTrue("Can't get the buffer", dictDecoder.isDictBufferOpen()); + + try { + // too long word + final String longWord = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(longWord)); + + // null + assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(null)); + + // empty string + assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition("")); + } catch (IOException e) { + } catch (UnsupportedFormatException e) { + } + + // Test a word that is contained within the dictionary. + long sum = 0; + for (int i = 0; i < sWords.size(); ++i) { + final long time = checkGetTerminalPosition(dictDecoder, sWords.get(i), true); + sum += time == -1 ? 0 : time; + } + Log.d(TAG, "per search : " + (((double)sum) / sWords.size() / 1000000) + " : " + message + + " : " + outputOptions(bufferType, formatOptions)); + + // Test a word that isn't contained within the dictionary. + final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, + mRandom); + for (int i = 0; i < 1000; ++i) { + final String word = CodePointUtils.generateWord(mRandom, codePointSet); + if (sWords.indexOf(word) != -1) continue; + checkGetTerminalPosition(dictDecoder, word, false); + } + } + + private void runGetTerminalPositionTests(final int bufferType, + final FormatOptions formatOptions) { + runGetTerminalPosition(sWords, sEmptyBigrams, bufferType, formatOptions, "unigram"); + } + + public void testGetTerminalPosition() { + final ArrayList results = new ArrayList<>(); + + runGetTerminalPositionTests(BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.STATIC_OPTIONS); + runGetTerminalPositionTests(BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.STATIC_OPTIONS); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + public void testVer2DictGetWordProperty() { + final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS; + final ArrayList words = sWords; + final String dictName = "testGetWordProperty"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, sEmptyBigrams); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + file.delete(); + timeWritingDictToFile(file, dict, formatOptions); + final BinaryDictionary binaryDictionary = new BinaryDictionary(file.getAbsolutePath(), + 0 /* offset */, file.length(), true /* useFullEditDistance */, + Locale.ENGLISH, dictName, false /* isUpdatable */); + for (final String word : words) { + final WordProperty wordProperty = binaryDictionary.getWordProperty(word, + false /* isBeginningOfSentence */); + assertEquals(word, wordProperty.mWord); + assertEquals(UNIGRAM_FREQ, wordProperty.getProbability()); + } + } + + public void testVer2DictIteration() { + final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS; + final ArrayList words = sWords; + final SparseArray> bigrams = sEmptyBigrams; + final String dictName = "testGetWordProperty"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + timeWritingDictToFile(file, dict, formatOptions); + Log.d(TAG, file.getAbsolutePath()); + final BinaryDictionary binaryDictionary = new BinaryDictionary(file.getAbsolutePath(), + 0 /* offset */, file.length(), true /* useFullEditDistance */, + Locale.ENGLISH, dictName, false /* isUpdatable */); + + final HashSet wordSet = new HashSet<>(words); + final HashSet> bigramSet = new HashSet<>(); + + for (int i = 0; i < words.size(); i++) { + final List bigramList = bigrams.get(i); + if (bigramList != null) { + for (final Integer word1Index : bigramList) { + final String word1 = words.get(word1Index); + bigramSet.add(new Pair<>(words.get(i), word1)); + } + } + } + int token = 0; + do { + final BinaryDictionary.GetNextWordPropertyResult result = + binaryDictionary.getNextWordProperty(token); + final WordProperty wordProperty = result.mWordProperty; + final String word0 = wordProperty.mWord; + assertEquals(UNIGRAM_FREQ, wordProperty.mProbabilityInfo.mProbability); + wordSet.remove(word0); + if (wordProperty.mHasNgrams) { + for (final WeightedString bigramTarget : wordProperty.getBigrams()) { + final String word1 = bigramTarget.mWord; + final Pair bigram = new Pair<>(word0, word1); + assertTrue(bigramSet.contains(bigram)); + bigramSet.remove(bigram); + } + } + token = result.mNextToken; + } while (token != 0); + assertTrue(wordSet.isEmpty()); + assertTrue(bigramSet.isEmpty()); + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java new file mode 100644 index 000000000..9c1e4cf84 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; + +import java.io.File; +import java.util.HashMap; + +public class BinaryDictUtils { + public static final int USE_BYTE_ARRAY = 1; + public static final int USE_BYTE_BUFFER = 2; + + public static final String TEST_DICT_FILE_EXTENSION = ".testDict"; + + public static final FormatSpec.FormatOptions STATIC_OPTIONS = + new FormatSpec.FormatOptions(FormatSpec.VERSION202); + public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP = + new FormatSpec.FormatOptions(FormatSpec.VERSION4, false /* hasTimestamp */); + public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITH_TIMESTAMP = + new FormatSpec.FormatOptions(FormatSpec.VERSION4, true /* hasTimestamp */); + + public static DictionaryOptions makeDictionaryOptions(final String id, final String version, + final FormatSpec.FormatOptions formatOptions) { + final DictionaryOptions options = new DictionaryOptions(new HashMap()); + options.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, "en_US"); + options.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, id); + options.mAttributes.put(DictionaryHeader.DICTIONARY_VERSION_KEY, version); + if (formatOptions.mHasTimestamp) { + options.mAttributes.put(DictionaryHeader.HAS_HISTORICAL_INFO_KEY, + DictionaryHeader.ATTRIBUTE_VALUE_TRUE); + options.mAttributes.put(DictionaryHeader.USES_FORGETTING_CURVE_KEY, + DictionaryHeader.ATTRIBUTE_VALUE_TRUE); + } + return options; + } + + public static File getDictFile(final String name, final String version, + final FormatOptions formatOptions, final File directory) { + if (formatOptions.mVersion == FormatSpec.VERSION2 + || formatOptions.mVersion == FormatSpec.VERSION201 + || formatOptions.mVersion == FormatSpec.VERSION202) { + return new File(directory, name + "." + version + TEST_DICT_FILE_EXTENSION); + } else if (formatOptions.mVersion == FormatSpec.VERSION4) { + return new File(directory, name + "." + version); + } else { + throw new RuntimeException("the format option has a wrong version : " + + formatOptions.mVersion); + } + } + + public static DictEncoder getDictEncoder(final File file, final FormatOptions formatOptions) { + if (formatOptions.mVersion == FormatSpec.VERSION4) { + if (!file.isDirectory()) { + file.mkdir(); + } + return new Ver4DictEncoder(file); + } else if (formatOptions.mVersion == FormatSpec.VERSION202) { + return new Ver2DictEncoder(file, Ver2DictEncoder.CODE_POINT_TABLE_OFF); + } else { + throw new RuntimeException("The format option has a wrong version : " + + formatOptions.mVersion); + } + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java new file mode 100644 index 000000000..c63b972eb --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java @@ -0,0 +1,279 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map.Entry; + +/** + * An implementation of DictEncoder for version 2 binary dictionary. + */ +@UsedForTesting +public class Ver2DictEncoder implements DictEncoder { + + private final File mDictFile; + private OutputStream mOutStream; + private byte[] mBuffer; + private int mPosition; + private final int mCodePointTableMode; + public static final int CODE_POINT_TABLE_OFF = 0; + public static final int CODE_POINT_TABLE_ON = 1; + + @UsedForTesting + public Ver2DictEncoder(final File dictFile, final int codePointTableMode) { + mDictFile = dictFile; + mOutStream = null; + mBuffer = null; + mCodePointTableMode = codePointTableMode; + } + + // This constructor is used only by BinaryDictOffdeviceUtilsTests. + // If you want to use this in the production code, you should consider keeping consistency of + // the interface of Ver3DictDecoder by using factory. + @UsedForTesting + public Ver2DictEncoder(final OutputStream outStream) { + mDictFile = null; + mOutStream = outStream; + mCodePointTableMode = CODE_POINT_TABLE_OFF; + } + + private void openStream() throws FileNotFoundException { + mOutStream = new FileOutputStream(mDictFile); + } + + private void close() throws IOException { + if (mOutStream != null) { + mOutStream.close(); + mOutStream = null; + } + } + + // Package for testing + static CodePointTable makeCodePointTable(final FusionDictionary dict) { + final HashMap codePointOccurrenceCounts = new HashMap<>(); + for (final WordProperty word : dict) { + // Store per code point occurrence + final String wordString = word.mWord; + for (int i = 0; i < wordString.length(); ++i) { + final int codePoint = Character.codePointAt(wordString, i); + if (codePointOccurrenceCounts.containsKey(codePoint)) { + codePointOccurrenceCounts.put(codePoint, + codePointOccurrenceCounts.get(codePoint) + 1); + } else { + codePointOccurrenceCounts.put(codePoint, 1); + } + } + } + final ArrayList> codePointOccurrenceArray = + new ArrayList<>(codePointOccurrenceCounts.entrySet()); + // Descending order sort by occurrence (value side) + Collections.sort(codePointOccurrenceArray, new Comparator>() { + @Override + public int compare(final Entry a, final Entry b) { + if (a.getValue() != b.getValue()) { + return b.getValue().compareTo(a.getValue()); + } + return b.getKey().compareTo(a.getKey()); + } + }); + int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; + // Temporary map for writing of nodes + final HashMap codePointToOneByteCodeMap = new HashMap<>(); + for (final Entry entry : codePointOccurrenceArray) { + // Put a relation from the original code point to the one byte code. + codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); + if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { + break; + } + } + // codePointToOneByteCodeMap for writing the trie + // codePointOccurrenceArray for writing the header + return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + // We no longer support anything but the latest version of v2. + if (formatOptions.mVersion != FormatSpec.VERSION202) { + throw new UnsupportedFormatException( + "The given format options has wrong version number : " + + formatOptions.mVersion); + } + + if (mOutStream == null) { + openStream(); + } + + // Make code point conversion table ordered by occurrence of code points + // Version 201 or later have codePointTable + final CodePointTable codePointTable; + if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion + < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) { + codePointTable = new CodePointTable(); + } else { + codePointTable = makeCodePointTable(dict); + } + + BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, + codePointTable.mCodePointOccurrenceArray); + + // Addresses are limited to 3 bytes, but since addresses can be relative to each node + // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding + // the order of the PtNode arrays becomes a quite complicated problem, because though the + // dictionary itself does not have a size limit, each node array must still be within 16MB + // of all its children and parents. As long as this is ensured, the dictionary file may + // grow to any size. + + // Leave the choice of the optimal node order to the flattenTree function. + MakedictLog.i("Flattening the tree..."); + ArrayList flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, + codePointTable.mCodePointToOneByteCodeMap); + MakedictLog.i("Checking PtNode array..."); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + // Create a buffer that matches the final dictionary size. + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mBuffer = new byte[bufferSize]; + + MakedictLog.i("Writing file..."); + + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, + codePointTable.mCodePointToOneByteCodeMap); + } + if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); + mOutStream.write(mBuffer, 0, mPosition); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(final int position) { + if (mBuffer == null || position < 0 || position >= mBuffer.length) return; + mPosition = position; + } + + @Override + public int getPosition() { + return mPosition; + } + + @Override + public void writePtNodeCount(final int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); + } + final int encodedPtNodeCount = (countSize == 2) ? + (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode, + final HashMap codePointToOneByteCodeMap) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, + codePointToOneByteCodeMap); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars, + final HashMap codePointToOneByteCodeMap) { + mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition, + codePointToOneByteCodeMap); + if (hasSeveralChars) { + mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeFrequency(final int frequency) { + if (frequency >= 0) { + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, + FormatSpec.PTNODE_FREQUENCY_SIZE); + } + } + + private void writeChildrenPosition(final PtNode ptNode, + final HashMap codePointToOneByteCodeMap) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, + codePointToOneByteCodeMap); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + childrenPos); + } + + /** + * Write a bigram attributes list to mBuffer. + * + * @param bigrams the bigram attributes list. + * @param dict the dictionary the node array is a part of (for relative offsets). + */ + private void writeBigrams(final ArrayList bigrams, + final FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.getProbability(); + final int offset = addressOfBigram + - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + Math.abs(offset)); + } + } + + @Override + public void writePtNode(final PtNode ptNode, final FusionDictionary dict, + final HashMap codePointToOneByteCodeMap) { + writePtNodeFlags(ptNode, codePointToOneByteCodeMap); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap); + writeFrequency(ptNode.getProbability()); + writeChildrenPosition(ptNode, codePointToOneByteCodeMap); + writeBigrams(ptNode.mBigrams, dict); + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java new file mode 100644 index 000000000..6e7b37d54 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.Dictionary; +import com.android.inputmethod.latin.NgramContext; +import com.android.inputmethod.latin.common.LocaleUtils; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; + +/** + * An implementation of DictEncoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictEncoder implements DictEncoder { + private final File mDictPlacedDir; + + @UsedForTesting + public Ver4DictEncoder(final File dictPlacedDir) { + mDictPlacedDir = dictPlacedDir; + } + + // TODO: This builds a FusionDictionary first and iterates it to add words to the binary + // dictionary. However, it is possible to just add words directly to the binary dictionary + // instead. + // In the long run, when we stop supporting version 2, FusionDictionary will become deprecated + // and we can remove it. Then we'll be able to just call BinaryDictionary directly. + @Override + public void writeDictionary(FusionDictionary dict, FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion != FormatSpec.VERSION4) { + throw new UnsupportedFormatException("File header has a wrong version number : " + + formatOptions.mVersion); + } + if (!mDictPlacedDir.isDirectory()) { + throw new UnsupportedFormatException("Given path is not a directory."); + } + if (!BinaryDictionaryUtils.createEmptyDictFile(mDictPlacedDir.getAbsolutePath(), + FormatSpec.VERSION4, LocaleUtils.constructLocaleFromString( + dict.mOptions.mAttributes.get(DictionaryHeader.DICTIONARY_LOCALE_KEY)), + dict.mOptions.mAttributes)) { + throw new IOException("Cannot create dictionary file : " + + mDictPlacedDir.getAbsolutePath()); + } + final BinaryDictionary binaryDict = new BinaryDictionary(mDictPlacedDir.getAbsolutePath(), + 0l, mDictPlacedDir.length(), true /* useFullEditDistance */, + LocaleUtils.constructLocaleFromString(dict.mOptions.mAttributes.get( + DictionaryHeader.DICTIONARY_LOCALE_KEY)), + Dictionary.TYPE_USER /* Dictionary type. Does not matter for us */, + true /* isUpdatable */); + if (!binaryDict.isValidDictionary()) { + // Somehow createEmptyDictFile returned true, but the file was not created correctly + throw new IOException("Cannot create dictionary file"); + } + for (final WordProperty wordProperty : dict) { + if (!binaryDict.addUnigramEntry(wordProperty.mWord, wordProperty.getProbability(), + wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord, + wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) { + MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord); + } + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + if (!binaryDict.flushWithGC()) { + MakedictLog.e("Cannot flush dict with GC."); + return; + } + } + } + for (final WordProperty word0Property : dict) { + if (!word0Property.mHasNgrams) continue; + // TODO: Support ngram. + for (final WeightedString word1 : word0Property.getBigrams()) { + final NgramContext ngramContext = + new NgramContext(new NgramContext.WordInfo(word0Property.mWord)); + if (!binaryDict.addNgramEntry(ngramContext, word1.mWord, + word1.getProbability(), 0 /* timestamp */)) { + MakedictLog.e("Cannot add n-gram entry for " + + ngramContext + " -> " + word1.mWord); + return; + } + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + if (!binaryDict.flushWithGC()) { + MakedictLog.e("Cannot flush dict with GC."); + return; + } + } + } + } + if (!binaryDict.flushWithGC()) { + MakedictLog.e("Cannot flush dict with GC."); + return; + } + binaryDict.close(); + } + + @Override + public void setPosition(int position) { + } + + @Override + public int getPosition() { + return 0; + } + + @Override + public void writePtNodeCount(int ptNodeCount) { + } + + @Override + public void writePtNode(PtNode ptNode, FusionDictionary dict, + HashMap codePointToOneByteCodeMap) { + } +} diff --git a/tools/dicttool/Android.mk b/tools/dicttool/Android.mk index 49816294f..dc53cd8e3 100644 --- a/tools/dicttool/Android.mk +++ b/tools/dicttool/Android.mk @@ -47,10 +47,14 @@ LATINIME_SRC_FILES_FOR_DICTTOOL := \ latin/utils/JniUtils.java LATINIME_OVERRIDABLE_SRC_FILES_FOR_DICTTOOL := \ - latin/define/DebugFlags.java + latin/define/DebugFlags.java \ + latin/define/DecoderSpecificConstants.java LATINIME_TEST_SRC_FILES_FOR_DICTTOOL := \ - utils/ByteArrayDictBuffer.java + utils/ByteArrayDictBuffer.java \ + makedict/Ver2DictEncoder.java \ + makedict/Ver4DictEncoder.java \ + makedict/BinaryDictDecoderEncoderTests.java USED_TARGETED_SRC_FILES := \ $(addprefix $(LATINIME_BASE_SRC_DIR)/, $(LATINIME_SRC_FILES_FOR_DICTTOOL)) \ diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java index 955c5728c..5e7aca5bb 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -106,8 +106,7 @@ public class CombinedInputOutput { final String args[] = line.trim().split(","); if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { if (null != word) { - dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, - isNotAWord, isPossiblyOffensive); + dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive); for (WeightedString s : bigrams) { dict.setBigram(word, s.mWord, s.mProbabilityInfo); } @@ -148,25 +147,6 @@ public class CombinedInputOutput { break; } } - } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { - String shortcut = null; - int shortcutFreq = 0; - for (String param : args) { - final String params[] = param.split("=", 2); - if (2 != params.length) throw new RuntimeException("Wrong format : " + line); - if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) { - shortcut = params[1]; - } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { - shortcutFreq = WHITELIST_TAG.equals(params[1]) - ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY - : Integer.parseInt(params[1]); - } - } - if (null != shortcut) { - shortcuts.add(new WeightedString(shortcut, shortcutFreq)); - } else { - throw new RuntimeException("Wrong format : " + line); - } } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) { String secondWordOfBigram = null; ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0); @@ -200,8 +180,7 @@ public class CombinedInputOutput { } } if (null != word) { - dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord, - isPossiblyOffensive); + dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive); for (WeightedString s : bigrams) { dict.setBigram(word, s.mWord, s.mProbabilityInfo); } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java index f97fbef2c..4ba7e1309 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java @@ -136,9 +136,6 @@ public class Diff extends Dicttool.Command { } hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0Property.mWord, "Bigram", word0Property.getBigrams(), word1PtNode.getBigrams()); - hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0Property.mWord, - "Shortcut", word0Property.mShortcutTargets, - word1PtNode.getShortcutTargets()); } } for (final WordProperty word1Property : dict1) { diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java index b8a64e31a..d516d60c3 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java @@ -48,15 +48,6 @@ public class Info extends Dicttool.Command { if (wordProperty.mHasNgrams) { bigramCount += wordProperty.mNgrams.size(); } - if (null != wordProperty.mShortcutTargets) { - shortcutCount += wordProperty.mShortcutTargets.size(); - for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { - if (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY - == shortcutTarget.getProbability()) { - ++whitelistCount; - } - } - } } System.out.println("Words in the dictionary : " + wordCount); System.out.println("Bigram count : " + bigramCount); @@ -78,17 +69,6 @@ public class Info extends Dicttool.Command { if (ptNode.getIsPossiblyOffensive()) { System.out.println(" Is possibly offensive"); } - final ArrayList shortcutTargets = ptNode.getShortcutTargets(); - if (null == shortcutTargets || shortcutTargets.isEmpty()) { - System.out.println(" No shortcuts"); - } else { - for (final WeightedString shortcutTarget : shortcutTargets) { - System.out.println(" Shortcut target: " + shortcutTarget.mWord + " (" - + (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY - == shortcutTarget.getProbability() ? - "whitelist" : shortcutTarget.getProbability()) + ")"); - } - } final ArrayList bigrams = ptNode.getBigrams(); if (null == bigrams || bigrams.isEmpty()) { System.out.println(" No bigrams"); diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java index e68aeb0eb..84d36a0c3 100644 --- a/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java +++ b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java @@ -85,15 +85,15 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase { testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, LOCALE); testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, ID); final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), testOptions); - dict.add("foo", new ProbabilityInfo(TEST_FREQ), null, false /* isNotAWord */, + dict.add("foo", new ProbabilityInfo(TEST_FREQ), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("fta", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("ftb", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("bar", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("fool", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); final File dst = File.createTempFile("testGetRawDict", ".tmp"); @@ -171,8 +171,8 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase { for (int i = 0; i < sWords.size(); ++i) { final String word = sWords.get(i); - dict.add(word, new ProbabilityInfo(TEST_FREQ), null /* shortcuts */, - false /* isNotAWord */, false /* isPossiblyOffensive */); + dict.add(word, new ProbabilityInfo(TEST_FREQ), false /* isNotAWord */, + false /* isPossiblyOffensive */); } File file = File.createTempFile(dictName, ".tmp"); diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictEncoderFlattenTreeTests.java b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictEncoderFlattenTreeTests.java index dc9981d1a..178fc41e2 100644 --- a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictEncoderFlattenTreeTests.java +++ b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictEncoderFlattenTreeTests.java @@ -33,15 +33,15 @@ public class BinaryDictEncoderFlattenTreeTests extends TestCase { public void testFlattenNodes() { final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), new DictionaryOptions(new HashMap())); - dict.add("foo", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("foo", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("fta", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("ftb", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("bar", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); - dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */, + dict.add("fool", new ProbabilityInfo(1), false /* isNotAWord */, false /* isPossiblyOffensive */); final ArrayList result = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/FusionDictionaryTest.java b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/FusionDictionaryTest.java index 1a4f096e4..626023920 100644 --- a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/FusionDictionaryTest.java +++ b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/FusionDictionaryTest.java @@ -102,7 +102,7 @@ public class FusionDictionaryTest extends TestCase { prepare(time); for (int i = 0; i < sWords.size(); ++i) { System.out.println("Adding in pos " + i + " : " + dumpWord(sWords.get(i))); - dict.add(sWords.get(i), new ProbabilityInfo(180), null, false, + dict.add(sWords.get(i), new ProbabilityInfo(180), false, false /* isPossiblyOffensive */); dumpDict(dict); checkDictionary(dict, sWords, i);