[Refactor] Add DictDecoder.readUnigramsAndBigramsBinary.

Change-Id: I259db91d837c67cbcb3b6dc504b21dca23a6a5be
This commit is contained in:
Yuichiro Hanada 2013-08-23 23:30:16 +09:00
parent 918336b7e7
commit 752a33640c
5 changed files with 66 additions and 35 deletions

View file

@ -148,7 +148,7 @@ public final class BinaryDictIOUtils {
* @throws IOException if the file can't be read. * @throws IOException if the file can't be read.
* @throws UnsupportedFormatException if the format of the file is not recognized. * @throws UnsupportedFormatException if the format of the file is not recognized.
*/ */
public static void readUnigramsAndBigramsBinary(final Ver3DictDecoder dictDecoder, /* package */ static void readUnigramsAndBigramsBinary(final Ver3DictDecoder dictDecoder,
final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
UnsupportedFormatException { UnsupportedFormatException {

View file

@ -29,6 +29,8 @@ import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.TreeMap;
/** /**
* An interface of binary dictionary decoder. * An interface of binary dictionary decoder.
@ -71,6 +73,21 @@ public interface DictDecoder {
public int getTerminalPosition(final String word) public int getTerminalPosition(final String word)
throws IOException, UnsupportedFormatException; throws IOException, UnsupportedFormatException;
/**
* Reads unigrams and bigrams from the binary file.
* Doesn't store a full memory representation of the dictionary.
*
* @param words the map to store the address as a key and the word as a value.
* @param frequencies the map to store the address as a key and the frequency as a value.
* @param bigrams the map to store the address as a key and the list of address as a value.
* @throws IOException if the file can't be read.
* @throws UnsupportedFormatException if the format of the file is not recognized.
*/
public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words,
final TreeMap<Integer, Integer> frequencies,
final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams)
throws IOException, UnsupportedFormatException;
// Flags for DictionaryBufferFactory. // Flags for DictionaryBufferFactory.
public static final int USE_READONLY_BYTEBUFFER = 0x01000000; public static final int USE_READONLY_BYTEBUFFER = 0x01000000;
public static final int USE_BYTEARRAY = 0x02000000; public static final int USE_BYTEARRAY = 0x02000000;

View file

@ -31,6 +31,7 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.TreeMap;
/** /**
* An implementation of DictDecoder for version 3 binary dictionary. * An implementation of DictDecoder for version 3 binary dictionary.
@ -317,4 +318,16 @@ public class Ver3DictDecoder implements DictDecoder {
} }
return BinaryDictIOUtils.getTerminalPosition(this, word); return BinaryDictIOUtils.getTerminalPosition(this, word);
} }
@Override
public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words,
final TreeMap<Integer, Integer> frequencies,
final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams)
throws IOException, UnsupportedFormatException {
if (mDictBuffer == null) {
openDictBuffer();
}
BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams);
}
} }

View file

@ -32,7 +32,8 @@ import com.android.inputmethod.latin.personalization.UserHistoryDictionaryBigram
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map.Entry;
import java.util.TreeMap;
/** /**
* Reads and writes Binary files for a UserHistoryDictionary. * Reads and writes Binary files for a UserHistoryDictionary.
@ -119,12 +120,11 @@ public final class UserHistoryDictIOUtils {
*/ */
public static void readDictionaryBinary(final Ver3DictDecoder dictDecoder, public static void readDictionaryBinary(final Ver3DictDecoder dictDecoder,
final OnAddWordListener dict) { final OnAddWordListener dict) {
final Map<Integer, String> unigrams = CollectionUtils.newTreeMap(); final TreeMap<Integer, String> unigrams = CollectionUtils.newTreeMap();
final Map<Integer, Integer> frequencies = CollectionUtils.newTreeMap(); final TreeMap<Integer, Integer> frequencies = CollectionUtils.newTreeMap();
final Map<Integer, ArrayList<PendingAttribute>> bigrams = CollectionUtils.newTreeMap(); final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams = CollectionUtils.newTreeMap();
try { try {
BinaryDictIOUtils.readUnigramsAndBigramsBinary(dictDecoder, unigrams, frequencies, dictDecoder.readUnigramsAndBigramsBinary(unigrams, frequencies, bigrams);
bigrams);
} catch (IOException e) { } catch (IOException e) {
Log.e(TAG, "IO exception while reading file", e); Log.e(TAG, "IO exception while reading file", e);
} catch (UnsupportedFormatException e) { } catch (UnsupportedFormatException e) {
@ -139,10 +139,11 @@ public final class UserHistoryDictIOUtils {
* Adds all unigrams and bigrams in maps to OnAddWordListener. * Adds all unigrams and bigrams in maps to OnAddWordListener.
*/ */
@UsedForTesting @UsedForTesting
static void addWordsFromWordMap(final Map<Integer, String> unigrams, static void addWordsFromWordMap(final TreeMap<Integer, String> unigrams,
final Map<Integer, Integer> frequencies, final TreeMap<Integer, Integer> frequencies,
final Map<Integer, ArrayList<PendingAttribute>> bigrams, final OnAddWordListener to) { final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams,
for (Map.Entry<Integer, String> entry : unigrams.entrySet()) { final OnAddWordListener to) {
for (Entry<Integer, String> entry : unigrams.entrySet()) {
final String word1 = entry.getValue(); final String word1 = entry.getValue();
final int unigramFrequency = frequencies.get(entry.getKey()); final int unigramFrequency = frequencies.get(entry.getKey());
to.setUnigram(word1, null, unigramFrequency); to.setUnigram(word1, null, unigramFrequency);

View file

@ -39,10 +39,10 @@ import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Random; import java.util.Random;
import java.util.Set; import java.util.Set;
import java.util.TreeMap;
/** /**
* Unit tests for BinaryDictDecoderUtils and BinaryDictEncoderUtils. * Unit tests for BinaryDictDecoderUtils and BinaryDictEncoderUtils.
@ -61,13 +61,13 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
private static final int USE_BYTE_ARRAY = 1; private static final int USE_BYTE_ARRAY = 1;
private static final int USE_BYTE_BUFFER = 2; private static final int USE_BYTE_BUFFER = 2;
private static final List<String> sWords = CollectionUtils.newArrayList(); private static final ArrayList<String> sWords = CollectionUtils.newArrayList();
private static final SparseArray<List<Integer>> sEmptyBigrams = private static final SparseArray<List<Integer>> sEmptyBigrams =
CollectionUtils.newSparseArray(); CollectionUtils.newSparseArray();
private static final SparseArray<List<Integer>> sStarBigrams = CollectionUtils.newSparseArray(); private static final SparseArray<List<Integer>> sStarBigrams = CollectionUtils.newSparseArray();
private static final SparseArray<List<Integer>> sChainBigrams = private static final SparseArray<List<Integer>> sChainBigrams =
CollectionUtils.newSparseArray(); CollectionUtils.newSparseArray();
private static final Map<String, List<String>> sShortcuts = CollectionUtils.newHashMap(); private static final HashMap<String, List<String>> sShortcuts = CollectionUtils.newHashMap();
private static final FormatSpec.FormatOptions VERSION2 = new FormatSpec.FormatOptions(2); private static final FormatSpec.FormatOptions VERSION2 = new FormatSpec.FormatOptions(2);
private static final FormatSpec.FormatOptions VERSION3_WITHOUT_DYNAMIC_UPDATE = private static final FormatSpec.FormatOptions VERSION3_WITHOUT_DYNAMIC_UPDATE =
@ -177,7 +177,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
* Adds unigrams to the dictionary. * Adds unigrams to the dictionary.
*/ */
private void addUnigrams(final int number, final FusionDictionary dict, private void addUnigrams(final int number, final FusionDictionary dict,
final List<String> words, final Map<String, List<String>> shortcutMap) { final List<String> words, final HashMap<String, List<String>> shortcutMap) {
for (int i = 0; i < number; ++i) { for (int i = 0; i < number; ++i) {
final String word = words.get(i); final String word = words.get(i);
final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList(); final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList();
@ -234,7 +234,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
} }
private void checkDictionary(final FusionDictionary dict, final List<String> words, private void checkDictionary(final FusionDictionary dict, final List<String> words,
final SparseArray<List<Integer>> bigrams, final Map<String, List<String>> shortcutMap) { final SparseArray<List<Integer>> bigrams,
final HashMap<String, List<String>> shortcutMap) {
assertNotNull(dict); assertNotNull(dict);
// check unigram // check unigram
@ -255,7 +256,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
// check shortcut // check shortcut
if (shortcutMap != null) { if (shortcutMap != null) {
for (final Map.Entry<String, List<String>> entry : shortcutMap.entrySet()) { for (final Entry<String, List<String>> entry : shortcutMap.entrySet()) {
assertTrue(words.contains(entry.getKey())); assertTrue(words.contains(entry.getKey()));
final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray,
entry.getKey()); entry.getKey());
@ -278,8 +279,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
// Tests for readDictionaryBinary and writeDictionaryBinary // Tests for readDictionaryBinary and writeDictionaryBinary
private long timeReadingAndCheckDict(final File file, final List<String> words, private long timeReadingAndCheckDict(final File file, final List<String> words,
final SparseArray<List<Integer>> bigrams, final Map<String, List<String>> shortcutMap, final SparseArray<List<Integer>> bigrams,
final int bufferType) { final HashMap<String, List<String>> shortcutMap, final int bufferType) {
long now, diff = -1; long now, diff = -1;
FusionDictionary dict = null; FusionDictionary dict = null;
@ -302,7 +303,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
// Tests for readDictionaryBinary and writeDictionaryBinary // Tests for readDictionaryBinary and writeDictionaryBinary
private String runReadAndWrite(final List<String> words, private String runReadAndWrite(final List<String> words,
final SparseArray<List<Integer>> bigrams, final Map<String, List<String>> shortcuts, final SparseArray<List<Integer>> bigrams, final HashMap<String, List<String>> shortcuts,
final int bufferType, final FormatSpec.FormatOptions formatOptions, final int bufferType, final FormatSpec.FormatOptions formatOptions,
final String message) { final String message) {
File file = null; File file = null;
@ -387,9 +388,9 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
private void checkWordMap(final List<String> expectedWords, private void checkWordMap(final List<String> expectedWords,
final SparseArray<List<Integer>> expectedBigrams, final SparseArray<List<Integer>> expectedBigrams,
final Map<Integer, String> resultWords, final TreeMap<Integer, String> resultWords,
final Map<Integer, Integer> resultFrequencies, final TreeMap<Integer, Integer> resultFrequencies,
final Map<Integer, ArrayList<PendingAttribute>> resultBigrams) { final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams) {
// check unigrams // check unigrams
final Set<String> actualWordsSet = new HashSet<String>(resultWords.values()); final Set<String> actualWordsSet = new HashSet<String>(resultWords.values());
final Set<String> expectedWordsSet = new HashSet<String>(expectedWords); final Set<String> expectedWordsSet = new HashSet<String>(expectedWords);
@ -400,7 +401,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
} }
// check bigrams // check bigrams
final Map<String, List<String>> expBigrams = new HashMap<String, List<String>>(); final HashMap<String, List<String>> expBigrams = new HashMap<String, List<String>>();
for (int i = 0; i < expectedBigrams.size(); ++i) { for (int i = 0; i < expectedBigrams.size(); ++i) {
final String word1 = expectedWords.get(expectedBigrams.keyAt(i)); final String word1 = expectedWords.get(expectedBigrams.keyAt(i));
for (int w2 : expectedBigrams.valueAt(i)) { for (int w2 : expectedBigrams.valueAt(i)) {
@ -411,7 +412,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
} }
} }
final Map<String, List<String>> actBigrams = new HashMap<String, List<String>>(); final HashMap<String, List<String>> actBigrams = new HashMap<String, List<String>>();
for (Entry<Integer, ArrayList<PendingAttribute>> entry : resultBigrams.entrySet()) { for (Entry<Integer, ArrayList<PendingAttribute>> entry : resultBigrams.entrySet()) {
final String word1 = resultWords.get(entry.getKey()); final String word1 = resultWords.get(entry.getKey());
final int unigramFreq = resultFrequencies.get(entry.getKey()); final int unigramFreq = resultFrequencies.get(entry.getKey());
@ -435,10 +436,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
final SparseArray<List<Integer>> bigrams, final int bufferType) { final SparseArray<List<Integer>> bigrams, final int bufferType) {
FileInputStream inStream = null; FileInputStream inStream = null;
final Map<Integer, String> resultWords = CollectionUtils.newTreeMap(); final TreeMap<Integer, String> resultWords = CollectionUtils.newTreeMap();
final Map<Integer, ArrayList<PendingAttribute>> resultBigrams = final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams =
CollectionUtils.newTreeMap(); CollectionUtils.newTreeMap();
final Map<Integer, Integer> resultFreqs = CollectionUtils.newTreeMap(); final TreeMap<Integer, Integer> resultFreqs = CollectionUtils.newTreeMap();
long now = -1, diff = -1; long now = -1, diff = -1;
try { try {
@ -446,8 +447,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
dictDecoder.openDictBuffer(); dictDecoder.openDictBuffer();
assertNotNull("Can't get buffer.", dictDecoder.getDictBuffer()); assertNotNull("Can't get buffer.", dictDecoder.getDictBuffer());
now = System.currentTimeMillis(); now = System.currentTimeMillis();
BinaryDictIOUtils.readUnigramsAndBigramsBinary(dictDecoder, resultWords, resultFreqs, dictDecoder.readUnigramsAndBigramsBinary(resultWords, resultFreqs, resultBigrams);
resultBigrams);
diff = System.currentTimeMillis() - now; diff = System.currentTimeMillis() - now;
} catch (IOException e) { } catch (IOException e) {
Log.e(TAG, "IOException", e); Log.e(TAG, "IOException", e);
@ -467,7 +467,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
return diff; return diff;
} }
private String runReadUnigramsAndBigramsBinary(final List<String> words, private String runReadUnigramsAndBigramsBinary(final ArrayList<String> words,
final SparseArray<List<Integer>> bigrams, final int bufferType, final SparseArray<List<Integer>> bigrams, final int bufferType,
final FormatSpec.FormatOptions formatOptions, final String message) { final FormatSpec.FormatOptions formatOptions, final String message) {
File file = null; File file = null;
@ -496,8 +496,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
+ " : " + message + " : " + outputOptions(bufferType, formatOptions); + " : " + message + " : " + outputOptions(bufferType, formatOptions);
} }
private void runReadUnigramsAndBigramsTests(final List<String> results, final int bufferType, private void runReadUnigramsAndBigramsTests(final ArrayList<String> results,
final FormatSpec.FormatOptions formatOptions) { final int bufferType, final FormatSpec.FormatOptions formatOptions) {
results.add(runReadUnigramsAndBigramsBinary(sWords, sEmptyBigrams, bufferType, results.add(runReadUnigramsAndBigramsBinary(sWords, sEmptyBigrams, bufferType,
formatOptions, "unigram")); formatOptions, "unigram"));
results.add(runReadUnigramsAndBigramsBinary(sWords, sChainBigrams, bufferType, results.add(runReadUnigramsAndBigramsBinary(sWords, sChainBigrams, bufferType,
@ -507,7 +507,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
} }
public void testReadUnigramsAndBigramsBinaryWithByteBuffer() { public void testReadUnigramsAndBigramsBinaryWithByteBuffer() {
final List<String> results = CollectionUtils.newArrayList(); final ArrayList<String> results = CollectionUtils.newArrayList();
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION2); runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION2);
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION3_WITHOUT_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION3_WITHOUT_DYNAMIC_UPDATE);
@ -519,7 +519,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
} }
public void testReadUnigramsAndBigramsBinaryWithByteArray() { public void testReadUnigramsAndBigramsBinaryWithByteArray() {
final List<String> results = CollectionUtils.newArrayList(); final ArrayList<String> results = CollectionUtils.newArrayList();
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION2); runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION2);
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION3_WITHOUT_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION3_WITHOUT_DYNAMIC_UPDATE);