add UserHistoryDictIOUtils.
Change-Id: I8a70e43b23f65b5fd5f0ee0b30a94ad8f5ef8a8a
This commit is contained in:
parent
9bbc7ec052
commit
666a433802
3 changed files with 444 additions and 11 deletions
|
@ -0,0 +1,193 @@
|
|||
/*
|
||||
* Copyright (C) 2012 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
* use this file except in compliance with the License. You may obtain a copy of
|
||||
* the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations under
|
||||
* the License.
|
||||
*/
|
||||
|
||||
package com.android.inputmethod.latin;
|
||||
|
||||
import android.util.Log;
|
||||
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
|
||||
import com.android.inputmethod.latin.makedict.PendingAttribute;
|
||||
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Reads and writes Binary files for a UserHistoryDictionary.
|
||||
*
|
||||
* All the methods in this class are static.
|
||||
*/
|
||||
public class UserHistoryDictIOUtils {
|
||||
private static final String TAG = UserHistoryDictIOUtils.class.getSimpleName();
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
public interface OnAddWordListener {
|
||||
public void setUnigram(final String word, final String shortcutTarget, final int frequency);
|
||||
public void setBigram(final String word1, final String word2, final int frequency);
|
||||
}
|
||||
|
||||
public interface BigramDictionaryInterface {
|
||||
public int getFrequency(final String word1, final String word2);
|
||||
}
|
||||
|
||||
public static final class ByteArrayWrapper implements FusionDictionaryBufferInterface {
|
||||
private byte[] mBuffer;
|
||||
private int mPosition;
|
||||
|
||||
ByteArrayWrapper(final byte[] buffer) {
|
||||
mBuffer = buffer;
|
||||
mPosition = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readUnsignedByte() {
|
||||
return ((int)mBuffer[mPosition++]) & 0xFF;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readUnsignedShort() {
|
||||
final int retval = readUnsignedByte();
|
||||
return (retval << 8) + readUnsignedByte();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readUnsignedInt24() {
|
||||
final int retval = readUnsignedShort();
|
||||
return (retval << 8) + readUnsignedByte();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int readInt() {
|
||||
final int retval = readUnsignedShort();
|
||||
return (retval << 16) + readUnsignedShort();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int position() {
|
||||
return mPosition;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void position(int position) {
|
||||
mPosition = position;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes dictionary to file.
|
||||
*/
|
||||
public static void writeDictionaryBinary(final OutputStream destination,
|
||||
final BigramDictionaryInterface dict, final UserHistoryDictionaryBigramList bigrams,
|
||||
final int version) {
|
||||
|
||||
final FusionDictionary fusionDict = constructFusionDictionary(dict, bigrams);
|
||||
|
||||
try {
|
||||
BinaryDictInputOutput.writeDictionaryBinary(destination, fusionDict, version);
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "IO exception while writing file: " + e);
|
||||
} catch (UnsupportedFormatException e) {
|
||||
Log.e(TAG, "Unsupported fomat: " + e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new FusionDictionary from BigramDictionaryInterface.
|
||||
*/
|
||||
/* packages for test */ static FusionDictionary constructFusionDictionary(
|
||||
final BigramDictionaryInterface dict, final UserHistoryDictionaryBigramList bigrams) {
|
||||
|
||||
final FusionDictionary fusionDict = new FusionDictionary(new Node(),
|
||||
new FusionDictionary.DictionaryOptions(
|
||||
new HashMap<String,String>(), false, false));
|
||||
|
||||
for (final String word1 : bigrams.keySet()) {
|
||||
final HashMap<String, Byte> word1Bigrams = bigrams.getBigrams(word1);
|
||||
for (final String word2 : word1Bigrams.keySet()) {
|
||||
final int freq = dict.getFrequency(word1, word2);
|
||||
|
||||
if (DEBUG) {
|
||||
if (word1 == null) {
|
||||
Log.d(TAG, "add unigram: " + word2 + "," + Integer.toString(freq));
|
||||
} else {
|
||||
Log.d(TAG, "add bigram: " + word1
|
||||
+ "," + word2 + "," + Integer.toString(freq));
|
||||
}
|
||||
}
|
||||
|
||||
if (word1 == null) { // unigram
|
||||
fusionDict.add(word2, freq, null);
|
||||
} else { // bigram
|
||||
fusionDict.setBigram(word1, word2, freq);
|
||||
}
|
||||
bigrams.updateBigram(word1, word2, (byte)freq);
|
||||
}
|
||||
}
|
||||
|
||||
return fusionDict;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads dictionary from file.
|
||||
*/
|
||||
public static void readDictionaryBinary(final FusionDictionaryBufferInterface buffer,
|
||||
final OnAddWordListener dict) {
|
||||
final Map<Integer, String> unigrams = CollectionUtils.newTreeMap();
|
||||
final Map<Integer, Integer> frequencies = CollectionUtils.newTreeMap();
|
||||
final Map<Integer, ArrayList<PendingAttribute>> bigrams = CollectionUtils.newTreeMap();
|
||||
|
||||
try {
|
||||
BinaryDictInputOutput.readUnigramsAndBigramsBinary(buffer, unigrams, frequencies,
|
||||
bigrams);
|
||||
addWordsFromWordMap(unigrams, frequencies, bigrams, dict);
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "IO exception while reading file: " + e);
|
||||
} catch (UnsupportedFormatException e) {
|
||||
Log.e(TAG, "Unsupported format: " + e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds all unigrams and bigrams in maps to OnAddWordListener.
|
||||
*/
|
||||
/* package for test */ static void addWordsFromWordMap(final Map<Integer, String> unigrams,
|
||||
final Map<Integer, Integer> frequencies,
|
||||
final Map<Integer, ArrayList<PendingAttribute>> bigrams, final OnAddWordListener to) {
|
||||
|
||||
for (Map.Entry<Integer, String> entry : unigrams.entrySet()) {
|
||||
final String word1 = entry.getValue();
|
||||
final int unigramFrequency = frequencies.get(entry.getKey());
|
||||
to.setUnigram(word1, null, unigramFrequency);
|
||||
|
||||
final ArrayList<PendingAttribute> attrList = bigrams.get(entry.getKey());
|
||||
|
||||
if (attrList != null) {
|
||||
for (final PendingAttribute attr : attrList) {
|
||||
to.setBigram(word1, unigrams.get(attr.mAddress),
|
||||
BinaryDictInputOutput.reconstructBigramFrequency(unigramFrequency,
|
||||
attr.mFrequency));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -189,7 +189,7 @@ public class BinaryDictInputOutput {
|
|||
// suspicion that a bug might be causing an infinite loop.
|
||||
private static final int MAX_PASSES = 24;
|
||||
|
||||
private interface FusionDictionaryBufferInterface {
|
||||
public interface FusionDictionaryBufferInterface {
|
||||
public int readUnsignedByte();
|
||||
public int readUnsignedShort();
|
||||
public int readUnsignedInt24();
|
||||
|
@ -234,7 +234,6 @@ public class BinaryDictInputOutput {
|
|||
@Override
|
||||
public void position(int newPos) {
|
||||
mBuffer.position(newPos);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1393,7 +1392,6 @@ public class BinaryDictInputOutput {
|
|||
final FusionDictionaryBufferInterface buffer, final int headerSize,
|
||||
final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
|
||||
final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
|
||||
|
||||
int[] pushedChars = new int[MAX_WORD_LENGTH + 1];
|
||||
|
||||
Stack<Position> stack = new Stack<Position>();
|
||||
|
@ -1443,8 +1441,6 @@ public class BinaryDictInputOutput {
|
|||
stack.push(childrenPos);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1462,7 +1458,6 @@ public class BinaryDictInputOutput {
|
|||
final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
|
||||
final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
|
||||
UnsupportedFormatException {
|
||||
|
||||
// Read header
|
||||
final int version = checkFormatVersion(buffer);
|
||||
final int optionsFlags = buffer.readUnsignedShort();
|
||||
|
@ -1507,10 +1502,8 @@ public class BinaryDictInputOutput {
|
|||
* @throws UnsupportedFormatException
|
||||
*/
|
||||
private static int readHeader(final FusionDictionaryBufferInterface buffer,
|
||||
final HashMap<String, String> options,
|
||||
final int version)
|
||||
final HashMap<String, String> options, final int version)
|
||||
throws IOException, UnsupportedFormatException {
|
||||
|
||||
final int headerSize;
|
||||
if (version < FIRST_VERSION_WITH_HEADER_SIZE) {
|
||||
headerSize = buffer.position();
|
||||
|
@ -1523,7 +1516,6 @@ public class BinaryDictInputOutput {
|
|||
if (headerSize < 0) {
|
||||
throw new UnsupportedFormatException("header size can't be negative.");
|
||||
}
|
||||
|
||||
return headerSize;
|
||||
}
|
||||
|
||||
|
@ -1561,7 +1553,6 @@ public class BinaryDictInputOutput {
|
|||
public static FusionDictionary readDictionaryBinary(
|
||||
final FusionDictionaryBufferInterface buffer, final FusionDictionary dict)
|
||||
throws IOException, UnsupportedFormatException {
|
||||
|
||||
// clear cache
|
||||
wordCache.clear();
|
||||
|
||||
|
|
|
@ -0,0 +1,249 @@
|
|||
/*
|
||||
* Copyright (C) 2012 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.android.inputmethod.latin;
|
||||
|
||||
import com.android.inputmethod.latin.UserHistoryDictIOUtils.BigramDictionaryInterface;
|
||||
import com.android.inputmethod.latin.UserHistoryDictIOUtils.OnAddWordListener;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
|
||||
|
||||
import android.content.Context;
|
||||
import android.test.AndroidTestCase;
|
||||
import android.util.Log;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Unit tests for UserHistoryDictIOUtils
|
||||
*/
|
||||
public class UserHistoryDictIOUtilsTests extends AndroidTestCase
|
||||
implements BigramDictionaryInterface {
|
||||
|
||||
private static final String TAG = UserHistoryDictIOUtilsTests.class.getSimpleName();
|
||||
private static final int UNIGRAM_FREQUENCY = 50;
|
||||
private static final int BIGRAM_FREQUENCY = 100;
|
||||
private static final ArrayList<String> NOT_HAVE_BIGRAM = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Return same frequency for all words and bigrams
|
||||
*/
|
||||
@Override
|
||||
public int getFrequency(String word1, String word2) {
|
||||
if (word1 == null) return UNIGRAM_FREQUENCY;
|
||||
return BIGRAM_FREQUENCY;
|
||||
}
|
||||
|
||||
// Utilities for Testing
|
||||
|
||||
private void addWord(final String word,
|
||||
final HashMap<String, ArrayList<String> > addedWords) {
|
||||
if (!addedWords.containsKey(word)) {
|
||||
addedWords.put(word, new ArrayList<String>());
|
||||
}
|
||||
}
|
||||
|
||||
private void addBigram(final String word1, final String word2,
|
||||
final HashMap<String, ArrayList<String> > addedWords) {
|
||||
addWord(word1, addedWords);
|
||||
addWord(word2, addedWords);
|
||||
addedWords.get(word1).add(word2);
|
||||
}
|
||||
|
||||
private void addBigramToBigramList(final String word1, final String word2,
|
||||
final HashMap<String, ArrayList<String> > addedWords,
|
||||
final UserHistoryDictionaryBigramList bigramList) {
|
||||
bigramList.addBigram(null, word1);
|
||||
bigramList.addBigram(word1, word2);
|
||||
|
||||
addBigram(word1, word2, addedWords);
|
||||
}
|
||||
|
||||
private void checkWordInFusionDict(final FusionDictionary dict, final String word,
|
||||
final ArrayList<String> expectedBigrams) {
|
||||
final CharGroup group = FusionDictionary.findWordInTree(dict.mRoot, word);
|
||||
assertNotNull(group);
|
||||
assertTrue(group.isTerminal());
|
||||
|
||||
for (final String bigram : expectedBigrams) {
|
||||
assertNotNull(group.getBigram(bigram));
|
||||
}
|
||||
}
|
||||
|
||||
private void checkWordsInFusionDict(final FusionDictionary dict,
|
||||
final HashMap<String, ArrayList<String> > bigrams) {
|
||||
for (final String word : bigrams.keySet()) {
|
||||
if (bigrams.containsKey(word)) {
|
||||
checkWordInFusionDict(dict, word, bigrams.get(word));
|
||||
} else {
|
||||
checkWordInFusionDict(dict, word, NOT_HAVE_BIGRAM);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkWordInBigramList(
|
||||
final UserHistoryDictionaryBigramList bigramList, final String word,
|
||||
final ArrayList<String> expectedBigrams) {
|
||||
// check unigram
|
||||
final HashMap<String,Byte> unigramMap = bigramList.getBigrams(null);
|
||||
assertTrue(unigramMap.containsKey(word));
|
||||
|
||||
// check bigrams
|
||||
final ArrayList<String> actualBigrams = new ArrayList<String>(
|
||||
bigramList.getBigrams(word).keySet());
|
||||
|
||||
Collections.sort(expectedBigrams);
|
||||
Collections.sort(actualBigrams);
|
||||
assertEquals(expectedBigrams, actualBigrams);
|
||||
}
|
||||
|
||||
private void checkWordsInBigramList(final UserHistoryDictionaryBigramList bigramList,
|
||||
final HashMap<String, ArrayList<String> > addedWords) {
|
||||
for (final String word : addedWords.keySet()) {
|
||||
if (addedWords.containsKey(word)) {
|
||||
checkWordInBigramList(bigramList, word, addedWords.get(word));
|
||||
} else {
|
||||
checkWordInBigramList(bigramList, word, NOT_HAVE_BIGRAM);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeDictToFile(final File file,
|
||||
final UserHistoryDictionaryBigramList bigramList) {
|
||||
try {
|
||||
final FileOutputStream out = new FileOutputStream(file);
|
||||
UserHistoryDictIOUtils.writeDictionaryBinary(out, this, bigramList, 2);
|
||||
out.flush();
|
||||
out.close();
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "IO exception while writing file: " + e);
|
||||
}
|
||||
}
|
||||
|
||||
private void readDictFromFile(final File file, final OnAddWordListener listener) {
|
||||
FileInputStream inStream = null;
|
||||
|
||||
try {
|
||||
inStream = new FileInputStream(file);
|
||||
final byte[] buffer = new byte[(int)file.length()];
|
||||
inStream.read(buffer);
|
||||
|
||||
UserHistoryDictIOUtils.readDictionaryBinary(
|
||||
new UserHistoryDictIOUtils.ByteArrayWrapper(buffer), listener);
|
||||
} catch (FileNotFoundException e) {
|
||||
Log.e(TAG, "file not found: " + e);
|
||||
} catch (IOException e) {
|
||||
Log.e(TAG, "IOException: " + e);
|
||||
} finally {
|
||||
if (inStream != null) {
|
||||
try {
|
||||
inStream.close();
|
||||
} catch (IOException e) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testGenerateFusionDictionary() {
|
||||
final UserHistoryDictionaryBigramList originalList = new UserHistoryDictionaryBigramList();
|
||||
|
||||
final HashMap<String, ArrayList<String> > addedWords =
|
||||
new HashMap<String, ArrayList<String>>();
|
||||
addBigramToBigramList("this", "is", addedWords, originalList);
|
||||
addBigramToBigramList("this", "was", addedWords, originalList);
|
||||
addBigramToBigramList("hello", "world", addedWords, originalList);
|
||||
|
||||
final FusionDictionary fusionDict =
|
||||
UserHistoryDictIOUtils.constructFusionDictionary(this, originalList);
|
||||
|
||||
checkWordsInFusionDict(fusionDict, addedWords);
|
||||
}
|
||||
|
||||
public void testReadAndWrite() {
|
||||
final Context context = getContext();
|
||||
|
||||
File file = null;
|
||||
try {
|
||||
file = File.createTempFile("testReadAndWrite", ".dict");
|
||||
} catch (IOException e) {
|
||||
Log.d(TAG, "IOException while creating a temporary file: " + e);
|
||||
}
|
||||
assertNotNull(file);
|
||||
|
||||
// make original dictionary
|
||||
final UserHistoryDictionaryBigramList originalList = new UserHistoryDictionaryBigramList();
|
||||
final HashMap<String, ArrayList<String>> addedWords = CollectionUtils.newHashMap();
|
||||
addBigramToBigramList("this" , "is" , addedWords, originalList);
|
||||
addBigramToBigramList("this" , "was" , addedWords, originalList);
|
||||
addBigramToBigramList("is" , "not" , addedWords, originalList);
|
||||
addBigramToBigramList("hello", "world", addedWords, originalList);
|
||||
|
||||
// write to file
|
||||
writeDictToFile(file, originalList);
|
||||
|
||||
// make result dict.
|
||||
final UserHistoryDictionaryBigramList resultList = new UserHistoryDictionaryBigramList();
|
||||
final OnAddWordListener listener = new OnAddWordListener() {
|
||||
@Override
|
||||
public void setUnigram(final String word,
|
||||
final String shortcutTarget, final int frequency) {
|
||||
Log.d(TAG, "in: setUnigram: " + word + "," + frequency);
|
||||
resultList.addBigram(null, word, (byte)frequency);
|
||||
}
|
||||
@Override
|
||||
public void setBigram(final String word1, final String word2, final int frequency) {
|
||||
Log.d(TAG, "in: setBigram: " + word1 + "," + word2 + "," + frequency);
|
||||
resultList.addBigram(word1, word2, (byte)frequency);
|
||||
}
|
||||
};
|
||||
|
||||
// load from file
|
||||
readDictFromFile(file, listener);
|
||||
checkWordsInBigramList(resultList, addedWords);
|
||||
|
||||
// add new bigram
|
||||
addBigramToBigramList("hello", "java", addedWords, resultList);
|
||||
|
||||
// rewrite
|
||||
writeDictToFile(file, resultList);
|
||||
final UserHistoryDictionaryBigramList resultList2 = new UserHistoryDictionaryBigramList();
|
||||
final OnAddWordListener listener2 = new OnAddWordListener() {
|
||||
@Override
|
||||
public void setUnigram(final String word,
|
||||
final String shortcutTarget, final int frequency) {
|
||||
Log.d(TAG, "in: setUnigram: " + word + "," + frequency);
|
||||
resultList2.addBigram(null, word, (byte)frequency);
|
||||
}
|
||||
@Override
|
||||
public void setBigram(final String word1, final String word2, final int frequency) {
|
||||
Log.d(TAG, "in: setBigram: " + word1 + "," + word2 + "," + frequency);
|
||||
resultList2.addBigram(word1, word2, (byte)frequency);
|
||||
}
|
||||
};
|
||||
|
||||
// load from file
|
||||
readDictFromFile(file, listener2);
|
||||
checkWordsInBigramList(resultList2, addedWords);
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue