Reading dictionary containing timestamps in Java Side.
Just skipping historical information fields. Bug: 11281877 Change-Id: I43d2adaa576b7da11ed3ca54990265dbb6f53b08main
parent
a454a7b85c
commit
26bd46095a
|
@ -48,7 +48,7 @@ public abstract class AbstractDictDecoder implements DictDecoder {
|
||||||
throw new UnsupportedFormatException("Unsupported version : " + version);
|
throw new UnsupportedFormatException("Unsupported version : " + version);
|
||||||
}
|
}
|
||||||
// TODO: Remove this field.
|
// TODO: Remove this field.
|
||||||
final int optionsFlags = HeaderReader.readOptionFlags(headerBuffer);
|
HeaderReader.readOptionFlags(headerBuffer);
|
||||||
final int headerSize = HeaderReader.readHeaderSize(headerBuffer);
|
final int headerSize = HeaderReader.readHeaderSize(headerBuffer);
|
||||||
if (headerSize < 0) {
|
if (headerSize < 0) {
|
||||||
throw new UnsupportedFormatException("header size can't be negative.");
|
throw new UnsupportedFormatException("header size can't be negative.");
|
||||||
|
@ -59,8 +59,8 @@ public abstract class AbstractDictDecoder implements DictDecoder {
|
||||||
|
|
||||||
final FileHeader header = new FileHeader(headerSize,
|
final FileHeader header = new FileHeader(headerSize,
|
||||||
new FusionDictionary.DictionaryOptions(attributes),
|
new FusionDictionary.DictionaryOptions(attributes),
|
||||||
new FormatOptions(version,
|
new FormatOptions(version, FileHeader.ATTRIBUTE_VALUE_TRUE.equals(
|
||||||
0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG)));
|
attributes.get(FileHeader.HAS_HISTORICAL_INFO_ATTRIBUTE))));
|
||||||
return header;
|
return header;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -499,7 +499,6 @@ public final class BinaryDictDecoderUtils {
|
||||||
final int nodeArrayOriginPos = dictDecoder.getPosition();
|
final int nodeArrayOriginPos = dictDecoder.getPosition();
|
||||||
|
|
||||||
do { // Scan the linked-list node.
|
do { // Scan the linked-list node.
|
||||||
final int nodeArrayHeadPos = dictDecoder.getPosition();
|
|
||||||
final int count = dictDecoder.readPtNodeCount();
|
final int count = dictDecoder.readPtNodeCount();
|
||||||
int groupPos = dictDecoder.getPosition();
|
int groupPos = dictDecoder.getPosition();
|
||||||
for (int i = count; i > 0; --i) { // Scan the array of PtNode.
|
for (int i = count; i > 0; --i) { // Scan the array of PtNode.
|
||||||
|
|
|
@ -755,14 +755,6 @@ public class BinaryDictEncoderUtils {
|
||||||
return discretizedFrequency > 0 ? discretizedFrequency : 0;
|
return discretizedFrequency > 0 ? discretizedFrequency : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Makes the 2-byte value for options flags. Unused at the moment, and always 0.
|
|
||||||
*/
|
|
||||||
private static final int makeOptionsValue(final FormatOptions formatOptions) {
|
|
||||||
// TODO: why doesn't this handle CONTAINS_TIMESTAMP_FLAG?
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Makes the flag value for a shortcut.
|
* Makes the flag value for a shortcut.
|
||||||
*
|
*
|
||||||
|
@ -949,7 +941,8 @@ public class BinaryDictEncoderUtils {
|
||||||
headerBuffer.write((byte) (0xFF & version));
|
headerBuffer.write((byte) (0xFF & version));
|
||||||
|
|
||||||
// Options flags
|
// Options flags
|
||||||
final int options = makeOptionsValue(formatOptions);
|
// TODO: Remove this field.
|
||||||
|
final int options = 0;
|
||||||
headerBuffer.write((byte) (0xFF & (options >> 8)));
|
headerBuffer.write((byte) (0xFF & (options >> 8)));
|
||||||
headerBuffer.write((byte) (0xFF & options));
|
headerBuffer.write((byte) (0xFF & options));
|
||||||
final int headerSizeOffset = headerBuffer.size();
|
final int headerSizeOffset = headerBuffer.size();
|
||||||
|
|
|
@ -192,10 +192,6 @@ public final class FormatSpec {
|
||||||
static final int MINIMUM_SUPPORTED_VERSION = VERSION2;
|
static final int MINIMUM_SUPPORTED_VERSION = VERSION2;
|
||||||
static final int MAXIMUM_SUPPORTED_VERSION = VERSION4;
|
static final int MAXIMUM_SUPPORTED_VERSION = VERSION4;
|
||||||
|
|
||||||
// These options need to be the same numeric values as the one in the native reading code.
|
|
||||||
// TODO: Make the native reading code read this variable.
|
|
||||||
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
|
|
||||||
|
|
||||||
// TODO: Make this value adaptative to content data, store it in the header, and
|
// TODO: Make this value adaptative to content data, store it in the header, and
|
||||||
// use it in the reading code.
|
// use it in the reading code.
|
||||||
static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
|
static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
|
||||||
|
@ -249,26 +245,26 @@ public final class FormatSpec {
|
||||||
static final String TRIE_FILE_EXTENSION = ".trie";
|
static final String TRIE_FILE_EXTENSION = ".trie";
|
||||||
public static final String HEADER_FILE_EXTENSION = ".header";
|
public static final String HEADER_FILE_EXTENSION = ".header";
|
||||||
static final String FREQ_FILE_EXTENSION = ".freq";
|
static final String FREQ_FILE_EXTENSION = ".freq";
|
||||||
static final String UNIGRAM_TIMESTAMP_FILE_EXTENSION = ".timestamp";
|
|
||||||
// tat = Terminal Address Table
|
// tat = Terminal Address Table
|
||||||
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
|
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
|
||||||
static final String BIGRAM_FILE_EXTENSION = ".bigram";
|
static final String BIGRAM_FILE_EXTENSION = ".bigram";
|
||||||
static final String SHORTCUT_FILE_EXTENSION = ".shortcut";
|
static final String SHORTCUT_FILE_EXTENSION = ".shortcut";
|
||||||
static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup";
|
static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup";
|
||||||
static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
|
static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
|
||||||
|
static final int FLAGS_IN_FREQ_FILE_SIZE = 1;
|
||||||
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
|
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
|
||||||
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
||||||
static final int UNIGRAM_TIMESTAMP_SIZE = 4;
|
static final int UNIGRAM_TIMESTAMP_SIZE = 4;
|
||||||
|
static final int UNIGRAM_COUNTER_SIZE = 1;
|
||||||
|
static final int UNIGRAM_LEVEL_SIZE = 1;
|
||||||
|
|
||||||
// With the English main dictionary as of October 2013, the size of bigram address table is
|
// With the English main dictionary as of October 2013, the size of bigram address table is
|
||||||
// is 345KB with the block size being 16.
|
// is 345KB with the block size being 16.
|
||||||
// This is 54% of that of full address table.
|
// This is 54% of that of full address table.
|
||||||
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16;
|
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16;
|
||||||
static final int BIGRAM_CONTENT_COUNT = 2;
|
static final int BIGRAM_CONTENT_COUNT = 1;
|
||||||
static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
|
static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
|
||||||
static final int BIGRAM_TIMESTAMP_CONTENT_INDEX = 1;
|
|
||||||
static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
|
static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
|
||||||
static final String BIGRAM_TIMESTAMP_CONTENT_ID = "_timestamp";
|
|
||||||
static final int BIGRAM_TIMESTAMP_SIZE = 4;
|
static final int BIGRAM_TIMESTAMP_SIZE = 4;
|
||||||
static final int BIGRAM_COUNTER_SIZE = 1;
|
static final int BIGRAM_COUNTER_SIZE = 1;
|
||||||
static final int BIGRAM_LEVEL_SIZE = 1;
|
static final int BIGRAM_LEVEL_SIZE = 1;
|
||||||
|
|
|
@ -61,6 +61,7 @@ public final class FusionDictionary implements Iterable<Word> {
|
||||||
mData = new ArrayList<PtNode>();
|
mData = new ArrayList<PtNode>();
|
||||||
}
|
}
|
||||||
public PtNodeArray(ArrayList<PtNode> data) {
|
public PtNodeArray(ArrayList<PtNode> data) {
|
||||||
|
Collections.sort(data, PTNODE_COMPARATOR);
|
||||||
mData = data;
|
mData = data;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -143,7 +143,7 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
|
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
|
||||||
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
|
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
|
||||||
mBigramReader = new BigramContentReader(mDictDirectory.getName(),
|
mBigramReader = new BigramContentReader(mDictDirectory.getName(),
|
||||||
mDictDirectory, mBufferFactory, false);
|
mDictDirectory, mBufferFactory);
|
||||||
mBigramReader.openBuffers();
|
mBigramReader.openBuffers();
|
||||||
mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory,
|
mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory,
|
||||||
mBufferFactory);
|
mBufferFactory);
|
||||||
|
@ -184,39 +184,24 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
*/
|
*/
|
||||||
protected static class BigramContentReader extends SparseTableContentReader {
|
protected static class BigramContentReader extends SparseTableContentReader {
|
||||||
public BigramContentReader(final String name, final File baseDir,
|
public BigramContentReader(final String name, final File baseDir,
|
||||||
final DictionaryBufferFactory factory, final boolean hasTimestamp) {
|
final DictionaryBufferFactory factory) {
|
||||||
super(name + FormatSpec.BIGRAM_FILE_EXTENSION,
|
super(name + FormatSpec.BIGRAM_FILE_EXTENSION,
|
||||||
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
|
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
|
||||||
getContentFilenames(name, hasTimestamp), getContentIds(hasTimestamp), factory);
|
getContentFilenames(name), getContentIds(), factory);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Consolidate this method and BigramContentWriter.getContentFilenames.
|
// TODO: Consolidate this method and BigramContentWriter.getContentFilenames.
|
||||||
protected static String[] getContentFilenames(final String name,
|
protected static String[] getContentFilenames(final String name) {
|
||||||
final boolean hasTimestamp) {
|
return new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
|
||||||
final String[] contentFilenames;
|
|
||||||
if (hasTimestamp) {
|
|
||||||
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION,
|
|
||||||
name + FormatSpec.BIGRAM_FILE_EXTENSION };
|
|
||||||
} else {
|
|
||||||
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
|
|
||||||
}
|
|
||||||
return contentFilenames;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Consolidate this method and BigramContentWriter.getContentIds.
|
// TODO: Consolidate this method and BigramContentWriter.getContentIds.
|
||||||
protected static String[] getContentIds(final boolean hasTimestamp) {
|
protected static String[] getContentIds() {
|
||||||
final String[] contentIds;
|
return new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
|
||||||
if (hasTimestamp) {
|
|
||||||
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID,
|
|
||||||
FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID };
|
|
||||||
} else {
|
|
||||||
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
|
|
||||||
}
|
|
||||||
return contentIds;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public ArrayList<PendingAttribute> readTargetsAndFrequencies(final int terminalId,
|
public ArrayList<PendingAttribute> readTargetsAndFrequencies(final int terminalId,
|
||||||
final DictBuffer terminalAddressTableBuffer) {
|
final DictBuffer terminalAddressTableBuffer, final FormatOptions options) {
|
||||||
final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList();
|
final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList();
|
||||||
read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
|
read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
|
||||||
new SparseTableContentReaderInterface() {
|
new SparseTableContentReaderInterface() {
|
||||||
|
@ -226,14 +211,25 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
// If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
|
// If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
|
||||||
// remaining bigram entries are ignored.
|
// remaining bigram entries are ignored.
|
||||||
final int bigramFlags = buffer.readUnsignedByte();
|
final int bigramFlags = buffer.readUnsignedByte();
|
||||||
|
final int probability;
|
||||||
|
|
||||||
|
if (options.mHasTimestamp) {
|
||||||
|
probability = buffer.readUnsignedByte();
|
||||||
|
final int pos = buffer.position();
|
||||||
|
// Skip historical info.
|
||||||
|
buffer.position(pos + FormatSpec.BIGRAM_TIMESTAMP_SIZE
|
||||||
|
+ FormatSpec.BIGRAM_LEVEL_SIZE
|
||||||
|
+ FormatSpec.BIGRAM_COUNTER_SIZE);
|
||||||
|
} else {
|
||||||
|
probability = bigramFlags
|
||||||
|
& FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
|
||||||
|
}
|
||||||
final int targetTerminalId = buffer.readUnsignedInt24();
|
final int targetTerminalId = buffer.readUnsignedInt24();
|
||||||
terminalAddressTableBuffer.position(targetTerminalId
|
terminalAddressTableBuffer.position(targetTerminalId
|
||||||
* FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
|
* FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
|
||||||
final int targetAddress =
|
final int targetAddress =
|
||||||
terminalAddressTableBuffer.readUnsignedInt24();
|
terminalAddressTableBuffer.readUnsignedInt24();
|
||||||
bigrams.add(new PendingAttribute(bigramFlags
|
bigrams.add(new PendingAttribute(probability, targetAddress));
|
||||||
& FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
|
|
||||||
targetAddress));
|
|
||||||
if (0 == (bigramFlags
|
if (0 == (bigramFlags
|
||||||
& FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
|
& FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
|
||||||
break;
|
break;
|
||||||
|
@ -286,8 +282,19 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
|
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
|
||||||
protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
|
protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId,
|
||||||
frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
|
final FormatOptions formatOptions) {
|
||||||
|
final int readingPos;
|
||||||
|
if (formatOptions.mHasTimestamp) {
|
||||||
|
final int entrySize = FormatSpec.FREQUENCY_AND_FLAGS_SIZE
|
||||||
|
+ FormatSpec.UNIGRAM_TIMESTAMP_SIZE + FormatSpec.UNIGRAM_LEVEL_SIZE
|
||||||
|
+ FormatSpec.UNIGRAM_COUNTER_SIZE;
|
||||||
|
readingPos = terminalId * entrySize + FormatSpec.FLAGS_IN_FREQ_FILE_SIZE;
|
||||||
|
} else {
|
||||||
|
readingPos = terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE
|
||||||
|
+ FormatSpec.FLAGS_IN_FREQ_FILE_SIZE;
|
||||||
|
}
|
||||||
|
frequencyBuffer.position(readingPos);
|
||||||
return frequencyBuffer.readUnsignedByte();
|
return frequencyBuffer.readUnsignedByte();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -354,12 +361,12 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) {
|
public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions options) {
|
||||||
final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(ptNodePos, options);
|
final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(ptNodePos, options);
|
||||||
|
|
||||||
final int frequency;
|
final int frequency;
|
||||||
if (0 != (FormatSpec.FLAG_IS_TERMINAL & nodeInfo.mFlags)) {
|
if (0 != (FormatSpec.FLAG_IS_TERMINAL & nodeInfo.mFlags)) {
|
||||||
frequency = PtNodeReader.readFrequency(mFrequencyBuffer, nodeInfo.mTerminalId);
|
frequency = PtNodeReader.readFrequency(mFrequencyBuffer, nodeInfo.mTerminalId, options);
|
||||||
} else {
|
} else {
|
||||||
frequency = PtNode.NOT_A_TERMINAL;
|
frequency = PtNode.NOT_A_TERMINAL;
|
||||||
}
|
}
|
||||||
|
@ -367,7 +374,7 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts(
|
final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts(
|
||||||
nodeInfo.mTerminalId);
|
nodeInfo.mTerminalId);
|
||||||
final ArrayList<PendingAttribute> bigrams = mBigramReader.readTargetsAndFrequencies(
|
final ArrayList<PendingAttribute> bigrams = mBigramReader.readTargetsAndFrequencies(
|
||||||
nodeInfo.mTerminalId, mTerminalAddressTableBuffer);
|
nodeInfo.mTerminalId, mTerminalAddressTableBuffer, options);
|
||||||
|
|
||||||
return new PtNodeInfo(ptNodePos, ptNodePos + nodeInfo.mNodeSize, nodeInfo.mFlags,
|
return new PtNodeInfo(ptNodePos, ptNodePos + nodeInfo.mNodeSize, nodeInfo.mFlags,
|
||||||
nodeInfo.mCharacters, frequency, nodeInfo.mParentPos, nodeInfo.mChildrenPos,
|
nodeInfo.mCharacters, frequency, nodeInfo.mParentPos, nodeInfo.mChildrenPos,
|
||||||
|
|
|
@ -21,7 +21,11 @@ import android.test.suitebuilder.annotation.LargeTest;
|
||||||
import android.util.Pair;
|
import android.util.Pair;
|
||||||
|
|
||||||
import com.android.inputmethod.latin.makedict.CodePointUtils;
|
import com.android.inputmethod.latin.makedict.CodePointUtils;
|
||||||
|
import com.android.inputmethod.latin.makedict.DictDecoder;
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec;
|
import com.android.inputmethod.latin.makedict.FormatSpec;
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
|
||||||
|
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
import com.android.inputmethod.latin.utils.FileUtils;
|
import com.android.inputmethod.latin.utils.FileUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
@ -98,6 +102,10 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
getContext().getCacheDir());
|
getContext().getCacheDir());
|
||||||
FileUtils.deleteRecursively(file);
|
FileUtils.deleteRecursively(file);
|
||||||
Map<String, String> attributeMap = new HashMap<String, String>();
|
Map<String, String> attributeMap = new HashMap<String, String>();
|
||||||
|
attributeMap.put(FormatSpec.FileHeader.DICTIONARY_ID_ATTRIBUTE, dictId);
|
||||||
|
attributeMap.put(FormatSpec.FileHeader.DICTIONARY_LOCALE_ATTRIBUTE, dictId);
|
||||||
|
attributeMap.put(FormatSpec.FileHeader.DICTIONARY_VERSION_ATTRIBUTE,
|
||||||
|
String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
|
||||||
attributeMap.put(FormatSpec.FileHeader.USES_FORGETTING_CURVE_ATTRIBUTE,
|
attributeMap.put(FormatSpec.FileHeader.USES_FORGETTING_CURVE_ATTRIBUTE,
|
||||||
FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE);
|
FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE);
|
||||||
attributeMap.put(FormatSpec.FileHeader.HAS_HISTORICAL_INFO_ATTRIBUTE,
|
attributeMap.put(FormatSpec.FileHeader.HAS_HISTORICAL_INFO_ATTRIBUTE,
|
||||||
|
@ -119,6 +127,50 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
return BinaryDictionary.setCurrentTimeForTest(-1);
|
return BinaryDictionary.setCurrentTimeForTest(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testReadDictInJavaSide() {
|
||||||
|
testReadDictInJavaSide(FormatSpec.VERSION4);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testReadDictInJavaSide(final int formatVersion) {
|
||||||
|
setCurrentTimeForTestMode(mCurrentTime);
|
||||||
|
File dictFile = null;
|
||||||
|
try {
|
||||||
|
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while writing an initial dictionary : " + e);
|
||||||
|
}
|
||||||
|
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||||
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||||
|
addUnigramWord(binaryDictionary, "ab", DUMMY_PROBABILITY);
|
||||||
|
addUnigramWord(binaryDictionary, "aaa", DUMMY_PROBABILITY);
|
||||||
|
addBigramWords(binaryDictionary, "a", "aaa", DUMMY_PROBABILITY);
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
binaryDictionary.close();
|
||||||
|
|
||||||
|
final DictDecoder dictDecoder = FormatSpec.getDictDecoder(dictFile);
|
||||||
|
try {
|
||||||
|
final FusionDictionary dict = dictDecoder.readDictionaryBinary(null,
|
||||||
|
false /* deleteDictIfBroken */);
|
||||||
|
PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, "a");
|
||||||
|
assertNotNull(ptNode);
|
||||||
|
assertTrue(ptNode.isTerminal());
|
||||||
|
assertNotNull(ptNode.getBigram("aaa"));
|
||||||
|
ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, "ab");
|
||||||
|
assertNotNull(ptNode);
|
||||||
|
assertTrue(ptNode.isTerminal());
|
||||||
|
ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa");
|
||||||
|
assertNotNull(ptNode);
|
||||||
|
assertTrue(ptNode.isTerminal());
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while reading dictionary: " + e);
|
||||||
|
} catch (UnsupportedFormatException e) {
|
||||||
|
fail("Unsupported format: " + e);
|
||||||
|
}
|
||||||
|
dictFile.delete();
|
||||||
|
}
|
||||||
|
|
||||||
public void testControlCurrentTime() {
|
public void testControlCurrentTime() {
|
||||||
testControlCurrentTime(FormatSpec.VERSION4);
|
testControlCurrentTime(FormatSpec.VERSION4);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,11 +17,11 @@
|
||||||
package com.android.inputmethod.latin.makedict;
|
package com.android.inputmethod.latin.makedict;
|
||||||
|
|
||||||
import android.test.AndroidTestCase;
|
import android.test.AndroidTestCase;
|
||||||
import android.test.MoreAsserts;
|
|
||||||
import android.test.suitebuilder.annotation.LargeTest;
|
import android.test.suitebuilder.annotation.LargeTest;
|
||||||
import android.util.Log;
|
import android.util.Log;
|
||||||
import android.util.SparseArray;
|
import android.util.SparseArray;
|
||||||
|
|
||||||
|
import com.android.inputmethod.latin.BinaryDictionary;
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
|
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
|
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
|
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
|
||||||
|
@ -76,6 +76,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
|
|
||||||
public BinaryDictDecoderEncoderTests(final long seed, final int maxUnigrams) {
|
public BinaryDictDecoderEncoderTests(final long seed, final int maxUnigrams) {
|
||||||
super();
|
super();
|
||||||
|
BinaryDictionary.setCurrentTimeForTest(0);
|
||||||
Log.e(TAG, "Testing dictionary: seed is " + seed);
|
Log.e(TAG, "Testing dictionary: seed is " + seed);
|
||||||
final Random random = new Random(seed);
|
final Random random = new Random(seed);
|
||||||
sWords.clear();
|
sWords.clear();
|
||||||
|
@ -262,7 +263,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
getContext().getCacheDir());
|
getContext().getCacheDir());
|
||||||
|
|
||||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||||
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion));
|
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions));
|
||||||
addUnigrams(words.size(), dict, words, shortcuts);
|
addUnigrams(words.size(), dict, words, shortcuts);
|
||||||
addBigrams(dict, words, bigrams);
|
addBigrams(dict, words, bigrams);
|
||||||
checkDictionary(dict, words, bigrams, shortcuts);
|
checkDictionary(dict, words, bigrams, shortcuts);
|
||||||
|
@ -317,7 +318,6 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
BinaryDictUtils.VERSION4_OPTIONS_WITHOUT_TIMESTAMP);
|
BinaryDictUtils.VERSION4_OPTIONS_WITHOUT_TIMESTAMP);
|
||||||
runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER,
|
runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER,
|
||||||
BinaryDictUtils.VERSION4_OPTIONS_WITH_TIMESTAMP);
|
BinaryDictUtils.VERSION4_OPTIONS_WITH_TIMESTAMP);
|
||||||
|
|
||||||
for (final String result : results) {
|
for (final String result : results) {
|
||||||
Log.d(TAG, result);
|
Log.d(TAG, result);
|
||||||
}
|
}
|
||||||
|
@ -344,15 +344,17 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
final SparseArray<List<Integer>> expectedBigrams,
|
final SparseArray<List<Integer>> expectedBigrams,
|
||||||
final TreeMap<Integer, String> resultWords,
|
final TreeMap<Integer, String> resultWords,
|
||||||
final TreeMap<Integer, Integer> resultFrequencies,
|
final TreeMap<Integer, Integer> resultFrequencies,
|
||||||
final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams) {
|
final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams,
|
||||||
|
final boolean checkProbability) {
|
||||||
// check unigrams
|
// check unigrams
|
||||||
final Set<String> actualWordsSet = new HashSet<String>(resultWords.values());
|
final Set<String> actualWordsSet = new HashSet<String>(resultWords.values());
|
||||||
final Set<String> expectedWordsSet = new HashSet<String>(expectedWords);
|
final Set<String> expectedWordsSet = new HashSet<String>(expectedWords);
|
||||||
assertEquals(actualWordsSet, expectedWordsSet);
|
assertEquals(actualWordsSet, expectedWordsSet);
|
||||||
|
if (checkProbability) {
|
||||||
for (int freq : resultFrequencies.values()) {
|
for (int freq : resultFrequencies.values()) {
|
||||||
assertEquals(freq, UNIGRAM_FREQ);
|
assertEquals(freq, UNIGRAM_FREQ);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// check bigrams
|
// check bigrams
|
||||||
final HashMap<String, Set<String>> expBigrams = new HashMap<String, Set<String>>();
|
final HashMap<String, Set<String>> expBigrams = new HashMap<String, Set<String>>();
|
||||||
|
@ -377,16 +379,19 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
actBigrams.get(word1).add(word2);
|
actBigrams.get(word1).add(word2);
|
||||||
|
|
||||||
|
if (checkProbability) {
|
||||||
final int bigramFreq = BinaryDictIOUtils.reconstructBigramFrequency(
|
final int bigramFreq = BinaryDictIOUtils.reconstructBigramFrequency(
|
||||||
unigramFreq, attr.mFrequency);
|
unigramFreq, attr.mFrequency);
|
||||||
assertTrue(Math.abs(bigramFreq - BIGRAM_FREQ) < TOLERANCE_OF_BIGRAM_FREQ);
|
assertTrue(Math.abs(bigramFreq - BIGRAM_FREQ) < TOLERANCE_OF_BIGRAM_FREQ);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
assertEquals(actBigrams, expBigrams);
|
assertEquals(actBigrams, expBigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
private long timeAndCheckReadUnigramsAndBigramsBinary(final File file, final List<String> words,
|
private long timeAndCheckReadUnigramsAndBigramsBinary(final File file, final List<String> words,
|
||||||
final SparseArray<List<Integer>> bigrams, final int bufferType) {
|
final SparseArray<List<Integer>> bigrams, final int bufferType,
|
||||||
|
final boolean checkProbability) {
|
||||||
final TreeMap<Integer, String> resultWords = CollectionUtils.newTreeMap();
|
final TreeMap<Integer, String> resultWords = CollectionUtils.newTreeMap();
|
||||||
final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams =
|
final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams =
|
||||||
CollectionUtils.newTreeMap();
|
CollectionUtils.newTreeMap();
|
||||||
|
@ -404,7 +409,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
Log.e(TAG, "UnsupportedFormatException", e);
|
Log.e(TAG, "UnsupportedFormatException", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
checkWordMap(words, bigrams, resultWords, resultFreqs, resultBigrams);
|
checkWordMap(words, bigrams, resultWords, resultFreqs, resultBigrams, checkProbability);
|
||||||
return diff;
|
return diff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -418,13 +423,17 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
|
|
||||||
// making the dictionary from lists of words.
|
// making the dictionary from lists of words.
|
||||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||||
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion));
|
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions));
|
||||||
addUnigrams(words.size(), dict, words, null /* shortcutMap */);
|
addUnigrams(words.size(), dict, words, null /* shortcutMap */);
|
||||||
addBigrams(dict, words, bigrams);
|
addBigrams(dict, words, bigrams);
|
||||||
|
|
||||||
timeWritingDictToFile(file, dict, formatOptions);
|
timeWritingDictToFile(file, dict, formatOptions);
|
||||||
|
|
||||||
long wordMap = timeAndCheckReadUnigramsAndBigramsBinary(file, words, bigrams, bufferType);
|
// Caveat: Currently, the Java code to read a v4 dictionary doesn't calculate the
|
||||||
|
// probability when there's a timestamp for the entry.
|
||||||
|
// TODO: Abandon the Java code, and implement the v4 dictionary reading code in native.
|
||||||
|
long wordMap = timeAndCheckReadUnigramsAndBigramsBinary(file, words, bigrams, bufferType,
|
||||||
|
!formatOptions.mHasTimestamp /* checkProbability */);
|
||||||
long fullReading = timeReadingAndCheckDict(file, words, bigrams, null /* shortcutMap */,
|
long fullReading = timeReadingAndCheckDict(file, words, bigrams, null /* shortcutMap */,
|
||||||
bufferType);
|
bufferType);
|
||||||
|
|
||||||
|
@ -517,7 +526,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
getContext().getCacheDir());
|
getContext().getCacheDir());
|
||||||
|
|
||||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||||
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion));
|
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions));
|
||||||
addUnigrams(sWords.size(), dict, sWords, null /* shortcutMap */);
|
addUnigrams(sWords.size(), dict, sWords, null /* shortcutMap */);
|
||||||
addBigrams(dict, words, bigrams);
|
addBigrams(dict, words, bigrams);
|
||||||
timeWritingDictToFile(file, dict, formatOptions);
|
timeWritingDictToFile(file, dict, formatOptions);
|
||||||
|
|
|
@ -36,11 +36,18 @@ public class BinaryDictUtils {
|
||||||
public static final FormatSpec.FormatOptions VERSION4_OPTIONS_WITH_TIMESTAMP =
|
public static final FormatSpec.FormatOptions VERSION4_OPTIONS_WITH_TIMESTAMP =
|
||||||
new FormatSpec.FormatOptions(FormatSpec.VERSION4, true /* hasTimestamp */);
|
new FormatSpec.FormatOptions(FormatSpec.VERSION4, true /* hasTimestamp */);
|
||||||
|
|
||||||
public static DictionaryOptions makeDictionaryOptions(final String id, final String version) {
|
public static DictionaryOptions makeDictionaryOptions(final String id, final String version,
|
||||||
|
final FormatSpec.FormatOptions formatOptions) {
|
||||||
final DictionaryOptions options = new DictionaryOptions(new HashMap<String, String>());
|
final DictionaryOptions options = new DictionaryOptions(new HashMap<String, String>());
|
||||||
options.mAttributes.put(FileHeader.DICTIONARY_LOCALE_ATTRIBUTE, "en_US");
|
options.mAttributes.put(FileHeader.DICTIONARY_LOCALE_ATTRIBUTE, "en_US");
|
||||||
options.mAttributes.put(FileHeader.DICTIONARY_ID_ATTRIBUTE, id);
|
options.mAttributes.put(FileHeader.DICTIONARY_ID_ATTRIBUTE, id);
|
||||||
options.mAttributes.put(FileHeader.DICTIONARY_VERSION_ATTRIBUTE, version);
|
options.mAttributes.put(FileHeader.DICTIONARY_VERSION_ATTRIBUTE, version);
|
||||||
|
if (formatOptions.mHasTimestamp) {
|
||||||
|
options.mAttributes.put(FileHeader.HAS_HISTORICAL_INFO_ATTRIBUTE,
|
||||||
|
FileHeader.ATTRIBUTE_VALUE_TRUE);
|
||||||
|
options.mAttributes.put(FileHeader.USES_FORGETTING_CURVE_ATTRIBUTE,
|
||||||
|
FileHeader.ATTRIBUTE_VALUE_TRUE);
|
||||||
|
}
|
||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue