am 9514ed5c: Add the new format of bigram entries.

* commit '9514ed5c2a49e645e2d468f7191d54d77d9f127f':
  Add the new format of bigram entries.
main
Yuichiro Hanada 2013-10-10 23:57:54 -07:00 committed by Android Git Automerger
commit c068780b9d
5 changed files with 105 additions and 27 deletions

View File

@ -60,7 +60,8 @@ public abstract class AbstractDictDecoder implements DictDecoder {
0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG), 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG),
0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)), 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)),
new FormatOptions(version, new FormatOptions(version,
0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE))); 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE),
0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG)));
return header; return header;
} }

View File

@ -37,13 +37,15 @@ public final class FormatSpec {
* sion * sion
* *
* o | * o |
* p | not used 4 bits * p | not used 3 bits
* t | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG * t | each unigram and bigram entry has a time stamp?
* i | FRENCH_LIGATURE_PROCESSING_FLAG * i | 1 bit, 1 = yes, 0 = no : CONTAINS_TIMESTAMP_FLAG
* o | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE * o | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
* n | GERMAN_UMLAUT_PROCESSING_FLAG * n | FRENCH_LIGATURE_PROCESSING_FLAG
* f | * f | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
* lags * l | GERMAN_UMLAUT_PROCESSING_FLAG
* a |
* gs
* *
* h | * h |
* e | size of the file header, 4bytes * e | size of the file header, 4bytes
@ -211,6 +213,8 @@ public final class FormatSpec {
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2; static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
static final int CONTAINS_BIGRAMS_FLAG = 0x8; static final int CONTAINS_BIGRAMS_FLAG = 0x8;
// TODO: Implement timestamps for unigram.
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
// TODO: Make this value adaptative to content data, store it in the header, and // TODO: Make this value adaptative to content data, store it in the header, and
// use it in the reading code. // use it in the reading code.
@ -276,9 +280,14 @@ public final class FormatSpec {
// is 584KB with the block size being 4. // is 584KB with the block size being 4.
// This is 91% of that of full address table. // This is 91% of that of full address table.
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
static final int BIGRAM_CONTENT_COUNT = 1; static final int BIGRAM_CONTENT_COUNT = 2;
static final int BIGRAM_FREQ_CONTENT_INDEX = 0; static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
static final int BIGRAM_TIMESTAMP_CONTENT_INDEX = 1;
static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
static final String BIGRAM_TIMESTAMP_CONTENT_ID = "_timestamp";
static final int BIGRAM_TIMESTAMP_SIZE = 4;
static final int BIGRAM_COUNTER_SIZE = 1;
static final int BIGRAM_LEVEL_SIZE = 1;
static final int SHORTCUT_CONTENT_COUNT = 1; static final int SHORTCUT_CONTENT_COUNT = 1;
static final int SHORTCUT_CONTENT_INDEX = 0; static final int SHORTCUT_CONTENT_INDEX = 0;
@ -321,6 +330,7 @@ public final class FormatSpec {
public final int mVersion; public final int mVersion;
public final boolean mSupportsDynamicUpdate; public final boolean mSupportsDynamicUpdate;
public final boolean mHasTerminalId; public final boolean mHasTerminalId;
public final boolean mHasTimestamp;
@UsedForTesting @UsedForTesting
public FormatOptions(final int version) { public FormatOptions(final int version) {
this(version, false); this(version, false);
@ -328,6 +338,11 @@ public final class FormatSpec {
@UsedForTesting @UsedForTesting
public FormatOptions(final int version, final boolean supportsDynamicUpdate) { public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
this(version, supportsDynamicUpdate, false /* hasTimestamp */);
}
public FormatOptions(final int version, final boolean supportsDynamicUpdate,
final boolean hasTimestamp) {
mVersion = version; mVersion = version;
if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) { if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
throw new RuntimeException("Dynamic updates are only supported with versions " throw new RuntimeException("Dynamic updates are only supported with versions "
@ -335,6 +350,7 @@ public final class FormatSpec {
} }
mSupportsDynamicUpdate = supportsDynamicUpdate; mSupportsDynamicUpdate = supportsDynamicUpdate;
mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID); mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID);
mHasTimestamp = hasTimestamp;
} }
} }

View File

@ -153,8 +153,12 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
final File contentFile = new File(mDictDirectory, mDictDirectory.getName() final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ FormatSpec.SHORTCUT_CONTENT_ID); + FormatSpec.SHORTCUT_CONTENT_ID);
final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName()
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ FormatSpec.SHORTCUT_CONTENT_ID);
mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile, mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE); new File[] { contentFile, timestampsFile },
FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
} }
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {

View File

@ -69,16 +69,16 @@ public class Ver4DictEncoder implements DictEncoder {
private final File[] mContentFiles; private final File[] mContentFiles;
protected final OutputStream[] mContentOutStreams; protected final OutputStream[] mContentOutStreams;
public SparseTableContentWriter(final String name, final int contentCount, public SparseTableContentWriter(final String name, final int initialCapacity,
final int initialCapacity, final int blockSize, final File baseDir, final int blockSize, final File baseDir, final String[] contentFilenames,
final String[] contentFilenames, final String[] contentIds) { final String[] contentIds) {
if (contentFilenames.length != contentIds.length) { if (contentFilenames.length != contentIds.length) {
throw new RuntimeException("The length of contentFilenames and the length of" throw new RuntimeException("The length of contentFilenames and the length of"
+ " contentIds are different " + contentFilenames.length + ", " + " contentIds are different " + contentFilenames.length + ", "
+ contentIds.length); + contentIds.length);
} }
mContentCount = contentCount; mContentCount = contentFilenames.length;
mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount); mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount);
mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
mAddressTableFiles = new File[mContentCount]; mAddressTableFiles = new File[mContentCount];
mContentFiles = new File[mContentCount]; mContentFiles = new File[mContentCount];
@ -113,16 +113,40 @@ public class Ver4DictEncoder implements DictEncoder {
} }
private static class BigramContentWriter extends SparseTableContentWriter { private static class BigramContentWriter extends SparseTableContentWriter {
private final boolean mWriteTimestamp;
public BigramContentWriter(final String name, final int initialCapacity, public BigramContentWriter(final String name, final int initialCapacity,
final File baseDir) { final File baseDir, final boolean writeTimestamp) {
super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT, super(name + FormatSpec.BIGRAM_FILE_EXTENSION, initialCapacity,
initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }, getContentFilenames(name, writeTimestamp), getContentIds(writeTimestamp));
new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }); mWriteTimestamp = writeTimestamp;
} }
public void writeBigramsForOneWord(final int terminalId, private static String[] getContentFilenames(final String name,
final boolean writeTimestamp) {
final String[] contentFilenames;
if (writeTimestamp) {
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION,
name + FormatSpec.BIGRAM_FILE_EXTENSION };
} else {
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
}
return contentFilenames;
}
private static String[] getContentIds(final boolean writeTimestamp) {
final String[] contentIds;
if (writeTimestamp) {
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID,
FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID };
} else {
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
}
return contentIds;
}
public void writeBigramsForOneWord(final int terminalId, final int bigramCount,
final Iterator<WeightedString> bigramIterator, final FusionDictionary dict) final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
throws IOException { throws IOException {
write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
@ -130,8 +154,16 @@ public class Ver4DictEncoder implements DictEncoder {
@Override @Override
public void write(final OutputStream outStream) throws IOException { public void write(final OutputStream outStream) throws IOException {
writeBigramsForOneWordInternal(outStream, bigramIterator, dict); writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
} }});
}); if (mWriteTimestamp) {
write(FormatSpec.BIGRAM_TIMESTAMP_CONTENT_INDEX, terminalId,
new SparseTableContentWriterInterface() {
@Override
public void write(final OutputStream outStream) throws IOException {
initBigramTimestampsCountersAndLevelsForOneWordInternal(outStream,
bigramCount);
}});
}
} }
private void writeBigramsForOneWordInternal(final OutputStream outStream, private void writeBigramsForOneWordInternal(final OutputStream outStream,
@ -151,13 +183,26 @@ public class Ver4DictEncoder implements DictEncoder {
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE); FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
} }
} }
private void initBigramTimestampsCountersAndLevelsForOneWordInternal(
final OutputStream outStream, final int bigramCount) throws IOException {
for (int i = 0; i < bigramCount; ++i) {
// TODO: Figure out what initial values should be.
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
FormatSpec.BIGRAM_TIMESTAMP_SIZE);
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
FormatSpec.BIGRAM_COUNTER_SIZE);
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
FormatSpec.BIGRAM_LEVEL_SIZE);
}
}
} }
private static class ShortcutContentWriter extends SparseTableContentWriter { private static class ShortcutContentWriter extends SparseTableContentWriter {
public ShortcutContentWriter(final String name, final int initialCapacity, public ShortcutContentWriter(final String name, final int initialCapacity,
final File baseDir) { final File baseDir) {
super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT, super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, initialCapacity,
initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
new String[] { FormatSpec.SHORTCUT_CONTENT_ID }); new String[] { FormatSpec.SHORTCUT_CONTENT_ID });
} }
@ -257,7 +302,8 @@ public class Ver4DictEncoder implements DictEncoder {
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
writeTerminalData(flatNodes, terminalCount); writeTerminalData(flatNodes, terminalCount);
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
formatOptions.mHasTimestamp);
writeBigrams(flatNodes, dict); writeBigrams(flatNodes, dict);
mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir); mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir);
writeShortcuts(flatNodes); writeShortcuts(flatNodes);
@ -348,7 +394,7 @@ public class Ver4DictEncoder implements DictEncoder {
for (final PtNodeArray nodeArray : flatNodes) { for (final PtNodeArray nodeArray : flatNodes) {
for (final PtNode ptNode : nodeArray.mData) { for (final PtNode ptNode : nodeArray.mData) {
if (ptNode.mBigrams != null) { if (ptNode.mBigrams != null) {
mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, ptNode.mBigrams.size(),
ptNode.mBigrams.iterator(), dict); ptNode.mBigrams.iterator(), dict);
} }
} }

View File

@ -80,6 +80,9 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
new FormatSpec.FormatOptions(4, false /* supportsDynamicUpdate */); new FormatSpec.FormatOptions(4, false /* supportsDynamicUpdate */);
private static final FormatSpec.FormatOptions VERSION4_WITH_DYNAMIC_UPDATE = private static final FormatSpec.FormatOptions VERSION4_WITH_DYNAMIC_UPDATE =
new FormatSpec.FormatOptions(4, true /* supportsDynamicUpdate */); new FormatSpec.FormatOptions(4, true /* supportsDynamicUpdate */);
private static final FormatSpec.FormatOptions VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP =
new FormatSpec.FormatOptions(4, true /* supportsDynamicUpdate */,
true /* hasTimestamp */);
private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
@ -363,6 +366,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE); runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE); runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE); runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) { for (final String result : results) {
Log.d(TAG, result); Log.d(TAG, result);
@ -377,6 +381,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE); runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE); runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE); runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) { for (final String result : results) {
Log.d(TAG, result); Log.d(TAG, result);
@ -508,6 +513,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER,
VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) { for (final String result : results) {
Log.d(TAG, result); Log.d(TAG, result);
@ -522,6 +529,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE); runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY,
VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) { for (final String result : results) {
Log.d(TAG, result); Log.d(TAG, result);
@ -634,12 +643,14 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE); runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE); runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE); runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION2); runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION2);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITHOUT_DYNAMIC_UPDATE); runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITHOUT_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE); runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE); runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE); runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) { for (final String result : results) {
Log.d(TAG, result); Log.d(TAG, result);