Add the new format of bigram entries.

In new format, each bigram entry has flags (1 byte), a terminal id (3 byte),
a time-stamp (4 byte), a counter (1 byte) and a level (1 byte).

Bug: 10920255
Bug: 10920165
Change-Id: I0f7fc125a6178e6d25a07e8462afc41a7f57e3e1
main
Yuichiro Hanada 2013-10-03 17:29:14 +09:00
parent 849942950e
commit 9514ed5c2a
5 changed files with 105 additions and 27 deletions

View File

@ -60,7 +60,8 @@ public abstract class AbstractDictDecoder implements DictDecoder {
0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG),
0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)),
new FormatOptions(version,
0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE)));
0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE),
0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG)));
return header;
}

View File

@ -37,13 +37,15 @@ public final class FormatSpec {
* sion
*
* o |
* p | not used 4 bits
* t | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
* i | FRENCH_LIGATURE_PROCESSING_FLAG
* o | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
* n | GERMAN_UMLAUT_PROCESSING_FLAG
* f |
* lags
* p | not used 3 bits
* t | each unigram and bigram entry has a time stamp?
* i | 1 bit, 1 = yes, 0 = no : CONTAINS_TIMESTAMP_FLAG
* o | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
* n | FRENCH_LIGATURE_PROCESSING_FLAG
* f | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
* l | GERMAN_UMLAUT_PROCESSING_FLAG
* a |
* gs
*
* h |
* e | size of the file header, 4bytes
@ -211,6 +213,8 @@ public final class FormatSpec {
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
static final int CONTAINS_BIGRAMS_FLAG = 0x8;
// TODO: Implement timestamps for unigram.
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
// TODO: Make this value adaptative to content data, store it in the header, and
// use it in the reading code.
@ -276,9 +280,14 @@ public final class FormatSpec {
// is 584KB with the block size being 4.
// This is 91% of that of full address table.
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
static final int BIGRAM_CONTENT_COUNT = 1;
static final int BIGRAM_CONTENT_COUNT = 2;
static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
static final int BIGRAM_TIMESTAMP_CONTENT_INDEX = 1;
static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
static final String BIGRAM_TIMESTAMP_CONTENT_ID = "_timestamp";
static final int BIGRAM_TIMESTAMP_SIZE = 4;
static final int BIGRAM_COUNTER_SIZE = 1;
static final int BIGRAM_LEVEL_SIZE = 1;
static final int SHORTCUT_CONTENT_COUNT = 1;
static final int SHORTCUT_CONTENT_INDEX = 0;
@ -321,6 +330,7 @@ public final class FormatSpec {
public final int mVersion;
public final boolean mSupportsDynamicUpdate;
public final boolean mHasTerminalId;
public final boolean mHasTimestamp;
@UsedForTesting
public FormatOptions(final int version) {
this(version, false);
@ -328,6 +338,11 @@ public final class FormatSpec {
@UsedForTesting
public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
this(version, supportsDynamicUpdate, false /* hasTimestamp */);
}
public FormatOptions(final int version, final boolean supportsDynamicUpdate,
final boolean hasTimestamp) {
mVersion = version;
if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
throw new RuntimeException("Dynamic updates are only supported with versions "
@ -335,6 +350,7 @@ public final class FormatSpec {
}
mSupportsDynamicUpdate = supportsDynamicUpdate;
mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID);
mHasTimestamp = hasTimestamp;
}
}

View File

@ -153,8 +153,12 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ FormatSpec.SHORTCUT_CONTENT_ID);
final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName()
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ FormatSpec.SHORTCUT_CONTENT_ID);
mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
new File[] { contentFile, timestampsFile },
FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
}
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {

View File

@ -69,16 +69,16 @@ public class Ver4DictEncoder implements DictEncoder {
private final File[] mContentFiles;
protected final OutputStream[] mContentOutStreams;
public SparseTableContentWriter(final String name, final int contentCount,
final int initialCapacity, final int blockSize, final File baseDir,
final String[] contentFilenames, final String[] contentIds) {
public SparseTableContentWriter(final String name, final int initialCapacity,
final int blockSize, final File baseDir, final String[] contentFilenames,
final String[] contentIds) {
if (contentFilenames.length != contentIds.length) {
throw new RuntimeException("The length of contentFilenames and the length of"
+ " contentIds are different " + contentFilenames.length + ", "
+ contentIds.length);
}
mContentCount = contentCount;
mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount);
mContentCount = contentFilenames.length;
mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount);
mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
mAddressTableFiles = new File[mContentCount];
mContentFiles = new File[mContentCount];
@ -113,16 +113,40 @@ public class Ver4DictEncoder implements DictEncoder {
}
private static class BigramContentWriter extends SparseTableContentWriter {
private final boolean mWriteTimestamp;
public BigramContentWriter(final String name, final int initialCapacity,
final File baseDir) {
super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT,
initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION },
new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID });
final File baseDir, final boolean writeTimestamp) {
super(name + FormatSpec.BIGRAM_FILE_EXTENSION, initialCapacity,
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
getContentFilenames(name, writeTimestamp), getContentIds(writeTimestamp));
mWriteTimestamp = writeTimestamp;
}
public void writeBigramsForOneWord(final int terminalId,
private static String[] getContentFilenames(final String name,
final boolean writeTimestamp) {
final String[] contentFilenames;
if (writeTimestamp) {
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION,
name + FormatSpec.BIGRAM_FILE_EXTENSION };
} else {
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
}
return contentFilenames;
}
private static String[] getContentIds(final boolean writeTimestamp) {
final String[] contentIds;
if (writeTimestamp) {
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID,
FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID };
} else {
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
}
return contentIds;
}
public void writeBigramsForOneWord(final int terminalId, final int bigramCount,
final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
throws IOException {
write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
@ -130,8 +154,16 @@ public class Ver4DictEncoder implements DictEncoder {
@Override
public void write(final OutputStream outStream) throws IOException {
writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
}
});
}});
if (mWriteTimestamp) {
write(FormatSpec.BIGRAM_TIMESTAMP_CONTENT_INDEX, terminalId,
new SparseTableContentWriterInterface() {
@Override
public void write(final OutputStream outStream) throws IOException {
initBigramTimestampsCountersAndLevelsForOneWordInternal(outStream,
bigramCount);
}});
}
}
private void writeBigramsForOneWordInternal(final OutputStream outStream,
@ -151,13 +183,26 @@ public class Ver4DictEncoder implements DictEncoder {
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
}
}
private void initBigramTimestampsCountersAndLevelsForOneWordInternal(
final OutputStream outStream, final int bigramCount) throws IOException {
for (int i = 0; i < bigramCount; ++i) {
// TODO: Figure out what initial values should be.
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
FormatSpec.BIGRAM_TIMESTAMP_SIZE);
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
FormatSpec.BIGRAM_COUNTER_SIZE);
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
FormatSpec.BIGRAM_LEVEL_SIZE);
}
}
}
private static class ShortcutContentWriter extends SparseTableContentWriter {
public ShortcutContentWriter(final String name, final int initialCapacity,
final File baseDir) {
super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT,
initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, initialCapacity,
FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
new String[] { FormatSpec.SHORTCUT_CONTENT_ID });
}
@ -257,7 +302,8 @@ public class Ver4DictEncoder implements DictEncoder {
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
writeTerminalData(flatNodes, terminalCount);
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir);
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
formatOptions.mHasTimestamp);
writeBigrams(flatNodes, dict);
mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir);
writeShortcuts(flatNodes);
@ -348,7 +394,7 @@ public class Ver4DictEncoder implements DictEncoder {
for (final PtNodeArray nodeArray : flatNodes) {
for (final PtNode ptNode : nodeArray.mData) {
if (ptNode.mBigrams != null) {
mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId,
mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, ptNode.mBigrams.size(),
ptNode.mBigrams.iterator(), dict);
}
}

View File

@ -80,6 +80,9 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
new FormatSpec.FormatOptions(4, false /* supportsDynamicUpdate */);
private static final FormatSpec.FormatOptions VERSION4_WITH_DYNAMIC_UPDATE =
new FormatSpec.FormatOptions(4, true /* supportsDynamicUpdate */);
private static final FormatSpec.FormatOptions VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP =
new FormatSpec.FormatOptions(4, true /* supportsDynamicUpdate */,
true /* hasTimestamp */);
private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
@ -363,6 +366,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) {
Log.d(TAG, result);
@ -377,6 +381,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) {
Log.d(TAG, result);
@ -508,6 +513,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER,
VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) {
Log.d(TAG, result);
@ -522,6 +529,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY,
VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) {
Log.d(TAG, result);
@ -634,12 +643,14 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION2);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITHOUT_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
for (final String result : results) {
Log.d(TAG, result);