Add the new format of bigram entries.
In new format, each bigram entry has flags (1 byte), a terminal id (3 byte), a time-stamp (4 byte), a counter (1 byte) and a level (1 byte). Bug: 10920255 Bug: 10920165 Change-Id: I0f7fc125a6178e6d25a07e8462afc41a7f57e3e1
This commit is contained in:
parent
849942950e
commit
9514ed5c2a
5 changed files with 105 additions and 27 deletions
|
@ -60,7 +60,8 @@ public abstract class AbstractDictDecoder implements DictDecoder {
|
|||
0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG),
|
||||
0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)),
|
||||
new FormatOptions(version,
|
||||
0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE)));
|
||||
0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE),
|
||||
0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG)));
|
||||
return header;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,13 +37,15 @@ public final class FormatSpec {
|
|||
* sion
|
||||
*
|
||||
* o |
|
||||
* p | not used 4 bits
|
||||
* t | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
|
||||
* i | FRENCH_LIGATURE_PROCESSING_FLAG
|
||||
* o | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
|
||||
* n | GERMAN_UMLAUT_PROCESSING_FLAG
|
||||
* f |
|
||||
* lags
|
||||
* p | not used 3 bits
|
||||
* t | each unigram and bigram entry has a time stamp?
|
||||
* i | 1 bit, 1 = yes, 0 = no : CONTAINS_TIMESTAMP_FLAG
|
||||
* o | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
|
||||
* n | FRENCH_LIGATURE_PROCESSING_FLAG
|
||||
* f | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
|
||||
* l | GERMAN_UMLAUT_PROCESSING_FLAG
|
||||
* a |
|
||||
* gs
|
||||
*
|
||||
* h |
|
||||
* e | size of the file header, 4bytes
|
||||
|
@ -211,6 +213,8 @@ public final class FormatSpec {
|
|||
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
|
||||
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
|
||||
static final int CONTAINS_BIGRAMS_FLAG = 0x8;
|
||||
// TODO: Implement timestamps for unigram.
|
||||
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
|
||||
|
||||
// TODO: Make this value adaptative to content data, store it in the header, and
|
||||
// use it in the reading code.
|
||||
|
@ -276,9 +280,14 @@ public final class FormatSpec {
|
|||
// is 584KB with the block size being 4.
|
||||
// This is 91% of that of full address table.
|
||||
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
|
||||
static final int BIGRAM_CONTENT_COUNT = 1;
|
||||
static final int BIGRAM_CONTENT_COUNT = 2;
|
||||
static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
|
||||
static final int BIGRAM_TIMESTAMP_CONTENT_INDEX = 1;
|
||||
static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
|
||||
static final String BIGRAM_TIMESTAMP_CONTENT_ID = "_timestamp";
|
||||
static final int BIGRAM_TIMESTAMP_SIZE = 4;
|
||||
static final int BIGRAM_COUNTER_SIZE = 1;
|
||||
static final int BIGRAM_LEVEL_SIZE = 1;
|
||||
|
||||
static final int SHORTCUT_CONTENT_COUNT = 1;
|
||||
static final int SHORTCUT_CONTENT_INDEX = 0;
|
||||
|
@ -321,6 +330,7 @@ public final class FormatSpec {
|
|||
public final int mVersion;
|
||||
public final boolean mSupportsDynamicUpdate;
|
||||
public final boolean mHasTerminalId;
|
||||
public final boolean mHasTimestamp;
|
||||
@UsedForTesting
|
||||
public FormatOptions(final int version) {
|
||||
this(version, false);
|
||||
|
@ -328,6 +338,11 @@ public final class FormatSpec {
|
|||
|
||||
@UsedForTesting
|
||||
public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
|
||||
this(version, supportsDynamicUpdate, false /* hasTimestamp */);
|
||||
}
|
||||
|
||||
public FormatOptions(final int version, final boolean supportsDynamicUpdate,
|
||||
final boolean hasTimestamp) {
|
||||
mVersion = version;
|
||||
if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
|
||||
throw new RuntimeException("Dynamic updates are only supported with versions "
|
||||
|
@ -335,6 +350,7 @@ public final class FormatSpec {
|
|||
}
|
||||
mSupportsDynamicUpdate = supportsDynamicUpdate;
|
||||
mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID);
|
||||
mHasTimestamp = hasTimestamp;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -153,8 +153,12 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
|||
final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
|
||||
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
|
||||
+ FormatSpec.SHORTCUT_CONTENT_ID);
|
||||
final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName()
|
||||
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
|
||||
+ FormatSpec.SHORTCUT_CONTENT_ID);
|
||||
mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
|
||||
new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
|
||||
new File[] { contentFile, timestampsFile },
|
||||
FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
|
||||
|
|
|
@ -69,16 +69,16 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
private final File[] mContentFiles;
|
||||
protected final OutputStream[] mContentOutStreams;
|
||||
|
||||
public SparseTableContentWriter(final String name, final int contentCount,
|
||||
final int initialCapacity, final int blockSize, final File baseDir,
|
||||
final String[] contentFilenames, final String[] contentIds) {
|
||||
public SparseTableContentWriter(final String name, final int initialCapacity,
|
||||
final int blockSize, final File baseDir, final String[] contentFilenames,
|
||||
final String[] contentIds) {
|
||||
if (contentFilenames.length != contentIds.length) {
|
||||
throw new RuntimeException("The length of contentFilenames and the length of"
|
||||
+ " contentIds are different " + contentFilenames.length + ", "
|
||||
+ contentIds.length);
|
||||
}
|
||||
mContentCount = contentCount;
|
||||
mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount);
|
||||
mContentCount = contentFilenames.length;
|
||||
mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount);
|
||||
mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
|
||||
mAddressTableFiles = new File[mContentCount];
|
||||
mContentFiles = new File[mContentCount];
|
||||
|
@ -113,16 +113,40 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
}
|
||||
|
||||
private static class BigramContentWriter extends SparseTableContentWriter {
|
||||
private final boolean mWriteTimestamp;
|
||||
|
||||
public BigramContentWriter(final String name, final int initialCapacity,
|
||||
final File baseDir) {
|
||||
super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT,
|
||||
initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
|
||||
new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION },
|
||||
new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID });
|
||||
final File baseDir, final boolean writeTimestamp) {
|
||||
super(name + FormatSpec.BIGRAM_FILE_EXTENSION, initialCapacity,
|
||||
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
|
||||
getContentFilenames(name, writeTimestamp), getContentIds(writeTimestamp));
|
||||
mWriteTimestamp = writeTimestamp;
|
||||
}
|
||||
|
||||
public void writeBigramsForOneWord(final int terminalId,
|
||||
private static String[] getContentFilenames(final String name,
|
||||
final boolean writeTimestamp) {
|
||||
final String[] contentFilenames;
|
||||
if (writeTimestamp) {
|
||||
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION,
|
||||
name + FormatSpec.BIGRAM_FILE_EXTENSION };
|
||||
} else {
|
||||
contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
|
||||
}
|
||||
return contentFilenames;
|
||||
}
|
||||
|
||||
private static String[] getContentIds(final boolean writeTimestamp) {
|
||||
final String[] contentIds;
|
||||
if (writeTimestamp) {
|
||||
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID,
|
||||
FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID };
|
||||
} else {
|
||||
contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
|
||||
}
|
||||
return contentIds;
|
||||
}
|
||||
|
||||
public void writeBigramsForOneWord(final int terminalId, final int bigramCount,
|
||||
final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
|
||||
throws IOException {
|
||||
write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
|
||||
|
@ -130,8 +154,16 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
@Override
|
||||
public void write(final OutputStream outStream) throws IOException {
|
||||
writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
|
||||
}
|
||||
});
|
||||
}});
|
||||
if (mWriteTimestamp) {
|
||||
write(FormatSpec.BIGRAM_TIMESTAMP_CONTENT_INDEX, terminalId,
|
||||
new SparseTableContentWriterInterface() {
|
||||
@Override
|
||||
public void write(final OutputStream outStream) throws IOException {
|
||||
initBigramTimestampsCountersAndLevelsForOneWordInternal(outStream,
|
||||
bigramCount);
|
||||
}});
|
||||
}
|
||||
}
|
||||
|
||||
private void writeBigramsForOneWordInternal(final OutputStream outStream,
|
||||
|
@ -151,13 +183,26 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
private void initBigramTimestampsCountersAndLevelsForOneWordInternal(
|
||||
final OutputStream outStream, final int bigramCount) throws IOException {
|
||||
for (int i = 0; i < bigramCount; ++i) {
|
||||
// TODO: Figure out what initial values should be.
|
||||
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
|
||||
FormatSpec.BIGRAM_TIMESTAMP_SIZE);
|
||||
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
|
||||
FormatSpec.BIGRAM_COUNTER_SIZE);
|
||||
BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
|
||||
FormatSpec.BIGRAM_LEVEL_SIZE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class ShortcutContentWriter extends SparseTableContentWriter {
|
||||
public ShortcutContentWriter(final String name, final int initialCapacity,
|
||||
final File baseDir) {
|
||||
super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT,
|
||||
initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
|
||||
super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, initialCapacity,
|
||||
FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
|
||||
new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
|
||||
new String[] { FormatSpec.SHORTCUT_CONTENT_ID });
|
||||
}
|
||||
|
@ -257,7 +302,8 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
|
||||
|
||||
writeTerminalData(flatNodes, terminalCount);
|
||||
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir);
|
||||
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
|
||||
formatOptions.mHasTimestamp);
|
||||
writeBigrams(flatNodes, dict);
|
||||
mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir);
|
||||
writeShortcuts(flatNodes);
|
||||
|
@ -348,7 +394,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
for (final PtNodeArray nodeArray : flatNodes) {
|
||||
for (final PtNode ptNode : nodeArray.mData) {
|
||||
if (ptNode.mBigrams != null) {
|
||||
mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId,
|
||||
mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, ptNode.mBigrams.size(),
|
||||
ptNode.mBigrams.iterator(), dict);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -80,6 +80,9 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
|||
new FormatSpec.FormatOptions(4, false /* supportsDynamicUpdate */);
|
||||
private static final FormatSpec.FormatOptions VERSION4_WITH_DYNAMIC_UPDATE =
|
||||
new FormatSpec.FormatOptions(4, true /* supportsDynamicUpdate */);
|
||||
private static final FormatSpec.FormatOptions VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP =
|
||||
new FormatSpec.FormatOptions(4, true /* supportsDynamicUpdate */,
|
||||
true /* hasTimestamp */);
|
||||
|
||||
private static final String TEST_DICT_FILE_EXTENSION = ".testDict";
|
||||
|
||||
|
@ -363,6 +366,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
|||
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
|
||||
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
|
||||
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
|
||||
runReadAndWriteTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
|
||||
|
||||
for (final String result : results) {
|
||||
Log.d(TAG, result);
|
||||
|
@ -377,6 +381,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
|||
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
|
||||
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
|
||||
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
|
||||
runReadAndWriteTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
|
||||
|
||||
for (final String result : results) {
|
||||
Log.d(TAG, result);
|
||||
|
@ -508,6 +513,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
|||
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
|
||||
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
|
||||
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
|
||||
runReadUnigramsAndBigramsTests(results, USE_BYTE_BUFFER,
|
||||
VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
|
||||
|
||||
for (final String result : results) {
|
||||
Log.d(TAG, result);
|
||||
|
@ -522,6 +529,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
|||
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
|
||||
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
|
||||
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
|
||||
runReadUnigramsAndBigramsTests(results, USE_BYTE_ARRAY,
|
||||
VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
|
||||
|
||||
for (final String result : results) {
|
||||
Log.d(TAG, result);
|
||||
|
@ -634,12 +643,14 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
|||
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION3_WITH_DYNAMIC_UPDATE);
|
||||
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITHOUT_DYNAMIC_UPDATE);
|
||||
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE);
|
||||
runGetTerminalPositionTests(USE_BYTE_ARRAY, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
|
||||
|
||||
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION2);
|
||||
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITHOUT_DYNAMIC_UPDATE);
|
||||
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION3_WITH_DYNAMIC_UPDATE);
|
||||
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITHOUT_DYNAMIC_UPDATE);
|
||||
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE);
|
||||
runGetTerminalPositionTests(USE_BYTE_BUFFER, VERSION4_WITH_DYNAMIC_UPDATE_AND_TIMESTAMP);
|
||||
|
||||
for (final String result : results) {
|
||||
Log.d(TAG, result);
|
||||
|
|
Loading…
Reference in a new issue