Add a time stamp for unigrams.

Bug: 10920255
Change-Id: I26d2cce3c322a4ff39a614f8615f43fb7bd3baed
main
Yuichiro Hanada 2013-10-04 17:38:02 +09:00
parent 9514ed5c2a
commit c32962b8f1
2 changed files with 21 additions and 4 deletions

View File

@ -213,7 +213,6 @@ public final class FormatSpec {
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
static final int CONTAINS_BIGRAMS_FLAG = 0x8;
// TODO: Implement timestamps for unigram.
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
// TODO: Make this value adaptative to content data, store it in the header, and
@ -267,6 +266,7 @@ public final class FormatSpec {
// These values are used only by version 4 or later.
static final String TRIE_FILE_EXTENSION = ".trie";
static final String FREQ_FILE_EXTENSION = ".freq";
static final String UNIGRAM_TIMESTAMP_FILE_EXTENSION = ".timestamp";
// tat = Terminal Address Table
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
static final String BIGRAM_FILE_EXTENSION = ".bigram";
@ -275,6 +275,7 @@ public final class FormatSpec {
static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
static final int UNIGRAM_TIMESTAMP_SIZE = 4;
// With the English main dictionary as of October 2013, the size of bigram address table is
// is 584KB with the block size being 4.

View File

@ -45,6 +45,7 @@ public class Ver4DictEncoder implements DictEncoder {
private int mHeaderSize;
private OutputStream mTrieOutStream;
private OutputStream mFreqOutStream;
private OutputStream mUnigramTimestampOutStream;
private OutputStream mTerminalAddressTableOutStream;
private File mDictDir;
private String mBaseFilename;
@ -238,18 +239,20 @@ public class Ver4DictEncoder implements DictEncoder {
mDictDir = new File(mDictPlacedDir, mBaseFilename);
final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
final File timestampFile = new File(mDictDir,
mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION);
final File terminalAddressTableFile = new File(mDictDir,
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
if (!mDictDir.isDirectory()) {
if (mDictDir.exists()) mDictDir.delete();
mDictDir.mkdirs();
}
if (!trieFile.exists()) trieFile.createNewFile();
if (!freqFile.exists()) freqFile.createNewFile();
if (!terminalAddressTableFile.exists()) terminalAddressTableFile.createNewFile();
mTrieOutStream = new FileOutputStream(trieFile);
mFreqOutStream = new FileOutputStream(freqFile);
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
if (formatOptions.mHasTimestamp) {
mUnigramTimestampOutStream = new FileOutputStream(timestampFile);
}
}
private void close() throws IOException {
@ -263,6 +266,9 @@ public class Ver4DictEncoder implements DictEncoder {
if (mTerminalAddressTableOutStream != null) {
mTerminalAddressTableOutStream.close();
}
if (mUnigramTimestampOutStream != null) {
mUnigramTimestampOutStream.close();
}
} finally {
mTrieOutStream = null;
mFreqOutStream = null;
@ -302,6 +308,9 @@ public class Ver4DictEncoder implements DictEncoder {
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
writeTerminalData(flatNodes, terminalCount);
if (formatOptions.mHasTimestamp) {
initUnigramTimestamps(terminalCount);
}
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
formatOptions.mHasTimestamp);
writeBigrams(flatNodes, dict);
@ -454,4 +463,11 @@ public class Ver4DictEncoder implements DictEncoder {
mFreqOutStream.write(freqBuf);
mTerminalAddressTableOutStream.write(terminalAddressTableBuf);
}
private void initUnigramTimestamps(final int terminalCount) throws IOException {
// Initial value of time stamps for each word is 0.
final byte[] unigramTimestampBuf =
new byte[terminalCount * FormatSpec.UNIGRAM_TIMESTAMP_SIZE];
mUnigramTimestampOutStream.write(unigramTimestampBuf);
}
}