Merge "Add a time stamp for unigrams."

This commit is contained in:
Ken Wakasa 2013-10-11 10:17:00 +00:00 committed by Android (Google) Code Review
commit 7abdab1c6f
2 changed files with 21 additions and 4 deletions

View file

@ -213,7 +213,6 @@ public final class FormatSpec {
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2; static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
static final int CONTAINS_BIGRAMS_FLAG = 0x8; static final int CONTAINS_BIGRAMS_FLAG = 0x8;
// TODO: Implement timestamps for unigram.
static final int CONTAINS_TIMESTAMP_FLAG = 0x10; static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
// TODO: Make this value adaptative to content data, store it in the header, and // TODO: Make this value adaptative to content data, store it in the header, and
@ -267,6 +266,7 @@ public final class FormatSpec {
// These values are used only by version 4 or later. // These values are used only by version 4 or later.
static final String TRIE_FILE_EXTENSION = ".trie"; static final String TRIE_FILE_EXTENSION = ".trie";
static final String FREQ_FILE_EXTENSION = ".freq"; static final String FREQ_FILE_EXTENSION = ".freq";
static final String UNIGRAM_TIMESTAMP_FILE_EXTENSION = ".timestamp";
// tat = Terminal Address Table // tat = Terminal Address Table
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
static final String BIGRAM_FILE_EXTENSION = ".bigram"; static final String BIGRAM_FILE_EXTENSION = ".bigram";
@ -275,6 +275,7 @@ public final class FormatSpec {
static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int FREQUENCY_AND_FLAGS_SIZE = 2;
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
static final int UNIGRAM_TIMESTAMP_SIZE = 4;
// With the English main dictionary as of October 2013, the size of bigram address table is // With the English main dictionary as of October 2013, the size of bigram address table is
// is 584KB with the block size being 4. // is 584KB with the block size being 4.

View file

@ -45,6 +45,7 @@ public class Ver4DictEncoder implements DictEncoder {
private int mHeaderSize; private int mHeaderSize;
private OutputStream mTrieOutStream; private OutputStream mTrieOutStream;
private OutputStream mFreqOutStream; private OutputStream mFreqOutStream;
private OutputStream mUnigramTimestampOutStream;
private OutputStream mTerminalAddressTableOutStream; private OutputStream mTerminalAddressTableOutStream;
private File mDictDir; private File mDictDir;
private String mBaseFilename; private String mBaseFilename;
@ -238,18 +239,20 @@ public class Ver4DictEncoder implements DictEncoder {
mDictDir = new File(mDictPlacedDir, mBaseFilename); mDictDir = new File(mDictPlacedDir, mBaseFilename);
final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION); final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION); final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
final File timestampFile = new File(mDictDir,
mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION);
final File terminalAddressTableFile = new File(mDictDir, final File terminalAddressTableFile = new File(mDictDir,
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
if (!mDictDir.isDirectory()) { if (!mDictDir.isDirectory()) {
if (mDictDir.exists()) mDictDir.delete(); if (mDictDir.exists()) mDictDir.delete();
mDictDir.mkdirs(); mDictDir.mkdirs();
} }
if (!trieFile.exists()) trieFile.createNewFile();
if (!freqFile.exists()) freqFile.createNewFile();
if (!terminalAddressTableFile.exists()) terminalAddressTableFile.createNewFile();
mTrieOutStream = new FileOutputStream(trieFile); mTrieOutStream = new FileOutputStream(trieFile);
mFreqOutStream = new FileOutputStream(freqFile); mFreqOutStream = new FileOutputStream(freqFile);
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile); mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
if (formatOptions.mHasTimestamp) {
mUnigramTimestampOutStream = new FileOutputStream(timestampFile);
}
} }
private void close() throws IOException { private void close() throws IOException {
@ -263,6 +266,9 @@ public class Ver4DictEncoder implements DictEncoder {
if (mTerminalAddressTableOutStream != null) { if (mTerminalAddressTableOutStream != null) {
mTerminalAddressTableOutStream.close(); mTerminalAddressTableOutStream.close();
} }
if (mUnigramTimestampOutStream != null) {
mUnigramTimestampOutStream.close();
}
} finally { } finally {
mTrieOutStream = null; mTrieOutStream = null;
mFreqOutStream = null; mFreqOutStream = null;
@ -302,6 +308,9 @@ public class Ver4DictEncoder implements DictEncoder {
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
writeTerminalData(flatNodes, terminalCount); writeTerminalData(flatNodes, terminalCount);
if (formatOptions.mHasTimestamp) {
initUnigramTimestamps(terminalCount);
}
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir, mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
formatOptions.mHasTimestamp); formatOptions.mHasTimestamp);
writeBigrams(flatNodes, dict); writeBigrams(flatNodes, dict);
@ -454,4 +463,11 @@ public class Ver4DictEncoder implements DictEncoder {
mFreqOutStream.write(freqBuf); mFreqOutStream.write(freqBuf);
mTerminalAddressTableOutStream.write(terminalAddressTableBuf); mTerminalAddressTableOutStream.write(terminalAddressTableBuf);
} }
private void initUnigramTimestamps(final int terminalCount) throws IOException {
// Initial value of time stamps for each word is 0.
final byte[] unigramTimestampBuf =
new byte[terminalCount * FormatSpec.UNIGRAM_TIMESTAMP_SIZE];
mUnigramTimestampOutStream.write(unigramTimestampBuf);
}
} }