Merge "Add a time stamp for unigrams."
This commit is contained in:
commit
7abdab1c6f
2 changed files with 21 additions and 4 deletions
|
@ -213,7 +213,6 @@ public final class FormatSpec {
|
||||||
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
|
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
|
||||||
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
|
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
|
||||||
static final int CONTAINS_BIGRAMS_FLAG = 0x8;
|
static final int CONTAINS_BIGRAMS_FLAG = 0x8;
|
||||||
// TODO: Implement timestamps for unigram.
|
|
||||||
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
|
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
|
||||||
|
|
||||||
// TODO: Make this value adaptative to content data, store it in the header, and
|
// TODO: Make this value adaptative to content data, store it in the header, and
|
||||||
|
@ -267,6 +266,7 @@ public final class FormatSpec {
|
||||||
// These values are used only by version 4 or later.
|
// These values are used only by version 4 or later.
|
||||||
static final String TRIE_FILE_EXTENSION = ".trie";
|
static final String TRIE_FILE_EXTENSION = ".trie";
|
||||||
static final String FREQ_FILE_EXTENSION = ".freq";
|
static final String FREQ_FILE_EXTENSION = ".freq";
|
||||||
|
static final String UNIGRAM_TIMESTAMP_FILE_EXTENSION = ".timestamp";
|
||||||
// tat = Terminal Address Table
|
// tat = Terminal Address Table
|
||||||
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
|
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
|
||||||
static final String BIGRAM_FILE_EXTENSION = ".bigram";
|
static final String BIGRAM_FILE_EXTENSION = ".bigram";
|
||||||
|
@ -275,6 +275,7 @@ public final class FormatSpec {
|
||||||
static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
|
static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
|
||||||
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
|
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
|
||||||
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
||||||
|
static final int UNIGRAM_TIMESTAMP_SIZE = 4;
|
||||||
|
|
||||||
// With the English main dictionary as of October 2013, the size of bigram address table is
|
// With the English main dictionary as of October 2013, the size of bigram address table is
|
||||||
// is 584KB with the block size being 4.
|
// is 584KB with the block size being 4.
|
||||||
|
|
|
@ -45,6 +45,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
private int mHeaderSize;
|
private int mHeaderSize;
|
||||||
private OutputStream mTrieOutStream;
|
private OutputStream mTrieOutStream;
|
||||||
private OutputStream mFreqOutStream;
|
private OutputStream mFreqOutStream;
|
||||||
|
private OutputStream mUnigramTimestampOutStream;
|
||||||
private OutputStream mTerminalAddressTableOutStream;
|
private OutputStream mTerminalAddressTableOutStream;
|
||||||
private File mDictDir;
|
private File mDictDir;
|
||||||
private String mBaseFilename;
|
private String mBaseFilename;
|
||||||
|
@ -238,18 +239,20 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
mDictDir = new File(mDictPlacedDir, mBaseFilename);
|
mDictDir = new File(mDictPlacedDir, mBaseFilename);
|
||||||
final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
|
final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
|
||||||
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
|
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
|
||||||
|
final File timestampFile = new File(mDictDir,
|
||||||
|
mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION);
|
||||||
final File terminalAddressTableFile = new File(mDictDir,
|
final File terminalAddressTableFile = new File(mDictDir,
|
||||||
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
|
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
|
||||||
if (!mDictDir.isDirectory()) {
|
if (!mDictDir.isDirectory()) {
|
||||||
if (mDictDir.exists()) mDictDir.delete();
|
if (mDictDir.exists()) mDictDir.delete();
|
||||||
mDictDir.mkdirs();
|
mDictDir.mkdirs();
|
||||||
}
|
}
|
||||||
if (!trieFile.exists()) trieFile.createNewFile();
|
|
||||||
if (!freqFile.exists()) freqFile.createNewFile();
|
|
||||||
if (!terminalAddressTableFile.exists()) terminalAddressTableFile.createNewFile();
|
|
||||||
mTrieOutStream = new FileOutputStream(trieFile);
|
mTrieOutStream = new FileOutputStream(trieFile);
|
||||||
mFreqOutStream = new FileOutputStream(freqFile);
|
mFreqOutStream = new FileOutputStream(freqFile);
|
||||||
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
|
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
|
||||||
|
if (formatOptions.mHasTimestamp) {
|
||||||
|
mUnigramTimestampOutStream = new FileOutputStream(timestampFile);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void close() throws IOException {
|
private void close() throws IOException {
|
||||||
|
@ -263,6 +266,9 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
if (mTerminalAddressTableOutStream != null) {
|
if (mTerminalAddressTableOutStream != null) {
|
||||||
mTerminalAddressTableOutStream.close();
|
mTerminalAddressTableOutStream.close();
|
||||||
}
|
}
|
||||||
|
if (mUnigramTimestampOutStream != null) {
|
||||||
|
mUnigramTimestampOutStream.close();
|
||||||
|
}
|
||||||
} finally {
|
} finally {
|
||||||
mTrieOutStream = null;
|
mTrieOutStream = null;
|
||||||
mFreqOutStream = null;
|
mFreqOutStream = null;
|
||||||
|
@ -302,6 +308,9 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
|
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
|
||||||
|
|
||||||
writeTerminalData(flatNodes, terminalCount);
|
writeTerminalData(flatNodes, terminalCount);
|
||||||
|
if (formatOptions.mHasTimestamp) {
|
||||||
|
initUnigramTimestamps(terminalCount);
|
||||||
|
}
|
||||||
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
|
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
|
||||||
formatOptions.mHasTimestamp);
|
formatOptions.mHasTimestamp);
|
||||||
writeBigrams(flatNodes, dict);
|
writeBigrams(flatNodes, dict);
|
||||||
|
@ -454,4 +463,11 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
mFreqOutStream.write(freqBuf);
|
mFreqOutStream.write(freqBuf);
|
||||||
mTerminalAddressTableOutStream.write(terminalAddressTableBuf);
|
mTerminalAddressTableOutStream.write(terminalAddressTableBuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void initUnigramTimestamps(final int terminalCount) throws IOException {
|
||||||
|
// Initial value of time stamps for each word is 0.
|
||||||
|
final byte[] unigramTimestampBuf =
|
||||||
|
new byte[terminalCount * FormatSpec.UNIGRAM_TIMESTAMP_SIZE];
|
||||||
|
mUnigramTimestampOutStream.write(unigramTimestampBuf);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue