* commit 'be7db63ad91f3f1a6e78d55993adcd5c055fddea': Add SparseTableContentWriter to Ver4DictEncoder.
This commit is contained in:
commit
98d7c6cde8
3 changed files with 119 additions and 52 deletions
|
@ -266,11 +266,14 @@ public final class FormatSpec {
|
||||||
// tat = Terminal Address Table
|
// tat = Terminal Address Table
|
||||||
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
|
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
|
||||||
static final String BIGRAM_FILE_EXTENSION = ".bigram";
|
static final String BIGRAM_FILE_EXTENSION = ".bigram";
|
||||||
static final String BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup";
|
static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup";
|
||||||
static final String BIGRAM_ADDRESS_TABLE_FILE_EXTENSION = ".bigram_index";
|
static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
|
||||||
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
|
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
|
||||||
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
||||||
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
|
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
|
||||||
|
static final int BIGRAM_CONTENT_COUNT = 1;
|
||||||
|
static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
|
||||||
|
static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
|
||||||
|
|
||||||
static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
|
static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
|
||||||
static final int NO_PARENT_ADDRESS = 0;
|
static final int NO_PARENT_ADDRESS = 0;
|
||||||
|
|
|
@ -42,7 +42,7 @@ public class Ver4DictDecoder extends DictDecoder {
|
||||||
private static final int FILETYPE_TRIE = 1;
|
private static final int FILETYPE_TRIE = 1;
|
||||||
private static final int FILETYPE_FREQUENCY = 2;
|
private static final int FILETYPE_FREQUENCY = 2;
|
||||||
private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
|
private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
|
||||||
private static final int FILETYPE_BIGRAM = 4;
|
private static final int FILETYPE_BIGRAM_FREQ = 4;
|
||||||
|
|
||||||
private final File mDictDirectory;
|
private final File mDictDirectory;
|
||||||
private final DictionaryBufferFactory mBufferFactory;
|
private final DictionaryBufferFactory mBufferFactory;
|
||||||
|
@ -85,9 +85,10 @@ public class Ver4DictDecoder extends DictDecoder {
|
||||||
} else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
|
} else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
|
||||||
return new File(mDictDirectory,
|
return new File(mDictDirectory,
|
||||||
mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
|
mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
|
||||||
} else if (fileType == FILETYPE_BIGRAM) {
|
} else if (fileType == FILETYPE_BIGRAM_FREQ) {
|
||||||
return new File(mDictDirectory,
|
return new File(mDictDirectory,
|
||||||
mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION);
|
mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
|
||||||
|
+ FormatSpec.BIGRAM_FREQ_CONTENT_ID);
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException("Unsupported kind of file : " + fileType);
|
throw new RuntimeException("Unsupported kind of file : " + fileType);
|
||||||
}
|
}
|
||||||
|
@ -99,7 +100,7 @@ public class Ver4DictDecoder extends DictDecoder {
|
||||||
mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
|
mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
|
||||||
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
|
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
|
||||||
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
|
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
|
||||||
mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM));
|
mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
|
||||||
loadBigramAddressSparseTable();
|
loadBigramAddressSparseTable();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,11 +127,12 @@ public class Ver4DictDecoder extends DictDecoder {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void loadBigramAddressSparseTable() throws IOException {
|
private void loadBigramAddressSparseTable() throws IOException {
|
||||||
final File lookupIndexFile = new File(mDictDirectory,
|
final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
|
||||||
mDictDirectory.getName() + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
|
+ FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
|
||||||
final File contentFile = new File(mDictDirectory,
|
final File freqsFile = new File(mDictDirectory, mDictDirectory.getName()
|
||||||
mDictDirectory.getName() + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
|
+ FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
|
||||||
mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { contentFile },
|
+ FormatSpec.BIGRAM_FREQ_CONTENT_ID);
|
||||||
|
mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile },
|
||||||
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
|
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,6 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
@ -44,19 +43,115 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
private byte[] mTrieBuf;
|
private byte[] mTrieBuf;
|
||||||
private int mTriePos;
|
private int mTriePos;
|
||||||
private int mHeaderSize;
|
private int mHeaderSize;
|
||||||
private SparseTable mBigramAddressTable;
|
|
||||||
private OutputStream mTrieOutStream;
|
private OutputStream mTrieOutStream;
|
||||||
private OutputStream mFreqOutStream;
|
private OutputStream mFreqOutStream;
|
||||||
private OutputStream mTerminalAddressTableOutStream;
|
private OutputStream mTerminalAddressTableOutStream;
|
||||||
private OutputStream mBigramOutStream;
|
|
||||||
private File mDictDir;
|
private File mDictDir;
|
||||||
private String mBaseFilename;
|
private String mBaseFilename;
|
||||||
|
private BigramContentWriter mBigramWriter;
|
||||||
|
|
||||||
@UsedForTesting
|
@UsedForTesting
|
||||||
public Ver4DictEncoder(final File dictPlacedDir) {
|
public Ver4DictEncoder(final File dictPlacedDir) {
|
||||||
mDictPlacedDir = dictPlacedDir;
|
mDictPlacedDir = dictPlacedDir;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private interface SparseTableContentWriterInterface {
|
||||||
|
public void write(final OutputStream outStream) throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class SparseTableContentWriter {
|
||||||
|
private final int mContentCount;
|
||||||
|
private final SparseTable mSparseTable;
|
||||||
|
private final File mLookupTableFile;
|
||||||
|
protected final File mBaseDir;
|
||||||
|
private final File[] mAddressTableFiles;
|
||||||
|
private final File[] mContentFiles;
|
||||||
|
protected final OutputStream[] mContentOutStreams;
|
||||||
|
|
||||||
|
public SparseTableContentWriter(final String name, final int contentCount,
|
||||||
|
final int initialCapacity, final int blockSize, final File baseDir,
|
||||||
|
final String[] contentFilenames, final String[] contentIds) {
|
||||||
|
if (contentFilenames.length != contentIds.length) {
|
||||||
|
throw new RuntimeException("The length of contentFilenames and the length of"
|
||||||
|
+ " contentIds are different " + contentFilenames.length + ", "
|
||||||
|
+ contentIds.length);
|
||||||
|
}
|
||||||
|
mContentCount = contentCount;
|
||||||
|
mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount);
|
||||||
|
mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
|
||||||
|
mAddressTableFiles = new File[mContentCount];
|
||||||
|
mContentFiles = new File[mContentCount];
|
||||||
|
mBaseDir = baseDir;
|
||||||
|
for (int i = 0; i < mContentCount; ++i) {
|
||||||
|
mAddressTableFiles[i] = new File(mBaseDir,
|
||||||
|
name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
|
||||||
|
mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
|
||||||
|
}
|
||||||
|
mContentOutStreams = new OutputStream[mContentCount];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void openStreams() throws FileNotFoundException {
|
||||||
|
for (int i = 0; i < mContentCount; ++i) {
|
||||||
|
mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void write(final int contentIndex, final int index,
|
||||||
|
final SparseTableContentWriterInterface writer) throws IOException {
|
||||||
|
mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length());
|
||||||
|
writer.write(mContentOutStreams[contentIndex]);
|
||||||
|
mContentOutStreams[contentIndex].flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void closeStreams() throws IOException {
|
||||||
|
mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles);
|
||||||
|
for (int i = 0; i < mContentCount; ++i) {
|
||||||
|
mContentOutStreams[i].close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class BigramContentWriter extends SparseTableContentWriter {
|
||||||
|
|
||||||
|
public BigramContentWriter(final String name, final int initialCapacity,
|
||||||
|
final File baseDir) {
|
||||||
|
super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT,
|
||||||
|
initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
|
||||||
|
new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION },
|
||||||
|
new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeBigramsForOneWord(final int terminalId,
|
||||||
|
final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
|
||||||
|
throws IOException {
|
||||||
|
write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
|
||||||
|
new SparseTableContentWriterInterface() {
|
||||||
|
@Override
|
||||||
|
public void write(final OutputStream outStream) throws IOException {
|
||||||
|
writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeBigramsForOneWordInternal(final OutputStream outStream,
|
||||||
|
final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
|
||||||
|
throws IOException {
|
||||||
|
while (bigramIterator.hasNext()) {
|
||||||
|
final WeightedString bigram = bigramIterator.next();
|
||||||
|
final PtNode target =
|
||||||
|
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
|
||||||
|
final int unigramFrequencyForThisWord = target.mFrequency;
|
||||||
|
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
|
||||||
|
bigramIterator.hasNext(), 0, bigram.mFrequency,
|
||||||
|
unigramFrequencyForThisWord, bigram.mWord);
|
||||||
|
BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags,
|
||||||
|
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
||||||
|
BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId,
|
||||||
|
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
|
private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
|
||||||
throws FileNotFoundException, IOException {
|
throws FileNotFoundException, IOException {
|
||||||
final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
|
final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
|
||||||
|
@ -66,8 +161,6 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
|
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
|
||||||
final File terminalAddressTableFile = new File(mDictDir,
|
final File terminalAddressTableFile = new File(mDictDir,
|
||||||
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
|
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
|
||||||
final File bigramFile = new File(mDictDir,
|
|
||||||
mBaseFilename + FormatSpec.BIGRAM_FILE_EXTENSION);
|
|
||||||
if (!mDictDir.isDirectory()) {
|
if (!mDictDir.isDirectory()) {
|
||||||
if (mDictDir.exists()) mDictDir.delete();
|
if (mDictDir.exists()) mDictDir.delete();
|
||||||
mDictDir.mkdirs();
|
mDictDir.mkdirs();
|
||||||
|
@ -78,7 +171,6 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
mTrieOutStream = new FileOutputStream(trieFile);
|
mTrieOutStream = new FileOutputStream(trieFile);
|
||||||
mFreqOutStream = new FileOutputStream(freqFile);
|
mFreqOutStream = new FileOutputStream(freqFile);
|
||||||
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
|
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
|
||||||
mBigramOutStream = new FileOutputStream(bigramFile);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void close() throws IOException {
|
private void close() throws IOException {
|
||||||
|
@ -92,14 +184,10 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
if (mTerminalAddressTableOutStream != null) {
|
if (mTerminalAddressTableOutStream != null) {
|
||||||
mTerminalAddressTableOutStream.close();
|
mTerminalAddressTableOutStream.close();
|
||||||
}
|
}
|
||||||
if (mBigramOutStream != null) {
|
|
||||||
mBigramOutStream.close();
|
|
||||||
}
|
|
||||||
} finally {
|
} finally {
|
||||||
mTrieOutStream = null;
|
mTrieOutStream = null;
|
||||||
mFreqOutStream = null;
|
mFreqOutStream = null;
|
||||||
mTerminalAddressTableOutStream = null;
|
mTerminalAddressTableOutStream = null;
|
||||||
mBigramOutStream = null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,10 +223,8 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
|
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
|
||||||
|
|
||||||
writeTerminalData(flatNodes, terminalCount);
|
writeTerminalData(flatNodes, terminalCount);
|
||||||
mBigramAddressTable = new SparseTable(terminalCount,
|
mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir);
|
||||||
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, 1 /* contentTableCount */);
|
|
||||||
writeBigrams(flatNodes, dict);
|
writeBigrams(flatNodes, dict);
|
||||||
writeBigramAddressSparseTable();
|
|
||||||
|
|
||||||
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
|
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
|
||||||
final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
|
final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
|
||||||
|
@ -245,40 +331,16 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
|
|
||||||
private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
|
private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final ByteArrayOutputStream bigramBuffer = new ByteArrayOutputStream();
|
mBigramWriter.openStreams();
|
||||||
|
|
||||||
for (final PtNodeArray nodeArray : flatNodes) {
|
for (final PtNodeArray nodeArray : flatNodes) {
|
||||||
for (final PtNode ptNode : nodeArray.mData) {
|
for (final PtNode ptNode : nodeArray.mData) {
|
||||||
if (ptNode.mBigrams != null) {
|
if (ptNode.mBigrams != null) {
|
||||||
final int startPos = bigramBuffer.size();
|
mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId,
|
||||||
mBigramAddressTable.set(0 /* contentTableIndex */, ptNode.mTerminalId,
|
ptNode.mBigrams.iterator(), dict);
|
||||||
startPos);
|
|
||||||
final Iterator<WeightedString> bigramIterator = ptNode.mBigrams.iterator();
|
|
||||||
while (bigramIterator.hasNext()) {
|
|
||||||
final WeightedString bigram = bigramIterator.next();
|
|
||||||
final PtNode target =
|
|
||||||
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
|
|
||||||
final int unigramFrequencyForThisWord = target.mFrequency;
|
|
||||||
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
|
|
||||||
bigramIterator.hasNext(), 0, bigram.mFrequency,
|
|
||||||
unigramFrequencyForThisWord, bigram.mWord);
|
|
||||||
BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, bigramFlags,
|
|
||||||
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
|
||||||
BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, target.mTerminalId,
|
|
||||||
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bigramBuffer.writeTo(mBigramOutStream);
|
mBigramWriter.closeStreams();
|
||||||
}
|
|
||||||
|
|
||||||
private void writeBigramAddressSparseTable() throws IOException {
|
|
||||||
final File lookupIndexFile =
|
|
||||||
new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
|
|
||||||
final File contentFile =
|
|
||||||
new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
|
|
||||||
mBigramAddressTable.writeToFiles(lookupIndexFile, new File[] { contentFile });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
Loading…
Reference in a new issue