Add SparseTableContentWriter to Ver4DictEncoder.

Bug: 10920165
Change-Id: I6372492e97297baad4c5aeeb3fb36dcccd7a944b
This commit is contained in:
Yuichiro Hanada 2013-10-01 23:08:27 +09:00
parent d6e307a4b7
commit 3dd77a6d66
3 changed files with 119 additions and 52 deletions

View file

@ -266,11 +266,14 @@ public final class FormatSpec {
// tat = Terminal Address Table // tat = Terminal Address Table
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
static final String BIGRAM_FILE_EXTENSION = ".bigram"; static final String BIGRAM_FILE_EXTENSION = ".bigram";
static final String BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup"; static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup";
static final String BIGRAM_ADDRESS_TABLE_FILE_EXTENSION = ".bigram_index"; static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int FREQUENCY_AND_FLAGS_SIZE = 2;
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
static final int BIGRAM_CONTENT_COUNT = 1;
static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
static final int NO_PARENT_ADDRESS = 0; static final int NO_PARENT_ADDRESS = 0;

View file

@ -42,7 +42,7 @@ public class Ver4DictDecoder extends DictDecoder {
private static final int FILETYPE_TRIE = 1; private static final int FILETYPE_TRIE = 1;
private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_FREQUENCY = 2;
private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
private static final int FILETYPE_BIGRAM = 4; private static final int FILETYPE_BIGRAM_FREQ = 4;
private final File mDictDirectory; private final File mDictDirectory;
private final DictionaryBufferFactory mBufferFactory; private final DictionaryBufferFactory mBufferFactory;
@ -85,9 +85,10 @@ public class Ver4DictDecoder extends DictDecoder {
} else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) { } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
return new File(mDictDirectory, return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
} else if (fileType == FILETYPE_BIGRAM) { } else if (fileType == FILETYPE_BIGRAM_FREQ) {
return new File(mDictDirectory, return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION); mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
+ FormatSpec.BIGRAM_FREQ_CONTENT_ID);
} else { } else {
throw new RuntimeException("Unsupported kind of file : " + fileType); throw new RuntimeException("Unsupported kind of file : " + fileType);
} }
@ -99,7 +100,7 @@ public class Ver4DictDecoder extends DictDecoder {
mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY)); mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer( mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM)); mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
loadBigramAddressSparseTable(); loadBigramAddressSparseTable();
} }
@ -126,11 +127,12 @@ public class Ver4DictDecoder extends DictDecoder {
} }
private void loadBigramAddressSparseTable() throws IOException { private void loadBigramAddressSparseTable() throws IOException {
final File lookupIndexFile = new File(mDictDirectory, final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
mDictDirectory.getName() + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION); + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
final File contentFile = new File(mDictDirectory, final File freqsFile = new File(mDictDirectory, mDictDirectory.getName()
mDictDirectory.getName() + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION); + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { contentFile }, + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile },
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
} }

View file

@ -26,7 +26,6 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileOutputStream; import java.io.FileOutputStream;
@ -44,19 +43,115 @@ public class Ver4DictEncoder implements DictEncoder {
private byte[] mTrieBuf; private byte[] mTrieBuf;
private int mTriePos; private int mTriePos;
private int mHeaderSize; private int mHeaderSize;
private SparseTable mBigramAddressTable;
private OutputStream mTrieOutStream; private OutputStream mTrieOutStream;
private OutputStream mFreqOutStream; private OutputStream mFreqOutStream;
private OutputStream mTerminalAddressTableOutStream; private OutputStream mTerminalAddressTableOutStream;
private OutputStream mBigramOutStream;
private File mDictDir; private File mDictDir;
private String mBaseFilename; private String mBaseFilename;
private BigramContentWriter mBigramWriter;
@UsedForTesting @UsedForTesting
public Ver4DictEncoder(final File dictPlacedDir) { public Ver4DictEncoder(final File dictPlacedDir) {
mDictPlacedDir = dictPlacedDir; mDictPlacedDir = dictPlacedDir;
} }
private interface SparseTableContentWriterInterface {
public void write(final OutputStream outStream) throws IOException;
}
private static class SparseTableContentWriter {
private final int mContentCount;
private final SparseTable mSparseTable;
private final File mLookupTableFile;
protected final File mBaseDir;
private final File[] mAddressTableFiles;
private final File[] mContentFiles;
protected final OutputStream[] mContentOutStreams;
public SparseTableContentWriter(final String name, final int contentCount,
final int initialCapacity, final int blockSize, final File baseDir,
final String[] contentFilenames, final String[] contentIds) {
if (contentFilenames.length != contentIds.length) {
throw new RuntimeException("The length of contentFilenames and the length of"
+ " contentIds are different " + contentFilenames.length + ", "
+ contentIds.length);
}
mContentCount = contentCount;
mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount);
mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
mAddressTableFiles = new File[mContentCount];
mContentFiles = new File[mContentCount];
mBaseDir = baseDir;
for (int i = 0; i < mContentCount; ++i) {
mAddressTableFiles[i] = new File(mBaseDir,
name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
}
mContentOutStreams = new OutputStream[mContentCount];
}
public void openStreams() throws FileNotFoundException {
for (int i = 0; i < mContentCount; ++i) {
mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]);
}
}
protected void write(final int contentIndex, final int index,
final SparseTableContentWriterInterface writer) throws IOException {
mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length());
writer.write(mContentOutStreams[contentIndex]);
mContentOutStreams[contentIndex].flush();
}
public void closeStreams() throws IOException {
mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles);
for (int i = 0; i < mContentCount; ++i) {
mContentOutStreams[i].close();
}
}
}
private static class BigramContentWriter extends SparseTableContentWriter {
public BigramContentWriter(final String name, final int initialCapacity,
final File baseDir) {
super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT,
initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION },
new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID });
}
public void writeBigramsForOneWord(final int terminalId,
final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
throws IOException {
write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
new SparseTableContentWriterInterface() {
@Override
public void write(final OutputStream outStream) throws IOException {
writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
}
});
}
private void writeBigramsForOneWordInternal(final OutputStream outStream,
final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
throws IOException {
while (bigramIterator.hasNext()) {
final WeightedString bigram = bigramIterator.next();
final PtNode target =
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
final int unigramFrequencyForThisWord = target.mFrequency;
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
bigramIterator.hasNext(), 0, bigram.mFrequency,
unigramFrequencyForThisWord, bigram.mWord);
BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags,
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId,
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
}
}
}
private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
throws FileNotFoundException, IOException { throws FileNotFoundException, IOException {
final FileHeader header = new FileHeader(0, dictOptions, formatOptions); final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
@ -66,8 +161,6 @@ public class Ver4DictEncoder implements DictEncoder {
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION); final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
final File terminalAddressTableFile = new File(mDictDir, final File terminalAddressTableFile = new File(mDictDir,
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
final File bigramFile = new File(mDictDir,
mBaseFilename + FormatSpec.BIGRAM_FILE_EXTENSION);
if (!mDictDir.isDirectory()) { if (!mDictDir.isDirectory()) {
if (mDictDir.exists()) mDictDir.delete(); if (mDictDir.exists()) mDictDir.delete();
mDictDir.mkdirs(); mDictDir.mkdirs();
@ -78,7 +171,6 @@ public class Ver4DictEncoder implements DictEncoder {
mTrieOutStream = new FileOutputStream(trieFile); mTrieOutStream = new FileOutputStream(trieFile);
mFreqOutStream = new FileOutputStream(freqFile); mFreqOutStream = new FileOutputStream(freqFile);
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile); mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
mBigramOutStream = new FileOutputStream(bigramFile);
} }
private void close() throws IOException { private void close() throws IOException {
@ -92,14 +184,10 @@ public class Ver4DictEncoder implements DictEncoder {
if (mTerminalAddressTableOutStream != null) { if (mTerminalAddressTableOutStream != null) {
mTerminalAddressTableOutStream.close(); mTerminalAddressTableOutStream.close();
} }
if (mBigramOutStream != null) {
mBigramOutStream.close();
}
} finally { } finally {
mTrieOutStream = null; mTrieOutStream = null;
mFreqOutStream = null; mFreqOutStream = null;
mTerminalAddressTableOutStream = null; mTerminalAddressTableOutStream = null;
mBigramOutStream = null;
} }
} }
@ -135,10 +223,8 @@ public class Ver4DictEncoder implements DictEncoder {
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
writeTerminalData(flatNodes, terminalCount); writeTerminalData(flatNodes, terminalCount);
mBigramAddressTable = new SparseTable(terminalCount, mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir);
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, 1 /* contentTableCount */);
writeBigrams(flatNodes, dict); writeBigrams(flatNodes, dict);
writeBigramAddressSparseTable();
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
@ -245,40 +331,16 @@ public class Ver4DictEncoder implements DictEncoder {
private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
throws IOException { throws IOException {
final ByteArrayOutputStream bigramBuffer = new ByteArrayOutputStream(); mBigramWriter.openStreams();
for (final PtNodeArray nodeArray : flatNodes) { for (final PtNodeArray nodeArray : flatNodes) {
for (final PtNode ptNode : nodeArray.mData) { for (final PtNode ptNode : nodeArray.mData) {
if (ptNode.mBigrams != null) { if (ptNode.mBigrams != null) {
final int startPos = bigramBuffer.size(); mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId,
mBigramAddressTable.set(0 /* contentTableIndex */, ptNode.mTerminalId, ptNode.mBigrams.iterator(), dict);
startPos);
final Iterator<WeightedString> bigramIterator = ptNode.mBigrams.iterator();
while (bigramIterator.hasNext()) {
final WeightedString bigram = bigramIterator.next();
final PtNode target =
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
final int unigramFrequencyForThisWord = target.mFrequency;
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
bigramIterator.hasNext(), 0, bigram.mFrequency,
unigramFrequencyForThisWord, bigram.mWord);
BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, bigramFlags,
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, target.mTerminalId,
FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
} }
} }
} }
} mBigramWriter.closeStreams();
bigramBuffer.writeTo(mBigramOutStream);
}
private void writeBigramAddressSparseTable() throws IOException {
final File lookupIndexFile =
new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
final File contentFile =
new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
mBigramAddressTable.writeToFiles(lookupIndexFile, new File[] { contentFile });
} }
@Override @Override