diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index 9481a8c14..a5516bd41 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -266,11 +266,14 @@ public final class FormatSpec { // tat = Terminal Address Table static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String BIGRAM_FILE_EXTENSION = ".bigram"; - static final String BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup"; - static final String BIGRAM_ADDRESS_TABLE_FILE_EXTENSION = ".bigram_index"; + static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; + static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; + static final int BIGRAM_CONTENT_COUNT = 1; + static final int BIGRAM_FREQ_CONTENT_INDEX = 0; + static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_PARENT_ADDRESS = 0; diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java index 624b2784f..5089687da 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -42,7 +42,7 @@ public class Ver4DictDecoder extends DictDecoder { private static final int FILETYPE_TRIE = 1; private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; - private static final int FILETYPE_BIGRAM = 4; + private static final int FILETYPE_BIGRAM_FREQ = 4; private final File mDictDirectory; private final DictionaryBufferFactory mBufferFactory; @@ -85,9 +85,10 @@ public class Ver4DictDecoder extends DictDecoder { } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); - } else if (fileType == FILETYPE_BIGRAM) { + } else if (fileType == FILETYPE_BIGRAM_FREQ) { return new File(mDictDirectory, - mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION); + mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION + + FormatSpec.BIGRAM_FREQ_CONTENT_ID); } else { throw new RuntimeException("Unsupported kind of file : " + fileType); } @@ -99,7 +100,7 @@ public class Ver4DictDecoder extends DictDecoder { mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY)); mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer( getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); - mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM)); + mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); loadBigramAddressSparseTable(); } @@ -126,11 +127,12 @@ public class Ver4DictDecoder extends DictDecoder { } private void loadBigramAddressSparseTable() throws IOException { - final File lookupIndexFile = new File(mDictDirectory, - mDictDirectory.getName() + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION); - final File contentFile = new File(mDictDirectory, - mDictDirectory.getName() + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION); - mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { contentFile }, + final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + final File freqsFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.BIGRAM_FREQ_CONTENT_ID); + mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile }, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java index a403e25db..b38c33019 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -26,7 +26,6 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; @@ -44,19 +43,115 @@ public class Ver4DictEncoder implements DictEncoder { private byte[] mTrieBuf; private int mTriePos; private int mHeaderSize; - private SparseTable mBigramAddressTable; private OutputStream mTrieOutStream; private OutputStream mFreqOutStream; private OutputStream mTerminalAddressTableOutStream; - private OutputStream mBigramOutStream; private File mDictDir; private String mBaseFilename; + private BigramContentWriter mBigramWriter; @UsedForTesting public Ver4DictEncoder(final File dictPlacedDir) { mDictPlacedDir = dictPlacedDir; } + private interface SparseTableContentWriterInterface { + public void write(final OutputStream outStream) throws IOException; + } + + private static class SparseTableContentWriter { + private final int mContentCount; + private final SparseTable mSparseTable; + private final File mLookupTableFile; + protected final File mBaseDir; + private final File[] mAddressTableFiles; + private final File[] mContentFiles; + protected final OutputStream[] mContentOutStreams; + + public SparseTableContentWriter(final String name, final int contentCount, + final int initialCapacity, final int blockSize, final File baseDir, + final String[] contentFilenames, final String[] contentIds) { + if (contentFilenames.length != contentIds.length) { + throw new RuntimeException("The length of contentFilenames and the length of" + + " contentIds are different " + contentFilenames.length + ", " + + contentIds.length); + } + mContentCount = contentCount; + mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount); + mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + mAddressTableFiles = new File[mContentCount]; + mContentFiles = new File[mContentCount]; + mBaseDir = baseDir; + for (int i = 0; i < mContentCount; ++i) { + mAddressTableFiles[i] = new File(mBaseDir, + name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]); + mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]); + } + mContentOutStreams = new OutputStream[mContentCount]; + } + + public void openStreams() throws FileNotFoundException { + for (int i = 0; i < mContentCount; ++i) { + mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]); + } + } + + protected void write(final int contentIndex, final int index, + final SparseTableContentWriterInterface writer) throws IOException { + mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length()); + writer.write(mContentOutStreams[contentIndex]); + mContentOutStreams[contentIndex].flush(); + } + + public void closeStreams() throws IOException { + mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles); + for (int i = 0; i < mContentCount; ++i) { + mContentOutStreams[i].close(); + } + } + } + + private static class BigramContentWriter extends SparseTableContentWriter { + + public BigramContentWriter(final String name, final int initialCapacity, + final File baseDir) { + super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT, + initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, + new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }, + new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }); + } + + public void writeBigramsForOneWord(final int terminalId, + final Iterator bigramIterator, final FusionDictionary dict) + throws IOException { + write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, + new SparseTableContentWriterInterface() { + @Override + public void write(final OutputStream outStream) throws IOException { + writeBigramsForOneWordInternal(outStream, bigramIterator, dict); + } + }); + } + + private void writeBigramsForOneWordInternal(final OutputStream outStream, + final Iterator bigramIterator, final FusionDictionary dict) + throws IOException { + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int unigramFrequencyForThisWord = target.mFrequency; + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags( + bigramIterator.hasNext(), 0, bigram.mFrequency, + unigramFrequencyForThisWord, bigram.mWord); + BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId, + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE); + } + } + } + private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) throws FileNotFoundException, IOException { final FileHeader header = new FileHeader(0, dictOptions, formatOptions); @@ -66,8 +161,6 @@ public class Ver4DictEncoder implements DictEncoder { final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION); final File terminalAddressTableFile = new File(mDictDir, mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); - final File bigramFile = new File(mDictDir, - mBaseFilename + FormatSpec.BIGRAM_FILE_EXTENSION); if (!mDictDir.isDirectory()) { if (mDictDir.exists()) mDictDir.delete(); mDictDir.mkdirs(); @@ -78,7 +171,6 @@ public class Ver4DictEncoder implements DictEncoder { mTrieOutStream = new FileOutputStream(trieFile); mFreqOutStream = new FileOutputStream(freqFile); mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile); - mBigramOutStream = new FileOutputStream(bigramFile); } private void close() throws IOException { @@ -92,14 +184,10 @@ public class Ver4DictEncoder implements DictEncoder { if (mTerminalAddressTableOutStream != null) { mTerminalAddressTableOutStream.close(); } - if (mBigramOutStream != null) { - mBigramOutStream.close(); - } } finally { mTrieOutStream = null; mFreqOutStream = null; mTerminalAddressTableOutStream = null; - mBigramOutStream = null; } } @@ -135,10 +223,8 @@ public class Ver4DictEncoder implements DictEncoder { if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); writeTerminalData(flatNodes, terminalCount); - mBigramAddressTable = new SparseTable(terminalCount, - FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, 1 /* contentTableCount */); + mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); writeBigrams(flatNodes, dict); - writeBigramAddressSparseTable(); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; @@ -245,40 +331,16 @@ public class Ver4DictEncoder implements DictEncoder { private void writeBigrams(final ArrayList flatNodes, final FusionDictionary dict) throws IOException { - final ByteArrayOutputStream bigramBuffer = new ByteArrayOutputStream(); - + mBigramWriter.openStreams(); for (final PtNodeArray nodeArray : flatNodes) { for (final PtNode ptNode : nodeArray.mData) { if (ptNode.mBigrams != null) { - final int startPos = bigramBuffer.size(); - mBigramAddressTable.set(0 /* contentTableIndex */, ptNode.mTerminalId, - startPos); - final Iterator bigramIterator = ptNode.mBigrams.iterator(); - while (bigramIterator.hasNext()) { - final WeightedString bigram = bigramIterator.next(); - final PtNode target = - FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); - final int unigramFrequencyForThisWord = target.mFrequency; - final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags( - bigramIterator.hasNext(), 0, bigram.mFrequency, - unigramFrequencyForThisWord, bigram.mWord); - BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, bigramFlags, - FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, target.mTerminalId, - FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE); - } + mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, + ptNode.mBigrams.iterator(), dict); } } } - bigramBuffer.writeTo(mBigramOutStream); - } - - private void writeBigramAddressSparseTable() throws IOException { - final File lookupIndexFile = - new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION); - final File contentFile = - new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION); - mBigramAddressTable.writeToFiles(lookupIndexFile, new File[] { contentFile }); + mBigramWriter.closeStreams(); } @Override