Add ShortcutContentReader.

Change-Id: I7c2d0d86d85775065a9bcb2b4a8463bb4969579e
This commit is contained in:
Yuichiro Hanada 2013-10-14 12:46:22 +09:00
parent aaabce8d30
commit 36dd126ab4

View file

@ -51,9 +51,8 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
protected DictBuffer mDictBuffer; protected DictBuffer mDictBuffer;
private DictBuffer mFrequencyBuffer; private DictBuffer mFrequencyBuffer;
private DictBuffer mTerminalAddressTableBuffer; private DictBuffer mTerminalAddressTableBuffer;
private DictBuffer mShortcutBuffer;
private BigramContentReader mBigramReader; private BigramContentReader mBigramReader;
private SparseTable mShortcutAddressTable; private ShortcutContentReader mShortcutReader;
@UsedForTesting @UsedForTesting
/* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
@ -110,8 +109,9 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
mBigramReader = new BigramContentReader(mDictDirectory.getName(), mBigramReader = new BigramContentReader(mDictDirectory.getName(),
mDictDirectory, mBufferFactory, false); mDictDirectory, mBufferFactory, false);
mBigramReader.openBuffers(); mBigramReader.openBuffers();
mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT)); mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory,
loadShortcutAddressSparseTable(); mBufferFactory);
mShortcutReader.openBuffers();
} }
@Override @Override
@ -136,21 +136,6 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
return header; return header;
} }
// TODO: Let's have something like SparseTableContentsReader in this class.
private void loadShortcutAddressSparseTable() throws IOException {
final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ FormatSpec.SHORTCUT_CONTENT_ID);
final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName()
+ FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ FormatSpec.SHORTCUT_CONTENT_ID);
mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
new File[] { contentFile, timestampsFile },
FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
}
/** /**
* An auxiliary class for reading bigrams. * An auxiliary class for reading bigrams.
*/ */
@ -194,34 +179,71 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList(); final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList();
read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
new SparseTableContentReaderInterface() { new SparseTableContentReaderInterface() {
@Override @Override
public void read(final DictBuffer buffer) { public void read(final DictBuffer buffer) {
while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
// If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
// remaining bigram entries are ignored. // remaining bigram entries are ignored.
final int bigramFlags = buffer.readUnsignedByte(); final int bigramFlags = buffer.readUnsignedByte();
final int targetTerminalId = buffer.readUnsignedInt24(); final int targetTerminalId = buffer.readUnsignedInt24();
terminalAddressTableBuffer.position( terminalAddressTableBuffer.position(targetTerminalId
targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
final int targetAddress = terminalAddressTableBuffer.readUnsignedInt24(); final int targetAddress =
bigrams.add(new PendingAttribute( terminalAddressTableBuffer.readUnsignedInt24();
bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, bigrams.add(new PendingAttribute(bigramFlags
targetAddress)); & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) { targetAddress));
break; if (0 == (bigramFlags
& FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
break;
}
}
if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
throw new RuntimeException("Too many bigrams in a PtNode ("
+ bigrams.size() + " but max is "
+ FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
}
} }
} });
if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
+ " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
}
}
});
if (bigrams.isEmpty()) return null; if (bigrams.isEmpty()) return null;
return bigrams; return bigrams;
} }
} }
/**
* An auxiliary class for reading shortcuts.
*/
protected static class ShortcutContentReader extends SparseTableContentReader {
public ShortcutContentReader(final String name, final File baseDir,
final DictionaryBufferFactory factory) {
super(name + FormatSpec.SHORTCUT_FILE_EXTENSION,
FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
new String[] { FormatSpec.SHORTCUT_CONTENT_ID }, factory);
}
public ArrayList<WeightedString> readShortcuts(final int terminalId) {
final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList();
read(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
new SparseTableContentReaderInterface() {
@Override
public void read(final DictBuffer buffer) {
while (true) {
final int flags = buffer.readUnsignedByte();
final String word = CharEncoding.readString(buffer);
shortcuts.add(new WeightedString(word,
flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
break;
}
}
}
});
if (shortcuts.isEmpty()) return null;
return shortcuts;
}
}
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
@ -233,23 +255,6 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
} }
} }
private ArrayList<WeightedString> readShortcuts(final int terminalId) {
if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null;
final ArrayList<WeightedString> ret = CollectionUtils.newArrayList();
final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX,
terminalId);
mShortcutBuffer.position(posOfShortcuts);
while (true) {
final int flags = mShortcutBuffer.readUnsignedByte();
final String word = CharEncoding.readString(mShortcutBuffer);
ret.add(new WeightedString(word,
flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
}
return ret;
}
// TODO: Make this buffer thread safe. // TODO: Make this buffer thread safe.
// TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH.
private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
@ -300,7 +305,7 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
childrenAddress += addressPointer; childrenAddress += addressPointer;
} }
addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts(terminalId);
final ArrayList<PendingAttribute> bigrams = final ArrayList<PendingAttribute> bigrams =
mBigramReader.readTargetsAndFrequencies(terminalId, mBigramReader.readTargetsAndFrequencies(terminalId,
mTerminalAddressTableBuffer); mTerminalAddressTableBuffer);