(1/2) Implement insertWord in Ver4DictUpdater.

Change-Id: Ia3079d5ef00ca7d831c91fb9220ad9c17038c5a3
main
Yuichiro Hanada 2013-10-04 15:33:25 +09:00
parent 27106487ba
commit 520f612849
8 changed files with 722 additions and 29 deletions

View File

@ -169,6 +169,14 @@ public final class BinaryDictDecoderUtils {
return size;
}
static int getCharArraySize(final int[] chars, final int start, final int end) {
int size = 0;
for (int i = start; i < end; ++i) {
size += getCharSize(chars[i]);
}
return size;
}
/**
* Writes a char array to a byte buffer.
*
@ -247,6 +255,40 @@ public final class BinaryDictDecoderUtils {
return written;
}
/**
* Writes an array of code points with our character format to an OutputStream.
*
* This will also write the terminator byte.
*
* @param stream the OutputStream to write to.
* @param codePoints the array of code points
* @return the size written, in bytes.
*/
// TODO: Merge this method with writeCharArray and rename the various write* methods to
// make the difference clear.
static int writeCodePoints(final OutputStream stream, final int[] codePoints,
final int startIndex, final int endIndex)
throws IOException {
int written = 0;
for (int i = startIndex; i < endIndex; ++i) {
final int codePoint = codePoints[i];
final int charSize = getCharSize(codePoint);
if (1 == charSize) {
stream.write((byte) codePoint);
} else {
stream.write((byte) (0xFF & (codePoint >> 16)));
stream.write((byte) (0xFF & (codePoint >> 8)));
stream.write((byte) (0xFF & codePoint));
}
written += charSize;
}
if (endIndex - startIndex > 1) {
stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
written += FormatSpec.PTNODE_TERMINATOR_SIZE;
}
return written;
}
/**
* Reads a string from a DictBuffer. This is the converse of the above method.
*/

View File

@ -17,6 +17,7 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
@ -245,6 +246,26 @@ public class BinaryDictEncoderUtils {
}
}
static void writeUIntToDictBuffer(final DictBuffer dictBuffer, final int value,
final int size) {
switch(size) {
case 4:
dictBuffer.put((byte) ((value >> 24) & 0xFF));
/* fall through */
case 3:
dictBuffer.put((byte) ((value >> 16) & 0xFF));
/* fall through */
case 2:
dictBuffer.put((byte) ((value >> 8) & 0xFF));
/* fall through */
case 1:
dictBuffer.put((byte) (value & 0xFF));
break;
default:
/* nop */
}
}
// End utility methods
// This method is responsible for finding a nice ordering of the nodes that favors run-time

View File

@ -245,8 +245,7 @@ public final class BinaryDictIOUtils {
/**
* @return the size written, in bytes. Always 3 bytes.
*/
static int writeSInt24ToBuffer(final DictBuffer dictBuffer,
final int value) {
static int writeSInt24ToBuffer(final DictBuffer dictBuffer, final int value) {
final int absValue = Math.abs(value);
dictBuffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
dictBuffer.put((byte)((absValue >> 8) & 0xFF));
@ -415,6 +414,25 @@ public final class BinaryDictIOUtils {
return size;
}
/**
* Writes a PtNodeCount to the stream.
*
* @param destination the stream to write.
* @param ptNodeCount the count.
* @return the size written in bytes.
*/
static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
throws IOException {
final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
// the count must fit on one byte or two bytes.
// Please see comments in FormatSpec.
if (countSize != 1 && countSize != 2) {
throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
}
BinaryDictEncoderUtils.writeUIntToStream(destination, ptNodeCount, countSize);
return countSize;
}
/**
* Write a node array to the stream.
*
@ -425,18 +443,7 @@ public final class BinaryDictIOUtils {
*/
static int writeNodes(final OutputStream destination, final PtNodeInfo[] infos)
throws IOException {
int size = getPtNodeCountSize(infos.length);
switch (getPtNodeCountSize(infos.length)) {
case 1:
destination.write((byte)infos.length);
break;
case 2:
destination.write((byte)(infos.length >> 8));
destination.write((byte)(infos.length & 0xFF));
break;
default:
throw new RuntimeException("Invalid node count size.");
}
int size = writePtNodeCount(destination, infos.length);
for (final PtNodeInfo info : infos) size += writePtNode(destination, info);
writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;

View File

@ -35,6 +35,7 @@ import java.util.TreeMap;
/**
* An interface of binary dictionary decoders.
*/
// TODO: Straighten out responsibility for the buffer's file pointer.
public interface DictDecoder {
/**

View File

@ -37,7 +37,7 @@ import java.util.Arrays;
@UsedForTesting
public final class DynamicBinaryDictIOUtils {
private static final boolean DBG = false;
private static final int MAX_JUMPS = 10000;
static final int MAX_JUMPS = 10000;
private DynamicBinaryDictIOUtils() {
// This utility class is not publicly instantiable.

View File

@ -40,17 +40,17 @@ import java.util.Arrays;
public class Ver4DictDecoder extends AbstractDictDecoder {
private static final String TAG = Ver4DictDecoder.class.getSimpleName();
private static final int FILETYPE_TRIE = 1;
private static final int FILETYPE_FREQUENCY = 2;
private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
private static final int FILETYPE_BIGRAM_FREQ = 4;
private static final int FILETYPE_SHORTCUT = 5;
protected static final int FILETYPE_TRIE = 1;
protected static final int FILETYPE_FREQUENCY = 2;
protected static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
protected static final int FILETYPE_BIGRAM_FREQ = 4;
protected static final int FILETYPE_SHORTCUT = 5;
private final File mDictDirectory;
private final DictionaryBufferFactory mBufferFactory;
protected final DictionaryBufferFactory mBufferFactory;
protected DictBuffer mDictBuffer;
private DictBuffer mFrequencyBuffer;
private DictBuffer mTerminalAddressTableBuffer;
protected DictBuffer mFrequencyBuffer;
protected DictBuffer mTerminalAddressTableBuffer;
private BigramContentReader mBigramReader;
private ShortcutContentReader mShortcutReader;
@ -64,6 +64,8 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
public final int mChildrenPos;
public final int mParentPos;
public final int mNodeSize;
public int mStartIndexOfCharacters;
public int mEndIndexOfCharacters; // exclusive
public Ver4PtNodeInfo(final int flags, final int[] characters, final int terminalId,
final int childrenPos, final int parentPos, final int nodeSize) {
@ -73,6 +75,8 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
mChildrenPos = childrenPos;
mParentPos = parentPos;
mNodeSize = nodeSize;
mStartIndexOfCharacters = 0;
mEndIndexOfCharacters = characters.length;
}
}
@ -99,7 +103,7 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
mDictBuffer = mFrequencyBuffer = null;
}
private File getFile(final int fileType) {
protected File getFile(final int fileType) {
if (fileType == FILETYPE_TRIE) {
return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION);
@ -141,6 +145,7 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
return mDictBuffer != null;
}
@UsedForTesting
/* package */ DictBuffer getDictBuffer() {
return mDictBuffer;
}

View File

@ -17,23 +17,37 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import android.util.Log;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
/**
* An implementation of DictUpdater for version 4 binary dictionary.
*/
@UsedForTesting
public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater {
private static final String TAG = Ver4DictUpdater.class.getSimpleName();
private OutputStream mDictStream;
private final File mFrequencyFile;
@UsedForTesting
public Ver4DictUpdater(final File dictDirectory, final int factoryType) {
// DictUpdater must have an updatable DictBuffer.
super(dictDirectory, ((factoryType & MASK_DICTBUFFER) == USE_BYTEARRAY)
? USE_BYTEARRAY : USE_WRITABLE_BYTEBUFFER);
mFrequencyFile = getFile(FILETYPE_FREQUENCY);
}
@Override
@ -49,11 +63,595 @@ public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater {
}
}
@Override
private int getNewTerminalId() {
// The size of frequency file is FormatSpec.FREQUENCY_AND_FLAGS_SIZE * number of terminals
// because each terminal always has a frequency.
// So we can get a fresh terminal id by this logic.
// CAVEAT: we are reading the file size from the disk each time: beware of race conditions,
// even on one thread.
return (int) (mFrequencyFile.length() / FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
}
private void updateParentPosIfNotMoved(final int nodePos, final int newParentPos,
final FormatOptions formatOptions) {
final int originalPos = getPosition();
setPosition(nodePos);
final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
if (!BinaryDictIOUtils.isMovedPtNode(flags, formatOptions)) {
final int parentOffset = newParentPos - nodePos;
BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, parentOffset);
}
setPosition(originalPos);
}
private void updateParentPositions(final int nodeArrayPos, final int newParentPos,
final FormatOptions formatOptions) {
final int originalPos = mDictBuffer.position();
mDictBuffer.position(nodeArrayPos);
int jumpCount = 0;
do {
final int count = readPtNodeCount();
for (int i = 0; i < count; ++i) {
updateParentPosIfNotMoved(getPosition(), newParentPos, formatOptions);
skipPtNode(formatOptions);
}
if (!readAndFollowForwardLink()) break;
} while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS);
setPosition(originalPos);
}
private void updateChildrenPos(final int nodePos, final int newChildrenPos,
final FormatOptions options) {
final int originalPos = getPosition();
setPosition(nodePos);
final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
PtNodeReader.readParentAddress(mDictBuffer, options);
BinaryDictIOUtils.skipString(mDictBuffer,
(flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer);
final int basePos = getPosition();
BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newChildrenPos - basePos);
setPosition(originalPos);
}
private void updateTerminalPosition(final int terminalId, final int position) {
if (terminalId == PtNode.NOT_A_TERMINAL
|| terminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE
>= mTerminalAddressTableBuffer.limit()) return;
mTerminalAddressTableBuffer.position(terminalId
* FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
BinaryDictEncoderUtils.writeUIntToDictBuffer(mTerminalAddressTableBuffer, position,
FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
}
private void updateForwardLink(final int nodeArrayPos, final int newForwardLink,
final FormatOptions formatOptions) {
final int originalPos = getPosition();
setPosition(nodeArrayPos);
int jumpCount = 0;
while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS) {
final int ptNodeCount = readPtNodeCount();
for (int i = 0; i < ptNodeCount; ++i) {
skipPtNode(formatOptions);
}
final int forwardLinkPos = getPosition();
if (!readAndFollowForwardLink()) {
setPosition(forwardLinkPos);
BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newForwardLink - forwardLinkPos);
break;
}
}
setPosition(originalPos);
}
private void markPtNodeAsMoved(final int nodePos, final int newNodePos,
final FormatOptions options) {
final int originalPos = getPosition();
updateParentPosIfNotMoved(nodePos, newNodePos, options);
setPosition(nodePos);
final int currentFlags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
setPosition(nodePos);
mDictBuffer.put((byte) (FormatSpec.FLAG_IS_MOVED
| (currentFlags & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
final int offset = newNodePos - nodePos;
BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, offset);
setPosition(originalPos);
}
/**
* Writes a PtNode to an output stream from a Ver4PtNodeInfo.
*
* @param nodePos the position of the head of the PtNode.
* @param info the PtNode info to be written.
* @return the size written, in bytes.
*/
private int writePtNode(final int nodePos, final Ver4PtNodeInfo info) throws IOException {
int written = 0;
// Write flags.
mDictStream.write((byte) (info.mFlags & 0xFF));
written += FormatSpec.PTNODE_FLAGS_SIZE;
// Write the parent position.
final int parentOffset = info.mParentPos == FormatSpec.NO_PARENT_ADDRESS ?
FormatSpec.NO_PARENT_ADDRESS : info.mParentPos - nodePos;
BinaryDictIOUtils.writeSInt24ToStream(mDictStream, parentOffset);
written += FormatSpec.PARENT_ADDRESS_SIZE;
// Write a string.
if (((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0)
!= (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters > 1)) {
throw new RuntimeException("Inconsistent flags : hasMultipleChars = "
+ ((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0) + ", length = "
+ (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters));
}
written += CharEncoding.writeCodePoints(mDictStream, info.mCharacters,
info.mStartIndexOfCharacters, info.mEndIndexOfCharacters);
// Write the terminal id.
if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
BinaryDictEncoderUtils.writeUIntToStream(mDictStream, info.mTerminalId,
FormatSpec.PTNODE_TERMINAL_ID_SIZE);
written += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
}
// Write the children position.
final int childrenOffset = info.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS
? 0 : info.mChildrenPos - (nodePos + written);
BinaryDictIOUtils.writeSInt24ToStream(mDictStream, childrenOffset);
written += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
return written;
}
/**
* Helper method to split and move PtNode.
*
* @param ptNodeArrayPos the position of PtNodeArray which contains the split and moved PtNode.
* @param splittedPtNodeToMovePos the position of the split and moved PtNode.
* @param newParent the parent PtNode after splitting.
* @param newChildren the children PtNodes after splitting.
* @param newParentStartPos where to write the new parent.
* @param formatOptions the format options.
*/
private void writeSplittedPtNodes(final int ptNodeArrayPos, final int splittedPtNodeToMovePos,
final Ver4PtNodeInfo newParent, final Ver4PtNodeInfo[] newChildren,
final int newParentStartPos,
final FormatOptions formatOptions) throws IOException {
updateTerminalPosition(newParent.mTerminalId,
newParentStartPos + 1 /* size of PtNodeCount */);
int written = writePtNodeArray(newParentStartPos, new Ver4PtNodeInfo[] { newParent },
FormatSpec.NO_FORWARD_LINK_ADDRESS);
final int childrenStartPos = newParentStartPos + written;
writePtNodeArray(childrenStartPos, newChildren, FormatSpec.NO_FORWARD_LINK_ADDRESS);
int childrenNodePos = childrenStartPos + 1 /* size of PtNodeCount */;
for (final Ver4PtNodeInfo info : newChildren) {
updateTerminalPosition(info.mTerminalId, childrenNodePos);
childrenNodePos += computePtNodeSize(info.mCharacters, info.mStartIndexOfCharacters,
info.mEndIndexOfCharacters,
(info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0);
}
// Mark as moved.
markPtNodeAsMoved(splittedPtNodeToMovePos, newParentStartPos + 1 /* size of PtNodeCount */,
formatOptions);
updateForwardLink(ptNodeArrayPos, newParentStartPos, formatOptions);
}
/**
* Writes a node array to the stream.
*
* @param nodeArrayPos the position of the head of the node array.
* @param infos an array of Ver4PtNodeInfo to be written.
* @return the written length in bytes.
*/
private int writePtNodeArray(final int nodeArrayPos, final Ver4PtNodeInfo[] infos,
final int forwardLink) throws IOException {
int written = BinaryDictIOUtils.writePtNodeCount(mDictStream, infos.length);
for (int i = 0; i < infos.length; ++i) {
written += writePtNode(nodeArrayPos + written, infos[i]);
}
BinaryDictIOUtils.writeSInt24ToStream(mDictStream, forwardLink);
written += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
return written;
}
private int computePtNodeSize(final int[] codePoints, final int startIndex, final int endIndex,
final boolean isTerminal) {
return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ CharEncoding.getCharArraySize(codePoints, startIndex, endIndex)
+ (endIndex - startIndex > 1 ? FormatSpec.PTNODE_TERMINATOR_SIZE : 0)
+ (isTerminal ? FormatSpec.PTNODE_TERMINAL_ID_SIZE : 0)
+ FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
}
private void writeNewSinglePtNodeWithAttributes(final int[] codePoints,
final boolean hasShortcuts, final int terminalId, final boolean hasBigrams,
final boolean isNotAWord, final boolean isBlackListEntry, final int parentPos,
final FormatOptions formatOptions) throws IOException {
final int newNodeArrayPos = mDictBuffer.limit();
final int newNodeFlags = BinaryDictEncoderUtils.makePtNodeFlags(codePoints.length > 1,
terminalId != PtNode.NOT_A_TERMINAL, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts,
hasBigrams, isNotAWord, isBlackListEntry, formatOptions);
final Ver4PtNodeInfo info = new Ver4PtNodeInfo(newNodeFlags, codePoints, terminalId,
FormatSpec.NO_CHILDREN_ADDRESS, parentPos, 0 /* nodeSize */);
writePtNodeArray(newNodeArrayPos, new Ver4PtNodeInfo[] { info },
FormatSpec.NO_FORWARD_LINK_ADDRESS);
}
private int setMultipleCharsInFlags(final int currentFlags, final boolean hasMultipleChars) {
final int flags;
if (hasMultipleChars) {
flags = currentFlags | FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
} else {
flags = currentFlags & (~FormatSpec.FLAG_HAS_MULTIPLE_CHARS);
}
return flags;
}
private int setIsNotAWordInFlags(final int currentFlags, final boolean isNotAWord) {
final int flags;
if (isNotAWord) {
flags = currentFlags | FormatSpec.FLAG_IS_NOT_A_WORD;
} else {
flags = currentFlags & (~FormatSpec.FLAG_IS_NOT_A_WORD);
}
return flags;
}
private int setIsBlackListEntryInFlags(final int currentFlags, final boolean isBlackListEntry) {
final int flags;
if (isBlackListEntry) {
flags = currentFlags | FormatSpec.FLAG_IS_BLACKLISTED;
} else {
flags = currentFlags & (~FormatSpec.FLAG_IS_BLACKLISTED);
}
return flags;
}
/**
* Splits a PtNode.
*
* abcd - ef
*
* -> inserting "abc"
*
* abc - d - ef
*
* @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split.
* @param nodeToSplitPos the position of the PtNode to split.
* @param nodeToSplitInfo the information of the PtNode to split.
* @param indexToSplit the index where to split in the code points array.
* @param parentOfNodeToSplitPos the absolute position of a parent of the node to split.
* @param newTerminalId the terminal id of the inserted node (corresponds to "d").
* @param hasShortcuts whether the inserted word should have shortcuts.
* @param hasBigrams whether the inserted word should have bigrams.
* @param isNotAWord whether the inserted word should be not a word.
* @param isBlackListEntry whether the inserted word should be a black list entry.
* @param formatOptions the format options.
*/
private void splitOnly(final int nodeArrayToSplitPos, final int nodeToSplitPos,
final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit,
final int parentOfNodeToSplitPos, final int newTerminalId, final boolean hasShortcuts,
final boolean hasBigrams, final boolean isNotAWord, final boolean isBlackListEntry,
final FormatOptions formatOptions) throws IOException {
final int parentNodeArrayStartPos = mDictBuffer.limit();
final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags(indexToSplit > 1,
true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, formatOptions);
final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags,
nodeToSplitInfo.mCharacters, newTerminalId, parentNodeStartPos
+ computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, true)
+ FormatSpec.FORWARD_LINK_ADDRESS_SIZE,
parentOfNodeToSplitPos, 0 /* nodeSize */);
parentInfo.mStartIndexOfCharacters = 0;
parentInfo.mEndIndexOfCharacters = indexToSplit;
// Write the child.
final int childrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags,
nodeToSplitInfo.mCharacters.length - indexToSplit > 1);
final Ver4PtNodeInfo childrenInfo = new Ver4PtNodeInfo(childrenFlags,
nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId,
nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */);
childrenInfo.mStartIndexOfCharacters = indexToSplit;
childrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length;
if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
updateParentPositions(nodeToSplitInfo.mChildrenPos,
parentInfo.mChildrenPos + 1 /* size of PtNodeCount */, formatOptions);
}
writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo,
new Ver4PtNodeInfo[] { childrenInfo }, parentNodeArrayStartPos, formatOptions);
}
/**
* Split and branch a PtNode.
*
* ab - cd
*
* -> inserting "ac"
*
* a - b - cd
* |
* - c
*
* @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split.
* @param nodeToSplitPos the position of the PtNode to split.
* @param nodeToSplitInfo the information of the PtNode to split.
* @param indexToSplit the index where to split in the code points array.
* @param parentOfNodeToSplitPos the absolute position of parent of the node to split.
* @param newWordSuffixCodePoints the suffix of the newly inserted word (corresponds to "c").
* @param startIndexOfNewWordSuffixCodePoints the start index in newWordSuffixCodePoints where
* the suffix starts.
* @param newTerminalId the terminal id of the inserted node (correspond to "c").
* @param hasShortcuts whether the inserted word should have shortcuts.
* @param hasBigrams whether the inserted word should have bigrams.
* @param isNotAWord whether the inserted word should be not a word.
* @param isBlackListEntry whether the inserted word should be a black list entry.
* @param formatOptions the format options.
*/
private void splitAndBranch(final int nodeArrayToSplitPos, final int nodeToSplitPos,
final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit,
final int parentOfNodeToSplitPos, final int[] newWordSuffixCodePoints,
final int startIndexOfNewWordSuffixCodePoints,
final int newTerminalId,
final boolean hasShortcuts, final boolean hasBigrams, final boolean isNotAWord,
final boolean isBlackListEntry, final FormatOptions formatOptions) throws IOException {
final int parentNodeArrayStartPos = mDictBuffer.limit();
final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags(
indexToSplit > 1,
false /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED,
false /* hasShortcut */, false /* hasBigrams */,
false /* isNotAWord */, false /* isBlackListEntry */, formatOptions);
final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags,
nodeToSplitInfo.mCharacters, PtNode.NOT_A_TERMINAL,
parentNodeStartPos
+ computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false)
+ FormatSpec.FORWARD_LINK_ADDRESS_SIZE,
parentOfNodeToSplitPos, 0 /* nodeSize */);
parentInfo.mStartIndexOfCharacters = 0;
parentInfo.mEndIndexOfCharacters = indexToSplit;
final int childrenNodeArrayStartPos = parentNodeStartPos
+ computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false)
+ FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
final int firstChildrenFlags = BinaryDictEncoderUtils.makePtNodeFlags(
newWordSuffixCodePoints.length - startIndexOfNewWordSuffixCodePoints > 1,
true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, formatOptions);
final Ver4PtNodeInfo firstChildrenInfo = new Ver4PtNodeInfo(firstChildrenFlags,
newWordSuffixCodePoints, newTerminalId,
FormatSpec.NO_CHILDREN_ADDRESS, parentNodeStartPos,
0 /* nodeSize */);
firstChildrenInfo.mStartIndexOfCharacters = startIndexOfNewWordSuffixCodePoints;
firstChildrenInfo.mEndIndexOfCharacters = newWordSuffixCodePoints.length;
final int secondChildrenStartPos = childrenNodeArrayStartPos + 1 /* size of ptNodeCount */
+ computePtNodeSize(newWordSuffixCodePoints, startIndexOfNewWordSuffixCodePoints,
newWordSuffixCodePoints.length, true /* isTerminal */);
final int secondChildrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags,
nodeToSplitInfo.mCharacters.length - indexToSplit > 1);
final Ver4PtNodeInfo secondChildrenInfo = new Ver4PtNodeInfo(secondChildrenFlags,
nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId,
nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */);
secondChildrenInfo.mStartIndexOfCharacters = indexToSplit;
secondChildrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length;
if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
updateParentPositions(nodeToSplitInfo.mChildrenPos, secondChildrenStartPos,
formatOptions);
}
writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo,
new Ver4PtNodeInfo[] { firstChildrenInfo, secondChildrenInfo },
parentNodeArrayStartPos, formatOptions);
}
/**
* Inserts a word into the trie file and returns the position of inserted terminal node.
* If the insertion is failed, returns FormatSpec.NOT_VALID_WORD.
*/
@UsedForTesting
private int insertWordToTrie(final String word, final int newTerminalId,
final boolean isNotAWord, final boolean isBlackListEntry, final boolean hasBigrams,
final boolean hasShortcuts) throws IOException, UnsupportedFormatException {
setPosition(0);
final FileHeader header = readHeader();
final int[] codePoints = FusionDictionary.getCodePoints(word);
final int wordLen = codePoints.length;
int wordPos = 0;
for (int depth = 0; depth < FormatSpec.MAX_WORD_LENGTH; /* nop */) {
final int nodeArrayPos = getPosition();
final int ptNodeCount = readPtNodeCount();
boolean goToChildren = false;
int parentPos = FormatSpec.NO_PARENT_ADDRESS;
for (int i = 0; i < ptNodeCount; ++i) {
final int nodePos = getPosition();
final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(nodePos, header.mFormatOptions);
if (BinaryDictIOUtils.isMovedPtNode(nodeInfo.mFlags, header.mFormatOptions)) {
continue;
}
if (nodeInfo.mParentPos != FormatSpec.NO_PARENT_ADDRESS) {
parentPos = nodePos + nodeInfo.mParentPos;
}
final boolean firstCharacterMatched =
codePoints[wordPos] == nodeInfo.mCharacters[0];
boolean allCharactersMatched = true;
int firstDifferentCharacterIndex = -1;
for (int p = 0; p < nodeInfo.mCharacters.length; ++p) {
if (wordPos + p >= codePoints.length) break;
if (codePoints[wordPos + p] != nodeInfo.mCharacters[p]) {
if (firstDifferentCharacterIndex == -1) {
firstDifferentCharacterIndex = p;
}
allCharactersMatched = false;
}
}
if (!firstCharacterMatched) {
// Go to the next sibling node.
continue;
}
if (!allCharactersMatched) {
final int parentNodeArrayStartPos = mDictBuffer.limit();
splitAndBranch(nodeArrayPos, nodePos, nodeInfo, firstDifferentCharacterIndex,
parentPos, codePoints, wordPos + firstDifferentCharacterIndex,
newTerminalId, hasShortcuts, hasBigrams, isNotAWord,
isBlackListEntry, header.mFormatOptions);
return parentNodeArrayStartPos + computePtNodeSize(codePoints, wordPos,
wordPos + firstDifferentCharacterIndex, false)
+ FormatSpec.FORWARD_LINK_ADDRESS_SIZE + 1 /* size of PtNodeCount */;
}
if (wordLen - wordPos < nodeInfo.mCharacters.length) {
final int parentNodeArrayStartPos = mDictBuffer.limit();
splitOnly(nodeArrayPos, nodePos, nodeInfo, wordLen - wordPos, parentPos,
newTerminalId, hasShortcuts, hasBigrams, isNotAWord, isBlackListEntry,
header.mFormatOptions);
// Return the position of the inserted word.
return parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
}
wordPos += nodeInfo.mCharacters.length;
if (wordPos == wordLen) {
// This dictionary already contains the word.
Log.e(TAG, "Something went wrong. If the word is already contained, "
+ " there is no need to insert new PtNode.");
return FormatSpec.NOT_VALID_WORD;
}
if (nodeInfo.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS) {
// There are no children.
// We need to add a new node as a child of this node.
final int newNodeArrayPos = mDictBuffer.limit();
final int[] newNodeCodePoints = Arrays.copyOfRange(codePoints, wordPos,
codePoints.length);
writeNewSinglePtNodeWithAttributes(newNodeCodePoints, hasShortcuts,
newTerminalId, hasBigrams, isNotAWord, isBlackListEntry, nodePos,
header.mFormatOptions);
updateChildrenPos(nodePos, newNodeArrayPos, header.mFormatOptions);
return newNodeArrayPos + 1 /* size of PtNodeCount */;
} else {
// Found the matched node.
// Go to the children of this node.
setPosition(nodeInfo.mChildrenPos);
goToChildren = true;
depth++;
break;
}
}
if (goToChildren) continue;
if (!readAndFollowForwardLink()) {
// Add a new node that contains [wordPos, word.length()-1].
// and update the forward link.
final int newNodeArrayPos = mDictBuffer.limit();
final int[] newCodePoints = Arrays.copyOfRange(codePoints, wordPos,
codePoints.length);
writeNewSinglePtNodeWithAttributes(newCodePoints, hasShortcuts, newTerminalId,
hasBigrams, isNotAWord, isBlackListEntry, parentPos, header.mFormatOptions);
updateForwardLink(nodeArrayPos, newNodeArrayPos, header.mFormatOptions);
return newNodeArrayPos + 1 /* size of PtNodeCount */;
}
}
return FormatSpec.NOT_VALID_WORD;
}
private void updateFrequency(final int terminalId, final int frequency) {
mFrequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
BinaryDictEncoderUtils.writeUIntToDictBuffer(mFrequencyBuffer, frequency,
FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
}
private void insertFrequency(final int frequency) throws IOException {
final OutputStream frequencyStream = new FileOutputStream(mFrequencyFile,
true /* append */);
BinaryDictEncoderUtils.writeUIntToStream(frequencyStream, frequency,
FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
}
private void insertTerminalPosition(final int posOfTerminal) throws IOException {
final OutputStream terminalPosStream = new FileOutputStream(
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE), true /* append */);
BinaryDictEncoderUtils.writeUIntToStream(terminalPosStream, posOfTerminal,
FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
}
private void insertBigrams(final int terminalId, final ArrayList<PendingAttribute> bigrams) {
// TODO: Implement.
}
private void insertShortcuts(final int terminalId, final ArrayList<WeightedString> shortcuts) {
// TODO: Implement.
}
private void openBuffersAndStream() throws IOException {
openDictBuffer();
mDictStream = new FileOutputStream(getFile(FILETYPE_TRIE), true /* append */);
}
private void close() throws IOException {
mDictStream.close();
mDictBuffer = null;
mFrequencyBuffer = null;
mTerminalAddressTableBuffer = null;
}
private void updateAttributes(final int posOfWord, final int frequency,
final ArrayList<WeightedString> bigramStrings,
final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
final boolean isBlackListEntry) throws IOException, UnsupportedFormatException {
mDictBuffer.position(0);
final FileHeader header = readHeader();
mDictBuffer.position(posOfWord);
final Ver4PtNodeInfo info = readVer4PtNodeInfo(posOfWord, header.mFormatOptions);
final int terminalId = info.mTerminalId;
// Update the flags.
final int newFlags = setIsNotAWordInFlags(
setIsBlackListEntryInFlags(info.mFlags, isBlackListEntry), isNotAWord);
mDictBuffer.position(posOfWord);
mDictBuffer.put((byte) newFlags);
updateFrequency(terminalId, frequency);
insertBigrams(terminalId,
DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings));
insertShortcuts(terminalId, shortcuts);
}
@Override @UsedForTesting
public void insertWord(final String word, final int frequency,
final ArrayList<WeightedString> bigramStrings, final ArrayList<WeightedString> shortcuts,
final boolean isNotAWord, final boolean isBlackListEntry)
throws IOException, UnsupportedFormatException {
// TODO: Implement this method.
final int newTerminalId = getNewTerminalId();
openBuffersAndStream();
final int posOfWord = getTerminalPosition(word);
if (posOfWord != FormatSpec.NOT_VALID_WORD) {
// The word is already contained in the dictionary.
updateAttributes(posOfWord, frequency, bigramStrings, shortcuts, isNotAWord,
isBlackListEntry);
close();
return;
}
// Insert new PtNode into trie.
final int posOfTerminal = insertWordToTrie(word, newTerminalId, isNotAWord,
isBlackListEntry, bigramStrings != null && !bigramStrings.isEmpty(),
shortcuts != null && !shortcuts.isEmpty());
insertFrequency(frequency);
insertTerminalPosition(posOfTerminal);
close();
insertBigrams(newTerminalId,
DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings));
insertShortcuts(newTerminalId, shortcuts);
}
}

View File

@ -248,6 +248,7 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, getWordPosition(file, "abcd"));
insertAndCheckWord(file, "abcde", 10, false, null, null, formatOptions);
checkReverseLookup(file, "abcde", getWordPosition(file, "abcde"));
insertAndCheckWord(file, "abcdefghijklmn", 10, false, null, null, formatOptions);
checkReverseLookup(file, "abcdefghijklmn", getWordPosition(file, "abcdefghijklmn"));
@ -257,12 +258,26 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
// update the existing word.
insertAndCheckWord(file, "abcdabcd", 15, true, null, null, formatOptions);
checkReverseLookup(file, "abcdabcd", getWordPosition(file, "abcdabcd"));
// split 1
// Testing splitOnly
insertAndCheckWord(file, "ab", 20, false, null, null, formatOptions);
checkReverseLookup(file, "ab", getWordPosition(file, "ab"));
checkReverseLookup(file, "abcdabcd", getWordPosition(file, "abcdabcd"));
checkReverseLookup(file, "abcde", getWordPosition(file, "abcde"));
checkReverseLookup(file, "abcdefghijklmn", getWordPosition(file, "abcdefghijklmn"));
// split 2
// Testing splitAndBranch
insertAndCheckWord(file, "ami", 30, false, null, null, formatOptions);
checkReverseLookup(file, "ami", getWordPosition(file, "ami"));
checkReverseLookup(file, "ab", getWordPosition(file, "ab"));
checkReverseLookup(file, "abcdabcd", getWordPosition(file, "abcdabcd"));
checkReverseLookup(file, "abcde", getWordPosition(file, "abcde"));
checkReverseLookup(file, "abcdefghijklmn", getWordPosition(file, "abcdefghijklmn"));
checkReverseLookup(file, "ami", getWordPosition(file, "ami"));
insertAndCheckWord(file, "abcdefzzzz", 40, false, null, null, formatOptions);
checkReverseLookup(file, "abcdefzzzz", getWordPosition(file, "abcdefzzzz"));
deleteWord(file, "ami", formatOptions);
assertEquals(FormatSpec.NOT_VALID_WORD, getWordPosition(file, "ami"));
@ -275,6 +290,7 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
public void testInsertWord() {
runTestInsertWord(BinaryDictUtils.VERSION3_WITH_DYNAMIC_UPDATE);
runTestInsertWord(BinaryDictUtils.VERSION4_WITH_DYNAMIC_UPDATE);
}
private void runTestInsertWordWithBigrams(final FormatOptions formatOptions) {
@ -314,6 +330,8 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
public void testInsertWordWithBigrams() {
runTestInsertWordWithBigrams(BinaryDictUtils.VERSION3_WITH_DYNAMIC_UPDATE);
// TODO: Add a test for version 4.
// runTestInsertWordWithBigrams(BinaryDictUtils.VERSION4_WITH_DYNAMIC_UPDATE);
}
private void runTestRandomWords(final FormatOptions formatOptions) {
@ -353,7 +371,7 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, getWordPosition(file, word));
}
Log.d(TAG, "Test version " + formatOptions);
Log.d(TAG, "Test version " + formatOptions.mVersion);
Log.d(TAG, "max = " + ((double)maxTimeToInsert/1000000) + " ms.");
Log.d(TAG, "min = " + ((double)minTimeToInsert/1000000) + " ms.");
Log.d(TAG, "avg = " + ((double)sum/mMaxUnigrams/1000000) + " ms.");
@ -361,5 +379,6 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
public void testRandomWords() {
runTestRandomWords(BinaryDictUtils.VERSION3_WITH_DYNAMIC_UPDATE);
runTestRandomWords(BinaryDictUtils.VERSION4_WITH_DYNAMIC_UPDATE);
}
}