Merge "Rename CharGroup to PtNode."

main
Ken Wakasa 2013-08-26 07:11:11 +00:00 committed by Android (Google) Code Review
commit 800225e0b1
15 changed files with 710 additions and 697 deletions

View File

@ -19,7 +19,7 @@ package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
@ -41,6 +41,7 @@ import java.util.TreeMap;
*
* TODO: Remove calls from classes except Ver3DictDecoder
* TODO: Move this file to makedict/internal.
* TODO: Rename this class to DictDecoderUtils.
*/
public final class BinaryDictDecoderUtils {
@ -213,7 +214,7 @@ public final class BinaryDictDecoderUtils {
buffer[index++] = (byte)(0xFF & codePoint);
}
}
buffer[index++] = FormatSpec.GROUP_CHARACTERS_TERMINATOR;
buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
return index - origin;
}
@ -237,7 +238,7 @@ public final class BinaryDictDecoderUtils {
buffer.write((byte) (0xFF & codePoint));
}
}
buffer.write(FormatSpec.GROUP_CHARACTERS_TERMINATOR);
buffer.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
}
/**
@ -264,7 +265,7 @@ public final class BinaryDictDecoderUtils {
static int readChar(final DictBuffer dictBuffer) {
int character = dictBuffer.readUnsignedByte();
if (!fitsOnOneByte(character)) {
if (FormatSpec.GROUP_CHARACTERS_TERMINATOR == character) {
if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
return FormatSpec.INVALID_CHARACTER;
}
character <<= 16;
@ -295,14 +296,14 @@ public final class BinaryDictDecoderUtils {
}
}
int address;
switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
return dictBuffer.readUnsignedByte();
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
return dictBuffer.readUnsignedShort();
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
return dictBuffer.readUnsignedInt24();
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
default:
return FormatSpec.NO_CHILDREN_ADDRESS;
}
@ -320,14 +321,14 @@ public final class BinaryDictDecoderUtils {
}
/**
* Reads and returns the char group count out of a buffer and forwards the pointer.
* Reads and returns the PtNode count out of a buffer and forwards the pointer.
*/
public static int readCharGroupCount(final DictBuffer dictBuffer) {
public static int readPtNodeCount(final DictBuffer dictBuffer) {
final int msb = dictBuffer.readUnsignedByte();
if (FormatSpec.MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) {
if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
return msb;
} else {
return ((FormatSpec.MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8)
return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
+ dictBuffer.readUnsignedByte();
}
}
@ -369,18 +370,18 @@ public final class BinaryDictDecoderUtils {
final StringBuilder builder = new StringBuilder();
// the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH
for (int count = 0; count < FormatSpec.MAX_WORD_LENGTH; ++count) {
CharGroupInfo currentInfo;
PtNodeInfo currentInfo;
int loopCounter = 0;
do {
dictBuffer.position(currentPos);
currentInfo = dictDecoder.readPtNode(currentPos, options);
if (BinaryDictIOUtils.isMovedGroup(currentInfo.mFlags, options)) {
if (BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, options)) {
currentPos = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
}
if (DBG && loopCounter++ > MAX_JUMPS) {
MakedictLog.d("Too many jumps - probably a bug");
}
} while (BinaryDictIOUtils.isMovedGroup(currentInfo.mFlags, options));
} while (BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, options));
if (Integer.MIN_VALUE == frequency) frequency = currentInfo.mFrequency;
builder.insert(0,
new String(currentInfo.mCharacters, 0, currentInfo.mCharacters.length));
@ -395,14 +396,14 @@ public final class BinaryDictDecoderUtils {
final FormatOptions options) {
final DictBuffer dictBuffer = dictDecoder.getDictBuffer();
dictBuffer.position(headerSize);
final int count = readCharGroupCount(dictBuffer);
int groupPos = headerSize + BinaryDictIOUtils.getGroupCountSize(count);
final int count = readPtNodeCount(dictBuffer);
int groupPos = headerSize + BinaryDictIOUtils.getPtNodeCountSize(count);
final StringBuilder builder = new StringBuilder();
WeightedString result = null;
CharGroupInfo last = null;
PtNodeInfo last = null;
for (int i = count - 1; i >= 0; --i) {
CharGroupInfo info = dictDecoder.readPtNode(groupPos, options);
PtNodeInfo info = dictDecoder.readPtNode(groupPos, options);
groupPos = info.mEndAddress;
if (info.mOriginalAddress == pos) {
builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
@ -414,8 +415,8 @@ public final class BinaryDictDecoderUtils {
if (null == last) continue;
builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
dictBuffer.position(last.mChildrenAddress);
i = readCharGroupCount(dictBuffer);
groupPos = last.mChildrenAddress + BinaryDictIOUtils.getGroupCountSize(i);
i = readPtNodeCount(dictBuffer);
groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
last = null;
continue;
}
@ -424,8 +425,8 @@ public final class BinaryDictDecoderUtils {
if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
dictBuffer.position(last.mChildrenAddress);
i = readCharGroupCount(dictBuffer);
groupPos = last.mChildrenAddress + BinaryDictIOUtils.getGroupCountSize(i);
i = readPtNodeCount(dictBuffer);
groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
last = null;
continue;
}
@ -444,25 +445,25 @@ public final class BinaryDictDecoderUtils {
* @param dictDecoder the dict decoder, correctly positioned at the start of a node array.
* @param headerSize the size, in bytes, of the file header.
* @param reverseNodeArrayMap a mapping from addresses to already read node arrays.
* @param reverseGroupMap a mapping from addresses to already read character groups.
* @param reversePtNodeMap a mapping from addresses to already read PtNodes.
* @param options file format options.
* @return the read node array with all his children already read.
*/
private static PtNodeArray readNodeArray(final Ver3DictDecoder dictDecoder,
final int headerSize, final Map<Integer, PtNodeArray> reverseNodeArrayMap,
final Map<Integer, CharGroup> reverseGroupMap, final FormatOptions options)
final Map<Integer, PtNode> reversePtNodeMap, final FormatOptions options)
throws IOException {
final DictBuffer dictBuffer = dictDecoder.getDictBuffer();
final ArrayList<CharGroup> nodeArrayContents = new ArrayList<CharGroup>();
final ArrayList<PtNode> nodeArrayContents = new ArrayList<PtNode>();
final int nodeArrayOriginPos = dictBuffer.position();
do { // Scan the linked-list node.
final int nodeArrayHeadPos = dictBuffer.position();
final int count = readCharGroupCount(dictBuffer);
int groupOffsetPos = nodeArrayHeadPos + BinaryDictIOUtils.getGroupCountSize(count);
for (int i = count; i > 0; --i) { // Scan the array of CharGroup.
CharGroupInfo info = dictDecoder.readPtNode(groupOffsetPos, options);
if (BinaryDictIOUtils.isMovedGroup(info.mFlags, options)) continue;
final int count = readPtNodeCount(dictBuffer);
int groupOffsetPos = nodeArrayHeadPos + BinaryDictIOUtils.getPtNodeCountSize(count);
for (int i = count; i > 0; --i) { // Scan the array of PtNode.
PtNodeInfo info = dictDecoder.readPtNode(groupOffsetPos, options);
if (BinaryDictIOUtils.isMovedPtNode(info.mFlags, options)) continue;
ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets;
ArrayList<WeightedString> bigrams = null;
if (null != info.mBigrams) {
@ -482,17 +483,17 @@ public final class BinaryDictDecoderUtils {
final int currentPosition = dictBuffer.position();
dictBuffer.position(info.mChildrenAddress);
children = readNodeArray(dictDecoder, headerSize, reverseNodeArrayMap,
reverseGroupMap, options);
reversePtNodeMap, options);
dictBuffer.position(currentPosition);
}
nodeArrayContents.add(
new CharGroup(info.mCharacters, shortcutTargets, bigrams,
new PtNode(info.mCharacters, shortcutTargets, bigrams,
info.mFrequency,
0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED), children));
} else {
nodeArrayContents.add(
new CharGroup(info.mCharacters, shortcutTargets, bigrams,
new PtNode(info.mCharacters, shortcutTargets, bigrams,
info.mFrequency,
0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED)));
@ -566,9 +567,9 @@ public final class BinaryDictDecoderUtils {
final FileHeader fileHeader = dictDecoder.readHeader();
Map<Integer, PtNodeArray> reverseNodeArrayMapping = new TreeMap<Integer, PtNodeArray>();
Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>();
Map<Integer, PtNode> reversePtNodeMapping = new TreeMap<Integer, PtNode>();
final PtNodeArray root = readNodeArray(dictDecoder, fileHeader.mHeaderSize,
reverseNodeArrayMapping, reverseGroupMapping, fileHeader.mFormatOptions);
reverseNodeArrayMapping, reversePtNodeMapping, fileHeader.mFormatOptions);
FusionDictionary newDict = new FusionDictionary(root, fileHeader.mDictionaryOptions);
if (null != dict) {

View File

@ -18,7 +18,7 @@ package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
@ -33,6 +33,8 @@ import java.util.Iterator;
* Encodes binary files for a FusionDictionary.
*
* All the methods in this class are static.
*
* TODO: Rename this class to DictEncoderUtils.
*/
public class BinaryDictEncoderUtils {
@ -58,46 +60,46 @@ public class BinaryDictEncoderUtils {
* @param characters the character array
* @return the size of the char array, including the terminator if any
*/
static int getGroupCharactersSize(final int[] characters) {
static int getPtNodeCharactersSize(final int[] characters) {
int size = CharEncoding.getCharArraySize(characters);
if (characters.length > 1) size += FormatSpec.GROUP_TERMINATOR_SIZE;
if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE;
return size;
}
/**
* Compute the binary size of the character array in a group
* Compute the binary size of the character array in a PtNode
*
* If only one character, this is the size of this character. If many, it's the sum of their
* sizes + 1 byte for the terminator.
*
* @param group the group
* @param ptNode the PtNode
* @return the size of the char array, including the terminator if any
*/
private static int getGroupCharactersSize(final CharGroup group) {
return getGroupCharactersSize(group.mChars);
private static int getPtNodeCharactersSize(final PtNode ptNode) {
return getPtNodeCharactersSize(ptNode.mChars);
}
/**
* Compute the binary size of the group count for a node array.
* Compute the binary size of the PtNode count for a node array.
* @param nodeArray the nodeArray
* @return the size of the group count, either 1 or 2 bytes.
* @return the size of the PtNode count, either 1 or 2 bytes.
*/
private static int getGroupCountSize(final PtNodeArray nodeArray) {
return BinaryDictIOUtils.getGroupCountSize(nodeArray.mData.size());
private static int getPtNodeCountSize(final PtNodeArray nodeArray) {
return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size());
}
/**
* Compute the size of a shortcut in bytes.
*/
private static int getShortcutSize(final WeightedString shortcut) {
int size = FormatSpec.GROUP_ATTRIBUTE_FLAGS_SIZE;
int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
final String word = shortcut.mWord;
final int length = word.length();
for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
final int codePoint = word.codePointAt(i);
size += CharEncoding.getCharSize(codePoint);
}
size += FormatSpec.GROUP_TERMINATOR_SIZE;
size += FormatSpec.PTNODE_TERMINATOR_SIZE;
return size;
}
@ -109,7 +111,7 @@ public class BinaryDictEncoderUtils {
*/
static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
if (null == shortcutList || shortcutList.isEmpty()) return 0;
int size = FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE;
int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
for (final WeightedString shortcut : shortcutList) {
size += getShortcutSize(shortcut);
}
@ -117,60 +119,60 @@ public class BinaryDictEncoderUtils {
}
/**
* Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything.
* Compute the maximum size of a PtNode, assuming 3-byte addresses for everything.
*
* @param group the CharGroup to compute the size of.
* @param ptNode the PtNode to compute the size of.
* @param options file format options.
* @return the maximum size of the group.
* @return the maximum size of the PtNode.
*/
private static int getCharGroupMaximumSize(final CharGroup group, final FormatOptions options) {
int size = getGroupHeaderSize(group, options);
private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) {
int size = getNodeHeaderSize(ptNode, options);
// If terminal, one byte for the frequency
if (group.isTerminal()) size += FormatSpec.GROUP_FREQUENCY_SIZE;
size += FormatSpec.GROUP_MAX_ADDRESS_SIZE; // For children address
size += getShortcutListSize(group.mShortcutTargets);
if (null != group.mBigrams) {
size += (FormatSpec.GROUP_ATTRIBUTE_FLAGS_SIZE
+ FormatSpec.GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE)
* group.mBigrams.size();
if (ptNode.isTerminal()) size += FormatSpec.PTNODE_FREQUENCY_SIZE;
size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address
size += getShortcutListSize(ptNode.mShortcutTargets);
if (null != ptNode.mBigrams) {
size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE
+ FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE)
* ptNode.mBigrams.size();
}
return size;
}
/**
* Compute the maximum size of each node of a node array, assuming 3-byte addresses for
* Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for
* everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of
* the containing node array, and cache it it its 'mCachedSize' member.
*
* @param nodeArray the node array to compute the maximum size of.
* @param ptNodeArray the node array to compute the maximum size of.
* @param options file format options.
*/
private static void calculateNodeArrayMaximumSize(final PtNodeArray nodeArray,
private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray,
final FormatOptions options) {
int size = getGroupCountSize(nodeArray);
for (CharGroup g : nodeArray.mData) {
final int groupSize = getCharGroupMaximumSize(g, options);
g.mCachedSize = groupSize;
size += groupSize;
int size = getPtNodeCountSize(ptNodeArray);
for (PtNode node : ptNodeArray.mData) {
final int nodeSize = getPtNodeMaximumSize(node, options);
node.mCachedSize = nodeSize;
size += nodeSize;
}
if (options.mSupportsDynamicUpdate) {
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
nodeArray.mCachedSize = size;
ptNodeArray.mCachedSize = size;
}
/**
* Compute the size of the header (flag + [parent address] + characters size) of a CharGroup.
* Compute the size of the header (flag + [parent address] + characters size) of a PtNode.
*
* @param group the group of which to compute the size of the header
* @param ptNode the PtNode of which to compute the size of the header
* @param options file format options.
*/
private static int getGroupHeaderSize(final CharGroup group, final FormatOptions options) {
private static int getNodeHeaderSize(final PtNode ptNode, final FormatOptions options) {
if (BinaryDictIOUtils.supportsDynamicUpdate(options)) {
return FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ getGroupCharactersSize(group);
return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ getPtNodeCharactersSize(ptNode);
} else {
return FormatSpec.GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode);
}
}
@ -203,14 +205,14 @@ public class BinaryDictEncoderUtils {
// cache performance and dictionary size.
/* package for tests */ static ArrayList<PtNodeArray> flattenTree(
final PtNodeArray rootNodeArray) {
final int treeSize = FusionDictionary.countCharGroups(rootNodeArray);
final int treeSize = FusionDictionary.countPtNodes(rootNodeArray);
MakedictLog.i("Counted nodes : " + treeSize);
final ArrayList<PtNodeArray> flatTree = new ArrayList<PtNodeArray>(treeSize);
return flattenTreeInner(flatTree, rootNodeArray);
}
private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list,
final PtNodeArray nodeArray) {
final PtNodeArray ptNodeArray) {
// Removing the node is necessary if the tails are merged, because we would then
// add the same node several times when we only want it once. A number of places in
// the code also depends on any node being only once in the list.
@ -228,11 +230,11 @@ public class BinaryDictEncoderUtils {
// this simple list.remove operation O(n*n) overall. On Android this overhead is very
// high.
// For future reference, the code to remove duplicate is a simple : list.remove(node);
list.add(nodeArray);
final ArrayList<CharGroup> branches = nodeArray.mData;
list.add(ptNodeArray);
final ArrayList<PtNode> branches = ptNodeArray.mData;
final int nodeSize = branches.size();
for (CharGroup group : branches) {
if (null != group.mChildren) flattenTreeInner(list, group.mChildren);
for (PtNode ptNode : branches) {
if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren);
}
return list;
}
@ -248,7 +250,7 @@ public class BinaryDictEncoderUtils {
* from the new position in the current node array to the new position in the target node
* array.
*
* @param currentNodeArray node array containing the CharGroup where the offset will be written
* @param currentNodeArray node array containing the PtNode where the offset will be written
* @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray
* @param targetNodeArray the target node array to get the offset to
* @return the offset to the target node array
@ -269,20 +271,20 @@ public class BinaryDictEncoderUtils {
}
/**
* Get the offset from a position inside a current node array to a target CharGroup, during
* Get the offset from a position inside a current node array to a target PtNode, during
* update.
*
* @param currentNodeArray node array containing the CharGroup where the offset will be written
* @param currentNodeArray node array containing the PtNode where the offset will be written
* @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray
* @param targetCharGroup the target CharGroup to get the offset to
* @return the offset to the target CharGroup
* @param targetPtNode the target PtNode to get the offset to
* @return the offset to the target PtNode
*/
// TODO: is there any way to factorize this method with the one above?
private static int getOffsetToTargetCharGroupDuringUpdate(final PtNodeArray currentNodeArray,
final int offsetFromStartOfCurrentNodeArray, final CharGroup targetCharGroup) {
private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray,
final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) {
final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate
+ offsetFromStartOfCurrentNodeArray;
final boolean isTargetBeforeCurrent = (targetCharGroup.mCachedAddressBeforeUpdate
final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate
< oldOffsetBasePoint);
// If the target is before the current node array, then its address has already been
// updated. We can use the AfterUpdate member, and compare it to our own member after
@ -292,9 +294,9 @@ public class BinaryDictEncoderUtils {
if (isTargetBeforeCurrent) {
final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate
+ offsetFromStartOfCurrentNodeArray;
return targetCharGroup.mCachedAddressAfterUpdate - newOffsetBasePoint;
return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint;
} else {
return targetCharGroup.mCachedAddressBeforeUpdate - oldOffsetBasePoint;
return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint;
}
}
@ -308,49 +310,49 @@ public class BinaryDictEncoderUtils {
* contents (as in, any of the addresses stored in the cache fields) have changed with
* respect to their previous value.
*
* @param nodeArray the node array to compute the size of.
* @param ptNodeArray the node array to compute the size of.
* @param dict the dictionary in which the word/attributes are to be found.
* @param formatOptions file format options.
* @return false if none of the cached addresses inside the node array changed, true otherwise.
*/
private static boolean computeActualNodeArraySize(final PtNodeArray nodeArray,
private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray,
final FusionDictionary dict, final FormatOptions formatOptions) {
boolean changed = false;
int size = getGroupCountSize(nodeArray);
for (CharGroup group : nodeArray.mData) {
group.mCachedAddressAfterUpdate = nodeArray.mCachedAddressAfterUpdate + size;
if (group.mCachedAddressAfterUpdate != group.mCachedAddressBeforeUpdate) {
int size = getPtNodeCountSize(ptNodeArray);
for (PtNode ptNode : ptNodeArray.mData) {
ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size;
if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) {
changed = true;
}
int groupSize = getGroupHeaderSize(group, formatOptions);
if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
} else if (null != group.mChildren) {
int nodeSize = getNodeHeaderSize(ptNode, formatOptions);
if (ptNode.isTerminal()) nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
if (null == ptNode.mChildren && formatOptions.mSupportsDynamicUpdate) {
nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
} else if (null != ptNode.mChildren) {
if (formatOptions.mSupportsDynamicUpdate) {
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
} else {
groupSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(nodeArray,
groupSize + size, group.mChildren));
nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
nodeSize + size, ptNode.mChildren));
}
}
groupSize += getShortcutListSize(group.mShortcutTargets);
if (null != group.mBigrams) {
for (WeightedString bigram : group.mBigrams) {
final int offset = getOffsetToTargetCharGroupDuringUpdate(nodeArray,
groupSize + size + FormatSpec.GROUP_FLAGS_SIZE,
nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
if (null != ptNode.mBigrams) {
for (WeightedString bigram : ptNode.mBigrams) {
final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
nodeSize + size + FormatSpec.PTNODE_FLAGS_SIZE,
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord));
groupSize += getByteSize(offset) + FormatSpec.GROUP_FLAGS_SIZE;
nodeSize += getByteSize(offset) + FormatSpec.PTNODE_FLAGS_SIZE;
}
}
group.mCachedSize = groupSize;
size += groupSize;
ptNode.mCachedSize = nodeSize;
size += nodeSize;
}
if (formatOptions.mSupportsDynamicUpdate) {
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
if (nodeArray.mCachedSize != size) {
nodeArray.mCachedSize = size;
if (ptNodeArray.mCachedSize != size) {
ptNodeArray.mCachedSize = size;
changed = true;
}
return changed;
@ -363,19 +365,19 @@ public class BinaryDictEncoderUtils {
* @param formatOptions file format options.
* @return the byte size of the entire stack.
*/
private static int initializeNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes,
private static int initializePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes,
final FormatOptions formatOptions) {
int nodeArrayOffset = 0;
for (final PtNodeArray nodeArray : flatNodes) {
nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset;
int groupCountSize = getGroupCountSize(nodeArray);
int groupOffset = 0;
for (final CharGroup g : nodeArray.mData) {
g.mCachedAddressBeforeUpdate = g.mCachedAddressAfterUpdate =
groupCountSize + nodeArrayOffset + groupOffset;
groupOffset += g.mCachedSize;
int nodeCountSize = getPtNodeCountSize(nodeArray);
int nodeffset = 0;
for (final PtNode ptNode : nodeArray.mData) {
ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate =
nodeCountSize + nodeArrayOffset + nodeffset;
nodeffset += ptNode.mCachedSize;
}
final int nodeSize = groupCountSize + groupOffset
final int nodeSize = nodeCountSize + nodeffset
+ (formatOptions.mSupportsDynamicUpdate
? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0);
nodeArrayOffset += nodeArray.mCachedSize;
@ -388,11 +390,11 @@ public class BinaryDictEncoderUtils {
*
* @param flatNodes the list of node arrays.
*/
private static void updateNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) {
private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) {
for (final PtNodeArray nodeArray : flatNodes) {
nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate;
for (final CharGroup g : nodeArray.mData) {
g.mCachedAddressBeforeUpdate = g.mCachedAddressAfterUpdate;
for (final PtNode ptNode : nodeArray.mData) {
ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate;
}
}
}
@ -407,38 +409,38 @@ public class BinaryDictEncoderUtils {
*/
private static void computeParentAddresses(final ArrayList<PtNodeArray> flatNodes) {
for (final PtNodeArray nodeArray : flatNodes) {
for (final CharGroup group : nodeArray.mData) {
if (null != group.mChildren) {
for (final PtNode ptNode : nodeArray.mData) {
if (null != ptNode.mChildren) {
// Assign my address to children's parent address
// Here BeforeUpdate and AfterUpdate addresses have the same value, so it
// does not matter which we use.
group.mChildren.mCachedParentAddress = group.mCachedAddressAfterUpdate
- group.mChildren.mCachedAddressAfterUpdate;
ptNode.mChildren.mCachedParentAddress = ptNode.mCachedAddressAfterUpdate
- ptNode.mChildren.mCachedAddressAfterUpdate;
}
}
}
}
/**
* Compute the addresses and sizes of an ordered list of node arrays.
* Compute the addresses and sizes of an ordered list of PtNode arrays.
*
* This method takes a list of node arrays and will update their cached address and size
* This method takes a list of PtNode arrays and will update their cached address and size
* values so that they can be written into a file. It determines the smallest size each of the
* nodes arrays can be given the addresses of its children and attributes, and store that into
* each node.
* The order of the node is given by the order of the array. This method makes no effort
* PtNode arrays can be given the addresses of its children and attributes, and store that into
* each PtNode.
* The order of the PtNode is given by the order of the array. This method makes no effort
* to find a good order; it only mechanically computes the size this order results in.
*
* @param dict the dictionary
* @param flatNodes the ordered list of nodes arrays
* @param flatNodes the ordered list of PtNode arrays
* @param formatOptions file format options.
* @return the same array it was passed. The nodes have been updated for address and size.
*/
private static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict,
final ArrayList<PtNodeArray> flatNodes, final FormatOptions formatOptions) {
// First get the worst possible sizes and offsets
for (final PtNodeArray n : flatNodes) calculateNodeArrayMaximumSize(n, formatOptions);
final int offset = initializeNodeArraysCachedAddresses(flatNodes, formatOptions);
for (final PtNodeArray n : flatNodes) calculatePtNodeArrayMaximumSize(n, formatOptions);
final int offset = initializePtNodeArraysCachedAddresses(flatNodes, formatOptions);
MakedictLog.i("Compressing the array addresses. Original size : " + offset);
MakedictLog.i("(Recursively seen size : " + offset + ")");
@ -447,19 +449,20 @@ public class BinaryDictEncoderUtils {
boolean changesDone = false;
do {
changesDone = false;
int nodeArrayStartOffset = 0;
for (final PtNodeArray nodeArray : flatNodes) {
nodeArray.mCachedAddressAfterUpdate = nodeArrayStartOffset;
final int oldNodeArraySize = nodeArray.mCachedSize;
final boolean changed = computeActualNodeArraySize(nodeArray, dict, formatOptions);
final int newNodeArraySize = nodeArray.mCachedSize;
int ptNodeArrayStartOffset = 0;
for (final PtNodeArray ptNodeArray : flatNodes) {
ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset;
final int oldNodeArraySize = ptNodeArray.mCachedSize;
final boolean changed =
computeActualPtNodeArraySize(ptNodeArray, dict, formatOptions);
final int newNodeArraySize = ptNodeArray.mCachedSize;
if (oldNodeArraySize < newNodeArraySize) {
throw new RuntimeException("Increased size ?!");
}
nodeArrayStartOffset += newNodeArraySize;
ptNodeArrayStartOffset += newNodeArraySize;
changesDone |= changed;
}
updateNodeArraysCachedAddresses(flatNodes);
updatePtNodeArraysCachedAddresses(flatNodes);
++passes;
if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
} while (changesDone);
@ -467,10 +470,10 @@ public class BinaryDictEncoderUtils {
if (formatOptions.mSupportsDynamicUpdate) {
computeParentAddresses(flatNodes);
}
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1);
MakedictLog.i("Compression complete in " + passes + " passes.");
MakedictLog.i("After address compression : "
+ (lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize));
+ (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize));
return flatNodes;
}
@ -478,25 +481,26 @@ public class BinaryDictEncoderUtils {
/**
* Sanity-checking method.
*
* This method checks a list of node arrays for juxtaposition, that is, it will do
* This method checks a list of PtNode arrays for juxtaposition, that is, it will do
* nothing if each node array's cached address is actually the previous node array's address
* plus the previous node's size.
* If this is not the case, it will throw an exception.
*
* @param arrays the list of node arrays to check
*/
private static void checkFlatNodeArrayList(final ArrayList<PtNodeArray> arrays) {
private static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) {
int offset = 0;
int index = 0;
for (final PtNodeArray nodeArray : arrays) {
for (final PtNodeArray ptNodeArray : arrays) {
// BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter
// which we use.
if (nodeArray.mCachedAddressAfterUpdate != offset) {
if (ptNodeArray.mCachedAddressAfterUpdate != offset) {
throw new RuntimeException("Wrong address for node " + index
+ " : expected " + offset + ", got " + nodeArray.mCachedAddressAfterUpdate);
+ " : expected " + offset + ", got " +
ptNodeArray.mCachedAddressAfterUpdate);
}
++index;
offset += nodeArray.mCachedSize;
offset += ptNodeArray.mCachedSize;
}
}
@ -552,19 +556,19 @@ public class BinaryDictEncoderUtils {
}
/**
* Makes the flag value for a char group.
* Makes the flag value for a PtNode.
*
* @param hasMultipleChars whether the group has multiple chars.
* @param isTerminal whether the group is terminal.
* @param hasMultipleChars whether the PtNode has multiple chars.
* @param isTerminal whether the PtNode is terminal.
* @param childrenAddressSize the size of a children address.
* @param hasShortcuts whether the group has shortcuts.
* @param hasBigrams whether the group has bigrams.
* @param isNotAWord whether the group is not a word.
* @param isBlackListEntry whether the group is a blacklist entry.
* @param hasShortcuts whether the PtNode has shortcuts.
* @param hasBigrams whether the PtNode has bigrams.
* @param isNotAWord whether the PtNode is not a word.
* @param isBlackListEntry whether the PtNode is a blacklist entry.
* @param formatOptions file format options.
* @return the flags
*/
static int makeCharGroupFlags(final boolean hasMultipleChars, final boolean isTerminal,
static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal,
final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams,
final boolean isNotAWord, final boolean isBlackListEntry,
final FormatOptions formatOptions) {
@ -576,16 +580,16 @@ public class BinaryDictEncoderUtils {
} else if (true) {
switch (childrenAddressSize) {
case 1:
flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE;
break;
case 2:
flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES;
break;
case 3:
flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES;
break;
case 0:
flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS;
flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS;
break;
default:
throw new RuntimeException("Node with a strange address");
@ -598,12 +602,12 @@ public class BinaryDictEncoderUtils {
return flags;
}
private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress,
private static byte makePtNodeFlags(final PtNode node, final int ptNodeAddress,
final int childrenOffset, final FormatOptions formatOptions) {
return (byte) makeCharGroupFlags(group.mChars.length > 1, group.mFrequency >= 0,
return (byte) makePtNodeFlags(node.mChars.length > 1, node.mFrequency >= 0,
getByteSize(childrenOffset),
group.mShortcutTargets != null && !group.mShortcutTargets.isEmpty(),
group.mBigrams != null, group.mIsNotAWord, group.mIsBlacklistEntry, formatOptions);
node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
node.mBigrams != null, node.mIsNotAWord, node.mIsBlacklistEntry, formatOptions);
}
/**
@ -618,17 +622,17 @@ public class BinaryDictEncoderUtils {
*/
private static final int makeBigramFlags(final boolean more, final int offset,
int bigramFrequency, final int unigramFrequency, final String word) {
int bigramFlags = (more ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0)
+ (offset < 0 ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0);
int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0)
+ (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0);
switch (getByteSize(offset)) {
case 1:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE;
break;
case 2:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES;
break;
case 3:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES;
break;
default:
throw new RuntimeException("Strange offset size");
@ -673,7 +677,7 @@ public class BinaryDictEncoderUtils {
// small over-estimation that we get in this case. TODO: actually remove this bigram
// if discretizedFrequency < 0.
final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0;
bigramFlags += finalBigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
bigramFlags += finalBigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
return bigramFlags;
}
@ -698,8 +702,8 @@ public class BinaryDictEncoderUtils {
* @return the flags
*/
static final int makeShortcutFlags(final boolean more, final int frequency) {
return (more ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0)
+ (frequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY);
return (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0)
+ (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY);
}
private static final int writeParentAddress(final byte[] buffer, final int index,
@ -722,68 +726,69 @@ public class BinaryDictEncoderUtils {
}
/**
* Write a node array to memory. The node array is expected to have its final position cached.
* Write a PtNodeArray to memory. The PtNodeArray is expected to have its final position cached.
*
* @param dict the dictionary the node array is a part of (for relative offsets).
* @param buffer the memory buffer to write to.
* @param nodeArray the node array to write.
* @param ptNodeArray the node array to write.
* @param formatOptions file format options.
* @return the address of the END of the node.
*/
@SuppressWarnings("unused")
private static int writePlacedNode(final FusionDictionary dict, byte[] buffer,
final PtNodeArray nodeArray, final FormatOptions formatOptions) {
// TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup
int index = nodeArray.mCachedAddressAfterUpdate;
final PtNodeArray ptNodeArray, final FormatOptions formatOptions) {
// TODO: Make the code in common with BinaryDictIOUtils#writePtNode
int index = ptNodeArray.mCachedAddressAfterUpdate;
final int groupCount = nodeArray.mData.size();
final int countSize = getGroupCountSize(nodeArray);
final int parentAddress = nodeArray.mCachedParentAddress;
final int ptNodeCount = ptNodeArray.mData.size();
final int countSize = getPtNodeCountSize(ptNodeArray);
final int parentAddress = ptNodeArray.mCachedParentAddress;
if (1 == countSize) {
buffer[index++] = (byte)groupCount;
buffer[index++] = (byte)ptNodeCount;
} else if (2 == countSize) {
// We need to signal 2-byte size by setting the top bit of the MSB to 1, so
// we | 0x80 to do this.
buffer[index++] = (byte)((groupCount >> 8) | 0x80);
buffer[index++] = (byte)(groupCount & 0xFF);
buffer[index++] = (byte)((ptNodeCount >> 8) | 0x80);
buffer[index++] = (byte)(ptNodeCount & 0xFF);
} else {
throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
}
int groupAddress = index;
for (int i = 0; i < groupCount; ++i) {
final CharGroup group = nodeArray.mData.get(i);
if (index != group.mCachedAddressAfterUpdate) {
int ptNodeAddress = index;
for (int i = 0; i < ptNodeCount; ++i) {
final PtNode ptNode = ptNodeArray.mData.get(i);
if (index != ptNode.mCachedAddressAfterUpdate) {
throw new RuntimeException("Bug: write index is not the same as the cached address "
+ "of the group : " + index + " <> " + group.mCachedAddressAfterUpdate);
+ "of the node : " + index + " <> " + ptNode.mCachedAddressAfterUpdate);
}
groupAddress += getGroupHeaderSize(group, formatOptions);
ptNodeAddress += getNodeHeaderSize(ptNode, formatOptions);
// Sanity checks.
if (DBG && group.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
if (DBG && ptNode.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
throw new RuntimeException("A node has a frequency > "
+ FormatSpec.MAX_TERMINAL_FREQUENCY
+ " : " + group.mFrequency);
+ " : " + ptNode.mFrequency);
}
if (group.mFrequency >= 0) groupAddress += FormatSpec.GROUP_FREQUENCY_SIZE;
final int childrenOffset = null == group.mChildren
if (ptNode.mFrequency >= 0) ptNodeAddress += FormatSpec.PTNODE_FREQUENCY_SIZE;
final int childrenOffset = null == ptNode.mChildren
? FormatSpec.NO_CHILDREN_ADDRESS
: group.mChildren.mCachedAddressAfterUpdate - groupAddress;
: ptNode.mChildren.mCachedAddressAfterUpdate - ptNodeAddress;
buffer[index++] =
makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
makePtNodeFlags(ptNode, ptNodeAddress, childrenOffset, formatOptions);
if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) {
index = writeParentAddress(buffer, index, parentAddress, formatOptions);
} else {
index = writeParentAddress(buffer, index, parentAddress
+ (nodeArray.mCachedAddressAfterUpdate - group.mCachedAddressAfterUpdate),
+ (ptNodeArray.mCachedAddressAfterUpdate
- ptNode.mCachedAddressAfterUpdate),
formatOptions);
}
index = CharEncoding.writeCharArray(group.mChars, buffer, index);
if (group.hasSeveralChars()) {
buffer[index++] = FormatSpec.GROUP_CHARACTERS_TERMINATOR;
index = CharEncoding.writeCharArray(ptNode.mChars, buffer, index);
if (ptNode.hasSeveralChars()) {
buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
}
if (group.mFrequency >= 0) {
buffer[index++] = (byte) group.mFrequency;
if (ptNode.mFrequency >= 0) {
buffer[index++] = (byte) ptNode.mFrequency;
}
final int shift;
@ -793,23 +798,24 @@ public class BinaryDictEncoderUtils {
shift = writeVariableAddress(buffer, index, childrenOffset);
}
index += shift;
groupAddress += shift;
ptNodeAddress += shift;
// Write shortcuts
if (null != group.mShortcutTargets && !group.mShortcutTargets.isEmpty()) {
if (null != ptNode.mShortcutTargets && !ptNode.mShortcutTargets.isEmpty()) {
final int indexOfShortcutByteSize = index;
index += FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE;
groupAddress += FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE;
final Iterator<WeightedString> shortcutIterator = group.mShortcutTargets.iterator();
index += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
ptNodeAddress += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
final Iterator<WeightedString> shortcutIterator =
ptNode.mShortcutTargets.iterator();
while (shortcutIterator.hasNext()) {
final WeightedString target = shortcutIterator.next();
++groupAddress;
++ptNodeAddress;
int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(),
target.mFrequency);
buffer[index++] = (byte)shortcutFlags;
final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord);
index += shortcutShift;
groupAddress += shortcutShift;
ptNodeAddress += shortcutShift;
}
final int shortcutByteSize = index - indexOfShortcutByteSize;
if (shortcutByteSize > 0xFFFF) {
@ -819,22 +825,22 @@ public class BinaryDictEncoderUtils {
buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF);
}
// Write bigrams
if (null != group.mBigrams) {
final Iterator<WeightedString> bigramIterator = group.mBigrams.iterator();
if (null != ptNode.mBigrams) {
final Iterator<WeightedString> bigramIterator = ptNode.mBigrams.iterator();
while (bigramIterator.hasNext()) {
final WeightedString bigram = bigramIterator.next();
final CharGroup target =
final PtNode target =
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
final int addressOfBigram = target.mCachedAddressAfterUpdate;
final int unigramFrequencyForThisWord = target.mFrequency;
++groupAddress;
final int offset = addressOfBigram - groupAddress;
++ptNodeAddress;
final int offset = addressOfBigram - ptNodeAddress;
int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset,
bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
buffer[index++] = (byte)bigramFlags;
final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
index += bigramShift;
groupAddress += bigramShift;
ptNodeAddress += bigramShift;
}
}
@ -844,64 +850,64 @@ public class BinaryDictEncoderUtils {
= FormatSpec.NO_FORWARD_LINK_ADDRESS;
index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
if (index != nodeArray.mCachedAddressAfterUpdate + nodeArray.mCachedSize) {
if (index != ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize) {
throw new RuntimeException(
"Not the same size : written " + (index - nodeArray.mCachedAddressAfterUpdate)
+ " bytes from a node that should have " + nodeArray.mCachedSize + " bytes");
"Not the same size : written " + (index - ptNodeArray.mCachedAddressAfterUpdate)
+ " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes");
}
return index;
}
/**
* Dumps a collection of useful statistics about a list of node arrays.
* Dumps a collection of useful statistics about a list of PtNode arrays.
*
* This prints purely informative stuff, like the total estimated file size, the
* number of node arrays, of character groups, the repartition of each address size, etc
* number of PtNode arrays, of PtNodes, the repartition of each address size, etc
*
* @param nodeArrays the list of node arrays.
* @param ptNodeArrays the list of PtNode arrays.
*/
private static void showStatistics(ArrayList<PtNodeArray> nodeArrays) {
private static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) {
int firstTerminalAddress = Integer.MAX_VALUE;
int lastTerminalAddress = Integer.MIN_VALUE;
int size = 0;
int charGroups = 0;
int maxGroups = 0;
int ptNodes = 0;
int maxNodes = 0;
int maxRuns = 0;
for (final PtNodeArray nodeArray : nodeArrays) {
if (maxGroups < nodeArray.mData.size()) maxGroups = nodeArray.mData.size();
for (final CharGroup cg : nodeArray.mData) {
++charGroups;
if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
if (cg.mFrequency >= 0) {
if (nodeArray.mCachedAddressAfterUpdate < firstTerminalAddress)
firstTerminalAddress = nodeArray.mCachedAddressAfterUpdate;
if (nodeArray.mCachedAddressAfterUpdate > lastTerminalAddress)
lastTerminalAddress = nodeArray.mCachedAddressAfterUpdate;
for (final PtNodeArray ptNodeArray : ptNodeArrays) {
if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size();
for (final PtNode ptNode : ptNodeArray.mData) {
++ptNodes;
if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length;
if (ptNode.mFrequency >= 0) {
if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress)
firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress)
lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
}
}
if (nodeArray.mCachedAddressAfterUpdate + nodeArray.mCachedSize > size) {
size = nodeArray.mCachedAddressAfterUpdate + nodeArray.mCachedSize;
if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) {
size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize;
}
}
final int[] groupCounts = new int[maxGroups + 1];
final int[] ptNodeCounts = new int[maxNodes + 1];
final int[] runCounts = new int[maxRuns + 1];
for (final PtNodeArray nodeArray : nodeArrays) {
++groupCounts[nodeArray.mData.size()];
for (final CharGroup cg : nodeArray.mData) {
++runCounts[cg.mChars.length];
for (final PtNodeArray ptNodeArray : ptNodeArrays) {
++ptNodeCounts[ptNodeArray.mData.size()];
for (final PtNode ptNode : ptNodeArray.mData) {
++runCounts[ptNode.mChars.length];
}
}
MakedictLog.i("Statistics:\n"
+ " total file size " + size + "\n"
+ " " + nodeArrays.size() + " node arrays\n"
+ " " + charGroups + " groups (" + ((float)charGroups / nodeArrays.size())
+ " groups per node)\n"
+ " " + ptNodeArrays.size() + " node arrays\n"
+ " " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size())
+ " PtNodes per node)\n"
+ " first terminal at " + firstTerminalAddress + "\n"
+ " last terminal at " + lastTerminalAddress + "\n"
+ " Group stats : max = " + maxGroups);
for (int i = 0; i < groupCounts.length; ++i) {
MakedictLog.i(" " + i + " : " + groupCounts[i]);
+ " PtNode stats : max = " + maxNodes);
for (int i = 0; i < ptNodeCounts.length; ++i) {
MakedictLog.i(" " + i + " : " + ptNodeCounts[i]);
}
MakedictLog.i(" Character run stats : max = " + maxRuns);
for (int i = 0; i < runCounts.length; ++i) {
@ -922,7 +928,7 @@ public class BinaryDictEncoderUtils {
// Addresses are limited to 3 bytes, but since addresses can be relative to each node
// array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
// the order of the node arrays becomes a quite complicated problem, because though the
// the order of the PtNode arrays becomes a quite complicated problem, because though the
// dictionary itself does not have a size limit, each node array must still be within 16MB
// of all its children and parents. As long as this is ensured, the dictionary file may
// grow to any size.
@ -980,8 +986,8 @@ public class BinaryDictEncoderUtils {
MakedictLog.i("Computing addresses...");
computeAddresses(dict, flatNodes, formatOptions);
MakedictLog.i("Checking array...");
if (DBG) checkFlatNodeArrayList(flatNodes);
MakedictLog.i("Checking PtNode array...");
if (DBG) checkFlatPtNodeArrayList(flatNodes);
// Create a buffer that matches the final dictionary size.
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);

View File

@ -22,7 +22,7 @@ import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncodin
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.utils.ByteArrayDictBuffer;
@ -44,17 +44,17 @@ public final class BinaryDictIOUtils {
}
private static final class Position {
public static final int NOT_READ_GROUPCOUNT = -1;
public static final int NOT_READ_PTNODE_COUNT = -1;
public int mAddress;
public int mNumOfCharGroup;
public int mNumOfPtNode;
public int mPosition;
public int mLength;
public Position(int address, int length) {
mAddress = address;
mLength = length;
mNumOfCharGroup = NOT_READ_GROUPCOUNT;
mNumOfPtNode = NOT_READ_PTNODE_COUNT;
}
}
@ -79,45 +79,45 @@ public final class BinaryDictIOUtils {
Position p = stack.peek();
if (DBG) {
MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" +
p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength);
MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
}
if (dictBuffer.position() != p.mAddress) dictBuffer.position(p.mAddress);
if (index != p.mLength) index = p.mLength;
if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) {
p.mNumOfCharGroup = BinaryDictDecoderUtils.readCharGroupCount(dictBuffer);
p.mAddress += getGroupCountSize(p.mNumOfCharGroup);
if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
p.mNumOfPtNode = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer);
p.mAddress += getPtNodeCountSize(p.mNumOfPtNode);
p.mPosition = 0;
}
if (p.mNumOfCharGroup == 0) {
if (p.mNumOfPtNode == 0) {
stack.pop();
continue;
}
CharGroupInfo info = dictDecoder.readPtNode(p.mAddress, formatOptions);
PtNodeInfo info = dictDecoder.readPtNode(p.mAddress, formatOptions);
for (int i = 0; i < info.mCharacters.length; ++i) {
pushedChars[index++] = info.mCharacters[i];
}
p.mPosition++;
final boolean isMovedGroup = isMovedGroup(info.mFlags,
final boolean isMovedPtNode = isMovedPtNode(info.mFlags,
formatOptions);
final boolean isDeletedGroup = isDeletedGroup(info.mFlags,
final boolean isDeletedPtNode = isDeletedPtNode(info.mFlags,
formatOptions);
if (!isMovedGroup && !isDeletedGroup
&& info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word
if (!isMovedPtNode && !isDeletedPtNode
&& info.mFrequency != FusionDictionary.PtNode.NOT_A_TERMINAL) {// found word
words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
frequencies.put(info.mOriginalAddress, info.mFrequency);
if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
}
if (p.mPosition == p.mNumOfCharGroup) {
if (p.mPosition == p.mNumOfPtNode) {
if (formatOptions.mSupportsDynamicUpdate) {
final int forwardLinkAddress = dictBuffer.readUnsignedInt24();
if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
// The node array has a forward link.
p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT;
p.mNumOfPtNode = Position.NOT_READ_PTNODE_COUNT;
p.mAddress = forwardLinkAddress;
} else {
stack.pop();
@ -126,11 +126,11 @@ public final class BinaryDictIOUtils {
stack.pop();
}
} else {
// The node array has more groups.
// The Ptnode array has more PtNodes.
p.mAddress = dictBuffer.position();
}
if (!isMovedGroup && hasChildrenAddress(info.mChildrenAddress)) {
if (!isMovedPtNode && hasChildrenAddress(info.mChildrenAddress)) {
final Position childrenPos = new Position(info.mChildrenAddress, index);
stack.push(childrenPos);
}
@ -159,7 +159,7 @@ public final class BinaryDictIOUtils {
}
/**
* Gets the address of the last CharGroup of the exact matching word in the dictionary.
* Gets the address of the last PtNode of the exact matching word in the dictionary.
* If no match is found, returns NOT_VALID_WORD.
*
* @param dictDecoder the dict decoder.
@ -182,17 +182,17 @@ public final class BinaryDictIOUtils {
if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
do {
final int charGroupCount = BinaryDictDecoderUtils.readCharGroupCount(dictBuffer);
boolean foundNextCharGroup = false;
for (int i = 0; i < charGroupCount; ++i) {
final int charGroupPos = dictBuffer.position();
final CharGroupInfo currentInfo = dictDecoder.readPtNode(charGroupPos,
final int ptNodeCount = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer);
boolean foundNextPtNode = false;
for (int i = 0; i < ptNodeCount; ++i) {
final int ptNodePos = dictBuffer.position();
final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos,
header.mFormatOptions);
final boolean isMovedGroup = isMovedGroup(currentInfo.mFlags,
final boolean isMovedNode = isMovedPtNode(currentInfo.mFlags,
header.mFormatOptions);
final boolean isDeletedGroup = isDeletedGroup(currentInfo.mFlags,
final boolean isDeletedNode = isDeletedPtNode(currentInfo.mFlags,
header.mFormatOptions);
if (isMovedGroup) continue;
if (isMovedNode) continue;
boolean same = true;
for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
p < currentInfo.mCharacters.length;
@ -205,30 +205,30 @@ public final class BinaryDictIOUtils {
}
if (same) {
// found the group matches the word.
// found the PtNode matches the word.
if (wordPos + currentInfo.mCharacters.length == wordLen) {
if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL
|| isDeletedGroup) {
if (currentInfo.mFrequency == PtNode.NOT_A_TERMINAL
|| isDeletedNode) {
return FormatSpec.NOT_VALID_WORD;
} else {
return charGroupPos;
return ptNodePos;
}
}
wordPos += currentInfo.mCharacters.length;
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
return FormatSpec.NOT_VALID_WORD;
}
foundNextCharGroup = true;
foundNextPtNode = true;
dictBuffer.position(currentInfo.mChildrenAddress);
break;
}
}
// If we found the next char group, it is under the file pointer.
// If we found the next PtNode, it is under the file pointer.
// But if not, we are at the end of this node array so we expect to have
// a forward link address that we need to consult and possibly resume
// search on the next node array in the linked list.
if (foundNextCharGroup) break;
if (foundNextPtNode) break;
if (!header.mFormatOptions.mSupportsDynamicUpdate) {
return FormatSpec.NOT_VALID_WORD;
}
@ -289,8 +289,7 @@ public final class BinaryDictIOUtils {
return BinaryDictEncoderUtils.getByteSize(value);
}
static void skipCharGroup(final DictBuffer dictBuffer,
final FormatOptions formatOptions) {
static void skipPtNode(final DictBuffer dictBuffer, final FormatOptions formatOptions) {
final int flags = dictBuffer.readUnsignedByte();
BinaryDictDecoderUtils.readParentAddress(dictBuffer, formatOptions);
skipString(dictBuffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
@ -299,27 +298,27 @@ public final class BinaryDictIOUtils {
if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) {
final int shortcutsSize = dictBuffer.readUnsignedShort();
dictBuffer.position(dictBuffer.position() + shortcutsSize
- FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE);
- FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
}
if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) {
int bigramCount = 0;
while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
final int bigramFlags = dictBuffer.readUnsignedByte();
switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) {
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) {
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE:
dictBuffer.readUnsignedByte();
break;
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES:
dictBuffer.readUnsignedShort();
break;
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES:
dictBuffer.readUnsignedInt24();
break;
}
if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break;
if ((bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) == 0) break;
}
if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
throw new RuntimeException("Too many bigrams in a group.");
if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
throw new RuntimeException("Too many bigrams in a PtNode.");
}
}
}
@ -360,24 +359,24 @@ public final class BinaryDictIOUtils {
size += 3;
}
}
destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
size += FormatSpec.GROUP_TERMINATOR_SIZE;
destination.write((byte)FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
size += FormatSpec.PTNODE_TERMINATOR_SIZE;
return size;
}
/**
* Write a char group to an output stream from a CharGroupInfo.
* A char group is an in-memory representation of a node in the patricia trie.
* A char group info is a container for low-level information about how the
* char group is stored in the binary format.
* Write a PtNode to an output stream from a PtNodeInfo.
* A PtNode is an in-memory representation of a node in the patricia trie.
* A PtNode info is a container for low-level information about how the
* PtNode is stored in the binary format.
*
* @param destination the stream to write.
* @param info the char group info to be written.
* @param info the PtNode info to be written.
* @return the size written, in bytes.
*/
private static int writeCharGroup(final OutputStream destination, final CharGroupInfo info)
private static int writePtNode(final OutputStream destination, final PtNodeInfo info)
throws IOException {
int size = FormatSpec.GROUP_FLAGS_SIZE;
int size = FormatSpec.PTNODE_FLAGS_SIZE;
destination.write((byte)info.mFlags);
final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ?
FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress;
@ -392,7 +391,7 @@ public final class BinaryDictIOUtils {
}
}
if (info.mCharacters.length > 1) {
destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
destination.write((byte)FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
size++;
}
@ -402,7 +401,7 @@ public final class BinaryDictIOUtils {
}
if (DBG) {
MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size
MakedictLog.d("writePtNode origin=" + info.mOriginalAddress + ", size=" + size
+ ", child=" + info.mChildrenAddress + ", characters ="
+ new String(info.mCharacters, 0, info.mCharacters.length));
}
@ -434,23 +433,23 @@ public final class BinaryDictIOUtils {
final int bigramFrequency = info.mBigrams.get(i).mFrequency;
int bigramFlags = (i < info.mBigrams.size() - 1)
? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0;
? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0;
size++;
final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress
+ size);
bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0;
bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0;
switch (BinaryDictEncoderUtils.getByteSize(bigramOffset)) {
case 1:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE;
break;
case 2:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES;
break;
case 3:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES;
break;
}
bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
bigramFlags |= bigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
destination.write((byte)bigramFlags);
size += writeVariableAddress(destination, Math.abs(bigramOffset));
}
@ -459,21 +458,21 @@ public final class BinaryDictIOUtils {
}
/**
* Compute the size of the char group.
* Compute the size of the PtNode.
*/
static int computeGroupSize(final CharGroupInfo info, final FormatOptions formatOptions) {
int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ BinaryDictEncoderUtils.getGroupCharactersSize(info.mCharacters)
static int computePtNodeSize(final PtNodeInfo info, final FormatOptions formatOptions) {
int size = FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ BinaryDictEncoderUtils.getPtNodeCharactersSize(info.mCharacters)
+ getChildrenAddressSize(info.mFlags, formatOptions);
if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
size += FormatSpec.GROUP_FREQUENCY_SIZE;
size += FormatSpec.PTNODE_FREQUENCY_SIZE;
}
if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) {
size += BinaryDictEncoderUtils.getShortcutListSize(info.mShortcutTargets);
}
if (info.mBigrams != null) {
for (final PendingAttribute attr : info.mBigrams) {
size += FormatSpec.GROUP_FLAGS_SIZE;
size += FormatSpec.PTNODE_FLAGS_SIZE;
size += BinaryDictEncoderUtils.getByteSize(attr.mAddress);
}
}
@ -484,14 +483,14 @@ public final class BinaryDictIOUtils {
* Write a node array to the stream.
*
* @param destination the stream to write.
* @param infos an array of CharGroupInfo to be written.
* @param infos an array of PtNodeInfo to be written.
* @return the size written, in bytes.
* @throws IOException
*/
static int writeNodes(final OutputStream destination, final CharGroupInfo[] infos)
static int writeNodes(final OutputStream destination, final PtNodeInfo[] infos)
throws IOException {
int size = getGroupCountSize(infos.length);
switch (getGroupCountSize(infos.length)) {
int size = getPtNodeCountSize(infos.length);
switch (getPtNodeCountSize(infos.length)) {
case 1:
destination.write((byte)infos.length);
break;
@ -500,9 +499,9 @@ public final class BinaryDictIOUtils {
destination.write((byte)(infos.length & 0xFF));
break;
default:
throw new RuntimeException("Invalid group count size.");
throw new RuntimeException("Invalid node count size.");
}
for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info);
for (final PtNodeInfo info : infos) size += writePtNode(destination, info);
writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
@ -560,11 +559,11 @@ public final class BinaryDictIOUtils {
}
/**
* Helper method to check whether the group is moved.
* Helper method to check whether the node is moved.
*/
public static boolean isMovedGroup(final int flags, final FormatOptions options) {
public static boolean isMovedPtNode(final int flags, final FormatOptions options) {
return options.mSupportsDynamicUpdate
&& ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED);
&& ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED);
}
/**
@ -576,26 +575,26 @@ public final class BinaryDictIOUtils {
}
/**
* Helper method to check whether the group is deleted.
* Helper method to check whether the node is deleted.
*/
public static boolean isDeletedGroup(final int flags, final FormatOptions formatOptions) {
public static boolean isDeletedPtNode(final int flags, final FormatOptions formatOptions) {
return formatOptions.mSupportsDynamicUpdate
&& ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED);
&& ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED);
}
/**
* Compute the binary size of the group count
* @param count the group count
* @return the size of the group count, either 1 or 2 bytes.
* Compute the binary size of the node count
* @param count the node count
* @return the size of the node count, either 1 or 2 bytes.
*/
public static int getGroupCountSize(final int count) {
if (FormatSpec.MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) {
public static int getPtNodeCountSize(final int count) {
if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
return 1;
} else if (FormatSpec.MAX_CHARGROUPS_IN_A_PT_NODE_ARRAY >= count) {
} else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
return 2;
} else {
throw new RuntimeException("Can't have more than "
+ FormatSpec.MAX_CHARGROUPS_IN_A_PT_NODE_ARRAY + " groups in a node (found "
+ FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
+ count + ")");
}
}
@ -603,14 +602,14 @@ public final class BinaryDictIOUtils {
static int getChildrenAddressSize(final int optionFlags,
final FormatOptions formatOptions) {
if (formatOptions.mSupportsDynamicUpdate) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
return 1;
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
return 2;
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
return 3;
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
default:
return 0;
}

View File

@ -40,9 +40,9 @@ public interface DictDecoder {
* Reads PtNode from nodeAddress.
* @param ptNodePos the position of PtNode.
* @param formatOptions the format options.
* @return CharGroupInfo.
* @return PtNodeInfo.
*/
public CharGroupInfo readPtNode(final int ptNodePos, final FormatOptions formatOptions);
public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions formatOptions);
/**
* Reads a buffer and returns the memory representation of the dictionary.

View File

@ -43,7 +43,7 @@ public final class DynamicBinaryDictIOUtils {
}
private static int markAsDeleted(final int flags) {
return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED;
return (flags & (~FormatSpec.MASK_CHILDREN_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED;
}
/**
@ -70,56 +70,56 @@ public final class DynamicBinaryDictIOUtils {
}
/**
* Update a parent address in a CharGroup that is referred to by groupOriginAddress.
* Update a parent address in a PtNode that is referred to by ptNodeOriginAddress.
*
* @param dictBuffer the DictBuffer to write.
* @param groupOriginAddress the address of the group.
* @param ptNodeOriginAddress the address of the PtNode.
* @param newParentAddress the absolute address of the parent.
* @param formatOptions file format options.
*/
public static void updateParentAddress(final DictBuffer dictBuffer,
final int groupOriginAddress, final int newParentAddress,
final int ptNodeOriginAddress, final int newParentAddress,
final FormatOptions formatOptions) {
final int originalPosition = dictBuffer.position();
dictBuffer.position(groupOriginAddress);
dictBuffer.position(ptNodeOriginAddress);
if (!formatOptions.mSupportsDynamicUpdate) {
throw new RuntimeException("this file format does not support parent addresses");
}
final int flags = dictBuffer.readUnsignedByte();
if (BinaryDictIOUtils.isMovedGroup(flags, formatOptions)) {
// If the group is moved, the parent address is stored in the destination group.
// We are guaranteed to process the destination group later, so there is no need to
if (BinaryDictIOUtils.isMovedPtNode(flags, formatOptions)) {
// If the node is moved, the parent address is stored in the destination node.
// We are guaranteed to process the destination node later, so there is no need to
// update anything here.
dictBuffer.position(originalPosition);
return;
}
if (DBG) {
MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress);
MakedictLog.d("update parent address flags=" + flags + ", " + ptNodeOriginAddress);
}
final int parentOffset = newParentAddress - groupOriginAddress;
final int parentOffset = newParentAddress - ptNodeOriginAddress;
BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, parentOffset);
dictBuffer.position(originalPosition);
}
/**
* Update parent addresses in a node array stored at nodeOriginAddress.
* Update parent addresses in a node array stored at ptNodeOriginAddress.
*
* @param dictBuffer the DictBuffer to be modified.
* @param nodeOriginAddress the address of the node array to update.
* @param ptNodeOriginAddress the address of the node array to update.
* @param newParentAddress the address to be written.
* @param formatOptions file format options.
*/
public static void updateParentAddresses(final DictBuffer dictBuffer,
final int nodeOriginAddress, final int newParentAddress,
final int ptNodeOriginAddress, final int newParentAddress,
final FormatOptions formatOptions) {
final int originalPosition = dictBuffer.position();
dictBuffer.position(nodeOriginAddress);
dictBuffer.position(ptNodeOriginAddress);
do {
final int count = BinaryDictDecoderUtils.readCharGroupCount(dictBuffer);
final int count = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer);
for (int i = 0; i < count; ++i) {
updateParentAddress(dictBuffer, dictBuffer.position(), newParentAddress,
formatOptions);
BinaryDictIOUtils.skipCharGroup(dictBuffer, formatOptions);
BinaryDictIOUtils.skipPtNode(dictBuffer, formatOptions);
}
final int forwardLinkAddress = dictBuffer.readUnsignedInt24();
dictBuffer.position(forwardLinkAddress);
@ -129,18 +129,18 @@ public final class DynamicBinaryDictIOUtils {
}
/**
* Update a children address in a CharGroup that is addressed by groupOriginAddress.
* Update a children address in a PtNode that is addressed by ptNodeOriginAddress.
*
* @param dictBuffer the DictBuffer to write.
* @param groupOriginAddress the address of the group.
* @param ptNodeOriginAddress the address of the PtNode.
* @param newChildrenAddress the absolute address of the child.
* @param formatOptions file format options.
*/
public static void updateChildrenAddress(final DictBuffer dictBuffer,
final int groupOriginAddress, final int newChildrenAddress,
final int ptNodeOriginAddress, final int newChildrenAddress,
final FormatOptions formatOptions) {
final int originalPosition = dictBuffer.position();
dictBuffer.position(groupOriginAddress);
dictBuffer.position(ptNodeOriginAddress);
final int flags = dictBuffer.readUnsignedByte();
final int parentAddress = BinaryDictDecoderUtils.readParentAddress(dictBuffer,
formatOptions);
@ -153,21 +153,21 @@ public final class DynamicBinaryDictIOUtils {
}
/**
* Helper method to move a char group to the tail of the file.
* Helper method to move a PtNode to the tail of the file.
*/
private static int moveCharGroup(final OutputStream destination,
final DictBuffer dictBuffer, final CharGroupInfo info,
final int nodeArrayOriginAddress, final int oldGroupAddress,
private static int movePtNode(final OutputStream destination,
final DictBuffer dictBuffer, final PtNodeInfo info,
final int nodeArrayOriginAddress, final int oldNodeAddress,
final FormatOptions formatOptions) throws IOException {
updateParentAddress(dictBuffer, oldGroupAddress, dictBuffer.limit() + 1, formatOptions);
dictBuffer.position(oldGroupAddress);
updateParentAddress(dictBuffer, oldNodeAddress, dictBuffer.limit() + 1, formatOptions);
dictBuffer.position(oldNodeAddress);
final int currentFlags = dictBuffer.readUnsignedByte();
dictBuffer.position(oldGroupAddress);
dictBuffer.position(oldNodeAddress);
dictBuffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags
& (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
int size = FormatSpec.GROUP_FLAGS_SIZE;
int size = FormatSpec.PTNODE_FLAGS_SIZE;
updateForwardLink(dictBuffer, nodeArrayOriginAddress, dictBuffer.limit(), formatOptions);
size += BinaryDictIOUtils.writeNodes(destination, new CharGroupInfo[] { info });
size += BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { info });
return size;
}
@ -178,9 +178,9 @@ public final class DynamicBinaryDictIOUtils {
dictBuffer.position(nodeArrayOriginAddress);
int jumpCount = 0;
while (jumpCount++ < MAX_JUMPS) {
final int count = BinaryDictDecoderUtils.readCharGroupCount(dictBuffer);
final int count = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer);
for (int i = 0; i < count; ++i) {
BinaryDictIOUtils.skipCharGroup(dictBuffer, formatOptions);
BinaryDictIOUtils.skipPtNode(dictBuffer, formatOptions);
}
final int forwardLinkAddress = dictBuffer.readUnsignedInt24();
if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
@ -196,43 +196,43 @@ public final class DynamicBinaryDictIOUtils {
}
/**
* Move a group that is referred to by oldGroupOrigin to the tail of the file, and set the
* children address to the byte after the group
* Move a PtNode that is referred to by oldPtNodeOrigin to the tail of the file, and set the
* children address to the byte after the PtNode.
*
* @param fileEndAddress the address of the tail of the file.
* @param codePoints the characters to put inside the group.
* @param codePoints the characters to put inside the PtNode.
* @param length how many code points to read from codePoints.
* @param flags the flags for this group.
* @param flags the flags for this PtNode.
* @param frequency the frequency of this terminal.
* @param parentAddress the address of the parent group of this group.
* @param shortcutTargets the shortcut targets for this group.
* @param bigrams the bigrams for this group.
* @param parentAddress the address of the parent PtNode of this PtNode.
* @param shortcutTargets the shortcut targets for this PtNode.
* @param bigrams the bigrams for this PtNode.
* @param destination the stream representing the tail of the file.
* @param dictBuffer the DictBuffer representing the (constant-size) body of the file.
* @param oldNodeArrayOrigin the origin of the old node array this group was a part of.
* @param oldGroupOrigin the old origin where this group used to be stored.
* @param oldPtNodeArrayOrigin the origin of the old PtNode array this PtNode was a part of.
* @param oldPtNodeOrigin the old origin where this PtNode used to be stored.
* @param formatOptions format options for this dictionary.
* @return the size written, in bytes.
* @throws IOException if the file can't be accessed
*/
private static int moveGroup(final int fileEndAddress, final int[] codePoints,
private static int movePtNode(final int fileEndAddress, final int[] codePoints,
final int length, final int flags, final int frequency, final int parentAddress,
final ArrayList<WeightedString> shortcutTargets,
final ArrayList<PendingAttribute> bigrams, final OutputStream destination,
final DictBuffer dictBuffer, final int oldNodeArrayOrigin,
final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException {
final DictBuffer dictBuffer, final int oldPtNodeArrayOrigin,
final int oldPtNodeOrigin, final FormatOptions formatOptions) throws IOException {
int size = 0;
final int newGroupOrigin = fileEndAddress + 1;
final int newPtNodeOrigin = fileEndAddress + 1;
final int[] writtenCharacters = Arrays.copyOfRange(codePoints, 0, length);
final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */,
final PtNodeInfo tmpInfo = new PtNodeInfo(newPtNodeOrigin, -1 /* endAddress */,
flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS,
shortcutTargets, bigrams);
size = BinaryDictIOUtils.computeGroupSize(tmpInfo, formatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size,
size = BinaryDictIOUtils.computePtNodeSize(tmpInfo, formatOptions);
final PtNodeInfo newInfo = new PtNodeInfo(newPtNodeOrigin, newPtNodeOrigin + size,
flags, writtenCharacters, frequency, parentAddress,
fileEndAddress + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets,
bigrams);
moveCharGroup(destination, dictBuffer, newInfo, oldNodeArrayOrigin, oldGroupOrigin,
movePtNode(destination, dictBuffer, newInfo, oldPtNodeArrayOrigin, oldPtNodeOrigin,
formatOptions);
return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
@ -288,16 +288,16 @@ public final class DynamicBinaryDictIOUtils {
if (wordPos >= wordLen) break;
nodeOriginAddress = dictBuffer.position();
int nodeParentAddress = -1;
final int charGroupCount = BinaryDictDecoderUtils.readCharGroupCount(dictBuffer);
boolean foundNextGroup = false;
final int ptNodeCount = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer);
boolean foundNextNode = false;
for (int i = 0; i < charGroupCount; ++i) {
for (int i = 0; i < ptNodeCount; ++i) {
address = dictBuffer.position();
final CharGroupInfo currentInfo = dictDecoder.readPtNode(address,
final PtNodeInfo currentInfo = dictDecoder.readPtNode(address,
fileHeader.mFormatOptions);
final boolean isMovedGroup = BinaryDictIOUtils.isMovedGroup(currentInfo.mFlags,
final boolean isMovedNode = BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags,
fileHeader.mFormatOptions);
if (isMovedGroup) continue;
if (isMovedNode) continue;
nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS)
? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address;
boolean matched = true;
@ -314,10 +314,10 @@ public final class DynamicBinaryDictIOUtils {
* abc - d - ef
*/
final int newNodeAddress = dictBuffer.limit();
final int flags = BinaryDictEncoderUtils.makeCharGroupFlags(p > 1,
final int flags = BinaryDictEncoderUtils.makePtNodeFlags(p > 1,
isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */,
false /* isBlackListEntry */, fileHeader.mFormatOptions);
int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags,
int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p, flags,
frequency, nodeParentAddress, shortcuts, bigrams, destination,
dictBuffer, nodeOriginAddress, address, fileHeader.mFormatOptions);
@ -327,12 +327,12 @@ public final class DynamicBinaryDictIOUtils {
updateParentAddresses(dictBuffer, currentInfo.mChildrenAddress,
newNodeAddress + written + 1, fileHeader.mFormatOptions);
}
final CharGroupInfo newInfo2 = new CharGroupInfo(
final PtNodeInfo newInfo2 = new PtNodeInfo(
newNodeAddress + written + 1, -1 /* endAddress */,
currentInfo.mFlags, characters2, currentInfo.mFrequency,
newNodeAddress + 1, currentInfo.mChildrenAddress,
currentInfo.mShortcutTargets, currentInfo.mBigrams);
BinaryDictIOUtils.writeNodes(destination, new CharGroupInfo[] { newInfo2 });
BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { newInfo2 });
return;
} else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) {
if (p > 0) {
@ -353,12 +353,12 @@ public final class DynamicBinaryDictIOUtils {
final int childrenAddress = currentInfo.mChildrenAddress;
// move prefix
final int prefixFlags = BinaryDictEncoderUtils.makeCharGroupFlags(p > 1,
final int prefixFlags = BinaryDictEncoderUtils.makePtNodeFlags(p > 1,
false /* isTerminal */, 0 /* childrenAddressSize*/,
false /* hasShortcut */, false /* hasBigrams */,
false /* isNotAWord */, false /* isBlackListEntry */,
fileHeader.mFormatOptions);
int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p,
int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p,
prefixFlags, -1 /* frequency */, nodeParentAddress, null, null,
destination, dictBuffer, nodeOriginAddress, address,
fileHeader.mFormatOptions);
@ -369,7 +369,7 @@ public final class DynamicBinaryDictIOUtils {
updateParentAddresses(dictBuffer, currentInfo.mChildrenAddress,
newNodeAddress + written + 1, fileHeader.mFormatOptions);
}
final int suffixFlags = BinaryDictEncoderUtils.makeCharGroupFlags(
final int suffixFlags = BinaryDictEncoderUtils.makePtNodeFlags(
suffixCharacters.length > 1,
(currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0,
0 /* childrenAddressSize */,
@ -377,26 +377,26 @@ public final class DynamicBinaryDictIOUtils {
!= 0,
(currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0,
isNotAWord, isBlackListEntry, fileHeader.mFormatOptions);
final CharGroupInfo suffixInfo = new CharGroupInfo(
final PtNodeInfo suffixInfo = new PtNodeInfo(
newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags,
suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1,
currentInfo.mChildrenAddress, currentInfo.mShortcutTargets,
currentInfo.mBigrams);
written += BinaryDictIOUtils.computeGroupSize(suffixInfo,
written += BinaryDictIOUtils.computePtNodeSize(suffixInfo,
fileHeader.mFormatOptions) + 1;
final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p,
codePoints.length);
final int flags = BinaryDictEncoderUtils.makeCharGroupFlags(
final int flags = BinaryDictEncoderUtils.makePtNodeFlags(
newCharacters.length > 1, isTerminal,
0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, fileHeader.mFormatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(
final PtNodeInfo newInfo = new PtNodeInfo(
newNodeAddress + written, -1 /* endAddress */, flags,
newCharacters, frequency, newNodeAddress + 1,
FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
BinaryDictIOUtils.writeNodes(destination,
new CharGroupInfo[] { suffixInfo, newInfo });
new PtNodeInfo[] { suffixInfo, newInfo });
return;
}
matched = false;
@ -407,17 +407,17 @@ public final class DynamicBinaryDictIOUtils {
if (matched) {
if (wordPos + currentInfo.mCharacters.length == wordLen) {
// the word exists in the dictionary.
// only update group.
// only update the PtNode.
final int newNodeAddress = dictBuffer.limit();
final boolean hasMultipleChars = currentInfo.mCharacters.length > 1;
final int flags = BinaryDictEncoderUtils.makeCharGroupFlags(hasMultipleChars,
final int flags = BinaryDictEncoderUtils.makePtNodeFlags(hasMultipleChars,
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, fileHeader.mFormatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress + 1,
-1 /* endAddress */, flags, currentInfo.mCharacters, frequency,
nodeParentAddress, currentInfo.mChildrenAddress, shortcuts,
bigrams);
moveCharGroup(destination, dictBuffer, newInfo, nodeOriginAddress, address,
movePtNode(destination, dictBuffer, newInfo, nodeOriginAddress, address,
fileHeader.mFormatOptions);
return;
}
@ -425,7 +425,7 @@ public final class DynamicBinaryDictIOUtils {
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
/*
* found the prefix of the word.
* make new node and link to the node from this group.
* make new PtNode and link to the PtNode from this PtNode.
*
* before
* ab - cd
@ -435,28 +435,28 @@ public final class DynamicBinaryDictIOUtils {
* after
* ab - cd - e
*/
final int newNodeAddress = dictBuffer.limit();
updateChildrenAddress(dictBuffer, address, newNodeAddress,
final int newNodeArrayAddress = dictBuffer.limit();
updateChildrenAddress(dictBuffer, address, newNodeArrayAddress,
fileHeader.mFormatOptions);
final int newGroupAddress = newNodeAddress + 1;
final int newNodeAddress = newNodeArrayAddress + 1;
final boolean hasMultipleChars = (wordLen - wordPos) > 1;
final int flags = BinaryDictEncoderUtils.makeCharGroupFlags(hasMultipleChars,
final int flags = BinaryDictEncoderUtils.makePtNodeFlags(hasMultipleChars,
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, fileHeader.mFormatOptions);
final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags,
final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress, -1, flags,
characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS,
shortcuts, bigrams);
BinaryDictIOUtils.writeNodes(destination, new CharGroupInfo[] { newInfo });
BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { newInfo });
return;
}
dictBuffer.position(currentInfo.mChildrenAddress);
foundNextGroup = true;
foundNextNode = true;
break;
}
}
if (foundNextGroup) continue;
if (foundNextNode) continue;
// reached the end of the array.
final int linkAddressPosition = dictBuffer.position();
@ -485,13 +485,13 @@ public final class DynamicBinaryDictIOUtils {
BinaryDictIOUtils.writeSInt24ToBuffer(dictBuffer, newNodeAddress);
final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
final int flags = BinaryDictEncoderUtils.makeCharGroupFlags(characters.length > 1,
final int flags = BinaryDictEncoderUtils.makePtNodeFlags(characters.length > 1,
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, fileHeader.mFormatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
final PtNodeInfo newInfo = new PtNodeInfo(newNodeAddress + 1,
-1 /* endAddress */, flags, characters, frequency, nodeParentAddress,
FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
BinaryDictIOUtils.writeNodes(destination, new CharGroupInfo[]{ newInfo });
BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[]{ newInfo });
return;
} else {
depth--;

View File

@ -62,19 +62,19 @@ public final class FormatSpec {
/*
* Node array (FusionDictionary.PtNodeArray) layout is as follows:
*
* g |
* r | the number of groups, 1 or 2 bytes.
* o | 1 byte = bbbbbbbb match
* u | case 1xxxxxxx => xxxxxxx << 8 + next byte
* p | otherwise => bbbbbbbb
* c |
* ount
* n |
* o | the number of PtNodes, 1 or 2 bytes.
* d | 1 byte = bbbbbbbb match
* e | case 1xxxxxxx => xxxxxxx << 8 + next byte
* c | otherwise => bbbbbbbb
* o |
* unt
*
* g |
* r | sequence of groups,
* o | the layout of each group is described below.
* u |
* ps
* n |
* o | sequence of PtNodes,
* d | the layout of each PtNode is described below.
* e |
* s
*
* f |
* o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
@ -86,19 +86,19 @@ public final class FormatSpec {
* linkaddress
*/
/* Node (FusionDictionary.CharGroup) layout is as follows:
/* Node (FusionDictionary.PtNode) layout is as follows:
* | IF !SUPPORTS_DYNAMIC_UPDATE
* | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE
* | 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
* f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
* l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
* a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
* | addressType xx : mask with MASK_CHILDREN_ADDRESS_TYPE
* | 2 bits, 00 = no children : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS
* f | 01 = 1 byte : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE
* l | 10 = 2 bytes : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES
* a | 11 = 3 bytes : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES
* g | ELSE
* s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED
* | This must be the same as FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
* | 01 = yes : FLAG_IS_MOVED
* s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED
* | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES
* | 01 = yes : FLAG_IS_MOVED
* | the new address is stored in the same place as the parent address
* | is deleted? 10 = yes : FLAG_IS_DELETED
* | is deleted? 10 = yes : FLAG_IS_DELETED
* | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS
* | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
* | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
@ -116,7 +116,7 @@ public final class FormatSpec {
* ddress
*
* c | IF FLAG_HAS_MULTIPLE_CHARS
* h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
* h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers
* a | end 1 byte, = 0
* r | ELSE
* s | char 1 or 3 bytes
@ -127,17 +127,22 @@ public final class FormatSpec {
* e | frequency 1 byte
* q |
*
* c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType
* h | // nothing
* i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType
* l | children address, 1 byte
* d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType
* r | children address, 2 bytes
* e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType
* n | children address, 3 bytes
* A | END
* d
* dress
* c | IF SUPPORTS_DYNAMIC_UPDATE
* h | children address, 3 bytes
* i | 1 byte = bbbbbbbb match
* l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
* d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte
* r | ELSIF 00 = FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS == addressType
* e | // nothing
* n | ELSIF 01 = FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE == addressType
* A | children address, 1 byte
* d | ELSIF 10 = FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES == addressType
* d | children address, 2 bytes
* r | ELSE // 11 = FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = addressType
* e | children address, 3 bytes
* s | END
* s
* ress
*
* | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
* | shortcut string list
@ -156,33 +161,33 @@ public final class FormatSpec {
* characters which should never happen anyway (and still work, but take 3 bytes).
*
* bigram address list is:
* <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
* | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE
* <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT
* | addressSign = 1 bit, : FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE
* | 1 = must take -address, 0 = must take +address
* | xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE
* | addressFormat = 2 bits, 00 = unused : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
* | 01 = 1 byte : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
* | 10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES
* | 11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES
* | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
* <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat)
* | xx : mask with MASK_BIGRAM_ATTR_ADDRESS_TYPE
* | addressFormat = 2 bits, 00 = unused : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE
* | 01 = 1 byte : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE
* | 10 = 2 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES
* | 11 = 3 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES
* | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY
* <address> | IF (01 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE == addressFormat)
* | read 1 byte, add top 4 bits
* | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat)
* | ELSIF (10 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES == addressFormat)
* | read 2 bytes, add top 4 bits
* | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat
* | ELSE // 11 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES == addressFormat
* | read 3 bytes, add top 4 bits
* | END
* | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address
* if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is
* | if (FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) then address = -address
* if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) goto bigram_and_shortcut_address_list_is
*
* shortcut string list is:
* <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
* <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
* <byte size> = PTNODE_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
* <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT
* | reserved = 3 bits, must be 0
* | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
* | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY
* <shortcut> = | string of characters at the char format described above, with the terminator
* | used to signal the end of the string.
* if (FLAG_ATTRIBUTE_HAS_NEXT goto flags
* if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT goto flags
*/
public static final int MAGIC_NUMBER = 0x9BC13AFE;
@ -206,11 +211,11 @@ public final class FormatSpec {
static final int FORWARD_LINK_ADDRESS_SIZE = 3;
// These flags are used only in the static dictionary.
static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
static final int MASK_CHILDREN_ADDRESS_TYPE = 0xC0;
static final int FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS = 0x00;
static final int FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE = 0x40;
static final int FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES = 0x80;
static final int FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = 0xC0;
static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
@ -227,32 +232,32 @@ public final class FormatSpec {
static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE;
static final int FLAG_IS_DELETED = 0x80;
static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F;
static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80;
static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40;
static final int MASK_BIGRAM_ATTR_ADDRESS_TYPE = 0x30;
static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE = 0x10;
static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES = 0x20;
static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES = 0x30;
static final int FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY = 0x0F;
static final int GROUP_CHARACTERS_TERMINATOR = 0x1F;
static final int PTNODE_CHARACTERS_TERMINATOR = 0x1F;
static final int GROUP_TERMINATOR_SIZE = 1;
static final int GROUP_FLAGS_SIZE = 1;
static final int GROUP_FREQUENCY_SIZE = 1;
static final int GROUP_MAX_ADDRESS_SIZE = 3;
static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1;
static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2;
static final int PTNODE_TERMINATOR_SIZE = 1;
static final int PTNODE_FLAGS_SIZE = 1;
static final int PTNODE_FREQUENCY_SIZE = 1;
static final int PTNODE_MAX_ADDRESS_SIZE = 3;
static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1;
static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2;
static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
static final int NO_PARENT_ADDRESS = 0;
static final int NO_FORWARD_LINK_ADDRESS = 0;
static final int INVALID_CHARACTER = -1;
static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
static final int MAX_CHARGROUPS_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767
static final int MAX_BIGRAMS_IN_A_GROUP = 10000;
static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127
static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767
static final int MAX_BIGRAMS_IN_A_PTNODE = 10000;
static final int MAX_TERMINAL_FREQUENCY = 255;
static final int MAX_BIGRAM_FREQUENCY = 15;

View File

@ -37,15 +37,15 @@ public final class FusionDictionary implements Iterable<Word> {
private static int CHARACTER_NOT_FOUND_INDEX = -1;
/**
* A node array of the dictionary, containing several CharGroups.
* A node array of the dictionary, containing several PtNodes.
*
* A PtNodeArray is but an ordered array of CharGroups, which essentially contain all the
* A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the
* real information.
* This class also contains fields to cache size and address, to help with binary
* generation.
*/
public static final class PtNodeArray {
ArrayList<CharGroup> mData;
ArrayList<PtNode> mData;
// To help with binary generation
int mCachedSize = Integer.MIN_VALUE;
// mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
@ -58,9 +58,9 @@ public final class FusionDictionary implements Iterable<Word> {
int mCachedParentAddress = 0;
public PtNodeArray() {
mData = new ArrayList<CharGroup>();
mData = new ArrayList<PtNode>();
}
public PtNodeArray(ArrayList<CharGroup> data) {
public PtNodeArray(ArrayList<PtNode> data) {
mData = data;
}
}
@ -93,18 +93,19 @@ public final class FusionDictionary implements Iterable<Word> {
}
/**
* A group of characters, with a frequency, shortcut targets, bigrams, and children.
* PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children
* (Pt means Patricia Trie).
*
* This is the central class of the in-memory representation. A CharGroup is what can
* This is the central class of the in-memory representation. A PtNode is what can
* be seen as a traditional "trie node", except it can hold several characters at the
* same time. A CharGroup essentially represents one or several characters in the middle
* same time. A PtNode essentially represents one or several characters in the middle
* of the trie tree; as such, it can be a terminal, and it can have children.
* In this in-memory representation, whether the CharGroup is a terminal or not is represented
* In this in-memory representation, whether the PtNode is a terminal or not is represented
* in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
* value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
* bigrams, but a non-terminal may not. Moreover, children, if present, are null.
*/
public static final class CharGroup {
public static final class PtNode {
public static final int NOT_A_TERMINAL = -1;
final int mChars[];
ArrayList<WeightedString> mShortcutTargets;
@ -119,11 +120,11 @@ public final class FusionDictionary implements Iterable<Word> {
// same time. Updating will update the AfterUpdate value, and the code will move them
// to BeforeUpdate before the next update pass.
// The update process does not need two versions of mCachedSize.
int mCachedSize; // The size, in bytes, of this char group.
int mCachedAddressBeforeUpdate; // The address of this char group (before update)
int mCachedAddressAfterUpdate; // The address of this char group (after update)
int mCachedSize; // The size, in bytes, of this PtNode.
int mCachedAddressBeforeUpdate; // The address of this PtNode (before update)
int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams, final int frequency,
final boolean isNotAWord, final boolean isBlacklistEntry) {
mChars = chars;
@ -135,7 +136,7 @@ public final class FusionDictionary implements Iterable<Word> {
mIsBlacklistEntry = isBlacklistEntry;
}
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams, final int frequency,
final boolean isNotAWord, final boolean isBlacklistEntry,
final PtNodeArray children) {
@ -148,7 +149,7 @@ public final class FusionDictionary implements Iterable<Word> {
mIsBlacklistEntry = isBlacklistEntry;
}
public void addChild(CharGroup n) {
public void addChild(PtNode n) {
if (null == mChildren) {
mChildren = new PtNodeArray();
}
@ -245,7 +246,7 @@ public final class FusionDictionary implements Iterable<Word> {
}
/**
* Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to
* Updates the PtNode with the given properties. Adds the shortcut and bigram lists to
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
* updated if they are higher than the existing ones.
*/
@ -407,13 +408,13 @@ public final class FusionDictionary implements Iterable<Word> {
}
/**
* Sanity check for a node array.
* Sanity check for a PtNode array.
*
* This method checks that all CharGroups in a node array are ordered as expected.
* This method checks that all PtNodes in a node array are ordered as expected.
* If they are, nothing happens. If they aren't, an exception is thrown.
*/
private void checkStack(PtNodeArray nodeArray) {
ArrayList<CharGroup> stack = nodeArray.mData;
private void checkStack(PtNodeArray ptNodeArray) {
ArrayList<PtNode> stack = ptNodeArray.mData;
int lastValue = -1;
for (int i = 0; i < stack.size(); ++i) {
int currentValue = stack.get(i).mChars[0];
@ -432,18 +433,18 @@ public final class FusionDictionary implements Iterable<Word> {
* @param frequency the bigram frequency
*/
public void setBigram(final String word1, final String word2, final int frequency) {
CharGroup charGroup = findWordInTree(mRootNodeArray, word1);
if (charGroup != null) {
final CharGroup charGroup2 = findWordInTree(mRootNodeArray, word2);
if (charGroup2 == null) {
PtNode ptNode = findWordInTree(mRootNodeArray, word1);
if (ptNode != null) {
final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2);
if (ptNode2 == null) {
add(getCodePoints(word2), 0, null, false /* isNotAWord */,
false /* isBlacklistEntry */);
// The chargroup for the first word may have moved by the above insertion,
// The PtNode for the first word may have moved by the above insertion,
// if word1 and word2 share a common stem that happens not to have been
// a cutting point until now. In this case, we need to refresh charGroup.
charGroup = findWordInTree(mRootNodeArray, word1);
// a cutting point until now. In this case, we need to refresh ptNode.
ptNode = findWordInTree(mRootNodeArray, word1);
}
charGroup.addBigram(word2, frequency);
ptNode.addBigram(word2, frequency);
} else {
throw new RuntimeException("First word of bigram not found");
}
@ -473,84 +474,83 @@ public final class FusionDictionary implements Iterable<Word> {
PtNodeArray currentNodeArray = mRootNodeArray;
int charIndex = 0;
CharGroup currentGroup = null;
PtNode currentPtNode = null;
int differentCharIndex = 0; // Set by the loop to the index of the char that differs
int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]);
while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) {
currentGroup = currentNodeArray.mData.get(nodeIndex);
differentCharIndex = compareCharArrays(currentGroup.mChars, word, charIndex);
currentPtNode = currentNodeArray.mData.get(nodeIndex);
differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex);
if (ARRAYS_ARE_EQUAL != differentCharIndex
&& differentCharIndex < currentGroup.mChars.length) break;
if (null == currentGroup.mChildren) break;
charIndex += currentGroup.mChars.length;
&& differentCharIndex < currentPtNode.mChars.length) break;
if (null == currentPtNode.mChildren) break;
charIndex += currentPtNode.mChars.length;
if (charIndex >= word.length) break;
currentNodeArray = currentGroup.mChildren;
currentNodeArray = currentPtNode.mChildren;
nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]);
}
if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) {
// No node at this point to accept the word. Create one.
final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
final CharGroup newGroup = new CharGroup(
Arrays.copyOfRange(word, charIndex, word.length),
final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
currentNodeArray.mData.add(insertionIndex, newGroup);
currentNodeArray.mData.add(insertionIndex, newPtNode);
if (DBG) checkStack(currentNodeArray);
} else {
// There is a word with a common prefix.
if (differentCharIndex == currentGroup.mChars.length) {
if (differentCharIndex == currentPtNode.mChars.length) {
if (charIndex + differentCharIndex >= word.length) {
// The new word is a prefix of an existing word, but the node on which it
// should end already exists as is. Since the old CharGroup was not a terminal,
// should end already exists as is. Since the old PtNode was not a terminal,
// make it one by filling in its frequency and other attributes
currentGroup.update(frequency, shortcutTargets, null, isNotAWord,
currentPtNode.update(frequency, shortcutTargets, null, isNotAWord,
isBlacklistEntry);
} else {
// The new word matches the full old word and extends past it.
// We only have to create a new node and add it to the end of this.
final CharGroup newNode = new CharGroup(
final PtNode newNode = new PtNode(
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
shortcutTargets, null /* bigrams */, frequency, isNotAWord,
isBlacklistEntry);
currentGroup.mChildren = new PtNodeArray();
currentGroup.mChildren.mData.add(newNode);
currentPtNode.mChildren = new PtNodeArray();
currentPtNode.mChildren.mData.add(newNode);
}
} else {
if (0 == differentCharIndex) {
// Exact same word. Update the frequency if higher. This will also add the
// new shortcuts to the existing shortcut list if it already exists.
currentGroup.update(frequency, shortcutTargets, null,
currentGroup.mIsNotAWord && isNotAWord,
currentGroup.mIsBlacklistEntry || isBlacklistEntry);
currentPtNode.update(frequency, shortcutTargets, null,
currentPtNode.mIsNotAWord && isNotAWord,
currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
} else {
// Partial prefix match only. We have to replace the current node with a node
// containing the current prefix and create two new ones for the tails.
PtNodeArray newChildren = new PtNodeArray();
final CharGroup newOldWord = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
currentGroup.mChars.length), currentGroup.mShortcutTargets,
currentGroup.mBigrams, currentGroup.mFrequency,
currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry,
currentGroup.mChildren);
final PtNode newOldWord = new PtNode(
Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
currentPtNode.mBigrams, currentPtNode.mFrequency,
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
currentPtNode.mChildren);
newChildren.mData.add(newOldWord);
final CharGroup newParent;
final PtNode newParent;
if (charIndex + differentCharIndex >= word.length) {
newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
newParent = new PtNode(
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
shortcutTargets, null /* bigrams */, frequency,
isNotAWord, isBlacklistEntry, newChildren);
} else {
newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
newParent = new PtNode(
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
null /* shortcutTargets */, null /* bigrams */, -1,
false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
charIndex + differentCharIndex, word.length),
shortcutTargets, null /* bigrams */, frequency,
isNotAWord, isBlacklistEntry);
final int addIndex = word[charIndex + differentCharIndex]
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
> currentPtNode.mChars[differentCharIndex] ? 1 : 0;
newChildren.mData.add(addIndex, newWord);
}
currentNodeArray.mData.set(nodeIndex, newParent);
@ -589,29 +589,29 @@ public final class FusionDictionary implements Iterable<Word> {
}
/**
* Helper class that compares and sorts two chargroups according to their
* Helper class that compares and sorts two PtNodes according to their
* first element only. I repeat: ONLY the first element is considered, the rest
* is ignored.
* This comparator imposes orderings that are inconsistent with equals.
*/
static private final class CharGroupComparator implements java.util.Comparator<CharGroup> {
static private final class PtNodeComparator implements java.util.Comparator<PtNode> {
@Override
public int compare(CharGroup c1, CharGroup c2) {
if (c1.mChars[0] == c2.mChars[0]) return 0;
return c1.mChars[0] < c2.mChars[0] ? -1 : 1;
public int compare(PtNode p1, PtNode p2) {
if (p1.mChars[0] == p2.mChars[0]) return 0;
return p1.mChars[0] < p2.mChars[0] ? -1 : 1;
}
}
final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator();
final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator();
/**
* Finds the insertion index of a character within a node array.
*/
private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
final ArrayList<CharGroup> data = nodeArray.mData;
final CharGroup reference = new CharGroup(new int[] { character },
final ArrayList<PtNode> data = nodeArray.mData;
final PtNode reference = new PtNode(new int[] { character },
null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
false /* isBlacklistEntry */);
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
return result >= 0 ? result : -result - 1;
}
@ -633,35 +633,37 @@ public final class FusionDictionary implements Iterable<Word> {
* Helper method to find a word in a given branch.
*/
@SuppressWarnings("unused")
public static CharGroup findWordInTree(PtNodeArray nodeArray, final String string) {
public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) {
int index = 0;
final StringBuilder checker = DBG ? new StringBuilder() : null;
final int[] codePoints = getCodePoints(string);
CharGroup currentGroup;
PtNode currentPtNode;
do {
int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]);
if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null;
currentGroup = nodeArray.mData.get(indexOfGroup);
currentPtNode = nodeArray.mData.get(indexOfGroup);
if (codePoints.length - index < currentGroup.mChars.length) return null;
if (codePoints.length - index < currentPtNode.mChars.length) return null;
int newIndex = index;
while (newIndex < codePoints.length && newIndex - index < currentGroup.mChars.length) {
if (currentGroup.mChars[newIndex - index] != codePoints[newIndex]) return null;
while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) {
if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null;
newIndex++;
}
index = newIndex;
if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length));
if (DBG) {
checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length));
}
if (index < codePoints.length) {
nodeArray = currentGroup.mChildren;
nodeArray = currentPtNode.mChildren;
}
} while (null != nodeArray && index < codePoints.length);
if (index < codePoints.length) return null;
if (!currentGroup.isTerminal()) return null;
if (!currentPtNode.isTerminal()) return null;
if (DBG && !string.equals(checker.toString())) return null;
return currentGroup;
return currentPtNode;
}
/**
@ -675,18 +677,18 @@ public final class FusionDictionary implements Iterable<Word> {
}
/**
* Recursively count the number of character groups in a given branch of the trie.
* Recursively count the number of PtNodes in a given branch of the trie.
*
* @param nodeArray the parent node.
* @return the number of char groups in all the branch under this node.
* @return the number of PtNodes in all the branch under this node.
*/
public static int countCharGroups(final PtNodeArray nodeArray) {
public static int countPtNodes(final PtNodeArray nodeArray) {
final int nodeSize = nodeArray.mData.size();
int size = nodeSize;
for (int i = nodeSize - 1; i >= 0; --i) {
CharGroup group = nodeArray.mData.get(i);
if (null != group.mChildren)
size += countCharGroups(group.mChildren);
PtNode ptNode = nodeArray.mData.get(i);
if (null != ptNode.mChildren)
size += countPtNodes(ptNode.mChildren);
}
return size;
}
@ -700,9 +702,9 @@ public final class FusionDictionary implements Iterable<Word> {
public static int countNodeArrays(final PtNodeArray nodeArray) {
int size = 1;
for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
CharGroup group = nodeArray.mData.get(i);
if (null != group.mChildren)
size += countNodeArrays(group.mChildren);
PtNode ptNode = nodeArray.mData.get(i);
if (null != ptNode.mChildren)
size += countNodeArrays(ptNode.mChildren);
}
return size;
}
@ -713,9 +715,9 @@ public final class FusionDictionary implements Iterable<Word> {
private static boolean hasBigramsInternal(final PtNodeArray nodeArray) {
if (null == nodeArray) return false;
for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
CharGroup group = nodeArray.mData.get(i);
if (null != group.mBigrams) return true;
if (hasBigramsInternal(group.mChildren)) return true;
PtNode ptNode = nodeArray.mData.get(i);
if (null != ptNode.mBigrams) return true;
if (hasBigramsInternal(ptNode.mChildren)) return true;
}
return false;
}
@ -748,8 +750,8 @@ public final class FusionDictionary implements Iterable<Word> {
MakedictLog.i("Do not merge tails");
return;
// MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root));
// MakedictLog.i("Number of groups : " + countCharGroups(root));
// MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root));
// MakedictLog.i("Number of PtNodes : " + countPtNodes(root));
//
// final HashMap<String, ArrayList<PtNodeArray>> repository =
// new HashMap<String, ArrayList<PtNodeArray>>();
@ -771,25 +773,25 @@ public final class FusionDictionary implements Iterable<Word> {
// if (a.data.size() != b.data.size()) return false;
// final int size = a.data.size();
// for (int i = size - 1; i >= 0; --i) {
// CharGroup aGroup = a.data.get(i);
// CharGroup bGroup = b.data.get(i);
// if (aGroup.frequency != bGroup.frequency) return false;
// if (aGroup.alternates == null && bGroup.alternates != null) return false;
// if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false;
// if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false;
// if (!isEqual(aGroup.children, bGroup.children)) return false;
// PtNode aPtNode = a.data.get(i);
// PtNode bPtNode = b.data.get(i);
// if (aPtNode.frequency != bPtNode.frequency) return false;
// if (aPtNode.alternates == null && bPtNode.alternates != null) return false;
// if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false;
// if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false;
// if (!isEqual(aPtNode.children, bPtNode.children)) return false;
// }
// return true;
// }
// static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner(
// final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) {
// final ArrayList<CharGroup> branches = nodeArray.data;
// final ArrayList<PtNode> branches = nodeArray.data;
// final int nodeSize = branches.size();
// for (int i = 0; i < nodeSize; ++i) {
// CharGroup group = branches.get(i);
// if (null != group.children) {
// String pseudoHash = getPseudoHash(group.children);
// PtNode ptNode = branches.get(i);
// if (null != ptNode.children) {
// String pseudoHash = getPseudoHash(ptNode.children);
// ArrayList<PtNodeArray> similarList = map.get(pseudoHash);
// if (null == similarList) {
// similarList = new ArrayList<PtNodeArray>();
@ -797,16 +799,16 @@ public final class FusionDictionary implements Iterable<Word> {
// }
// boolean merged = false;
// for (PtNodeArray similar : similarList) {
// if (isEqual(group.children, similar)) {
// group.children = similar;
// if (isEqual(ptNode.children, similar)) {
// ptNode.children = similar;
// merged = true;
// break;
// }
// }
// if (!merged) {
// similarList.add(group.children);
// similarList.add(ptNode.children);
// }
// mergeTailsInner(map, group.children);
// mergeTailsInner(map, ptNode.children);
// }
// }
// return map;
@ -814,9 +816,9 @@ public final class FusionDictionary implements Iterable<Word> {
// private static String getPseudoHash(final PtNodeArray nodeArray) {
// StringBuilder s = new StringBuilder();
// for (CharGroup g : nodeArray.data) {
// s.append(g.frequency);
// for (int ch : g.chars) {
// for (PtNode ptNode : nodeArray.data) {
// s.append(ptNode.frequency);
// for (int ch : ptNode.chars) {
// s.append(Character.toChars(ch));
// }
// }
@ -830,20 +832,20 @@ public final class FusionDictionary implements Iterable<Word> {
*/
public static final class DictionaryIterator implements Iterator<Word> {
private static final class Position {
public Iterator<CharGroup> pos;
public Iterator<PtNode> pos;
public int length;
public Position(ArrayList<CharGroup> groups) {
pos = groups.iterator();
public Position(ArrayList<PtNode> ptNodes) {
pos = ptNodes.iterator();
length = 0;
}
}
final StringBuilder mCurrentString;
final LinkedList<Position> mPositions;
public DictionaryIterator(ArrayList<CharGroup> root) {
public DictionaryIterator(ArrayList<PtNode> ptRoot) {
mCurrentString = new StringBuilder();
mPositions = new LinkedList<Position>();
final Position rootPos = new Position(root);
final Position rootPos = new Position(ptRoot);
mPositions.add(rootPos);
}
@ -864,20 +866,20 @@ public final class FusionDictionary implements Iterable<Word> {
do {
if (currentPos.pos.hasNext()) {
final CharGroup currentGroup = currentPos.pos.next();
final PtNode currentPtNode = currentPos.pos.next();
currentPos.length = mCurrentString.length();
for (int i : currentGroup.mChars) {
for (int i : currentPtNode.mChars) {
mCurrentString.append(Character.toChars(i));
}
if (null != currentGroup.mChildren) {
currentPos = new Position(currentGroup.mChildren.mData);
if (null != currentPtNode.mChildren) {
currentPos = new Position(currentPtNode.mChildren.mData);
currentPos.length = mCurrentString.length();
mPositions.addLast(currentPos);
}
if (currentGroup.mFrequency >= 0) {
return new Word(mCurrentString.toString(), currentGroup.mFrequency,
currentGroup.mShortcutTargets, currentGroup.mBigrams,
currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry);
if (currentPtNode.mFrequency >= 0) {
return new Word(mCurrentString.toString(), currentPtNode.mFrequency,
currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
}
} else {
mPositions.removeLast();

View File

@ -21,9 +21,9 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import java.util.ArrayList;
/**
* Raw char group info straight out of a file. This will contain numbers for addresses.
* Raw PtNode info straight out of a file. This will contain numbers for addresses.
*/
public final class CharGroupInfo {
public final class PtNodeInfo {
public final int mOriginalAddress;
public final int mEndAddress;
@ -35,7 +35,7 @@ public final class CharGroupInfo {
public final ArrayList<WeightedString> mShortcutTargets;
public final ArrayList<PendingAttribute> mBigrams;
public CharGroupInfo(final int originalAddress, final int endAddress, final int flags,
public PtNodeInfo(final int originalAddress, final int endAddress, final int flags,
final int[] characters, final int frequency, final int parentAddress,
final int childrenAddress, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<PendingAttribute> bigrams) {

View File

@ -21,7 +21,7 @@ import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncodin
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.utils.JniUtils;
@ -99,14 +99,14 @@ public class Ver3DictDecoder implements DictDecoder {
if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS;
return address;
} else {
switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
return dictBuffer.readUnsignedByte();
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
return dictBuffer.readUnsignedShort();
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
return dictBuffer.readUnsignedInt24();
case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
default:
return FormatSpec.NO_CHILDREN_ADDRESS;
}
@ -122,8 +122,8 @@ public class Ver3DictDecoder implements DictDecoder {
final int targetFlags = dictBuffer.readUnsignedByte();
final String word = CharEncoding.readString(dictBuffer);
shortcutTargets.add(new WeightedString(word,
targetFlags & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY));
if (0 == (targetFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break;
targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
}
return dictBuffer.position() - pointerBefore;
}
@ -132,22 +132,22 @@ public class Ver3DictDecoder implements DictDecoder {
final ArrayList<PendingAttribute> bigrams, final int baseAddress) {
int readLength = 0;
int bigramCount = 0;
while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
final int bigramFlags = dictBuffer.readUnsignedByte();
++readLength;
final int sign = 0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE)
final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE)
? 1 : -1;
int bigramAddress = baseAddress + readLength;
switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) {
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) {
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE:
bigramAddress += sign * dictBuffer.readUnsignedByte();
readLength += 1;
break;
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES:
bigramAddress += sign * dictBuffer.readUnsignedShort();
readLength += 2;
break;
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES:
final int offset = (dictBuffer.readUnsignedByte() << 16)
+ dictBuffer.readUnsignedShort();
bigramAddress += sign * offset;
@ -156,9 +156,10 @@ public class Ver3DictDecoder implements DictDecoder {
default:
throw new RuntimeException("Has bigrams with no address");
}
bigrams.add(new PendingAttribute(bigramFlags & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY,
bigrams.add(new PendingAttribute(
bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
bigramAddress));
if (0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break;
if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
}
return readLength;
}
@ -236,7 +237,7 @@ public class Ver3DictDecoder implements DictDecoder {
// TODO: Make this buffer multi thread safe.
private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
@Override
public CharGroupInfo readPtNode(final int ptNodePos, final FormatOptions options) {
public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions options) {
int addressPointer = ptNodePos;
final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
++addressPointer;
@ -270,7 +271,7 @@ public class Ver3DictDecoder implements DictDecoder {
++addressPointer;
frequency = PtNodeReader.readFrequency(mDictBuffer);
} else {
frequency = CharGroup.NOT_A_TERMINAL;
frequency = PtNode.NOT_A_TERMINAL;
}
int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
@ -290,14 +291,13 @@ public class Ver3DictDecoder implements DictDecoder {
if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
bigrams = new ArrayList<PendingAttribute>();
addressPointer += PtNodeReader.readBigrams(mDictBuffer, bigrams, addressPointer);
if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
MakedictLog.d("too many bigrams in a group.");
if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
MakedictLog.d("too many bigrams in a PtNode.");
}
} else {
bigrams = null;
}
return new CharGroupInfo(ptNodePos, addressPointer, flags, characters, frequency,
return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency,
parentAddress, childrenAddress, shortcutTargets, bigrams);
}

View File

@ -25,7 +25,7 @@ import android.util.SparseArray;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.utils.ByteArrayDictBuffer;
@ -239,17 +239,17 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
// check unigram
for (final String word : words) {
final CharGroup cg = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
assertNotNull(cg);
final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
assertNotNull(ptNode);
}
// check bigram
for (int i = 0; i < bigrams.size(); ++i) {
final int w1 = bigrams.keyAt(i);
for (final int w2 : bigrams.valueAt(i)) {
final CharGroup cg = FusionDictionary.findWordInTree(dict.mRootNodeArray,
final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray,
words.get(w1));
assertNotNull(words.get(w1) + "," + words.get(w2), cg.getBigram(words.get(w2)));
assertNotNull(words.get(w1) + "," + words.get(w2), ptNode.getBigram(words.get(w2)));
}
}
@ -257,11 +257,11 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
if (shortcutMap != null) {
for (final Map.Entry<String, List<String>> entry : shortcutMap.entrySet()) {
assertTrue(words.contains(entry.getKey()));
final CharGroup group = FusionDictionary.findWordInTree(dict.mRootNodeArray,
final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray,
entry.getKey());
for (final String word : entry.getValue()) {
assertNotNull("shortcut not found: " + entry.getKey() + ", " + word,
group.getShortcut(word));
ptNode.getShortcut(word));
}
}
}

View File

@ -86,8 +86,8 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
return builder.toString();
}
private static void printCharGroup(final CharGroupInfo info) {
Log.d(TAG, " CharGroup at " + info.mOriginalAddress);
private static void printPtNode(final PtNodeInfo info) {
Log.d(TAG, " PtNode at " + info.mOriginalAddress);
Log.d(TAG, " flags = " + info.mFlags);
Log.d(TAG, " parentAddress = " + info.mParentAddress);
Log.d(TAG, " characters = " + new String(info.mCharacters, 0,
@ -115,12 +115,12 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
final FormatSpec.FormatOptions formatOptions) {
final DictBuffer dictBuffer = dictDecoder.getDictBuffer();
Log.d(TAG, "Node at " + dictBuffer.position());
final int count = BinaryDictDecoderUtils.readCharGroupCount(dictBuffer);
Log.d(TAG, " charGroupCount = " + count);
final int count = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer);
Log.d(TAG, " ptNodeCount = " + count);
for (int i = 0; i < count; ++i) {
final CharGroupInfo currentInfo = dictDecoder.readPtNode(dictBuffer.position(),
final PtNodeInfo currentInfo = dictDecoder.readPtNode(dictBuffer.position(),
formatOptions);
printCharGroup(currentInfo);
printPtNode(currentInfo);
}
if (formatOptions.mSupportsDynamicUpdate) {
final int forwardLinkAddress = dictBuffer.readUnsignedInt24();
@ -155,11 +155,11 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
*
* @param dictDecoder the dict decoder
* @param word the word searched
* @return the found group
* @return the found ptNodeInfo
* @throws IOException
* @throws UnsupportedFormatException
*/
private static CharGroupInfo findWordByBinaryDictReader(final Ver3DictDecoder dictDecoder,
private static PtNodeInfo findWordByBinaryDictReader(final Ver3DictDecoder dictDecoder,
final String word) throws IOException, UnsupportedFormatException {
int position = BinaryDictIOUtils.getTerminalPosition(dictDecoder, word);
final DictBuffer dictBuffer = dictDecoder.getDictBuffer();
@ -172,10 +172,10 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
return null;
}
private CharGroupInfo findWordFromFile(final File file, final String word) {
CharGroupInfo info = null;
private PtNodeInfo findWordFromFile(final File file, final String word) {
final Ver3DictDecoder dictDecoder = new Ver3DictDecoder(file);
PtNodeInfo info = null;
try {
final Ver3DictDecoder dictDecoder = new Ver3DictDecoder(file);
dictDecoder.openDictBuffer();
info = findWordByBinaryDictReader(dictDecoder, word);
} catch (IOException e) {
@ -328,7 +328,7 @@ public class BinaryDictIOUtilsTests extends AndroidTestCase {
insertAndCheckWord(file, "banana", 0, false, null, null);
insertAndCheckWord(file, "recursive", 60, true, banana, null);
final CharGroupInfo info = findWordFromFile(file, "recursive");
final PtNodeInfo info = findWordFromFile(file, "recursive");
int bananaPos = getWordPosition(file, "banana");
assertNotNull(info.mBigrams);
assertEquals(info.mBigrams.size(), 1);

View File

@ -25,7 +25,7 @@ import com.android.inputmethod.latin.makedict.DictDecoder;
import com.android.inputmethod.latin.makedict.DictEncoder;
import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.Ver3DictDecoder;
import com.android.inputmethod.latin.makedict.Ver3DictEncoder;
import com.android.inputmethod.latin.personalization.UserHistoryDictionaryBigramList;
@ -89,12 +89,12 @@ public class UserHistoryDictIOUtilsTests extends AndroidTestCase
private void checkWordInFusionDict(final FusionDictionary dict, final String word,
final ArrayList<String> expectedBigrams) {
final CharGroup group = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
assertNotNull(group);
assertTrue(group.isTerminal());
final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
assertNotNull(ptNode);
assertTrue(ptNode.isTerminal());
for (final String bigram : expectedBigrams) {
assertNotNull(group.getBigram(bigram));
assertNotNull(ptNode.getBigram(bigram));
}
}

View File

@ -17,7 +17,7 @@
package com.android.inputmethod.latin.dicttool;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.makedict.Word;
@ -121,7 +121,7 @@ public class Diff extends Dicttool.Command {
private static void diffWords(final FusionDictionary dict0, final FusionDictionary dict1) {
boolean hasDifferences = false;
for (final Word word0 : dict0) {
final CharGroup word1 = FusionDictionary.findWordInTree(dict1.mRootNodeArray,
final PtNode word1 = FusionDictionary.findWordInTree(dict1.mRootNodeArray,
word0.mWord);
if (null == word1) {
// This word is not in dict1
@ -151,7 +151,7 @@ public class Diff extends Dicttool.Command {
}
}
for (final Word word1 : dict1) {
final CharGroup word0 = FusionDictionary.findWordInTree(dict0.mRootNodeArray,
final PtNode word0 = FusionDictionary.findWordInTree(dict0.mRootNodeArray,
word1.mWord);
if (null == word0) {
// This word is not in dict0

View File

@ -18,7 +18,7 @@ package com.android.inputmethod.latin.dicttool;
import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.makedict.Word;
@ -65,20 +65,20 @@ public class Info extends Dicttool.Command {
private static void showWordInfo(final FusionDictionary dict, final String word,
final boolean plumbing) {
final CharGroup group = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
if (null == group) {
final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
if (null == ptNode) {
System.out.println(word + " is not in the dictionary");
return;
}
System.out.println("Word: " + word);
System.out.println(" Freq: " + group.getFrequency());
if (group.getIsNotAWord()) {
System.out.println(" Freq: " + ptNode.getFrequency());
if (ptNode.getIsNotAWord()) {
System.out.println(" Is not a word");
}
if (group.getIsBlacklistEntry()) {
if (ptNode.getIsBlacklistEntry()) {
System.out.println(" Is a blacklist entry");
}
final ArrayList<WeightedString> shortcutTargets = group.getShortcutTargets();
final ArrayList<WeightedString> shortcutTargets = ptNode.getShortcutTargets();
if (null == shortcutTargets || shortcutTargets.isEmpty()) {
System.out.println(" No shortcuts");
} else {
@ -88,7 +88,7 @@ public class Info extends Dicttool.Command {
? "whitelist" : shortcutTarget.mFrequency) + ")");
}
}
final ArrayList<WeightedString> bigrams = group.getBigrams();
final ArrayList<WeightedString> bigrams = ptNode.getBigrams();
if (null == bigrams || bigrams.isEmpty()) {
System.out.println(" No bigrams");
} else {

View File

@ -17,7 +17,7 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.Word;
@ -72,8 +72,8 @@ public class FusionDictionaryTest extends TestCase {
assertNotNull(dict);
for (final String word : words) {
if (--limit < 0) return;
final CharGroup cg = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
assertNotNull(cg);
final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, word);
assertNotNull(ptNode);
}
}