Merge "[FD4] Separate cached address before/after update for groups"
This commit is contained in:
commit
289df0ecad
2 changed files with 80 additions and 54 deletions
|
@ -518,14 +518,56 @@ public final class BinaryDictInputOutput {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds the absolute address of a word in the dictionary.
|
* Get the offset from a position inside a current node to a target node, during update.
|
||||||
*
|
*
|
||||||
* @param dict the dictionary in which to search.
|
* If the current node is before the target node, the target node has not been updated yet,
|
||||||
* @param word the word we are searching for.
|
* so we should return the offset from the old position of the current node to the old position
|
||||||
* @return the word address. If it is not found, an exception is thrown.
|
* of the target node. If on the other hand the target is before the current node, it already
|
||||||
|
* has been updated, so we should return the offset from the new position in the current node
|
||||||
|
* to the new position in the target node.
|
||||||
|
* @param currentNode the node containing the CharGroup where the offset will be written
|
||||||
|
* @param offsetFromStartOfCurrentNode the offset, in bytes, from the start of currentNode
|
||||||
|
* @param targetNode the target node to get the offset to
|
||||||
|
* @return the offset to the target node
|
||||||
*/
|
*/
|
||||||
private static int findAddressOfWord(final FusionDictionary dict, final String word) {
|
private static int getOffsetToTargetNodeDuringUpdate(final Node currentNode,
|
||||||
return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress;
|
final int offsetFromStartOfCurrentNode, final Node targetNode) {
|
||||||
|
final boolean isTargetBeforeCurrent = (targetNode.mCachedAddressBeforeUpdate
|
||||||
|
< currentNode.mCachedAddressBeforeUpdate);
|
||||||
|
if (isTargetBeforeCurrent) {
|
||||||
|
return targetNode.mCachedAddressAfterUpdate
|
||||||
|
- (currentNode.mCachedAddressAfterUpdate + offsetFromStartOfCurrentNode);
|
||||||
|
} else {
|
||||||
|
return targetNode.mCachedAddressBeforeUpdate
|
||||||
|
- (currentNode.mCachedAddressBeforeUpdate + offsetFromStartOfCurrentNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the offset from a position inside a current node to a target CharGroup, during update.
|
||||||
|
* @param currentNode the node containing the CharGroup where the offset will be written
|
||||||
|
* @param offsetFromStartOfCurrentNode the offset, in bytes, from the start of currentNode
|
||||||
|
* @param targetCharGroup the target CharGroup to get the offset to
|
||||||
|
* @return the offset to the target CharGroup
|
||||||
|
*/
|
||||||
|
// TODO: is there any way to factorize this method with the one above?
|
||||||
|
private static int getOffsetToTargetCharGroupDuringUpdate(final Node currentNode,
|
||||||
|
final int offsetFromStartOfCurrentNode, final CharGroup targetCharGroup) {
|
||||||
|
final int oldOffsetBasePoint = currentNode.mCachedAddressBeforeUpdate
|
||||||
|
+ offsetFromStartOfCurrentNode;
|
||||||
|
final boolean isTargetBeforeCurrent = (targetCharGroup.mCachedAddressBeforeUpdate
|
||||||
|
< oldOffsetBasePoint);
|
||||||
|
// If the target is before the current node, then its address has already been updated.
|
||||||
|
// We can use the AfterUpdate member, and compare it to our own member after update.
|
||||||
|
// Otherwise, the AfterUpdate member is not updated yet, so we need to use the BeforeUpdate
|
||||||
|
// member, and of course we have to compare this to our own address before update.
|
||||||
|
if (isTargetBeforeCurrent) {
|
||||||
|
final int newOffsetBasePoint = currentNode.mCachedAddressAfterUpdate
|
||||||
|
+ offsetFromStartOfCurrentNode;
|
||||||
|
return targetCharGroup.mCachedAddressAfterUpdate - newOffsetBasePoint;
|
||||||
|
} else {
|
||||||
|
return targetCharGroup.mCachedAddressBeforeUpdate - oldOffsetBasePoint;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -548,30 +590,28 @@ public final class BinaryDictInputOutput {
|
||||||
boolean changed = false;
|
boolean changed = false;
|
||||||
int size = getGroupCountSize(node);
|
int size = getGroupCountSize(node);
|
||||||
for (CharGroup group : node.mData) {
|
for (CharGroup group : node.mData) {
|
||||||
if (group.mCachedAddress != node.mCachedAddressBeforeUpdate + size) {
|
group.mCachedAddressAfterUpdate = node.mCachedAddressAfterUpdate + size;
|
||||||
|
if (group.mCachedAddressAfterUpdate != group.mCachedAddressBeforeUpdate) {
|
||||||
changed = true;
|
changed = true;
|
||||||
group.mCachedAddress = node.mCachedAddressBeforeUpdate + size;
|
|
||||||
}
|
}
|
||||||
int groupSize = getGroupHeaderSize(group, formatOptions);
|
int groupSize = getGroupHeaderSize(group, formatOptions);
|
||||||
if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
|
if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
|
||||||
if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
|
if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
|
||||||
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
||||||
} else if (null != group.mChildren) {
|
} else if (null != group.mChildren) {
|
||||||
final int offsetBasePoint = groupSize + node.mCachedAddressBeforeUpdate + size;
|
|
||||||
final int offset = group.mChildren.mCachedAddressBeforeUpdate - offsetBasePoint;
|
|
||||||
if (formatOptions.mSupportsDynamicUpdate) {
|
if (formatOptions.mSupportsDynamicUpdate) {
|
||||||
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
||||||
} else {
|
} else {
|
||||||
groupSize += getByteSize(offset);
|
groupSize += getByteSize(getOffsetToTargetNodeDuringUpdate(node,
|
||||||
|
groupSize + size, group.mChildren));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
groupSize += getShortcutListSize(group.mShortcutTargets);
|
groupSize += getShortcutListSize(group.mShortcutTargets);
|
||||||
if (null != group.mBigrams) {
|
if (null != group.mBigrams) {
|
||||||
for (WeightedString bigram : group.mBigrams) {
|
for (WeightedString bigram : group.mBigrams) {
|
||||||
final int offsetBasePoint = groupSize + node.mCachedAddressBeforeUpdate + size
|
final int offset = getOffsetToTargetCharGroupDuringUpdate(node,
|
||||||
+ FormatSpec.GROUP_FLAGS_SIZE;
|
groupSize + size + FormatSpec.GROUP_FLAGS_SIZE,
|
||||||
final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
|
FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord));
|
||||||
final int offset = addressOfBigram - offsetBasePoint;
|
|
||||||
groupSize += getByteSize(offset) + FormatSpec.GROUP_FLAGS_SIZE;
|
groupSize += getByteSize(offset) + FormatSpec.GROUP_FLAGS_SIZE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -603,7 +643,8 @@ public final class BinaryDictInputOutput {
|
||||||
int groupCountSize = getGroupCountSize(n);
|
int groupCountSize = getGroupCountSize(n);
|
||||||
int groupOffset = 0;
|
int groupOffset = 0;
|
||||||
for (final CharGroup g : n.mData) {
|
for (final CharGroup g : n.mData) {
|
||||||
g.mCachedAddress = groupCountSize + nodeOffset + groupOffset;
|
g.mCachedAddressBeforeUpdate = g.mCachedAddressAfterUpdate =
|
||||||
|
groupCountSize + nodeOffset + groupOffset;
|
||||||
groupOffset += g.mCachedSize;
|
groupOffset += g.mCachedSize;
|
||||||
}
|
}
|
||||||
final int nodeSize = groupCountSize + groupOffset
|
final int nodeSize = groupCountSize + groupOffset
|
||||||
|
@ -618,36 +659,14 @@ public final class BinaryDictInputOutput {
|
||||||
* Updates the cached addresses of nodes after recomputing their new positions.
|
* Updates the cached addresses of nodes after recomputing their new positions.
|
||||||
*
|
*
|
||||||
* @param flatNodes the array of nodes.
|
* @param flatNodes the array of nodes.
|
||||||
* @param formatOptions file format options.
|
|
||||||
* @return the byte size of the entire stack.
|
|
||||||
*/
|
*/
|
||||||
private static int updateNodeCachedAddresses(final ArrayList<Node> flatNodes,
|
private static void updateNodeCachedAddresses(final ArrayList<Node> flatNodes) {
|
||||||
final FormatOptions formatOptions) {
|
|
||||||
int nodeOffset = 0;
|
|
||||||
for (final Node n : flatNodes) {
|
for (final Node n : flatNodes) {
|
||||||
n.mCachedAddressBeforeUpdate = n.mCachedAddressAfterUpdate;
|
n.mCachedAddressBeforeUpdate = n.mCachedAddressAfterUpdate;
|
||||||
int groupCountSize = getGroupCountSize(n);
|
|
||||||
int groupOffset = 0;
|
|
||||||
for (final CharGroup g : n.mData) {
|
for (final CharGroup g : n.mData) {
|
||||||
// TODO: just copy cached address after update into cached address before update
|
g.mCachedAddressBeforeUpdate = g.mCachedAddressAfterUpdate;
|
||||||
// when the two fields are separated.
|
|
||||||
g.mCachedAddress = groupCountSize + nodeOffset + groupOffset;
|
|
||||||
groupOffset += g.mCachedSize;
|
|
||||||
}
|
}
|
||||||
final int nodeSize = groupCountSize + groupOffset
|
|
||||||
+ (formatOptions.mSupportsDynamicUpdate
|
|
||||||
? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0);
|
|
||||||
if (nodeSize != n.mCachedSize) {
|
|
||||||
// TODO: remove this test when the addresses are separated
|
|
||||||
throw new RuntimeException("Bug : Stored and computed node size differ");
|
|
||||||
}
|
|
||||||
if (nodeOffset != n.mCachedAddressAfterUpdate) {
|
|
||||||
// TODO: remove this test when the code is well tested
|
|
||||||
throw new RuntimeException("Bug : Stored and computed node address differ");
|
|
||||||
}
|
|
||||||
nodeOffset += n.mCachedSize;
|
|
||||||
}
|
}
|
||||||
return nodeOffset;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -665,7 +684,7 @@ public final class BinaryDictInputOutput {
|
||||||
// Assign my address to children's parent address
|
// Assign my address to children's parent address
|
||||||
// Here BeforeUpdate and AfterUpdate addresses have the same value, so it
|
// Here BeforeUpdate and AfterUpdate addresses have the same value, so it
|
||||||
// does not matter which we use.
|
// does not matter which we use.
|
||||||
group.mChildren.mCachedParentAddress = group.mCachedAddress
|
group.mChildren.mCachedParentAddress = group.mCachedAddressAfterUpdate
|
||||||
- group.mChildren.mCachedAddressAfterUpdate;
|
- group.mChildren.mCachedAddressAfterUpdate;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -710,7 +729,7 @@ public final class BinaryDictInputOutput {
|
||||||
nodeStartOffset += newNodeSize;
|
nodeStartOffset += newNodeSize;
|
||||||
changesDone |= changed;
|
changesDone |= changed;
|
||||||
}
|
}
|
||||||
updateNodeCachedAddresses(flatNodes, formatOptions);
|
updateNodeCachedAddresses(flatNodes);
|
||||||
++passes;
|
++passes;
|
||||||
if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
|
if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
|
||||||
} while (changesDone);
|
} while (changesDone);
|
||||||
|
@ -1003,10 +1022,11 @@ public final class BinaryDictInputOutput {
|
||||||
}
|
}
|
||||||
int groupAddress = index;
|
int groupAddress = index;
|
||||||
for (int i = 0; i < groupCount; ++i) {
|
for (int i = 0; i < groupCount; ++i) {
|
||||||
CharGroup group = node.mData.get(i);
|
final CharGroup group = node.mData.get(i);
|
||||||
if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not "
|
if (index != group.mCachedAddressAfterUpdate) {
|
||||||
+ "the same as the cached address of the group : "
|
throw new RuntimeException("Bug: write index is not the same as the cached address "
|
||||||
+ index + " <> " + group.mCachedAddress);
|
+ "of the group : " + index + " <> " + group.mCachedAddressAfterUpdate);
|
||||||
|
}
|
||||||
groupAddress += getGroupHeaderSize(group, formatOptions);
|
groupAddress += getGroupHeaderSize(group, formatOptions);
|
||||||
// Sanity checks.
|
// Sanity checks.
|
||||||
if (DBG && group.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
|
if (DBG && group.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
|
||||||
|
@ -1018,14 +1038,14 @@ public final class BinaryDictInputOutput {
|
||||||
final int childrenOffset = null == group.mChildren
|
final int childrenOffset = null == group.mChildren
|
||||||
? FormatSpec.NO_CHILDREN_ADDRESS
|
? FormatSpec.NO_CHILDREN_ADDRESS
|
||||||
: group.mChildren.mCachedAddressAfterUpdate - groupAddress;
|
: group.mChildren.mCachedAddressAfterUpdate - groupAddress;
|
||||||
byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
|
buffer[index++] =
|
||||||
buffer[index++] = flags;
|
makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
|
||||||
|
|
||||||
if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) {
|
if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) {
|
||||||
index = writeParentAddress(buffer, index, parentAddress, formatOptions);
|
index = writeParentAddress(buffer, index, parentAddress, formatOptions);
|
||||||
} else {
|
} else {
|
||||||
index = writeParentAddress(buffer, index,
|
index = writeParentAddress(buffer, index, parentAddress
|
||||||
parentAddress + (node.mCachedAddressAfterUpdate - group.mCachedAddress),
|
+ (node.mCachedAddressAfterUpdate - group.mCachedAddressAfterUpdate),
|
||||||
formatOptions);
|
formatOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1076,7 +1096,7 @@ public final class BinaryDictInputOutput {
|
||||||
final WeightedString bigram = bigramIterator.next();
|
final WeightedString bigram = bigramIterator.next();
|
||||||
final CharGroup target =
|
final CharGroup target =
|
||||||
FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord);
|
FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord);
|
||||||
final int addressOfBigram = target.mCachedAddress;
|
final int addressOfBigram = target.mCachedAddressAfterUpdate;
|
||||||
final int unigramFrequencyForThisWord = target.mFrequency;
|
final int unigramFrequencyForThisWord = target.mFrequency;
|
||||||
++groupAddress;
|
++groupAddress;
|
||||||
final int offset = addressOfBigram - groupAddress;
|
final int offset = addressOfBigram - groupAddress;
|
||||||
|
|
|
@ -111,9 +111,15 @@ public final class FusionDictionary implements Iterable<Word> {
|
||||||
Node mChildren;
|
Node mChildren;
|
||||||
boolean mIsNotAWord; // Only a shortcut
|
boolean mIsNotAWord; // Only a shortcut
|
||||||
boolean mIsBlacklistEntry;
|
boolean mIsBlacklistEntry;
|
||||||
// The two following members to help with binary generation
|
// mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
|
||||||
int mCachedSize;
|
// generation. Before and After always hold the same value except during dictionary
|
||||||
int mCachedAddress;
|
// address compression, where the update process needs to know about both values at the
|
||||||
|
// same time. Updating will update the AfterUpdate value, and the code will move them
|
||||||
|
// to BeforeUpdate before the next update pass.
|
||||||
|
// The update process does not need two versions of mCachedSize.
|
||||||
|
int mCachedSize; // The size, in bytes, of this char group.
|
||||||
|
int mCachedAddressBeforeUpdate; // The address of this char group (before update)
|
||||||
|
int mCachedAddressAfterUpdate; // The address of this char group (after update)
|
||||||
|
|
||||||
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams, final int frequency,
|
final ArrayList<WeightedString> bigrams, final int frequency,
|
||||||
|
|
Loading…
Reference in a new issue