am 2ef87aee
: Merge "Make PtNode have ProbabilityInfo instead of raw value."
* commit '2ef87aee3e36e2c1121b454e321e59b4b450dd82': Make PtNode have ProbabilityInfo instead of raw value.
This commit is contained in:
commit
b4fbf9ac2f
20 changed files with 196 additions and 140 deletions
|
@ -21,6 +21,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||||
|
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
||||||
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
import com.android.inputmethod.latin.utils.CollectionUtils;
|
import com.android.inputmethod.latin.utils.CollectionUtils;
|
||||||
|
|
||||||
|
@ -56,22 +57,23 @@ public class DictionaryWriter extends AbstractDictionaryWriter {
|
||||||
// TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
|
// TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
|
||||||
// considering performance regression.
|
// considering performance regression.
|
||||||
@Override
|
@Override
|
||||||
public void addUnigramWord(final String word, final String shortcutTarget, final int frequency,
|
public void addUnigramWord(final String word, final String shortcutTarget,
|
||||||
final int shortcutFreq, final boolean isNotAWord) {
|
final int probability, final int shortcutProbability, final boolean isNotAWord) {
|
||||||
if (shortcutTarget == null) {
|
if (shortcutTarget == null) {
|
||||||
mFusionDictionary.add(word, frequency, null, isNotAWord);
|
mFusionDictionary.add(word, new ProbabilityInfo(probability), null, isNotAWord);
|
||||||
} else {
|
} else {
|
||||||
// TODO: Do this in the subclass, with this class taking an arraylist.
|
// TODO: Do this in the subclass, with this class taking an arraylist.
|
||||||
final ArrayList<WeightedString> shortcutTargets = CollectionUtils.newArrayList();
|
final ArrayList<WeightedString> shortcutTargets = CollectionUtils.newArrayList();
|
||||||
shortcutTargets.add(new WeightedString(shortcutTarget, shortcutFreq));
|
shortcutTargets.add(new WeightedString(shortcutTarget, shortcutProbability));
|
||||||
mFusionDictionary.add(word, frequency, shortcutTargets, isNotAWord);
|
mFusionDictionary.add(word, new ProbabilityInfo(probability), shortcutTargets,
|
||||||
|
isNotAWord);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void addBigramWords(final String word0, final String word1, final int frequency,
|
public void addBigramWords(final String word0, final String word1, final int probability,
|
||||||
final boolean isValid, final long lastModifiedTime) {
|
final boolean isValid, final long lastModifiedTime) {
|
||||||
mFusionDictionary.setBigram(word0, word1, frequency);
|
mFusionDictionary.setBigram(word0, word1, new ProbabilityInfo(probability));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -408,7 +408,7 @@ public final class BinaryDictDecoderUtils {
|
||||||
private static WeightedString getWordAtPositionWithParentAddress(final DictDecoder dictDecoder,
|
private static WeightedString getWordAtPositionWithParentAddress(final DictDecoder dictDecoder,
|
||||||
final int pos, final FormatOptions options) {
|
final int pos, final FormatOptions options) {
|
||||||
int currentPos = pos;
|
int currentPos = pos;
|
||||||
int frequency = Integer.MIN_VALUE;
|
ProbabilityInfo probabilityInfo = null;
|
||||||
final StringBuilder builder = new StringBuilder();
|
final StringBuilder builder = new StringBuilder();
|
||||||
// the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH
|
// the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH
|
||||||
for (int count = 0; count < FormatSpec.MAX_WORD_LENGTH; ++count) {
|
for (int count = 0; count < FormatSpec.MAX_WORD_LENGTH; ++count) {
|
||||||
|
@ -424,13 +424,15 @@ public final class BinaryDictDecoderUtils {
|
||||||
MakedictLog.d("Too many jumps - probably a bug");
|
MakedictLog.d("Too many jumps - probably a bug");
|
||||||
}
|
}
|
||||||
} while (BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, options));
|
} while (BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, options));
|
||||||
if (Integer.MIN_VALUE == frequency) frequency = currentInfo.mFrequency;
|
if (probabilityInfo == null) {
|
||||||
|
probabilityInfo = currentInfo.mProbabilityInfo;
|
||||||
|
}
|
||||||
builder.insert(0,
|
builder.insert(0,
|
||||||
new String(currentInfo.mCharacters, 0, currentInfo.mCharacters.length));
|
new String(currentInfo.mCharacters, 0, currentInfo.mCharacters.length));
|
||||||
if (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) break;
|
if (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) break;
|
||||||
currentPos = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
|
currentPos = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
|
||||||
}
|
}
|
||||||
return new WeightedString(builder.toString(), frequency);
|
return new WeightedString(builder.toString(), probabilityInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static WeightedString getWordAtPositionWithoutParentAddress(
|
private static WeightedString getWordAtPositionWithoutParentAddress(
|
||||||
|
@ -448,7 +450,7 @@ public final class BinaryDictDecoderUtils {
|
||||||
groupPos = info.mEndAddress;
|
groupPos = info.mEndAddress;
|
||||||
if (info.mOriginalAddress == pos) {
|
if (info.mOriginalAddress == pos) {
|
||||||
builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
|
builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
|
||||||
result = new WeightedString(builder.toString(), info.mFrequency);
|
result = new WeightedString(builder.toString(), info.mProbabilityInfo);
|
||||||
break; // and return
|
break; // and return
|
||||||
}
|
}
|
||||||
if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
|
if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
|
||||||
|
@ -527,13 +529,13 @@ public final class BinaryDictDecoderUtils {
|
||||||
}
|
}
|
||||||
nodeArrayContents.add(
|
nodeArrayContents.add(
|
||||||
new PtNode(info.mCharacters, shortcutTargets, bigrams,
|
new PtNode(info.mCharacters, shortcutTargets, bigrams,
|
||||||
info.mFrequency,
|
info.mProbabilityInfo,
|
||||||
0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
|
0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
|
||||||
0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED), children));
|
0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED), children));
|
||||||
} else {
|
} else {
|
||||||
nodeArrayContents.add(
|
nodeArrayContents.add(
|
||||||
new PtNode(info.mCharacters, shortcutTargets, bigrams,
|
new PtNode(info.mCharacters, shortcutTargets, bigrams,
|
||||||
info.mFrequency,
|
info.mProbabilityInfo,
|
||||||
0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
|
0 != (info.mFlags & FormatSpec.FLAG_IS_NOT_A_WORD),
|
||||||
0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED)));
|
0 != (info.mFlags & FormatSpec.FLAG_IS_BLACKLISTED)));
|
||||||
}
|
}
|
||||||
|
@ -611,7 +613,7 @@ public final class BinaryDictDecoderUtils {
|
||||||
newDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
newDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
||||||
wordProperty.mIsNotAWord);
|
wordProperty.mIsNotAWord);
|
||||||
} else {
|
} else {
|
||||||
newDict.add(wordProperty.mWord, wordProperty.getProbability(),
|
newDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -620,7 +622,7 @@ public final class BinaryDictDecoderUtils {
|
||||||
// words that are not also registered as unigrams so we don't have to avoid
|
// words that are not also registered as unigrams so we don't have to avoid
|
||||||
// them explicitly here.
|
// them explicitly here.
|
||||||
for (final WeightedString bigram : wordProperty.mBigrams) {
|
for (final WeightedString bigram : wordProperty.mBigrams) {
|
||||||
newDict.setBigram(wordProperty.mWord, bigram.mWord, bigram.getProbability());
|
newDict.setBigram(wordProperty.mWord, bigram.mWord, bigram.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -673,7 +673,7 @@ public class BinaryDictEncoderUtils {
|
||||||
|
|
||||||
/* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset,
|
/* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset,
|
||||||
final FormatOptions formatOptions) {
|
final FormatOptions formatOptions) {
|
||||||
return (byte) makePtNodeFlags(node.mChars.length > 1, node.mFrequency >= 0,
|
return (byte) makePtNodeFlags(node.mChars.length > 1, node.isTerminal(),
|
||||||
getByteSize(childrenOffset),
|
getByteSize(childrenOffset),
|
||||||
node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
|
node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
|
||||||
node.mBigrams != null, node.mIsNotAWord, node.mIsBlacklistEntry, formatOptions);
|
node.mBigrams != null, node.mIsNotAWord, node.mIsBlacklistEntry, formatOptions);
|
||||||
|
@ -833,10 +833,10 @@ public class BinaryDictEncoderUtils {
|
||||||
+ ptNode.mCachedAddressAfterUpdate);
|
+ ptNode.mCachedAddressAfterUpdate);
|
||||||
}
|
}
|
||||||
// Sanity checks.
|
// Sanity checks.
|
||||||
if (DBG && ptNode.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
|
if (DBG && ptNode.getProbability() > FormatSpec.MAX_TERMINAL_FREQUENCY) {
|
||||||
throw new RuntimeException("A node has a frequency > "
|
throw new RuntimeException("A node has a frequency > "
|
||||||
+ FormatSpec.MAX_TERMINAL_FREQUENCY
|
+ FormatSpec.MAX_TERMINAL_FREQUENCY
|
||||||
+ " : " + ptNode.mFrequency);
|
+ " : " + ptNode.mProbabilityInfo.toString());
|
||||||
}
|
}
|
||||||
dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict);
|
dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict);
|
||||||
}
|
}
|
||||||
|
@ -871,7 +871,7 @@ public class BinaryDictEncoderUtils {
|
||||||
for (final PtNode ptNode : ptNodeArray.mData) {
|
for (final PtNode ptNode : ptNodeArray.mData) {
|
||||||
++ptNodes;
|
++ptNodes;
|
||||||
if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length;
|
if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length;
|
||||||
if (ptNode.mFrequency >= 0) {
|
if (ptNode.isTerminal()) {
|
||||||
if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress)
|
if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress)
|
||||||
firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
|
firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
|
||||||
if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress)
|
if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress)
|
||||||
|
|
|
@ -91,21 +91,23 @@ public final class BinaryDictIOUtils {
|
||||||
stack.pop();
|
stack.pop();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
PtNodeInfo info = dictDecoder.readPtNode(p.mAddress, formatOptions);
|
final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress, formatOptions);
|
||||||
for (int i = 0; i < info.mCharacters.length; ++i) {
|
for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
|
||||||
pushedChars[index++] = info.mCharacters[i];
|
pushedChars[index++] = ptNodeInfo.mCharacters[i];
|
||||||
}
|
}
|
||||||
p.mPosition++;
|
p.mPosition++;
|
||||||
|
|
||||||
final boolean isMovedPtNode = isMovedPtNode(info.mFlags,
|
final boolean isMovedPtNode = isMovedPtNode(ptNodeInfo.mFlags,
|
||||||
formatOptions);
|
formatOptions);
|
||||||
final boolean isDeletedPtNode = isDeletedPtNode(info.mFlags,
|
final boolean isDeletedPtNode = isDeletedPtNode(ptNodeInfo.mFlags,
|
||||||
formatOptions);
|
formatOptions);
|
||||||
if (!isMovedPtNode && !isDeletedPtNode
|
if (!isMovedPtNode && !isDeletedPtNode && ptNodeInfo.isTerminal()) {// found word
|
||||||
&& info.mFrequency != FusionDictionary.PtNode.NOT_A_TERMINAL) {// found word
|
words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
|
||||||
words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
|
frequencies.put(
|
||||||
frequencies.put(info.mOriginalAddress, info.mFrequency);
|
ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
|
||||||
if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
|
if (ptNodeInfo.mBigrams != null) {
|
||||||
|
bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p.mPosition == p.mNumOfPtNode) {
|
if (p.mPosition == p.mNumOfPtNode) {
|
||||||
|
@ -127,8 +129,8 @@ public final class BinaryDictIOUtils {
|
||||||
p.mAddress = dictDecoder.getPosition();
|
p.mAddress = dictDecoder.getPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isMovedPtNode && hasChildrenAddress(info.mChildrenAddress)) {
|
if (!isMovedPtNode && hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
|
||||||
final Position childrenPos = new Position(info.mChildrenAddress, index);
|
final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
|
||||||
stack.push(childrenPos);
|
stack.push(childrenPos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -203,8 +205,7 @@ public final class BinaryDictIOUtils {
|
||||||
if (same) {
|
if (same) {
|
||||||
// found the PtNode matches the word.
|
// found the PtNode matches the word.
|
||||||
if (wordPos + currentInfo.mCharacters.length == wordLen) {
|
if (wordPos + currentInfo.mCharacters.length == wordLen) {
|
||||||
if (currentInfo.mFrequency == PtNode.NOT_A_TERMINAL
|
if (!currentInfo.isTerminal() || isDeletedNode) {
|
||||||
|| isDeletedNode) {
|
|
||||||
return FormatSpec.NOT_VALID_WORD;
|
return FormatSpec.NOT_VALID_WORD;
|
||||||
} else {
|
} else {
|
||||||
return ptNodePos;
|
return ptNodePos;
|
||||||
|
|
|
@ -107,24 +107,26 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children
|
* PtNode is a group of characters, with probability information, shortcut targets, bigrams,
|
||||||
* (Pt means Patricia Trie).
|
* and children (Pt means Patricia Trie).
|
||||||
*
|
*
|
||||||
* This is the central class of the in-memory representation. A PtNode is what can
|
* This is the central class of the in-memory representation. A PtNode is what can
|
||||||
* be seen as a traditional "trie node", except it can hold several characters at the
|
* be seen as a traditional "trie node", except it can hold several characters at the
|
||||||
* same time. A PtNode essentially represents one or several characters in the middle
|
* same time. A PtNode essentially represents one or several characters in the middle
|
||||||
* of the trie tree; as such, it can be a terminal, and it can have children.
|
* of the trie tree; as such, it can be a terminal, and it can have children.
|
||||||
* In this in-memory representation, whether the PtNode is a terminal or not is represented
|
* In this in-memory representation, whether the PtNode is a terminal or not is represented
|
||||||
* in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
|
* by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the
|
||||||
* value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
|
* PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null
|
||||||
* bigrams, but a non-terminal may not. Moreover, children, if present, are null.
|
* shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present,
|
||||||
|
* are non-null.
|
||||||
*/
|
*/
|
||||||
public static final class PtNode {
|
public static final class PtNode {
|
||||||
public static final int NOT_A_TERMINAL = -1;
|
private static final int NOT_A_TERMINAL = -1;
|
||||||
final int mChars[];
|
final int mChars[];
|
||||||
ArrayList<WeightedString> mShortcutTargets;
|
ArrayList<WeightedString> mShortcutTargets;
|
||||||
ArrayList<WeightedString> mBigrams;
|
ArrayList<WeightedString> mBigrams;
|
||||||
int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
|
// null == mProbabilityInfo indicates this is not a terminal.
|
||||||
|
ProbabilityInfo mProbabilityInfo;
|
||||||
int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
|
int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
|
||||||
PtNodeArray mChildren;
|
PtNodeArray mChildren;
|
||||||
boolean mIsNotAWord; // Only a shortcut
|
boolean mIsNotAWord; // Only a shortcut
|
||||||
|
@ -140,11 +142,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
|
int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
|
||||||
|
|
||||||
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams, final int frequency,
|
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
mChars = chars;
|
mChars = chars;
|
||||||
mFrequency = frequency;
|
mProbabilityInfo = probabilityInfo;
|
||||||
mTerminalId = frequency;
|
mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
mChildren = null;
|
mChildren = null;
|
||||||
|
@ -153,11 +155,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams, final int frequency,
|
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry,
|
final boolean isNotAWord, final boolean isBlacklistEntry,
|
||||||
final PtNodeArray children) {
|
final PtNodeArray children) {
|
||||||
mChars = chars;
|
mChars = chars;
|
||||||
mFrequency = frequency;
|
mProbabilityInfo = probabilityInfo;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
mChildren = children;
|
mChildren = children;
|
||||||
|
@ -177,11 +179,15 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isTerminal() {
|
public boolean isTerminal() {
|
||||||
return NOT_A_TERMINAL != mFrequency;
|
return mProbabilityInfo != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getFrequency() {
|
public int getProbability() {
|
||||||
return mFrequency;
|
if (isTerminal()) {
|
||||||
|
return mProbabilityInfo.mProbability;
|
||||||
|
} else {
|
||||||
|
return NOT_A_TERMINAL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean getIsNotAWord() {
|
public boolean getIsNotAWord() {
|
||||||
|
@ -213,18 +219,18 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds a word to the bigram list. Updates the probability if the word already
|
* Adds a word to the bigram list. Updates the probability information if the word already
|
||||||
* exists.
|
* exists.
|
||||||
*/
|
*/
|
||||||
public void addBigram(final String word, final int probability) {
|
public void addBigram(final String word, final ProbabilityInfo probabilityInfo) {
|
||||||
if (mBigrams == null) {
|
if (mBigrams == null) {
|
||||||
mBigrams = new ArrayList<WeightedString>();
|
mBigrams = new ArrayList<WeightedString>();
|
||||||
}
|
}
|
||||||
WeightedString bigram = getBigram(word);
|
WeightedString bigram = getBigram(word);
|
||||||
if (bigram != null) {
|
if (bigram != null) {
|
||||||
bigram.setProbability(probability);
|
bigram.mProbabilityInfo = probabilityInfo;
|
||||||
} else {
|
} else {
|
||||||
bigram = new WeightedString(word, probability);
|
bigram = new WeightedString(word, probabilityInfo);
|
||||||
mBigrams.add(bigram);
|
mBigrams.add(bigram);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -270,12 +276,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
|
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
|
||||||
* updated if they are higher than the existing ones.
|
* updated if they are higher than the existing ones.
|
||||||
*/
|
*/
|
||||||
public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
|
private void update(final ProbabilityInfo probabilityInfo,
|
||||||
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams,
|
final ArrayList<WeightedString> bigrams,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
if (frequency > mFrequency) {
|
mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
|
||||||
mFrequency = frequency;
|
|
||||||
}
|
|
||||||
if (shortcutTargets != null) {
|
if (shortcutTargets != null) {
|
||||||
if (mShortcutTargets == null) {
|
if (mShortcutTargets == null) {
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
|
@ -286,8 +291,9 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
final WeightedString existingShortcut = getShortcut(shortcut.mWord);
|
final WeightedString existingShortcut = getShortcut(shortcut.mWord);
|
||||||
if (existingShortcut == null) {
|
if (existingShortcut == null) {
|
||||||
mShortcutTargets.add(shortcut);
|
mShortcutTargets.add(shortcut);
|
||||||
} else if (existingShortcut.getProbability() < shortcut.getProbability()) {
|
} else {
|
||||||
existingShortcut.setProbability(shortcut.getProbability());
|
existingShortcut.mProbabilityInfo = ProbabilityInfo.max(
|
||||||
|
existingShortcut.mProbabilityInfo, shortcut.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -302,8 +308,9 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
final WeightedString existingBigram = getBigram(bigram.mWord);
|
final WeightedString existingBigram = getBigram(bigram.mWord);
|
||||||
if (existingBigram == null) {
|
if (existingBigram == null) {
|
||||||
mBigrams.add(bigram);
|
mBigrams.add(bigram);
|
||||||
} else if (existingBigram.getProbability() < bigram.getProbability()) {
|
} else {
|
||||||
existingBigram.setProbability(bigram.getProbability());
|
existingBigram.mProbabilityInfo = ProbabilityInfo.max(
|
||||||
|
existingBigram.mProbabilityInfo, bigram.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -393,13 +400,13 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
* they will be added to the dictionary as necessary.
|
* they will be added to the dictionary as necessary.
|
||||||
*
|
*
|
||||||
* @param word the word to add.
|
* @param word the word to add.
|
||||||
* @param frequency the frequency of the word, in the range [0..255].
|
* @param probabilityInfo probability information of the word.
|
||||||
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
||||||
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
|
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
|
||||||
*/
|
*/
|
||||||
public void add(final String word, final int frequency,
|
public void add(final String word, final ProbabilityInfo probabilityInfo,
|
||||||
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
||||||
add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
|
add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord,
|
||||||
false /* isBlacklistEntry */);
|
false /* isBlacklistEntry */);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -412,7 +419,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
*/
|
*/
|
||||||
public void addBlacklistEntry(final String word,
|
public void addBlacklistEntry(final String word,
|
||||||
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
||||||
add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
|
add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord,
|
||||||
|
true /* isBlacklistEntry */);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -438,21 +446,22 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
*
|
*
|
||||||
* @param word0 the previous word of the context
|
* @param word0 the previous word of the context
|
||||||
* @param word1 the next word of the context
|
* @param word1 the next word of the context
|
||||||
* @param frequency the bigram frequency
|
* @param probabilityInfo the bigram probability info
|
||||||
*/
|
*/
|
||||||
public void setBigram(final String word0, final String word1, final int frequency) {
|
public void setBigram(final String word0, final String word1,
|
||||||
|
final ProbabilityInfo probabilityInfo) {
|
||||||
PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
|
PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
|
||||||
if (ptNode0 != null) {
|
if (ptNode0 != null) {
|
||||||
final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
|
final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
|
||||||
if (ptNode1 == null) {
|
if (ptNode1 == null) {
|
||||||
add(getCodePoints(word1), 0, null, false /* isNotAWord */,
|
add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */,
|
||||||
false /* isBlacklistEntry */);
|
false /* isBlacklistEntry */);
|
||||||
// The PtNode for the first word may have moved by the above insertion,
|
// The PtNode for the first word may have moved by the above insertion,
|
||||||
// if word1 and word2 share a common stem that happens not to have been
|
// if word1 and word2 share a common stem that happens not to have been
|
||||||
// a cutting point until now. In this case, we need to refresh ptNode.
|
// a cutting point until now. In this case, we need to refresh ptNode.
|
||||||
ptNode0 = findWordInTree(mRootNodeArray, word0);
|
ptNode0 = findWordInTree(mRootNodeArray, word0);
|
||||||
}
|
}
|
||||||
ptNode0.addBigram(word1, frequency);
|
ptNode0.addBigram(word1, probabilityInfo);
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException("First word of bigram not found " + word0);
|
throw new RuntimeException("First word of bigram not found " + word0);
|
||||||
}
|
}
|
||||||
|
@ -465,15 +474,15 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
* an exception is thrown.
|
* an exception is thrown.
|
||||||
*
|
*
|
||||||
* @param word the word, as an int array.
|
* @param word the word, as an int array.
|
||||||
* @param frequency the frequency of the word, in the range [0..255].
|
* @param probabilityInfo the probability information of the word.
|
||||||
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
||||||
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
||||||
* @param isBlacklistEntry true if this is a blacklisted word, false otherwise
|
* @param isBlacklistEntry true if this is a blacklisted word, false otherwise
|
||||||
*/
|
*/
|
||||||
private void add(final int[] word, final int frequency,
|
private void add(final int[] word, final ProbabilityInfo probabilityInfo,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
assert(frequency >= 0 && frequency <= 255);
|
assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
|
||||||
if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
|
if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
|
||||||
MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
|
MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
|
||||||
return;
|
return;
|
||||||
|
@ -501,7 +510,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
// No node at this point to accept the word. Create one.
|
// No node at this point to accept the word. Create one.
|
||||||
final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
|
final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
|
||||||
final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
|
final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
|
shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord,
|
||||||
|
isBlacklistEntry);
|
||||||
currentNodeArray.mData.add(insertionIndex, newPtNode);
|
currentNodeArray.mData.add(insertionIndex, newPtNode);
|
||||||
if (DBG) checkStack(currentNodeArray);
|
if (DBG) checkStack(currentNodeArray);
|
||||||
} else {
|
} else {
|
||||||
|
@ -511,15 +521,15 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
// The new word is a prefix of an existing word, but the node on which it
|
// The new word is a prefix of an existing word, but the node on which it
|
||||||
// should end already exists as is. Since the old PtNode was not a terminal,
|
// should end already exists as is. Since the old PtNode was not a terminal,
|
||||||
// make it one by filling in its frequency and other attributes
|
// make it one by filling in its frequency and other attributes
|
||||||
currentPtNode.update(frequency, shortcutTargets, null, isNotAWord,
|
currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord,
|
||||||
isBlacklistEntry);
|
isBlacklistEntry);
|
||||||
} else {
|
} else {
|
||||||
// The new word matches the full old word and extends past it.
|
// The new word matches the full old word and extends past it.
|
||||||
// We only have to create a new node and add it to the end of this.
|
// We only have to create a new node and add it to the end of this.
|
||||||
final PtNode newNode = new PtNode(
|
final PtNode newNode = new PtNode(
|
||||||
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, frequency, isNotAWord,
|
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||||
isBlacklistEntry);
|
isNotAWord, isBlacklistEntry);
|
||||||
currentPtNode.mChildren = new PtNodeArray();
|
currentPtNode.mChildren = new PtNodeArray();
|
||||||
currentPtNode.mChildren.mData.add(newNode);
|
currentPtNode.mChildren.mData.add(newNode);
|
||||||
}
|
}
|
||||||
|
@ -527,7 +537,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
if (0 == differentCharIndex) {
|
if (0 == differentCharIndex) {
|
||||||
// Exact same word. Update the frequency if higher. This will also add the
|
// Exact same word. Update the frequency if higher. This will also add the
|
||||||
// new shortcuts to the existing shortcut list if it already exists.
|
// new shortcuts to the existing shortcut list if it already exists.
|
||||||
currentPtNode.update(frequency, shortcutTargets, null,
|
currentPtNode.update(probabilityInfo, shortcutTargets, null,
|
||||||
currentPtNode.mIsNotAWord && isNotAWord,
|
currentPtNode.mIsNotAWord && isNotAWord,
|
||||||
currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
|
currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
|
||||||
} else {
|
} else {
|
||||||
|
@ -537,7 +547,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
final PtNode newOldWord = new PtNode(
|
final PtNode newOldWord = new PtNode(
|
||||||
Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
|
Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
|
||||||
currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
|
currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
|
||||||
currentPtNode.mBigrams, currentPtNode.mFrequency,
|
currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
|
||||||
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
|
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
|
||||||
currentPtNode.mChildren);
|
currentPtNode.mChildren);
|
||||||
newChildren.mData.add(newOldWord);
|
newChildren.mData.add(newOldWord);
|
||||||
|
@ -546,16 +556,17 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
if (charIndex + differentCharIndex >= word.length) {
|
if (charIndex + differentCharIndex >= word.length) {
|
||||||
newParent = new PtNode(
|
newParent = new PtNode(
|
||||||
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
||||||
shortcutTargets, null /* bigrams */, frequency,
|
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||||
isNotAWord, isBlacklistEntry, newChildren);
|
isNotAWord, isBlacklistEntry, newChildren);
|
||||||
} else {
|
} else {
|
||||||
newParent = new PtNode(
|
newParent = new PtNode(
|
||||||
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
||||||
null /* shortcutTargets */, null /* bigrams */, -1,
|
null /* shortcutTargets */, null /* bigrams */,
|
||||||
false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
|
null /* probabilityInfo */, false /* isNotAWord */,
|
||||||
|
false /* isBlacklistEntry */, newChildren);
|
||||||
final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
|
final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
|
||||||
charIndex + differentCharIndex, word.length),
|
charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, frequency,
|
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||||
isNotAWord, isBlacklistEntry);
|
isNotAWord, isBlacklistEntry);
|
||||||
final int addIndex = word[charIndex + differentCharIndex]
|
final int addIndex = word[charIndex + differentCharIndex]
|
||||||
> currentPtNode.mChars[differentCharIndex] ? 1 : 0;
|
> currentPtNode.mChars[differentCharIndex] ? 1 : 0;
|
||||||
|
@ -617,8 +628,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
|
private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
|
||||||
final ArrayList<PtNode> data = nodeArray.mData;
|
final ArrayList<PtNode> data = nodeArray.mData;
|
||||||
final PtNode reference = new PtNode(new int[] { character },
|
final PtNode reference = new PtNode(new int[] { character },
|
||||||
null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
|
null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */,
|
||||||
false /* isBlacklistEntry */);
|
false /* isNotAWord */, false /* isBlacklistEntry */);
|
||||||
int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
|
int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
|
||||||
return result >= 0 ? result : -result - 1;
|
return result >= 0 ? result : -result - 1;
|
||||||
}
|
}
|
||||||
|
@ -752,8 +763,9 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
currentPos.length = mCurrentString.length();
|
currentPos.length = mCurrentString.length();
|
||||||
mPositions.addLast(currentPos);
|
mPositions.addLast(currentPos);
|
||||||
}
|
}
|
||||||
if (currentPtNode.mFrequency >= 0) {
|
if (currentPtNode.isTerminal()) {
|
||||||
return new WordProperty(mCurrentString.toString(), currentPtNode.mFrequency,
|
return new WordProperty(mCurrentString.toString(),
|
||||||
|
currentPtNode.mProbabilityInfo,
|
||||||
currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
|
currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
|
||||||
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
|
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,6 +30,21 @@ public final class ProbabilityInfo {
|
||||||
public final int mLevel;
|
public final int mLevel;
|
||||||
public final int mCount;
|
public final int mCount;
|
||||||
|
|
||||||
|
public static ProbabilityInfo max(final ProbabilityInfo probabilityInfo1,
|
||||||
|
final ProbabilityInfo probabilityInfo2) {
|
||||||
|
if (probabilityInfo1 == null) {
|
||||||
|
return probabilityInfo2;
|
||||||
|
}
|
||||||
|
if (probabilityInfo2 == null) {
|
||||||
|
return probabilityInfo1;
|
||||||
|
}
|
||||||
|
if (probabilityInfo1.mProbability > probabilityInfo2.mProbability) {
|
||||||
|
return probabilityInfo1;
|
||||||
|
} else {
|
||||||
|
return probabilityInfo2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public ProbabilityInfo(final int probability) {
|
public ProbabilityInfo(final int probability) {
|
||||||
this(probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP, 0, 0);
|
this(probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,24 +29,28 @@ public final class PtNodeInfo {
|
||||||
public final int mEndAddress;
|
public final int mEndAddress;
|
||||||
public final int mFlags;
|
public final int mFlags;
|
||||||
public final int[] mCharacters;
|
public final int[] mCharacters;
|
||||||
public final int mFrequency;
|
public final ProbabilityInfo mProbabilityInfo;
|
||||||
public final int mChildrenAddress;
|
public final int mChildrenAddress;
|
||||||
public final int mParentAddress;
|
public final int mParentAddress;
|
||||||
public final ArrayList<WeightedString> mShortcutTargets;
|
public final ArrayList<WeightedString> mShortcutTargets;
|
||||||
public final ArrayList<PendingAttribute> mBigrams;
|
public final ArrayList<PendingAttribute> mBigrams;
|
||||||
|
|
||||||
public PtNodeInfo(final int originalAddress, final int endAddress, final int flags,
|
public PtNodeInfo(final int originalAddress, final int endAddress, final int flags,
|
||||||
final int[] characters, final int frequency, final int parentAddress,
|
final int[] characters, final ProbabilityInfo probabilityInfo, final int parentAddress,
|
||||||
final int childrenAddress, final ArrayList<WeightedString> shortcutTargets,
|
final int childrenAddress, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<PendingAttribute> bigrams) {
|
final ArrayList<PendingAttribute> bigrams) {
|
||||||
mOriginalAddress = originalAddress;
|
mOriginalAddress = originalAddress;
|
||||||
mEndAddress = endAddress;
|
mEndAddress = endAddress;
|
||||||
mFlags = flags;
|
mFlags = flags;
|
||||||
mCharacters = characters;
|
mCharacters = characters;
|
||||||
mFrequency = frequency;
|
mProbabilityInfo = probabilityInfo;
|
||||||
mParentAddress = parentAddress;
|
mParentAddress = parentAddress;
|
||||||
mChildrenAddress = childrenAddress;
|
mChildrenAddress = childrenAddress;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isTerminal() {
|
||||||
|
return mProbabilityInfo != null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,8 +39,9 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
|
||||||
private static final String TAG = Ver2DictDecoder.class.getSimpleName();
|
private static final String TAG = Ver2DictDecoder.class.getSimpleName();
|
||||||
|
|
||||||
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
|
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
|
||||||
private static int readFrequency(final DictBuffer dictBuffer) {
|
private static ProbabilityInfo readProbabilityInfo(final DictBuffer dictBuffer) {
|
||||||
return dictBuffer.readUnsignedByte();
|
// Ver2 dicts don't contain historical information.
|
||||||
|
return new ProbabilityInfo(dictBuffer.readUnsignedByte());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -133,12 +134,12 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
|
||||||
addressPointer += CharEncoding.getCharSize(character);
|
addressPointer += CharEncoding.getCharSize(character);
|
||||||
characters = new int[] { character };
|
characters = new int[] { character };
|
||||||
}
|
}
|
||||||
final int frequency;
|
final ProbabilityInfo probabilityInfo;
|
||||||
if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
|
if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
|
||||||
frequency = PtNodeReader.readFrequency(mDictBuffer);
|
probabilityInfo = PtNodeReader.readProbabilityInfo(mDictBuffer);
|
||||||
addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE;
|
addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE;
|
||||||
} else {
|
} else {
|
||||||
frequency = PtNode.NOT_A_TERMINAL;
|
probabilityInfo = null;
|
||||||
}
|
}
|
||||||
int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
|
int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
|
||||||
if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
|
if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
|
||||||
|
@ -166,7 +167,7 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
|
||||||
} else {
|
} else {
|
||||||
bigrams = null;
|
bigrams = null;
|
||||||
}
|
}
|
||||||
return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency,
|
return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, probabilityInfo,
|
||||||
parentAddress, childrenAddress, shortcutTargets, bigrams);
|
parentAddress, childrenAddress, shortcutTargets, bigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -231,7 +232,9 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
|
||||||
BinaryDictIOUtils.skipString(mDictBuffer,
|
BinaryDictIOUtils.skipString(mDictBuffer,
|
||||||
(flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
|
(flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
|
||||||
PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions);
|
PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions);
|
||||||
if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readFrequency(mDictBuffer);
|
if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
|
||||||
|
PtNodeReader.readProbabilityInfo(mDictBuffer);
|
||||||
|
}
|
||||||
if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) {
|
if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) {
|
||||||
final int shortcutsSize = mDictBuffer.readUnsignedShort();
|
final int shortcutsSize = mDictBuffer.readUnsignedShort();
|
||||||
mDictBuffer.position(mDictBuffer.position() + shortcutsSize
|
mDictBuffer.position(mDictBuffer.position() + shortcutsSize
|
||||||
|
|
|
@ -227,7 +227,7 @@ public class Ver2DictEncoder implements DictEncoder {
|
||||||
final PtNode target =
|
final PtNode target =
|
||||||
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
|
FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
|
||||||
final int addressOfBigram = target.mCachedAddressAfterUpdate;
|
final int addressOfBigram = target.mCachedAddressAfterUpdate;
|
||||||
final int unigramFrequencyForThisWord = target.mFrequency;
|
final int unigramFrequencyForThisWord = target.getProbability();
|
||||||
final int offset = addressOfBigram
|
final int offset = addressOfBigram
|
||||||
- (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
- (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
||||||
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
|
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
|
||||||
|
@ -251,7 +251,7 @@ public class Ver2DictEncoder implements DictEncoder {
|
||||||
writePtNodeFlags(ptNode, formatOptions);
|
writePtNodeFlags(ptNode, formatOptions);
|
||||||
writeParentPosition(parentPosition, ptNode, formatOptions);
|
writeParentPosition(parentPosition, ptNode, formatOptions);
|
||||||
writeCharacters(ptNode.mChars, ptNode.hasSeveralChars());
|
writeCharacters(ptNode.mChars, ptNode.hasSeveralChars());
|
||||||
writeFrequency(ptNode.mFrequency);
|
writeFrequency(ptNode.getProbability());
|
||||||
writeChildrenPosition(ptNode, formatOptions);
|
writeChildrenPosition(ptNode, formatOptions);
|
||||||
writeShortcuts(ptNode.mShortcutTargets);
|
writeShortcuts(ptNode.mShortcutTargets);
|
||||||
writeBigrams(ptNode.mBigrams, dict);
|
writeBigrams(ptNode.mBigrams, dict);
|
||||||
|
|
|
@ -81,14 +81,11 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
|
|
||||||
// Insert unigrams to the fusion dictionary.
|
// Insert unigrams to the fusion dictionary.
|
||||||
for (final WordProperty wordProperty : wordProperties) {
|
for (final WordProperty wordProperty : wordProperties) {
|
||||||
// TODO: Support probability that is -1.
|
|
||||||
final int probability = wordProperty.getProbability() < 0 ?
|
|
||||||
0 : wordProperty.getProbability();
|
|
||||||
if (wordProperty.mIsBlacklistEntry) {
|
if (wordProperty.mIsBlacklistEntry) {
|
||||||
fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
||||||
wordProperty.mIsNotAWord);
|
wordProperty.mIsNotAWord);
|
||||||
} else {
|
} else {
|
||||||
fusionDict.add(wordProperty.mWord, probability,
|
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -99,7 +96,7 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
}
|
}
|
||||||
final String word0 = wordProperty.mWord;
|
final String word0 = wordProperty.mWord;
|
||||||
for (final WeightedString bigram : wordProperty.mBigrams) {
|
for (final WeightedString bigram : wordProperty.mBigrams) {
|
||||||
fusionDict.setBigram(word0, bigram.mWord, bigram.getProbability());
|
fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return fusionDict;
|
return fusionDict;
|
||||||
|
|
|
@ -43,12 +43,12 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
|
|
||||||
private int mHashCode = 0;
|
private int mHashCode = 0;
|
||||||
|
|
||||||
public WordProperty(final String word, final int probability,
|
public WordProperty(final String word, final ProbabilityInfo probabilityInfo,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams,
|
final ArrayList<WeightedString> bigrams,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
mWord = word;
|
mWord = word;
|
||||||
mProbabilityInfo = new ProbabilityInfo(probability);
|
mProbabilityInfo = probabilityInfo;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
mIsNotAWord = isNotAWord;
|
mIsNotAWord = isNotAWord;
|
||||||
|
|
|
@ -20,6 +20,7 @@ import android.test.AndroidTestCase;
|
||||||
import android.test.suitebuilder.annotation.SmallTest;
|
import android.test.suitebuilder.annotation.SmallTest;
|
||||||
|
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
|
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -33,16 +34,16 @@ public class FusionDictionaryTests extends AndroidTestCase {
|
||||||
FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||||
new FusionDictionary.DictionaryOptions(new HashMap<String,String>()));
|
new FusionDictionary.DictionaryOptions(new HashMap<String,String>()));
|
||||||
|
|
||||||
dict.add("abc", 10, null, false /* isNotAWord */);
|
dict.add("abc", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "abc"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "abc"));
|
||||||
|
|
||||||
dict.add("aa", 10, null, false /* isNotAWord */);
|
dict.add("aa", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aa"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aa"));
|
||||||
|
|
||||||
dict.add("babcd", 10, null, false /* isNotAWord */);
|
dict.add("babcd", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||||
dict.add("bacde", 10, null, false /* isNotAWord */);
|
dict.add("bacde", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "ba"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "ba"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "babcd"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "babcd"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "bacde"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "bacde"));
|
||||||
|
|
|
@ -151,8 +151,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
shortcuts.add(new WeightedString(shortcut, UNIGRAM_FREQ));
|
shortcuts.add(new WeightedString(shortcut, UNIGRAM_FREQ));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dict.add(word, UNIGRAM_FREQ, (shortcutMap == null) ? null : shortcuts,
|
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
||||||
false /* isNotAWord */);
|
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -162,7 +162,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
for (int i = 0; i < bigrams.size(); ++i) {
|
for (int i = 0; i < bigrams.size(); ++i) {
|
||||||
final int w1 = bigrams.keyAt(i);
|
final int w1 = bigrams.keyAt(i);
|
||||||
for (int w2 : bigrams.valueAt(i)) {
|
for (int w2 : bigrams.valueAt(i)) {
|
||||||
dict.setBigram(words.get(w1), words.get(w2), BIGRAM_FREQ);
|
dict.setBigram(words.get(w1), words.get(w2), new ProbabilityInfo(BIGRAM_FREQ));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||||
|
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
||||||
import com.android.inputmethod.latin.makedict.WordProperty;
|
import com.android.inputmethod.latin.makedict.WordProperty;
|
||||||
import com.android.inputmethod.latin.utils.CombinedFormatUtils;
|
import com.android.inputmethod.latin.utils.CombinedFormatUtils;
|
||||||
|
|
||||||
|
@ -112,7 +113,7 @@ public class CombinedInputOutput {
|
||||||
|
|
||||||
String line;
|
String line;
|
||||||
String word = null;
|
String word = null;
|
||||||
int freq = 0;
|
ProbabilityInfo probabilityInfo = new ProbabilityInfo(0);
|
||||||
boolean isNotAWord = false;
|
boolean isNotAWord = false;
|
||||||
ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>();
|
ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>();
|
||||||
ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>();
|
ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>();
|
||||||
|
@ -121,9 +122,10 @@ public class CombinedInputOutput {
|
||||||
final String args[] = line.trim().split(",");
|
final String args[] = line.trim().split(",");
|
||||||
if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
|
if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
|
||||||
if (null != word) {
|
if (null != word) {
|
||||||
dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts,
|
||||||
|
isNotAWord);
|
||||||
for (WeightedString s : bigrams) {
|
for (WeightedString s : bigrams) {
|
||||||
dict.setBigram(word, s.mWord, s.getProbability());
|
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>();
|
if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>();
|
||||||
|
@ -135,14 +137,19 @@ public class CombinedInputOutput {
|
||||||
if (CombinedFormatUtils.WORD_TAG.equals(params[0])) {
|
if (CombinedFormatUtils.WORD_TAG.equals(params[0])) {
|
||||||
word = params[1];
|
word = params[1];
|
||||||
} else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
|
} else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
|
||||||
freq = Integer.parseInt(params[1]);
|
probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
|
||||||
|
probabilityInfo.mTimestamp, probabilityInfo.mLevel,
|
||||||
|
probabilityInfo.mCount);
|
||||||
} else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
|
} else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
|
||||||
final String[] historicalInfoParams =
|
final String[] historicalInfoParams =
|
||||||
params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
|
params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
|
||||||
if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
|
if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
|
||||||
throw new RuntimeException("Wrong format (historical info) : " + line);
|
throw new RuntimeException("Wrong format (historical info) : " + line);
|
||||||
}
|
}
|
||||||
// TODO: Use parsed historical info.
|
probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability,
|
||||||
|
Integer.parseInt(historicalInfoParams[0]),
|
||||||
|
Integer.parseInt(historicalInfoParams[1]),
|
||||||
|
Integer.parseInt(historicalInfoParams[2]));
|
||||||
} else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) {
|
} else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) {
|
||||||
isNotAWord = "true".equals(params[1]);
|
isNotAWord = "true".equals(params[1]);
|
||||||
}
|
}
|
||||||
|
@ -168,34 +175,40 @@ public class CombinedInputOutput {
|
||||||
}
|
}
|
||||||
} else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) {
|
} else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) {
|
||||||
String secondWordOfBigram = null;
|
String secondWordOfBigram = null;
|
||||||
int bigramFreq = 0;
|
ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0);
|
||||||
for (String param : args) {
|
for (String param : args) {
|
||||||
final String params[] = param.split("=", 2);
|
final String params[] = param.split("=", 2);
|
||||||
if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
|
if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
|
||||||
if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) {
|
if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) {
|
||||||
secondWordOfBigram = params[1];
|
secondWordOfBigram = params[1];
|
||||||
} else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
|
} else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
|
||||||
bigramFreq = Integer.parseInt(params[1]);
|
bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
|
||||||
|
bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel,
|
||||||
|
bigramProbabilityInfo.mCount);
|
||||||
} else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
|
} else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
|
||||||
final String[] historicalInfoParams =
|
final String[] historicalInfoParams =
|
||||||
params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
|
params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
|
||||||
if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
|
if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
|
||||||
throw new RuntimeException("Wrong format (historical info) : " + line);
|
throw new RuntimeException("Wrong format (historical info) : " + line);
|
||||||
}
|
}
|
||||||
// TODO: Use parsed historical info.
|
bigramProbabilityInfo = new ProbabilityInfo(
|
||||||
|
bigramProbabilityInfo.mProbability,
|
||||||
|
Integer.parseInt(historicalInfoParams[0]),
|
||||||
|
Integer.parseInt(historicalInfoParams[1]),
|
||||||
|
Integer.parseInt(historicalInfoParams[2]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (null != secondWordOfBigram) {
|
if (null != secondWordOfBigram) {
|
||||||
bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq));
|
bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo));
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException("Wrong format : " + line);
|
throw new RuntimeException("Wrong format : " + line);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (null != word) {
|
if (null != word) {
|
||||||
dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
||||||
for (WeightedString s : bigrams) {
|
for (WeightedString s : bigrams) {
|
||||||
dict.setBigram(word, s.mWord, s.getProbability());
|
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -118,9 +118,10 @@ public class Diff extends Dicttool.Command {
|
||||||
hasDifferences = true;
|
hasDifferences = true;
|
||||||
} else {
|
} else {
|
||||||
// We found the word. Compare frequencies, shortcuts, bigrams
|
// We found the word. Compare frequencies, shortcuts, bigrams
|
||||||
if (word0Property.getProbability() != word1PtNode.getFrequency()) {
|
if (word0Property.getProbability() != word1PtNode.getProbability()) {
|
||||||
System.out.println("Probability changed: " + word0Property.mWord + " "
|
System.out.println("Probability changed: " + word0Property.mWord + " "
|
||||||
+ word0Property.getProbability() + " -> " + word1PtNode.getFrequency());
|
+ word0Property.getProbability() + " -> "
|
||||||
|
+ word1PtNode.getProbability());
|
||||||
hasDifferences = true;
|
hasDifferences = true;
|
||||||
}
|
}
|
||||||
if (word0Property.mIsNotAWord != word1PtNode.getIsNotAWord()) {
|
if (word0Property.mIsNotAWord != word1PtNode.getIsNotAWord()) {
|
||||||
|
|
|
@ -72,7 +72,7 @@ public class Info extends Dicttool.Command {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
System.out.println("Word: " + word);
|
System.out.println("Word: " + word);
|
||||||
System.out.println(" Freq: " + ptNode.getFrequency());
|
System.out.println(" Freq: " + ptNode.getProbability());
|
||||||
if (ptNode.getIsNotAWord()) {
|
if (ptNode.getIsNotAWord()) {
|
||||||
System.out.println(" Is not a word");
|
System.out.println(" Is not a word");
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||||
|
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
||||||
import com.android.inputmethod.latin.makedict.WordProperty;
|
import com.android.inputmethod.latin.makedict.WordProperty;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
|
@ -66,6 +67,7 @@ public class XmlDictInputOutput {
|
||||||
private static final int START = 1;
|
private static final int START = 1;
|
||||||
private static final int WORD = 2;
|
private static final int WORD = 2;
|
||||||
private static final int UNKNOWN = 3;
|
private static final int UNKNOWN = 3;
|
||||||
|
private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1;
|
||||||
|
|
||||||
FusionDictionary mDictionary;
|
FusionDictionary mDictionary;
|
||||||
int mState; // the state of the parser
|
int mState; // the state of the parser
|
||||||
|
@ -90,7 +92,8 @@ public class XmlDictInputOutput {
|
||||||
final FusionDictionary dict = mDictionary;
|
final FusionDictionary dict = mDictionary;
|
||||||
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
||||||
if (dict.hasWord(shortcutOnly)) continue;
|
if (dict.hasWord(shortcutOnly)) continue;
|
||||||
dict.add(shortcutOnly, 1, mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
|
dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
|
||||||
|
mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
|
||||||
}
|
}
|
||||||
mDictionary = null;
|
mDictionary = null;
|
||||||
mShortcutsMap.clear();
|
mShortcutsMap.clear();
|
||||||
|
@ -138,7 +141,8 @@ public class XmlDictInputOutput {
|
||||||
@Override
|
@Override
|
||||||
public void endElement(String uri, String localName, String qName) {
|
public void endElement(String uri, String localName, String qName) {
|
||||||
if (WORD == mState) {
|
if (WORD == mState) {
|
||||||
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), false /* isNotAWord */);
|
mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
|
||||||
|
false /* isNotAWord */);
|
||||||
mState = START;
|
mState = START;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -319,7 +323,7 @@ public class XmlDictInputOutput {
|
||||||
final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
|
final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
|
||||||
for (final WeightedString bigram : bigramList) {
|
for (final WeightedString bigram : bigramList) {
|
||||||
if (!dict.hasWord(bigram.mWord)) continue;
|
if (!dict.hasWord(bigram.mWord)) continue;
|
||||||
dict.setBigram(firstWord, bigram.mWord, bigram.getProbability());
|
dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return dict;
|
return dict;
|
||||||
|
|
|
@ -24,6 +24,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
|
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
||||||
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
|
import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
|
||||||
|
|
||||||
|
@ -53,11 +54,11 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase {
|
||||||
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, LOCALE);
|
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, LOCALE);
|
||||||
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, ID);
|
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, ID);
|
||||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), testOptions);
|
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), testOptions);
|
||||||
dict.add("foo", TEST_FREQ, null, false /* isNotAWord */);
|
dict.add("foo", new ProbabilityInfo(TEST_FREQ), null, false /* isNotAWord */);
|
||||||
dict.add("fta", 1, null, false /* isNotAWord */);
|
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
dict.add("ftb", 1, null, false /* isNotAWord */);
|
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
dict.add("bar", 1, null, false /* isNotAWord */);
|
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
dict.add("fool", 1, null, false /* isNotAWord */);
|
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
|
|
||||||
final File dst = File.createTempFile("testGetRawDict", ".tmp");
|
final File dst = File.createTempFile("testGetRawDict", ".tmp");
|
||||||
dst.deleteOnExit();
|
dst.deleteOnExit();
|
||||||
|
@ -87,7 +88,7 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase {
|
||||||
assertEquals("Wrong id attribute", ID, resultDict.mOptions.mAttributes.get(
|
assertEquals("Wrong id attribute", ID, resultDict.mOptions.mAttributes.get(
|
||||||
DictionaryHeader.DICTIONARY_ID_KEY));
|
DictionaryHeader.DICTIONARY_ID_KEY));
|
||||||
assertEquals("Dictionary can't be read back correctly",
|
assertEquals("Dictionary can't be read back correctly",
|
||||||
FusionDictionary.findWordInTree(resultDict.mRootNodeArray, "foo").getFrequency(),
|
FusionDictionary.findWordInTree(resultDict.mRootNodeArray, "foo").getProbability(),
|
||||||
TEST_FREQ);
|
TEST_FREQ);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,11 +33,11 @@ public class BinaryDictEncoderFlattenTreeTests extends TestCase {
|
||||||
public void testFlattenNodes() {
|
public void testFlattenNodes() {
|
||||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||||
new DictionaryOptions(new HashMap<String, String>()));
|
new DictionaryOptions(new HashMap<String, String>()));
|
||||||
dict.add("foo", 1, null, false /* isNotAWord */);
|
dict.add("foo", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
dict.add("fta", 1, null, false /* isNotAWord */);
|
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
dict.add("ftb", 1, null, false /* isNotAWord */);
|
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
dict.add("bar", 1, null, false /* isNotAWord */);
|
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
dict.add("fool", 1, null, false /* isNotAWord */);
|
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||||
final ArrayList<PtNodeArray> result =
|
final ArrayList<PtNodeArray> result =
|
||||||
BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
|
BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
|
||||||
assertEquals(4, result.size());
|
assertEquals(4, result.size());
|
||||||
|
|
|
@ -101,7 +101,7 @@ public class FusionDictionaryTest extends TestCase {
|
||||||
prepare(time);
|
prepare(time);
|
||||||
for (int i = 0; i < sWords.size(); ++i) {
|
for (int i = 0; i < sWords.size(); ++i) {
|
||||||
System.out.println("Adding in pos " + i + " : " + dumpWord(sWords.get(i)));
|
System.out.println("Adding in pos " + i + " : " + dumpWord(sWords.get(i)));
|
||||||
dict.add(sWords.get(i), 180, null, false);
|
dict.add(sWords.get(i), new ProbabilityInfo(180), null, false);
|
||||||
dumpDict(dict);
|
dumpDict(dict);
|
||||||
checkDictionary(dict, sWords, i);
|
checkDictionary(dict, sWords, i);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue