Make WeightedString have ProbabilityInfo.
Bug: 11281877 Bug: 12810574 Change-Id: I265e3d8654c75766cd0e0d09d67ef62b4566298a
This commit is contained in:
parent
75a3df30f6
commit
df1d3e733e
11 changed files with 80 additions and 46 deletions
|
@ -511,7 +511,7 @@ public final class BinaryDictDecoderUtils {
|
|||
final WeightedString word = getWordAtPosition(dictDecoder, headerSize,
|
||||
bigram.mAddress, options);
|
||||
final int reconstructedFrequency =
|
||||
BinaryDictIOUtils.reconstructBigramFrequency(word.mFrequency,
|
||||
BinaryDictIOUtils.reconstructBigramFrequency(word.getProbability(),
|
||||
bigram.mFrequency);
|
||||
bigrams.add(new WeightedString(word.mWord, reconstructedFrequency));
|
||||
}
|
||||
|
@ -618,7 +618,7 @@ public final class BinaryDictDecoderUtils {
|
|||
// words that are not also registered as unigrams so we don't have to avoid
|
||||
// them explicitly here.
|
||||
for (final WeightedString bigram : w.mBigrams) {
|
||||
newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
|
||||
newDict.setBigram(w.mWord, bigram.mWord, bigram.getProbability());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,29 +67,40 @@ public final class FusionDictionary implements Iterable<Word> {
|
|||
}
|
||||
|
||||
/**
|
||||
* A string with a frequency.
|
||||
* A string with a probability.
|
||||
*
|
||||
* This represents an "attribute", that is either a bigram or a shortcut.
|
||||
*/
|
||||
public static final class WeightedString {
|
||||
public final String mWord;
|
||||
public int mFrequency;
|
||||
public WeightedString(String word, int frequency) {
|
||||
public ProbabilityInfo mProbabilityInfo;
|
||||
|
||||
public WeightedString(final String word, final int probability) {
|
||||
mWord = word;
|
||||
mFrequency = frequency;
|
||||
mProbabilityInfo = new ProbabilityInfo(probability);
|
||||
}
|
||||
|
||||
public int getProbability() {
|
||||
return mProbabilityInfo.mProbability;
|
||||
}
|
||||
|
||||
public void setProbability(final int probability) {
|
||||
mProbabilityInfo = new ProbabilityInfo(probability);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Arrays.hashCode(new Object[] { mWord, mFrequency });
|
||||
return Arrays.hashCode(new Object[] { mWord, mProbabilityInfo.mProbability,
|
||||
mProbabilityInfo.mTimestamp, mProbabilityInfo.mLevel,
|
||||
mProbabilityInfo.mCount });
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (o == this) return true;
|
||||
if (!(o instanceof WeightedString)) return false;
|
||||
WeightedString w = (WeightedString)o;
|
||||
return mWord.equals(w.mWord) && mFrequency == w.mFrequency;
|
||||
final WeightedString w = (WeightedString)o;
|
||||
return mWord.equals(w.mWord) && mProbabilityInfo.equals(w.mProbabilityInfo);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -200,18 +211,18 @@ public final class FusionDictionary implements Iterable<Word> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Adds a word to the bigram list. Updates the frequency if the word already
|
||||
* Adds a word to the bigram list. Updates the probability if the word already
|
||||
* exists.
|
||||
*/
|
||||
public void addBigram(final String word, final int frequency) {
|
||||
public void addBigram(final String word, final int probability) {
|
||||
if (mBigrams == null) {
|
||||
mBigrams = new ArrayList<WeightedString>();
|
||||
}
|
||||
WeightedString bigram = getBigram(word);
|
||||
if (bigram != null) {
|
||||
bigram.mFrequency = frequency;
|
||||
bigram.setProbability(probability);
|
||||
} else {
|
||||
bigram = new WeightedString(word, frequency);
|
||||
bigram = new WeightedString(word, probability);
|
||||
mBigrams.add(bigram);
|
||||
}
|
||||
}
|
||||
|
@ -273,8 +284,8 @@ public final class FusionDictionary implements Iterable<Word> {
|
|||
final WeightedString existingShortcut = getShortcut(shortcut.mWord);
|
||||
if (existingShortcut == null) {
|
||||
mShortcutTargets.add(shortcut);
|
||||
} else if (existingShortcut.mFrequency < shortcut.mFrequency) {
|
||||
existingShortcut.mFrequency = shortcut.mFrequency;
|
||||
} else if (existingShortcut.getProbability() < shortcut.getProbability()) {
|
||||
existingShortcut.setProbability(shortcut.getProbability());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -289,8 +300,8 @@ public final class FusionDictionary implements Iterable<Word> {
|
|||
final WeightedString existingBigram = getBigram(bigram.mWord);
|
||||
if (existingBigram == null) {
|
||||
mBigrams.add(bigram);
|
||||
} else if (existingBigram.mFrequency < bigram.mFrequency) {
|
||||
existingBigram.mFrequency = bigram.mFrequency;
|
||||
} else if (existingBigram.getProbability() < bigram.getProbability()) {
|
||||
existingBigram.setProbability(bigram.getProbability());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package com.android.inputmethod.latin.makedict;
|
||||
|
||||
import com.android.inputmethod.latin.BinaryDictionary;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||
|
||||
public final class ProbabilityInfo {
|
||||
public final int mProbability;
|
||||
|
@ -39,8 +40,24 @@ public final class ProbabilityInfo {
|
|||
mCount = count;
|
||||
}
|
||||
|
||||
public boolean hasHistoricalInfo() {
|
||||
return mTimestamp != BinaryDictionary.NOT_A_VALID_TIMESTAMP;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return mTimestamp + ":" + mLevel + ":" + mCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (o == this) return true;
|
||||
if (!(o instanceof ProbabilityInfo)) return false;
|
||||
final ProbabilityInfo p = (ProbabilityInfo)o;
|
||||
if (!hasHistoricalInfo() && !p.hasHistoricalInfo()) {
|
||||
return mProbability == p.mProbability;
|
||||
}
|
||||
return mProbability == p.mProbability && mTimestamp == p.mTimestamp && mLevel == p.mLevel
|
||||
&& mCount == p.mCount;
|
||||
}
|
||||
}
|
|
@ -197,7 +197,7 @@ public class Ver2DictEncoder implements DictEncoder {
|
|||
final WeightedString target = shortcutIterator.next();
|
||||
final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
|
||||
shortcutIterator.hasNext(),
|
||||
target.mFrequency);
|
||||
target.getProbability());
|
||||
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags,
|
||||
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
||||
final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord);
|
||||
|
@ -231,7 +231,7 @@ public class Ver2DictEncoder implements DictEncoder {
|
|||
final int offset = addressOfBigram
|
||||
- (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
||||
final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
|
||||
offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
|
||||
offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord);
|
||||
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags,
|
||||
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
|
||||
mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
|
||||
|
|
|
@ -78,7 +78,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
} else {
|
||||
for (final WeightedString shortcutTarget : word.mShortcutTargets) {
|
||||
binaryDict.addUnigramWord(word.mWord, word.mFrequency,
|
||||
shortcutTarget.mWord, shortcutTarget.mFrequency,
|
||||
shortcutTarget.mWord, shortcutTarget.getProbability(),
|
||||
word.mIsNotAWord, word.mIsBlacklistEntry, 0 /* timestamp */);
|
||||
}
|
||||
}
|
||||
|
@ -89,7 +89,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
for (final Word word0 : dict) {
|
||||
if (null == word0.mBigrams) continue;
|
||||
for (final WeightedString word1 : word0.mBigrams) {
|
||||
binaryDict.addBigramWords(word0.mWord, word1.mWord, word1.mFrequency,
|
||||
binaryDict.addBigramWords(word0.mWord, word1.mWord, word1.getProbability(),
|
||||
0 /* timestamp */);
|
||||
if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||
binaryDict.flushWithGC();
|
||||
|
|
|
@ -108,7 +108,7 @@ public class WordProperty {
|
|||
for (int i = 0; i < mBigramTargets.size(); i++) {
|
||||
builder.append(" bigram=" + mBigramTargets.get(i).mWord);
|
||||
builder.append(",");
|
||||
builder.append("f=" + mBigramTargets.get(i).mFrequency);
|
||||
builder.append("f=" + mBigramTargets.get(i).getProbability());
|
||||
if (mBigramProbabilityInfo.get(i).mTimestamp
|
||||
!= BinaryDictionary.NOT_A_VALID_TIMESTAMP) {
|
||||
builder.append(",");
|
||||
|
@ -119,7 +119,7 @@ public class WordProperty {
|
|||
for (int i = 0; i < mShortcutTargets.size(); i++) {
|
||||
builder.append(" shortcut=" + mShortcutTargets.get(i).mWord);
|
||||
builder.append(",");
|
||||
builder.append("f=" + mShortcutTargets.get(i).mFrequency);
|
||||
builder.append("f=" + mShortcutTargets.get(i).getProbability());
|
||||
builder.append("\n");
|
||||
}
|
||||
return builder.toString();
|
||||
|
|
|
@ -962,7 +962,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
for (int j = 0; j < unigramProperty.mBigramTargets.size(); j++) {
|
||||
final String word1 = unigramProperty.mBigramTargets.get(j).mWord;
|
||||
assertTrue(bigramWord1s.contains(word1));
|
||||
final int probability = unigramProperty.mBigramTargets.get(j).mFrequency;
|
||||
final int probability = unigramProperty.mBigramTargets.get(j).getProbability();
|
||||
assertEquals((int)bigramProbabilities.get(new Pair<String, String>(word0, word1)),
|
||||
probability);
|
||||
assertEquals(unigramProperty.mBigramProbabilityInfo.get(j).mProbability,
|
||||
|
@ -1053,7 +1053,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
for (int j = 0; j < wordProperty.mBigramTargets.size(); j++) {
|
||||
final String word1 = wordProperty.mBigramTargets.get(j).mWord;
|
||||
assertTrue(bigramWord1s.contains(word1));
|
||||
final int probability = wordProperty.mBigramTargets.get(j).mFrequency;
|
||||
final int probability = wordProperty.mBigramTargets.get(j).getProbability();
|
||||
final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
|
||||
assertEquals((int)bigramProbabilitiesToCheckLater.get(bigram), probability);
|
||||
bigramSet.remove(bigram);
|
||||
|
@ -1087,7 +1087,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa");
|
||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
|
||||
assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).mFrequency);
|
||||
assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability());
|
||||
final int updatedShortcutProbability = 2;
|
||||
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
|
||||
updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
|
||||
|
@ -1096,7 +1096,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
|
||||
assertEquals(updatedShortcutProbability,
|
||||
wordProperty.mShortcutTargets.get(0).mFrequency);
|
||||
wordProperty.mShortcutTargets.get(0).getProbability());
|
||||
binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy",
|
||||
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
|
||||
0 /* timestamp */);
|
||||
|
@ -1107,7 +1107,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertEquals(2, wordProperty.mShortcutTargets.size());
|
||||
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
|
||||
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
||||
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
|
||||
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord),
|
||||
shortcutTarget.getProbability());
|
||||
shortcutTargets.remove(shortcutTarget.mWord);
|
||||
}
|
||||
shortcutTargets.put("zzz", updatedShortcutProbability);
|
||||
|
@ -1117,7 +1118,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertEquals(2, wordProperty.mShortcutTargets.size());
|
||||
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
|
||||
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
||||
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
|
||||
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord),
|
||||
shortcutTarget.getProbability());
|
||||
shortcutTargets.remove(shortcutTarget.mWord);
|
||||
}
|
||||
}
|
||||
|
@ -1193,7 +1195,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
|
||||
final String targetCodePonts = shortcutTarget.mWord;
|
||||
assertEquals((int)shortcutTargets.get(word).get(targetCodePonts),
|
||||
shortcutTarget.mFrequency);
|
||||
shortcutTarget.getProbability());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,7 +127,7 @@ public class CombinedInputOutput {
|
|||
if (null != word) {
|
||||
dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
||||
for (WeightedString s : bigrams) {
|
||||
dict.setBigram(word, s.mWord, s.mFrequency);
|
||||
dict.setBigram(word, s.mWord, s.getProbability());
|
||||
}
|
||||
}
|
||||
if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>();
|
||||
|
@ -185,7 +185,7 @@ public class CombinedInputOutput {
|
|||
if (null != word) {
|
||||
dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
||||
for (WeightedString s : bigrams) {
|
||||
dict.setBigram(word, s.mWord, s.mFrequency);
|
||||
dict.setBigram(word, s.mWord, s.getProbability());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -222,13 +222,13 @@ public class CombinedInputOutput {
|
|||
if (null != word.mShortcutTargets) {
|
||||
for (WeightedString target : word.mShortcutTargets) {
|
||||
destination.write(" " + SHORTCUT_TAG + "=" + target.mWord + ","
|
||||
+ FREQUENCY_TAG + "=" + target.mFrequency + "\n");
|
||||
+ FREQUENCY_TAG + "=" + target.getProbability() + "\n");
|
||||
}
|
||||
}
|
||||
if (null != word.mBigrams) {
|
||||
for (WeightedString bigram : word.mBigrams) {
|
||||
destination.write(" " + BIGRAM_TAG + "=" + bigram.mWord + ","
|
||||
+ FREQUENCY_TAG + "=" + bigram.mFrequency + "\n");
|
||||
+ FREQUENCY_TAG + "=" + bigram.getProbability() + "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -159,7 +159,7 @@ public class Diff extends Dicttool.Command {
|
|||
if (null == list0) return false;
|
||||
for (final WeightedString attribute0 : list0) {
|
||||
System.out.println(type + " removed: " + word + " " + attribute0.mWord + " "
|
||||
+ attribute0.mFrequency);
|
||||
+ attribute0.getProbability());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -175,8 +175,8 @@ public class Diff extends Dicttool.Command {
|
|||
for (final WeightedString attribute1 : list1) {
|
||||
if (attribute0.mWord.equals(attribute1.mWord)) {
|
||||
System.out.println(type + " freq changed: " + word + " "
|
||||
+ attribute0.mWord + " " + attribute0.mFrequency + " -> "
|
||||
+ attribute1.mFrequency);
|
||||
+ attribute0.mWord + " " + attribute0.getProbability() + " -> "
|
||||
+ attribute1.getProbability());
|
||||
list1.remove(attribute1);
|
||||
foundString = true;
|
||||
break;
|
||||
|
@ -185,7 +185,7 @@ public class Diff extends Dicttool.Command {
|
|||
if (!foundString) {
|
||||
// We come here if we haven't found any matching string.
|
||||
System.out.println(type + " removed: " + word + " " + attribute0.mWord + " "
|
||||
+ attribute0.mFrequency);
|
||||
+ attribute0.getProbability());
|
||||
}
|
||||
} else {
|
||||
list1.remove(attribute0);
|
||||
|
@ -197,7 +197,7 @@ public class Diff extends Dicttool.Command {
|
|||
for (final WeightedString attribute1 : list1) {
|
||||
hasDifferences = true;
|
||||
System.out.println(type + " added: " + word + " " + attribute1.mWord + " "
|
||||
+ attribute1.mFrequency);
|
||||
+ attribute1.getProbability());
|
||||
}
|
||||
return hasDifferences;
|
||||
}
|
||||
|
|
|
@ -51,7 +51,8 @@ public class Info extends Dicttool.Command {
|
|||
if (null != w.mShortcutTargets) {
|
||||
shortcutCount += w.mShortcutTargets.size();
|
||||
for (WeightedString shortcutTarget : w.mShortcutTargets) {
|
||||
if (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY == shortcutTarget.mFrequency) {
|
||||
if (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY
|
||||
== shortcutTarget.getProbability()) {
|
||||
++whitelistCount;
|
||||
}
|
||||
}
|
||||
|
@ -84,8 +85,9 @@ public class Info extends Dicttool.Command {
|
|||
} else {
|
||||
for (final WeightedString shortcutTarget : shortcutTargets) {
|
||||
System.out.println(" Shortcut target: " + shortcutTarget.mWord + " ("
|
||||
+ (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY == shortcutTarget.mFrequency
|
||||
? "whitelist" : shortcutTarget.mFrequency) + ")");
|
||||
+ (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY
|
||||
== shortcutTarget.getProbability() ?
|
||||
"whitelist" : shortcutTarget.getProbability()) + ")");
|
||||
}
|
||||
}
|
||||
final ArrayList<WeightedString> bigrams = ptNode.getBigrams();
|
||||
|
@ -93,7 +95,8 @@ public class Info extends Dicttool.Command {
|
|||
System.out.println(" No bigrams");
|
||||
} else {
|
||||
for (final WeightedString bigram : bigrams) {
|
||||
System.out.println(" Bigram: " + bigram.mWord + " (" + bigram.mFrequency + ")");
|
||||
System.out.println(
|
||||
" Bigram: " + bigram.mWord + " (" + bigram.getProbability() + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -319,7 +319,7 @@ public class XmlDictInputOutput {
|
|||
final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
|
||||
for (final WeightedString bigram : bigramList) {
|
||||
if (!dict.hasWord(bigram.mWord)) continue;
|
||||
dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency);
|
||||
dict.setBigram(firstWord, bigram.mWord, bigram.getProbability());
|
||||
}
|
||||
}
|
||||
return dict;
|
||||
|
@ -369,7 +369,7 @@ public class XmlDictInputOutput {
|
|||
destination.write("\n");
|
||||
for (WeightedString target : word.mShortcutTargets) {
|
||||
destination.write(" <" + SHORTCUT_TAG + " " + FREQUENCY_ATTR + "=\""
|
||||
+ target.mFrequency + "\">" + target.mWord + "</" + SHORTCUT_TAG
|
||||
+ target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG
|
||||
+ ">\n");
|
||||
}
|
||||
destination.write(" ");
|
||||
|
@ -378,7 +378,8 @@ public class XmlDictInputOutput {
|
|||
destination.write("\n");
|
||||
for (WeightedString bigram : word.mBigrams) {
|
||||
destination.write(" <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\""
|
||||
+ bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n");
|
||||
+ bigram.getProbability() + "\">" + bigram.mWord
|
||||
+ "</" + BIGRAM_TAG + ">\n");
|
||||
}
|
||||
destination.write(" ");
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue