Merge "Write the bigram frequency following the new formula" into jb-dev
This commit is contained in:
commit
59e6ad3874
1 changed files with 21 additions and 5 deletions
|
@ -174,6 +174,7 @@ public class BinaryDictInputOutput {
|
||||||
private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
|
private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
|
||||||
|
|
||||||
private static final int MAX_TERMINAL_FREQUENCY = 255;
|
private static final int MAX_TERMINAL_FREQUENCY = 255;
|
||||||
|
private static final int MAX_BIGRAM_FREQUENCY = 15;
|
||||||
|
|
||||||
// Arbitrary limit to how much passes we consider address size compression should
|
// Arbitrary limit to how much passes we consider address size compression should
|
||||||
// terminate in. At the time of this writing, our largest dictionary completes
|
// terminate in. At the time of this writing, our largest dictionary completes
|
||||||
|
@ -726,12 +727,13 @@ public class BinaryDictInputOutput {
|
||||||
*
|
*
|
||||||
* @param more whether there are more bigrams after this one.
|
* @param more whether there are more bigrams after this one.
|
||||||
* @param offset the offset of the bigram.
|
* @param offset the offset of the bigram.
|
||||||
* @param bigramFrequency the frequency of the bigram, 0..15.
|
* @param bigramFrequency the frequency of the bigram, 0..255.
|
||||||
* @param unigramFrequency the unigram frequency of the same word.
|
* @param unigramFrequency the unigram frequency of the same word, 0..255.
|
||||||
|
* @param word the second bigram, for debugging purposes
|
||||||
* @return the flags
|
* @return the flags
|
||||||
*/
|
*/
|
||||||
private static final int makeBigramFlags(final boolean more, final int offset,
|
private static final int makeBigramFlags(final boolean more, final int offset,
|
||||||
final int bigramFrequency, final int unigramFrequency) {
|
int bigramFrequency, final int unigramFrequency, final String word) {
|
||||||
int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0)
|
int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0)
|
||||||
+ (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0);
|
+ (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0);
|
||||||
switch (getByteSize(offset)) {
|
switch (getByteSize(offset)) {
|
||||||
|
@ -747,7 +749,21 @@ public class BinaryDictInputOutput {
|
||||||
default:
|
default:
|
||||||
throw new RuntimeException("Strange offset size");
|
throw new RuntimeException("Strange offset size");
|
||||||
}
|
}
|
||||||
bigramFlags += bigramFrequency & FLAG_ATTRIBUTE_FREQUENCY;
|
if (unigramFrequency > bigramFrequency) {
|
||||||
|
MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word
|
||||||
|
+ "\". Bigram freq is " + bigramFrequency + ", unigram freq for "
|
||||||
|
+ word + " is " + unigramFrequency);
|
||||||
|
bigramFrequency = unigramFrequency;
|
||||||
|
}
|
||||||
|
// We compute the difference between 255 (which means probability = 1) and the
|
||||||
|
// unigram score. We split this into discrete 16 steps, and this is the value
|
||||||
|
// we store into the 4 bits of the bigrams frequency.
|
||||||
|
final float bigramRatio = (float)(bigramFrequency - unigramFrequency)
|
||||||
|
/ (MAX_TERMINAL_FREQUENCY - unigramFrequency);
|
||||||
|
// TODO: if the bigram freq is very close to the unigram frequency, we don't want
|
||||||
|
// to include the bigram in the binary dictionary at all.
|
||||||
|
final int discretizedFrequency = Math.round(bigramRatio * MAX_BIGRAM_FREQUENCY);
|
||||||
|
bigramFlags += discretizedFrequency & FLAG_ATTRIBUTE_FREQUENCY;
|
||||||
return bigramFlags;
|
return bigramFlags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -862,7 +878,7 @@ public class BinaryDictInputOutput {
|
||||||
++groupAddress;
|
++groupAddress;
|
||||||
final int offset = addressOfBigram - groupAddress;
|
final int offset = addressOfBigram - groupAddress;
|
||||||
int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset,
|
int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset,
|
||||||
bigram.mFrequency, unigramFrequencyForThisWord);
|
bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
|
||||||
buffer[index++] = (byte)bigramFlags;
|
buffer[index++] = (byte)bigramFlags;
|
||||||
final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
|
final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
|
||||||
index += bigramShift;
|
index += bigramShift;
|
||||||
|
|
Loading…
Reference in a new issue