Merge "Support bigram historical information migration."

This commit is contained in:
Keisuke Kuroyanagi 2014-05-13 02:56:36 +00:00 committed by Android (Google) Code Review
commit 61fc329901
15 changed files with 71 additions and 56 deletions

View file

@ -335,7 +335,7 @@ static void latinime_BinaryDictionary_addUnigramWord(JNIEnv *env, jclass clazz,
if (!shortcutTargetCodePoints.empty()) {
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
}
// Use 1 for count to indicate the word has inputed.
// Use 1 for count to indicate the word has inputted.
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
dictionary->addUnigramWord(codePoints, codePointCount, &unigramProperty);
@ -353,8 +353,12 @@ static void latinime_BinaryDictionary_addBigramWords(JNIEnv *env, jclass clazz,
jsize word1Length = env->GetArrayLength(word1);
int word1CodePoints[word1Length];
env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints);
dictionary->addBigramWords(word0CodePoints, word0Length, word1CodePoints,
word1Length, probability, timestamp);
const std::vector<int> bigramTargetCodePoints(
word1CodePoints, word1CodePoints + word1Length);
// Use 1 for count to indicate the bigram has inputted.
const BigramProperty bigramProperty(&bigramTargetCodePoints, probability,
timestamp, 0 /* level */, 1 /* count */);
dictionary->addBigramWords(word0CodePoints, word0Length, &bigramProperty);
}
static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass clazz, jlong dict,
@ -437,14 +441,18 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
env->GetIntField(languageModelParam, shortcutProbabilityFieldId);
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
}
// Use 1 for count to indicate the word has inputed.
// Use 1 for count to indicate the word has inputted.
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
unigramProbability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
dictionary->addUnigramWord(word1CodePoints, word1Length, &unigramProperty);
if (word0) {
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
dictionary->addBigramWords(word0CodePoints, word0Length, word1CodePoints, word1Length,
bigramProbability, timestamp);
const std::vector<int> bigramTargetCodePoints(
word1CodePoints, word1CodePoints + word1Length);
// Use 1 for count to indicate the bigram has inputted.
const BigramProperty bigramProperty(&bigramTargetCodePoints, bigramProbability,
timestamp, 0 /* level */, 1 /* count */);
dictionary->addBigramWords(word0CodePoints, word0Length, &bigramProperty);
}
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
return i + 1;
@ -558,11 +566,9 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
return false;
}
}
for (const BigramProperty &bigarmProperty : *wordProperty.getBigramProperties()) {
const std::vector<int> *targetCodePoints = bigarmProperty.getTargetCodePoints();
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
if (!dictionaryStructureWithBufferPolicy->addBigramWords(wordCodePoints, wordLength,
targetCodePoints->data(), targetCodePoints->size(),
bigarmProperty.getProbability(), bigarmProperty.getTimestamp())) {
&bigramProperty)) {
LogUtils::logToJava(env, "Cannot add bigram to the new dict.");
return false;
}

View file

@ -88,11 +88,10 @@ void Dictionary::addUnigramWord(const int *const word, const int length,
mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, unigramProperty);
}
void Dictionary::addBigramWords(const int *const word0, const int length0, const int *const word1,
const int length1, const int probability, const int timestamp) {
void Dictionary::addBigramWords(const int *const word0, const int length0,
const BigramProperty *const bigramProperty) {
TimeKeeper::setCurrentTime();
mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, word1, length1,
probability, timestamp);
mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, bigramProperty);
}
void Dictionary::removeBigramWords(const int *const word0, const int length0,

View file

@ -76,8 +76,8 @@ class Dictionary {
void addUnigramWord(const int *const codePoints, const int codePointCount,
const UnigramProperty *const unigramProperty);
void addBigramWords(const int *const word0, const int length0, const int *const word1,
const int length1, const int probability, const int timestamp);
void addBigramWords(const int *const word0, const int length0,
const BigramProperty *const bigramProperty);
void removeBigramWords(const int *const word0, const int length0, const int *const word1,
const int length1);

View file

@ -73,8 +73,8 @@ class DictionaryStructureWithBufferPolicy {
const UnigramProperty *const unigramProperty) = 0;
// Returns whether the update was success or not.
virtual bool addBigramWords(const int *const word0, const int length0, const int *const word1,
const int length1, const int probability, const int timestamp) = 0;
virtual bool addBigramWords(const int *const word0, const int length0,
const BigramProperty *const bigramProperty) = 0;
// Returns whether the update was success or not.
virtual bool removeBigramWords(const int *const word0, const int length0,

View file

@ -16,6 +16,7 @@
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
#include "suggest/core/dictionary/property/bigram_property.h"
#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h"
@ -49,13 +50,12 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out
}
bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId,
const int newProbability, const int timestamp, bool *const outAddedNewEntry) {
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) {
// 1. The word has no bigrams yet.
// 2. The word has bigrams, and there is the target in the list.
// 3. The word has bigrams, and there is an invalid entry that can be reclaimed.
// 4. The word has bigrams. We have to append new bigram entry to the list.
// 5. Same as 4, but the list is the last entry of the content file.
if (outAddedNewEntry) {
*outAddedNewEntry = false;
}
@ -69,7 +69,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
newTargetTerminalId);
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry,
newProbability, timestamp);
bigramProperty);
// Write an entry.
const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) {
@ -102,7 +102,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
newTargetTerminalId);
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
&newBigramEntry, newProbability, timestamp);
&newBigramEntry, bigramProperty);
if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) {
return false;
}
@ -128,7 +128,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
const BigramEntry updatedBigramEntry =
originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId);
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
&updatedBigramEntry, newProbability, timestamp);
&updatedBigramEntry, bigramProperty);
return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate);
}
@ -253,19 +253,19 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
}
const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
const BigramEntry *const originalBigramEntry, const int newProbability,
const int timestamp) const {
const BigramEntry *const originalBigramEntry,
const BigramProperty *const bigramProperty) const {
// TODO: Consolidate historical info and probability.
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
// Use 1 for count to indicate the bigram has inputed.
const HistoricalInfo historicalInfoForUpdate(timestamp, 0 /* level */, 1 /* count */);
const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(),
bigramProperty->getLevel(), bigramProperty->getCount());
const HistoricalInfo updatedHistoricalInfo =
ForgettingCurveUtils::createUpdatedHistoricalInfo(
originalBigramEntry->getHistoricalInfo(), newProbability,
originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(),
&historicalInfoForUpdate, mHeaderPolicy);
return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
} else {
return originalBigramEntry->updateProbabilityAndGetEntry(newProbability);
return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability());
}
}

View file

@ -24,6 +24,7 @@
namespace latinime {
class BigramDictContent;
class BigramProperty;
class HeaderPolicy;
class TerminalPositionLookupTable;
@ -43,8 +44,8 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
// Do nothing because we don't need to skip bigram lists in ver4 dictionaries.
}
bool addNewEntry(const int terminalId, const int newTargetTerminalId, const int newProbability,
const int timestamp, bool *const outAddedNewEntry);
bool addNewEntry(const int terminalId, const int newTargetTerminalId,
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
bool removeEntry(const int terminalId, const int targetTerminalId);
@ -60,7 +61,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
int *const outTailEntryPos) const;
const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry,
const int newProbability, const int timestamp) const;
const BigramProperty *const bigramProperty) const;
bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos);

View file

@ -85,13 +85,13 @@ bool DynamicPtUpdatingHelper::addUnigramWord(
}
bool DynamicPtUpdatingHelper::addBigramWords(const int word0Pos, const int word1Pos,
const int probability, const int timestamp, bool *const outAddedNewBigram) {
const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) {
const PtNodeParams sourcePtNodeParams(
mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos));
const PtNodeParams targetPtNodeParams(
mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos));
return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, probability,
timestamp, outAddedNewBigram);
return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams,
bigramProperty, outAddedNewBigram);
}
// Remove a bigram relation from word0Pos to word1Pos.

View file

@ -22,6 +22,7 @@
namespace latinime {
class BigramProperty;
class BufferWithExtendableBuffer;
class DynamicPtReadingHelper;
class PtNodeReader;
@ -42,8 +43,8 @@ class DynamicPtUpdatingHelper {
const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram);
// Add a bigram relation from word0Pos to word1Pos.
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability,
const int timestamp, bool *const outAddedNewBigram);
bool addBigramWords(const int word0Pos, const int word1Pos,
const BigramProperty *const bigramProperty, bool *const outAddedNewBigram);
// Remove a bigram relation from word0Pos to word1Pos.
bool removeBigramWords(const int word0Pos, const int word1Pos);

View file

@ -24,6 +24,7 @@
namespace latinime {
class BigramProperty;
class UnigramProperty;
// Interface class used to write PtNode information.
@ -70,7 +71,7 @@ class PtNodeWriter {
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0;
virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams,
const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp,
const PtNodeParams *const targetPtNodeParam, const BigramProperty *const bigramProperty,
bool *const outAddedNewBigram) = 0;
virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams,

View file

@ -88,8 +88,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
return false;
}
bool addBigramWords(const int *const word0, const int length0, const int *const word1,
const int length1, const int probability, const int timestamp) {
bool addBigramWords(const int *const word0, const int length0,
const BigramProperty *const bigramProperty) {
// This method should not be called for non-updatable dictionary.
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
return false;

View file

@ -223,11 +223,10 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
}
bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry(
const PtNodeParams *const sourcePtNodeParams,
const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp,
bool *const outAddedNewBigram) {
const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam,
const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) {
if (!mBigramPolicy->addNewEntry(sourcePtNodeParams->getTerminalId(),
targetPtNodeParam->getTerminalId(), probability, timestamp, outAddedNewBigram)) {
targetPtNodeParam->getTerminalId(), bigramProperty, outAddedNewBigram)) {
AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d",
sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId());
return false;

View file

@ -76,7 +76,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams,
const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp,
const PtNodeParams *const targetPtNodeParam, const BigramProperty *const bigramProperty,
bool *const outAddedNewBigram);
virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams,

View file

@ -209,8 +209,7 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
}
bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
const int *const word1, const int length1, const int probability,
const int timestamp) {
const BigramProperty *const bigramProperty) {
if (!mBuffers->isUpdatable()) {
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
return false;
@ -220,9 +219,10 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
mDictBuffer->getTailPosition());
return false;
}
if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) {
if (length0 > MAX_WORD_LENGTH
|| bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
AKLOGE("Either src word or target word is too long to insert the bigram to the dictionary. "
"length0: %d, length1: %d", length0, length1);
"length0: %d, length1: %d", length0, bigramProperty->getTargetCodePoints()->size());
return false;
}
const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0,
@ -230,14 +230,14 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
if (word0Pos == NOT_A_DICT_POS) {
return false;
}
const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1,
false /* forceLowerCaseSearch */);
const int word1Pos = getTerminalPtNodePositionOfWord(
bigramProperty->getTargetCodePoints()->data(),
bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */);
if (word1Pos == NOT_A_DICT_POS) {
return false;
}
bool addedNewBigram = false;
if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, probability, timestamp,
&addedNewBigram)) {
if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, bigramProperty, &addedNewBigram)) {
if (addedNewBigram) {
mBigramCount++;
}

View file

@ -93,8 +93,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
bool addUnigramWord(const int *const word, const int length,
const UnigramProperty *const unigramProperty);
bool addBigramWords(const int *const word0, const int length0, const int *const word1,
const int length1, const int probability, const int timestamp);
bool addBigramWords(const int *const word0, const int length0,
const BigramProperty *const bigramProperty);
bool removeBigramWords(const int *const word0, const int length0, const int *const word1,
const int length1);

View file

@ -580,7 +580,6 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
// TODO: Add tests for bigrams when the implementation gets ready.
addUnigramWord(binaryDictionary, "aaa", DUMMY_PROBABILITY);
assertTrue(binaryDictionary.isValidWord("aaa"));
addUnigramWord(binaryDictionary, "bbb", Dictionary.NOT_A_PROBABILITY);
@ -590,6 +589,11 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
addUnigramWord(binaryDictionary, "ccc", DUMMY_PROBABILITY);
addUnigramWord(binaryDictionary, "ccc", DUMMY_PROBABILITY);
addUnigramWord(binaryDictionary, "ccc", DUMMY_PROBABILITY);
addUnigramWord(binaryDictionary, "abc", DUMMY_PROBABILITY);
addBigramWords(binaryDictionary, "aaa", "abc", DUMMY_PROBABILITY);
assertTrue(binaryDictionary.isValidBigram("aaa", "abc"));
addBigramWords(binaryDictionary, "aaa", "bbb", Dictionary.NOT_A_PROBABILITY);
assertFalse(binaryDictionary.isValidBigram("aaa", "bbb"));
assertEquals(fromFormatVersion, binaryDictionary.getFormatVersion());
assertTrue(binaryDictionary.migrateTo(toFormatVersion));
@ -600,6 +604,10 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
assertTrue(binaryDictionary.getFrequency("aaa") < binaryDictionary.getFrequency("ccc"));
addUnigramWord(binaryDictionary, "bbb", Dictionary.NOT_A_PROBABILITY);
assertTrue(binaryDictionary.isValidWord("bbb"));
assertTrue(binaryDictionary.isValidBigram("aaa", "abc"));
assertFalse(binaryDictionary.isValidBigram("aaa", "bbb"));
addBigramWords(binaryDictionary, "aaa", "bbb", Dictionary.NOT_A_PROBABILITY);
assertTrue(binaryDictionary.isValidBigram("aaa", "bbb"));
binaryDictionary.close();
dictFile.delete();
}