Implement addBigramWords() for DynamicPatriciaTrie.

Bug: 6669677
Change-Id: Ia6aa76c212165690191c65fee9dacdc44be5acd5
This commit is contained in:
Keisuke Kuroyanagi 2013-09-17 12:52:21 +09:00
parent 8faa1a4482
commit cd6a0430c7
7 changed files with 130 additions and 36 deletions

View file

@ -62,6 +62,11 @@ public:
return flags | FLAG_ATTRIBUTE_HAS_NEXT;
}
static AK_FORCE_INLINE BigramFlags setProbabilityInFlags(const BigramFlags flags,
const int probability) {
return (flags & (~MASK_ATTRIBUTE_PROBABILITY)) | (probability & MASK_ATTRIBUTE_PROBABILITY);
}
// Returns true if the bigram entry is valid and put entry values into out*.
static AK_FORCE_INLINE bool createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize(
const int entryPos, const int targetPos, const int probability, const bool hasNext,

View file

@ -98,8 +98,8 @@ bool DynamicBigramListPolicy::copyAllBigrams(int *const fromPos, int *const toPo
return true;
}
bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int probability,
int *const pos) {
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramPos,
const int probability, int *const pos) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos);
if (usesAdditionalBuffer) {
*pos -= mBuffer->getOriginalBufferSize();
@ -113,7 +113,17 @@ bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int prob
// The buffer address can be changed after calling buffer writing methods.
const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer);
flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos);
BigramListReadWriteUtils::getBigramAddressAndForwardPointer(buffer, flags, pos);
int originalBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer(
buffer, flags, pos);
if (usesAdditionalBuffer && originalBigramPos != NOT_A_VALID_WORD_POS) {
originalBigramPos += mBuffer->getOriginalBufferSize();
}
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramPos) {
// Update this bigram entry.
const BigramListReadWriteUtils::BigramFlags updatedFlags =
BigramListReadWriteUtils::setProbabilityInFlags(flags, probability);
return mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos);
}
if (BigramListReadWriteUtils::hasNext(flags)) {
continue;
}
@ -124,34 +134,36 @@ bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int prob
if (!mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos)) {
return false;
}
if (usesAdditionalBuffer) {
*pos += mBuffer->getOriginalBufferSize();
}
// Then, add a new entry after the last entry.
return writeNewBigramEntry(bigramPos, probability, pos);
} while(BigramListReadWriteUtils::hasNext(flags));
// We return directly from the while loop.
ASSERT(false);
return false;
}
bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramPos, const int probability,
int *const writingPos) {
BigramListReadWriteUtils::BigramFlags newBigramFlags;
uint32_t newBigramOffset;
int newBigramOffsetFieldSize;
if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize(
*pos, bigramPos, BigramListReadWriteUtils::getProbabilityFromFlags(flags),
BigramListReadWriteUtils::hasNext(flags), &newBigramFlags, &newBigramOffset,
&newBigramOffsetFieldSize)) {
continue;
}
int newEntryPos = *pos;
if (usesAdditionalBuffer) {
newEntryPos += mBuffer->getOriginalBufferSize();
*writingPos, bigramPos, probability, false /* hasNext */, &newBigramFlags,
&newBigramOffset, &newBigramOffsetFieldSize)) {
return false;
}
// Write bigram flags.
if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */,
&newEntryPos)) {
if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */, writingPos)) {
return false;
}
// Write bigram positon offset.
if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize,
&newEntryPos)) {
writingPos)) {
return false;
}
} while(BigramListReadWriteUtils::hasNext(flags));
if (usesAdditionalBuffer) {
*pos += mBuffer->getOriginalBufferSize();
}
return true;
}

View file

@ -48,7 +48,10 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
// positions after bigram lists. This method skips invalid bigram entries.
bool copyAllBigrams(int *const fromPos, int *const toPos);
bool addBigramEntry(const int bigramPos, const int probability, int *const pos);
bool addNewBigramEntryToBigramList(const int bigramPos, const int probability, int *const pos);
bool writeNewBigramEntry(const int bigramPos, const int probability,
int *const writingPos);
// Return if targetBigramPos is found or not.
bool removeBigram(const int bigramListPos, const int targetBigramPos);

View file

@ -69,11 +69,13 @@ void DynamicPatriciaTrieNodeReader::fetchNodeInfoFromBufferAndProcessMovedNode(c
if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) {
mChildrenPos += mBuffer->getOriginalBufferSize();
}
if (mSiblingPos == NOT_A_DICT_POS && DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) {
if (mSiblingPos == NOT_A_DICT_POS) {
if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) {
mBigramLinkedNodePos = mChildrenPos;
} else {
mBigramLinkedNodePos = NOT_A_DICT_POS;
}
}
if (usesAdditionalBuffer) {
pos += mBuffer->getOriginalBufferSize();
}

View file

@ -26,6 +26,8 @@
namespace latinime {
const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
DynamicPatriciaTrieReadingHelper *const readingHelper,
const int *const wordCodePoints, const int codePointCount, const int probability) {
@ -79,13 +81,44 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
const int probability) {
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
nodeReader.fetchNodeInfoFromBuffer(word0Pos);
if (nodeReader.isDeleted()) {
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
mMergedNodeCodePoints);
// Move node to add bigram entry.
const int newNodePos = mBuffer->getTailPosition();
if (!markNodeAsMovedAndSetPosition(&nodeReader, newNodePos, newNodePos)) {
return false;
}
// TODO: Implement.
int writingPos = newNodePos;
// Write a new PtNode using original PtNode's info to the tail of the dictionary.
if (!writePtNodeToBufferByCopyingPtNodeInfo(&nodeReader, nodeReader.getParentPos(),
mMergedNodeCodePoints, nodeReader.getCodePointCount(), nodeReader.getProbability(),
&writingPos)) {
return false;
}
nodeReader.fetchNodeInfoFromBuffer(newNodePos);
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
// Insert a new bigram entry into the existing bigram list.
int bigramListPos = nodeReader.getBigramsPos();
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos);
} else {
// The PtNode doesn't have a bigram list.
// First, Write a bigram entry at the tail position of the PtNode.
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
return false;
}
// Then, Mark as the PtNode having bigram list in the flags.
const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
PatriciaTrieReadingUtils::createAndGetFlags(nodeReader.isBlacklisted(),
nodeReader.isNotAWord(), nodeReader.getProbability() != NOT_A_PROBABILITY,
nodeReader.getShortcutPos() != NOT_A_DICT_POS, true /* hasBigrams */,
nodeReader.getCodePointCount() > 1, CHILDREN_POSITION_FIELD_SIZE);
writingPos = newNodePos;
// Write updated flags into the moved PtNode's flags field.
return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags,
&writingPos);
}
}
// Remove a bigram relation from word0Pos to word1Pos.

View file

@ -49,6 +49,8 @@ class DynamicPatriciaTrieWritingHelper {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper);
static const int CHILDREN_POSITION_FIELD_SIZE;
BufferWithExtendableBuffer *const mBuffer;
DynamicBigramListPolicy *const mBigramPolicy;
DynamicShortcutListPolicy *const mShortcutPolicy;

View file

@ -122,4 +122,41 @@ public class BinaryDictionaryTests extends AndroidTestCase {
assertEquals(probability, binaryDictionary.getFrequency("a"));
assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
}
public void testAddBigramWords() {
// TODO: Add a test to check the frequency of the bigram score which uses current value
// calculated in the native code
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
} catch (UnsupportedFormatException e) {
fail("UnsupportedFormatException while writing an initial dictionary : " + e);
}
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final int unigramProbability = 100;
final int bigramProbability = 10;
binaryDictionary.addUnigramWord("aaa", unigramProbability);
binaryDictionary.addUnigramWord("abb", unigramProbability);
binaryDictionary.addUnigramWord("bcc", unigramProbability);
binaryDictionary.addBigramWords("aaa", "abb", bigramProbability);
binaryDictionary.addBigramWords("aaa", "bcc", bigramProbability);
binaryDictionary.addBigramWords("abb", "aaa", bigramProbability);
binaryDictionary.addBigramWords("abb", "bcc", bigramProbability);
assertEquals(true, binaryDictionary.isValidBigram("aaa", "abb"));
assertEquals(true, binaryDictionary.isValidBigram("aaa", "bcc"));
assertEquals(true, binaryDictionary.isValidBigram("abb", "aaa"));
assertEquals(true, binaryDictionary.isValidBigram("abb", "bcc"));
assertEquals(false, binaryDictionary.isValidBigram("bcc", "aaa"));
assertEquals(false, binaryDictionary.isValidBigram("bcc", "bbc"));
assertEquals(false, binaryDictionary.isValidBigram("aaa", "aaa"));
dictFile.delete();
}
}