Implement addBigramWords() for DynamicPatriciaTrie.
Bug: 6669677 Change-Id: Ia6aa76c212165690191c65fee9dacdc44be5acd5
This commit is contained in:
parent
8faa1a4482
commit
cd6a0430c7
7 changed files with 130 additions and 36 deletions
|
@ -62,6 +62,11 @@ public:
|
|||
return flags | FLAG_ATTRIBUTE_HAS_NEXT;
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE BigramFlags setProbabilityInFlags(const BigramFlags flags,
|
||||
const int probability) {
|
||||
return (flags & (~MASK_ATTRIBUTE_PROBABILITY)) | (probability & MASK_ATTRIBUTE_PROBABILITY);
|
||||
}
|
||||
|
||||
// Returns true if the bigram entry is valid and put entry values into out*.
|
||||
static AK_FORCE_INLINE bool createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize(
|
||||
const int entryPos, const int targetPos, const int probability, const bool hasNext,
|
||||
|
|
|
@ -98,8 +98,8 @@ bool DynamicBigramListPolicy::copyAllBigrams(int *const fromPos, int *const toPo
|
|||
return true;
|
||||
}
|
||||
|
||||
bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int probability,
|
||||
int *const pos) {
|
||||
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramPos,
|
||||
const int probability, int *const pos) {
|
||||
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos);
|
||||
if (usesAdditionalBuffer) {
|
||||
*pos -= mBuffer->getOriginalBufferSize();
|
||||
|
@ -113,7 +113,17 @@ bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int prob
|
|||
// The buffer address can be changed after calling buffer writing methods.
|
||||
const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer);
|
||||
flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos);
|
||||
BigramListReadWriteUtils::getBigramAddressAndForwardPointer(buffer, flags, pos);
|
||||
int originalBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer(
|
||||
buffer, flags, pos);
|
||||
if (usesAdditionalBuffer && originalBigramPos != NOT_A_VALID_WORD_POS) {
|
||||
originalBigramPos += mBuffer->getOriginalBufferSize();
|
||||
}
|
||||
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramPos) {
|
||||
// Update this bigram entry.
|
||||
const BigramListReadWriteUtils::BigramFlags updatedFlags =
|
||||
BigramListReadWriteUtils::setProbabilityInFlags(flags, probability);
|
||||
return mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos);
|
||||
}
|
||||
if (BigramListReadWriteUtils::hasNext(flags)) {
|
||||
continue;
|
||||
}
|
||||
|
@ -124,34 +134,36 @@ bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int prob
|
|||
if (!mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos)) {
|
||||
return false;
|
||||
}
|
||||
if (usesAdditionalBuffer) {
|
||||
*pos += mBuffer->getOriginalBufferSize();
|
||||
}
|
||||
// Then, add a new entry after the last entry.
|
||||
return writeNewBigramEntry(bigramPos, probability, pos);
|
||||
} while(BigramListReadWriteUtils::hasNext(flags));
|
||||
// We return directly from the while loop.
|
||||
ASSERT(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramPos, const int probability,
|
||||
int *const writingPos) {
|
||||
BigramListReadWriteUtils::BigramFlags newBigramFlags;
|
||||
uint32_t newBigramOffset;
|
||||
int newBigramOffsetFieldSize;
|
||||
if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize(
|
||||
*pos, bigramPos, BigramListReadWriteUtils::getProbabilityFromFlags(flags),
|
||||
BigramListReadWriteUtils::hasNext(flags), &newBigramFlags, &newBigramOffset,
|
||||
&newBigramOffsetFieldSize)) {
|
||||
continue;
|
||||
}
|
||||
int newEntryPos = *pos;
|
||||
if (usesAdditionalBuffer) {
|
||||
newEntryPos += mBuffer->getOriginalBufferSize();
|
||||
*writingPos, bigramPos, probability, false /* hasNext */, &newBigramFlags,
|
||||
&newBigramOffset, &newBigramOffsetFieldSize)) {
|
||||
return false;
|
||||
}
|
||||
// Write bigram flags.
|
||||
if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */,
|
||||
&newEntryPos)) {
|
||||
if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */, writingPos)) {
|
||||
return false;
|
||||
}
|
||||
// Write bigram positon offset.
|
||||
if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize,
|
||||
&newEntryPos)) {
|
||||
writingPos)) {
|
||||
return false;
|
||||
}
|
||||
} while(BigramListReadWriteUtils::hasNext(flags));
|
||||
if (usesAdditionalBuffer) {
|
||||
*pos += mBuffer->getOriginalBufferSize();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -48,7 +48,10 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
|
|||
// positions after bigram lists. This method skips invalid bigram entries.
|
||||
bool copyAllBigrams(int *const fromPos, int *const toPos);
|
||||
|
||||
bool addBigramEntry(const int bigramPos, const int probability, int *const pos);
|
||||
bool addNewBigramEntryToBigramList(const int bigramPos, const int probability, int *const pos);
|
||||
|
||||
bool writeNewBigramEntry(const int bigramPos, const int probability,
|
||||
int *const writingPos);
|
||||
|
||||
// Return if targetBigramPos is found or not.
|
||||
bool removeBigram(const int bigramListPos, const int targetBigramPos);
|
||||
|
|
|
@ -69,11 +69,13 @@ void DynamicPatriciaTrieNodeReader::fetchNodeInfoFromBufferAndProcessMovedNode(c
|
|||
if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) {
|
||||
mChildrenPos += mBuffer->getOriginalBufferSize();
|
||||
}
|
||||
if (mSiblingPos == NOT_A_DICT_POS && DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) {
|
||||
if (mSiblingPos == NOT_A_DICT_POS) {
|
||||
if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) {
|
||||
mBigramLinkedNodePos = mChildrenPos;
|
||||
} else {
|
||||
mBigramLinkedNodePos = NOT_A_DICT_POS;
|
||||
}
|
||||
}
|
||||
if (usesAdditionalBuffer) {
|
||||
pos += mBuffer->getOriginalBufferSize();
|
||||
}
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
|
||||
namespace latinime {
|
||||
|
||||
const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
|
||||
|
||||
bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
||||
DynamicPatriciaTrieReadingHelper *const readingHelper,
|
||||
const int *const wordCodePoints, const int codePointCount, const int probability) {
|
||||
|
@ -79,13 +81,44 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
|||
|
||||
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
|
||||
const int probability) {
|
||||
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
|
||||
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
|
||||
nodeReader.fetchNodeInfoFromBuffer(word0Pos);
|
||||
if (nodeReader.isDeleted()) {
|
||||
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
|
||||
mMergedNodeCodePoints);
|
||||
// Move node to add bigram entry.
|
||||
const int newNodePos = mBuffer->getTailPosition();
|
||||
if (!markNodeAsMovedAndSetPosition(&nodeReader, newNodePos, newNodePos)) {
|
||||
return false;
|
||||
}
|
||||
// TODO: Implement.
|
||||
int writingPos = newNodePos;
|
||||
// Write a new PtNode using original PtNode's info to the tail of the dictionary.
|
||||
if (!writePtNodeToBufferByCopyingPtNodeInfo(&nodeReader, nodeReader.getParentPos(),
|
||||
mMergedNodeCodePoints, nodeReader.getCodePointCount(), nodeReader.getProbability(),
|
||||
&writingPos)) {
|
||||
return false;
|
||||
}
|
||||
nodeReader.fetchNodeInfoFromBuffer(newNodePos);
|
||||
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
|
||||
// Insert a new bigram entry into the existing bigram list.
|
||||
int bigramListPos = nodeReader.getBigramsPos();
|
||||
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos);
|
||||
} else {
|
||||
// The PtNode doesn't have a bigram list.
|
||||
// First, Write a bigram entry at the tail position of the PtNode.
|
||||
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
|
||||
return false;
|
||||
}
|
||||
// Then, Mark as the PtNode having bigram list in the flags.
|
||||
const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
|
||||
PatriciaTrieReadingUtils::createAndGetFlags(nodeReader.isBlacklisted(),
|
||||
nodeReader.isNotAWord(), nodeReader.getProbability() != NOT_A_PROBABILITY,
|
||||
nodeReader.getShortcutPos() != NOT_A_DICT_POS, true /* hasBigrams */,
|
||||
nodeReader.getCodePointCount() > 1, CHILDREN_POSITION_FIELD_SIZE);
|
||||
writingPos = newNodePos;
|
||||
// Write updated flags into the moved PtNode's flags field.
|
||||
return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags,
|
||||
&writingPos);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove a bigram relation from word0Pos to word1Pos.
|
||||
|
|
|
@ -49,6 +49,8 @@ class DynamicPatriciaTrieWritingHelper {
|
|||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper);
|
||||
|
||||
static const int CHILDREN_POSITION_FIELD_SIZE;
|
||||
|
||||
BufferWithExtendableBuffer *const mBuffer;
|
||||
DynamicBigramListPolicy *const mBigramPolicy;
|
||||
DynamicShortcutListPolicy *const mShortcutPolicy;
|
||||
|
|
|
@ -122,4 +122,41 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertEquals(probability, binaryDictionary.getFrequency("a"));
|
||||
assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
|
||||
}
|
||||
|
||||
public void testAddBigramWords() {
|
||||
// TODO: Add a test to check the frequency of the bigram score which uses current value
|
||||
// calculated in the native code
|
||||
File dictFile = null;
|
||||
try {
|
||||
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
} catch (UnsupportedFormatException e) {
|
||||
fail("UnsupportedFormatException while writing an initial dictionary : " + e);
|
||||
}
|
||||
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
|
||||
final int unigramProbability = 100;
|
||||
final int bigramProbability = 10;
|
||||
binaryDictionary.addUnigramWord("aaa", unigramProbability);
|
||||
binaryDictionary.addUnigramWord("abb", unigramProbability);
|
||||
binaryDictionary.addUnigramWord("bcc", unigramProbability);
|
||||
binaryDictionary.addBigramWords("aaa", "abb", bigramProbability);
|
||||
binaryDictionary.addBigramWords("aaa", "bcc", bigramProbability);
|
||||
binaryDictionary.addBigramWords("abb", "aaa", bigramProbability);
|
||||
binaryDictionary.addBigramWords("abb", "bcc", bigramProbability);
|
||||
|
||||
assertEquals(true, binaryDictionary.isValidBigram("aaa", "abb"));
|
||||
assertEquals(true, binaryDictionary.isValidBigram("aaa", "bcc"));
|
||||
assertEquals(true, binaryDictionary.isValidBigram("abb", "aaa"));
|
||||
assertEquals(true, binaryDictionary.isValidBigram("abb", "bcc"));
|
||||
|
||||
assertEquals(false, binaryDictionary.isValidBigram("bcc", "aaa"));
|
||||
assertEquals(false, binaryDictionary.isValidBigram("bcc", "bbc"));
|
||||
assertEquals(false, binaryDictionary.isValidBigram("aaa", "aaa"));
|
||||
|
||||
dictFile.delete();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue