Use linked list for bigram list.

BinaryDictionaryTests for VERSION4_DEV:
Before
Time: 36.461
After
Time: 33.031

Bug: 14425059

Change-Id: I9ca2714f450f61f713df6ebd34c953dece991cdb
Keisuke Kuroyanagi 2014-07-07 21:09:25 +09:00
parent bb843eb223
commit 804f7450fc
6 changed files with 112 additions and 97 deletions

View File

@ -71,8 +71,14 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry,
bigramProperty);
// Write an entry.
const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) {
int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite,
&writingPos)) {
AKLOGE("Cannot write bigram entry. pos: %d.", writingPos);
return false;
}
if (!mBigramDictContent->writeTerminator(writingPos)) {
AKLOGE("Cannot write bigram list terminator. pos: %d.", writingPos);
return false;
}
if (outAddedNewEntry) {
@ -84,32 +90,37 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
int tailEntryPos = NOT_A_DICT_POS;
const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos,
&tailEntryPos);
if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) {
// Case 4, 5.
// Add new entry to the bigram list.
if (tailEntryPos == NOT_A_DICT_POS) {
// Case 4. Create new bigram list.
if (!mBigramDictContent->createNewBigramList(terminalId)) {
return false;
}
const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId);
// Copy existing bigram list.
if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) {
return false;
}
}
if (entryPosToUpdate == NOT_A_DICT_POS) {
// Case 4, 5. Add new entry to the bigram list.
const int contentTailPos = mBigramDictContent->getContentTailPos();
// If the tail entry is at the tail of content buffer, the new entry can be written without
// link (Case 5).
const bool canAppendEntry =
contentTailPos == tailEntryPos + mBigramDictContent->getBigramEntrySize();
const int newEntryPos = canAppendEntry ? tailEntryPos : contentTailPos;
int writingPos = newEntryPos;
// Write new entry at the tail position of the bigram content.
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
newTargetTerminalId);
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
&newBigramEntry, bigramProperty);
if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) {
if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite,
&writingPos)) {
AKLOGE("Cannot write bigram entry. pos: %d.", writingPos);
return false;
}
// Update has next flag of the tail entry.
if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) {
if (!mBigramDictContent->writeTerminator(writingPos)) {
AKLOGE("Cannot write bigram list terminator. pos: %d.", writingPos);
return false;
}
if (!canAppendEntry) {
// Update link of the current tail entry.
if (!mBigramDictContent->writeLink(newEntryPos, tailEntryPos)) {
AKLOGE("Cannot update bigram entry link. pos: %d, linked entry pos: %d.",
tailEntryPos, newEntryPos);
return false;
}
}
if (outAddedNewEntry) {
*outAddedNewEntry = true;
}
@ -228,14 +239,18 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
if (outTailEntryPos) {
*outTailEntryPos = NOT_A_DICT_POS;
}
bool hasNext = true;
int invalidEntryPos = NOT_A_DICT_POS;
int readingPos = bigramListPos;
while (hasNext) {
const int entryPos = readingPos;
while (true) {
const BigramEntry bigramEntry =
mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
hasNext = bigramEntry.hasNext();
const int entryPos = readingPos - mBigramDictContent->getBigramEntrySize();
if (!bigramEntry.hasNext()) {
if (outTailEntryPos) {
*outTailEntryPos = entryPos;
}
break;
}
if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) {
// Entry with same target is found.
return entryPos;
@ -243,11 +258,6 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
// Invalid entry that can be reused is found.
invalidEntryPos = entryPos;
}
if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) {
if (outTailEntryPos) {
*outTailEntryPos = entryPos;
}
}
}
return invalidEntryPos;
}
@ -269,10 +279,4 @@ const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
}
}
bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) {
const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos);
const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext);
return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos);
}
} // namespace latinime

View File

@ -63,8 +63,6 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry,
const BigramProperty *const bigramProperty) const;
bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos);
BigramDictContent *const mBigramDictContent;
const TerminalPositionLookupTable *const mTerminalPositionLookupTable;
const HeaderPolicy *const mHeaderPolicy;

View File

@ -20,6 +20,8 @@
namespace latinime {
const int BigramDictContent::INVALID_LINKED_ENTRY_POS = Ver4DictConstants::NOT_A_TERMINAL_ID;
const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
int *const bigramEntryPos) const {
const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer();
@ -34,7 +36,7 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
}
const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition(
Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos);
const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0;
const bool isLink = (bigramFlags & Ver4DictConstants::BIGRAM_IS_LINK_MASK) != 0;
int probability = NOT_A_PROBABILITY;
int timestamp = NOT_A_TIMESTAMP;
int level = 0;
@ -55,81 +57,90 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
const int targetTerminalId =
(encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ?
Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId;
if (isLink) {
const int linkedEntryPos = targetTerminalId;
if (linkedEntryPos == INVALID_LINKED_ENTRY_POS) {
// Bigram list terminator is found.
return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
Ver4DictConstants::NOT_A_TERMINAL_ID);
}
*bigramEntryPos = linkedEntryPos;
return getBigramEntryAndAdvancePosition(bigramEntryPos);
}
// hasNext is always true because we should continue to read the next entry until the terminator
// is found.
if (mHasHistoricalInfo) {
const HistoricalInfo historicalInfo(timestamp, level, count);
return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId);
return BigramEntry(true /* hasNext */, probability, &historicalInfo, targetTerminalId);
} else {
return BigramEntry(hasNext, probability, targetTerminalId);
return BigramEntry(true /* hasNext */, probability, targetTerminalId);
}
}
bool BigramDictContent::writeBigramEntryAndAdvancePosition(
const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) {
return writeBigramEntryAttributesAndAdvancePosition(false /* isLink */,
bigramEntryToWrite->getProbability(), bigramEntryToWrite->getTargetTerminalId(),
bigramEntryToWrite->getHistoricalInfo()->getTimeStamp(),
bigramEntryToWrite->getHistoricalInfo()->getLevel(),
bigramEntryToWrite->getHistoricalInfo()->getCount(),
entryWritingPos);
}
bool BigramDictContent::writeBigramEntryAttributesAndAdvancePosition(
const bool isLink, const int probability, const int targetTerminalId,
const int timestamp, const int level, const int count, int *const entryWritingPos) {
BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer();
const int bigramFlags = createAndGetBigramFlags(bigramEntryToWrite->hasNext());
const int bigramFlags = isLink ? Ver4DictConstants::BIGRAM_IS_LINK_MASK : 0;
if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags,
Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) {
AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags);
return false;
}
if (mHasHistoricalInfo) {
const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo();
if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(),
if (!bigramListBuffer->writeUintAndAdvancePosition(timestamp,
Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) {
AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos,
historicalInfo->getTimeStamp());
timestamp);
return false;
}
if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(),
if (!bigramListBuffer->writeUintAndAdvancePosition(level,
Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) {
AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos,
historicalInfo->getLevel());
level);
return false;
}
if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getCount(),
if (!bigramListBuffer->writeUintAndAdvancePosition(count,
Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) {
AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos,
historicalInfo->getCount());
count);
return false;
}
} else {
if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(),
if (!bigramListBuffer->writeUintAndAdvancePosition(probability,
Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) {
AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos,
bigramEntryToWrite->getProbability());
probability);
return false;
}
}
const int targetTerminalIdToWrite =
(bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ?
Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID :
bigramEntryToWrite->getTargetTerminalId();
const int targetTerminalIdToWrite = (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) ?
Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : targetTerminalId;
if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite,
Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) {
AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d",
*entryWritingPos, bigramEntryToWrite->getTargetTerminalId());
*entryWritingPos, targetTerminalId);
return false;
}
return true;
}
bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos,
int *const outTailEntryPos) {
int readingPos = bigramListPos;
int writingPos = toPos;
bool hasNext = true;
while (hasNext) {
const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos);
hasNext = bigramEntry.hasNext();
if (!hasNext) {
*outTailEntryPos = writingPos;
}
if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) {
AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos);
return false;
}
}
return true;
bool BigramDictContent::writeLink(const int linkedEntryPos, const int writingPos) {
const int targetTerminalId = linkedEntryPos;
int pos = writingPos;
return writeBigramEntryAttributesAndAdvancePosition(true /* isLink */,
NOT_A_PROBABILITY /* probability */, targetTerminalId, NOT_A_TIMESTAMP, 0 /* level */,
0 /* count */, &pos);
}
bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
@ -171,16 +182,15 @@ bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *
bool BigramDictContent::runGCBigramList(const int bigramListPos,
const BigramDictContent *const sourceBigramDictContent, const int toPos,
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
int *const outEntrycount) {
int *const outEntryCount) {
bool hasNext = true;
int readingPos = bigramListPos;
int writingPos = toPos;
int lastEntryPos = NOT_A_DICT_POS;
while (hasNext) {
const BigramEntry originalBigramEntry =
sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
hasNext = originalBigramEntry.hasNext();
if (originalBigramEntry.getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) {
if (!originalBigramEntry.isValid()) {
continue;
}
TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
@ -189,21 +199,17 @@ bool BigramDictContent::runGCBigramList(const int bigramListPos,
// Target word has been removed.
continue;
}
lastEntryPos = hasNext ? writingPos : NOT_A_DICT_POS;
const BigramEntry updatedBigramEntry =
originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second);
if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) {
AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos);
return false;
}
*outEntrycount += 1;
*outEntryCount += 1;
}
if (lastEntryPos != NOT_A_DICT_POS) {
// Update has next flag in the last written entry.
const BigramEntry bigramEntry = getBigramEntry(lastEntryPos).updateHasNextAndGetEntry(
false /* hasNext */);
if (!writeBigramEntry(&bigramEntry, lastEntryPos)) {
AKLOGE("Cannot write bigram entry to set hasNext flag after GC. pos: %d", writingPos);
if (*outEntryCount > 0) {
if (!writeTerminator(writingPos)) {
AKLOGE("Cannot write terminator to run GC. pos: %d", writingPos);
return false;
}
}

View File

@ -42,6 +42,10 @@ class BigramDictContent : public SparseTableDictContent {
Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE),
mHasHistoricalInfo(hasHistoricalInfo) {}
int getContentTailPos() const {
return getContentBuffer()->getTailPosition();
}
const BigramEntry getBigramEntry(const int bigramEntryPos) const {
int readingPos = bigramEntryPos;
return getBigramEntryAndAdvancePosition(&readingPos);
@ -71,13 +75,18 @@ class BigramDictContent : public SparseTableDictContent {
bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite,
int *const entryWritingPos);
bool writeTerminator(const int writingPos) {
// Terminator is a link to the invalid position.
return writeLink(INVALID_LINKED_ENTRY_POS, writingPos);
}
bool writeLink(const int linkedPos, const int writingPos);
bool createNewBigramList(const int terminalId) {
const int bigramListPos = getContentBuffer()->getTailPosition();
return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos);
}
bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos);
bool flushToFile(const char *const dictPath) const {
return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION,
Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION,
@ -88,17 +97,6 @@ class BigramDictContent : public SparseTableDictContent {
const BigramDictContent *const originalBigramDictContent,
int *const outBigramEntryCount);
bool isContentTailPos(const int pos) const {
return pos == getContentBuffer()->getTailPosition();
}
private:
DISALLOW_COPY_AND_ASSIGN(BigramDictContent);
int createAndGetBigramFlags(const bool hasNext) const {
return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0;
}
int getBigramEntrySize() const {
if (mHasHistoricalInfo) {
return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE
@ -113,6 +111,15 @@ class BigramDictContent : public SparseTableDictContent {
}
}
private:
DISALLOW_COPY_AND_ASSIGN(BigramDictContent);
static const int INVALID_LINKED_ENTRY_POS;
bool writeBigramEntryAttributesAndAdvancePosition(
const bool isLink, const int probability, const int targetTerminalId,
const int timestamp, const int level, const int count, int *const entryWritingPos);
bool runGCBigramList(const int bigramListPos,
const BigramDictContent *const sourceBigramDictContent, const int toPos,
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,

View File

@ -60,7 +60,7 @@ const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID =
(1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1;
const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1;
const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F;
const int Ver4DictConstants::BIGRAM_HAS_NEXT_MASK = 0x80;
const int Ver4DictConstants::BIGRAM_IS_LINK_MASK = 0x80;
const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1;
const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1;

View File

@ -57,8 +57,8 @@ class Ver4DictConstants {
static const int BIGRAM_FLAGS_FIELD_SIZE;
static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE;
static const int INVALID_BIGRAM_TARGET_TERMINAL_ID;
static const int BIGRAM_IS_LINK_MASK;
static const int BIGRAM_PROBABILITY_MASK;
static const int BIGRAM_HAS_NEXT_MASK;
// Used when bigram list has time stamp.
static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE;