From 16412cca66a2ae7f9655691776049fd093f49e08 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Fri, 29 Nov 2013 16:07:29 +0900 Subject: [PATCH] Extend unigram probability field to support historical info. Bug: 11073222 Change-Id: I9668db89ae5e90271f3d70c86cea458832275bff --- ...namic_patricia_trie_gc_event_listeners.cpp | 3 +- .../v4/content/probability_dict_content.cpp | 119 ++++++++++++++---- .../v4/content/probability_dict_content.h | 22 +++- .../structure/v4/content/probability_entry.h | 78 ++++++++++++ .../structure/v4/ver4_dict_buffers.h | 5 +- .../structure/v4/ver4_dict_constants.cpp | 2 + .../v4/ver4_patricia_trie_node_reader.cpp | 5 +- .../v4/ver4_patricia_trie_node_writer.cpp | 22 ++-- 8 files changed, 208 insertions(+), 48 deletions(-) create mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp index 22d4f7ac8..c582a6e76 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp @@ -127,7 +127,8 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNo ptNodeParams->getHeadPos(), writingPos)); mValidPtNodeCount++; // Writes current PtNode. - return mPtNodeWriter->writePtNodeAndAdvancePosition(ptNodeParams, &writingPos); + return mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(ptNodeParams, + 0 /* timestamp */, &writingPos); } bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp index 6019d765e..0222080d1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp @@ -16,55 +16,72 @@ #include "suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { -int ProbabilityDictContent::getProbability(const int terminalId) const { +void ProbabilityDictContent::getProbabilityEntry(const int terminalId, + ProbabilityEntry *const outProbabilityEntry) const { if (terminalId < 0 || terminalId >= mSize) { - return NOT_A_PROBABILITY; + outProbabilityEntry->setProbability(0 /* flags */, NOT_A_PROBABILITY); + AKLOGE("Terminal id (%d) is not in the probability dict content. mSize: %d", terminalId, + mSize); + return; + } + const BufferWithExtendableBuffer *const buffer = getBuffer(); + int entryPos = getEntryPos(terminalId); + const int flags = buffer->readUintAndAdvancePosition( + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos); + const int probability = buffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, &entryPos); + if (mHasHistoricalInfo) { + const int timestamp = buffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos); + const int level = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); + const int count = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); + outProbabilityEntry->setProbabilityWithHistricalInfo(flags, probability, timestamp, level, + count); + } else { + outProbabilityEntry->setProbability(flags, probability); } - const int probabilityFieldPos = - getEntryPos(terminalId) + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE; - return getBuffer()->readUint(Ver4DictConstants::PROBABILITY_SIZE, probabilityFieldPos); } -bool ProbabilityDictContent::setProbability(const int terminalId, const int probability) { +bool ProbabilityDictContent::setProbabilityEntry(const int terminalId, + const ProbabilityEntry *const probabilityEntry) { if (terminalId < 0) { return false; } + const int entryPos = getEntryPos(terminalId); if (terminalId >= mSize) { + ProbabilityEntry dummyEntry; // Write new entry. int writingPos = getBuffer()->getTailPosition(); - while (writingPos <= getEntryPos(terminalId)) { - const int dummyFlags = 0; - if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags, - Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { - return false; - } - const int dummyProbability = 0; - if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyProbability, - Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { + while (writingPos <= entryPos) { + // Fulfilling with dummy entries until writingPos. + if (!writeEntry(&dummyEntry, writingPos)) { + AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize); return false; } + writingPos += getEntrySize(); mSize++; } } - const int probabilityWritingPos = getEntryPos(terminalId) - + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE; - return getWritableBuffer()->writeUint(probability, - Ver4DictConstants::PROBABILITY_SIZE, probabilityWritingPos); + return writeEntry(probabilityEntry, entryPos); } bool ProbabilityDictContent::flushToFile(const char *const dictDirPath) const { if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { - ProbabilityDictContent probabilityDictContentToWrite; + ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo); + ProbabilityEntry probabilityEntry; for (int i = 0; i < mSize; ++i) { - if (!probabilityDictContentToWrite.setProbability(i, getProbability(i))) { + getProbabilityEntry(i, &probabilityEntry); + if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i); return false; } } @@ -79,10 +96,12 @@ bool ProbabilityDictContent::runGC( const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, const ProbabilityDictContent *const originalProbabilityDictContent) { mSize = 0; + ProbabilityEntry probabilityEntry; for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); it != terminalIdMap->end(); ++it) { - if (!setProbability(it->second, - originalProbabilityDictContent->getProbability(it->first))) { + originalProbabilityDictContent->getProbabilityEntry(it->first, &probabilityEntry); + if (!setProbabilityEntry(it->second, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second); return false; } mSize++; @@ -90,9 +109,55 @@ bool ProbabilityDictContent::runGC( return true; } +int ProbabilityDictContent::getEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE; + } else { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE; + } +} + int ProbabilityDictContent::getEntryPos(const int terminalId) const { - return terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE - + Ver4DictConstants::PROBABILITY_SIZE); + return terminalId * getEntrySize(); +} + +bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry, + const int entryPos) { + BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer(); + int writingPos = entryPos; + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(), + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { + AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { + AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos); + return false; + } + if (mHasHistoricalInfo) { + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getTimeStamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos); + return false; + } + } + return true; } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h index 0971ee0e6..7e782728d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h @@ -25,18 +25,23 @@ namespace latinime { +class ProbabilityEntry; + class ProbabilityDictContent : public SingleDictContent { public: - ProbabilityDictContent(const char *const dictDirPath, const bool isUpdatable) + ProbabilityDictContent(const char *const dictDirPath, const bool hasHistoricalInfo, + const bool isUpdatable) : SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable), - mSize(getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE - + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE)) {} + mHasHistoricalInfo(hasHistoricalInfo), + mSize(getBuffer()->getTailPosition() / getEntrySize()) {} - ProbabilityDictContent() : mSize(0) {} + ProbabilityDictContent(const bool hasHistoricalInfo) + : mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {} - int getProbability(const int terminalId) const; + void getProbabilityEntry(const int terminalId, + ProbabilityEntry *const outProbabilityEntry) const; - bool setProbability(const int terminalId, const int probability); + bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry); bool flushToFile(const char *const dictDirPath) const; @@ -46,8 +51,13 @@ class ProbabilityDictContent : public SingleDictContent { private: DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent); + int getEntrySize() const; + int getEntryPos(const int terminalId) const; + bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos); + + bool mHasHistoricalInfo; int mSize; }; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h new file mode 100644 index 000000000..95e2e2809 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_ENTRY_H +#define LATINIME_PROBABILITY_ENTRY_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +class ProbabilityEntry { + public: + ProbabilityEntry() + : mFlags(0), mProbability(NOT_A_PROBABILITY), + mTimestamp(Ver4DictConstants::NOT_A_TIME_STAMP), mLevel(0), mCount(0) {} + + void setProbability(const int flags, const int probability) { + mFlags = flags; + mProbability = probability; + mTimestamp = Ver4DictConstants::NOT_A_TIME_STAMP; + mLevel = 0; + mCount = 0; + } + + void setProbabilityWithHistricalInfo(const int flags, const int probability, + const int timestamp, const int level, const int count) { + mFlags = flags; + mProbability = probability; + mTimestamp = timestamp; + mLevel = level; + mCount = count; + } + + int getFlags() const { + return mFlags; + } + + int getProbability() const { + return mProbability; + } + + int getTimeStamp() const { + return mTimestamp; + } + + int getLevel() const { + return mLevel; + } + + int getCount() const { + return mCount; + } + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityEntry); + + int mFlags; + int mProbability; + int mTimestamp; + int mLevel; + int mCount; +}; +} // namespace latinime +#endif /* LATINIME_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h index e67bd2edb..8fdbbedfe 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h @@ -126,7 +126,7 @@ class Ver4DictBuffers { BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), // TODO: Quit using header size. mTerminalPositionLookupTable(dictDirPath, isUpdatable, mHeaderSize), - mProbabilityDictContent(dictDirPath, isUpdatable), + mProbabilityDictContent(dictDirPath, false /* hasHistoricalInfo */, isUpdatable), mBigramDictContent(dictDirPath, isUpdatable), mShortcutDictContent(dictDirPath, isUpdatable), mIsUpdatable(isUpdatable) {} @@ -135,7 +135,8 @@ class Ver4DictBuffers { : mDictBuffer(0), mHeaderSize(0), mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mExpandableTrieBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mTerminalPositionLookupTable(), mProbabilityDictContent(), + mTerminalPositionLookupTable(), + mProbabilityDictContent(false /* hasHistoricalInfo */), mBigramDictContent(), mShortcutDictContent(), mIsUpdatable(true) {} const MmappedBuffer::MmappedBufferPtr mDictBuffer; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp index 363215124..457f29667 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp @@ -45,6 +45,8 @@ const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp index e9fba79d7..9f7b34d5b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -19,6 +19,7 @@ #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" @@ -59,7 +60,9 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce terminalIdFieldPos += mBuffer->getOriginalBufferSize(); } terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); - probability = mProbabilityDictContent->getProbability(terminalId); + ProbabilityEntry probabilityEntry; + mProbabilityDictContent->getProbabilityEntry(terminalId, &probabilityEntry); + probability = probabilityEntry.getProbability(); } int childrenPosFieldPos = pos; if (usesAdditionalBuffer) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 6a418276f..145eeb08f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -19,6 +19,7 @@ #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_utils.h" @@ -119,8 +120,12 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability( } const int probabilityToWrite = getUpdatedProbability(toBeUpdatedPtNodeParams->getProbability(), newProbability); - return mBuffers->getUpdatableProbabilityDictContent()->setProbability( - toBeUpdatedPtNodeParams->getTerminalId(), probabilityToWrite); + ProbabilityEntry probabilityEntry; + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); + probabilityEntry.setProbability(probabilityEntry.getFlags(), probabilityToWrite); + return mBuffers->getUpdatableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); } bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( @@ -153,8 +158,10 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( // Write probability. const int probabilityToWrite = getUpdatedProbability(NOT_A_PROBABILITY, ptNodeParams->getProbability()); - return mBuffers->getUpdatableProbabilityDictContent()->setProbability(terminalId, - probabilityToWrite); + ProbabilityEntry probabilityEntry; + probabilityEntry.setProbability(0 /* flags */, probabilityToWrite); + return mBuffers->getUpdatableProbabilityDictContent()->setProbabilityEntry(terminalId, + &probabilityEntry); } bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry( @@ -258,13 +265,6 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { return false; } - // Write probability. - if (ptNodeParams->getProbability() != NOT_A_PROBABILITY) { - if (!mBuffers->getUpdatableProbabilityDictContent()->setProbability( - terminalId, ptNodeParams->getProbability())) { - return false; - } - } if (outTerminalId) { *outTerminalId = terminalId; }