From 24af6ed692c7437025e5ebacb41d5ed2c99c3f28 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Wed, 27 Nov 2013 12:14:00 +0900 Subject: [PATCH] Have ver4 support decaying dictionary Bug: 11073222 Change-Id: I7f0002c4743ab3bb1ebaac1bca6e367e6b220010 --- .../bigram/ver4_bigram_list_policy.cpp | 69 ++++++++++++++----- .../bigram/ver4_bigram_list_policy.h | 12 +++- .../v4/content/bigram_dict_content.cpp | 6 +- .../v4/content/bigram_dict_content.h | 14 ++++ .../structure/v4/ver4_patricia_trie_policy.h | 3 +- .../v4/ver4_patricia_trie_writing_helper.cpp | 5 +- .../latin/BinaryDictionaryDecayingTests.java | 69 ++++++++++++++++--- 7 files changed, 142 insertions(+), 36 deletions(-) diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp index 628d3ab38..dc2adb44e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp @@ -20,6 +20,7 @@ #include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -46,10 +47,12 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget if (!mBigramDictContent->createNewBigramList(terminalId)) { return false; } + const int probabilityToWrite = getUpdatedProbability( + NOT_A_PROBABILITY /* originalProbability */, newProbability); // Write an entry. - int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(newProbability, - false /* hasNext */, newTargetTerminalId, &writingPos)) { + const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (!mBigramDictContent->writeBigramEntry(probabilityToWrite, false /* hasNext */, + newTargetTerminalId, writingPos)) { return false; } if (outAddedNewEntry) { @@ -61,19 +64,18 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos); if (entryPosToUpdate != NOT_A_DICT_POS) { // Overwrite existing entry. - int readingPos = entryPosToUpdate; bool hasNext = false; int probability = NOT_A_PROBABILITY; int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - mBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext, - &targetTerminalId, &readingPos); + mBigramDictContent->getBigramEntry(&probability, &hasNext, &targetTerminalId, + entryPosToUpdate); + const int probabilityToWrite = getUpdatedProbability(probability, newProbability); if (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID && outAddedNewEntry) { // Reuse invalid entry. *outAddedNewEntry = true; } - int writingPos = entryPosToUpdate; - return mBigramDictContent->writeBigramEntryAndAdvancePosition(newProbability, hasNext, - newTargetTerminalId, &writingPos); + return mBigramDictContent->writeBigramEntry(probabilityToWrite, hasNext, + newTargetTerminalId, entryPosToUpdate); } // Add new entry to the bigram list. @@ -83,7 +85,9 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget } // Write new entry at a head position of the bigram list. int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(newProbability, + const int probabilityToWrite = getUpdatedProbability( + NOT_A_PROBABILITY /* originalProbability */, newProbability); + if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(probabilityToWrite, true /* hasNext */, newTargetTerminalId, &writingPos)) { return false; } @@ -105,20 +109,18 @@ bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTer // Bigram entry doesn't exist. return false; } - int readingPos = entryPosToUpdate; bool hasNext = false; int probability = NOT_A_PROBABILITY; int originalTargetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - mBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext, - &originalTargetTerminalId, &readingPos); + mBigramDictContent->getBigramEntry(&probability, &hasNext, &originalTargetTerminalId, + entryPosToUpdate); if (targetTerminalId != originalTargetTerminalId) { // Bigram entry doesn't exist. return false; } - int writingPos = entryPosToUpdate; // Remove bigram entry by overwriting target terminal Id. - return mBigramDictContent->writeBigramEntryAndAdvancePosition(probability, hasNext, - Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos); + return mBigramDictContent->writeBigramEntry(probability, hasNext, + Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, entryPosToUpdate); } bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, @@ -143,9 +145,28 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i targetTerminalId); if (targetPtNodePos == NOT_A_DICT_POS) { // Invalidate bigram entry. - int writingPos = entryPos; - return mBigramDictContent->writeBigramEntryAndAdvancePosition(probability, hasNext, - Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos); + if (!mBigramDictContent->writeBigramEntry(probability, hasNext, + Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, entryPos)) { + return false; + } + } else if (mNeedsToDecayWhenUpdating) { + probability = ForgettingCurveUtils::getEncodedProbabilityToSave( + probability, mHeaderPolicy); + if (ForgettingCurveUtils::isValidEncodedProbability(probability)) { + if (!mBigramDictContent->writeBigramEntry(probability, hasNext, targetTerminalId, + entryPos)) { + return false; + } + *outBigramCount += 1; + } else { + // Remove entry. + if (!mBigramDictContent->writeBigramEntry(probability, hasNext, + Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, entryPos)) { + return false; + } + } + } else { + *outBigramCount += 1; } } return true; @@ -192,4 +213,14 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, return invalidEntryPos; } +int Ver4BigramListPolicy::getUpdatedProbability(const int originalProbability, + const int newProbability) const { + if (mNeedsToDecayWhenUpdating) { + return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + newProbability); + } else { + return newProbability; + } +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h index 5b7d5b527..ed8bdb84d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h @@ -23,14 +23,18 @@ namespace latinime { class BigramDictContent; +class DictionaryHeaderStructurePolicy; class TerminalPositionLookupTable; class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { public: Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, - const TerminalPositionLookupTable *const terminalPositionLookupTable) + const TerminalPositionLookupTable *const terminalPositionLookupTable, + const DictionaryHeaderStructurePolicy *const headerPolicy, + const bool needsToDecayWhenUpdating) : mBigramDictContent(bigramDictContent), - mTerminalPositionLookupTable(terminalPositionLookupTable) {} + mTerminalPositionLookupTable(terminalPositionLookupTable), + mHeaderPolicy(headerPolicy), mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating) {} void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, int *const bigramEntryPos) const; @@ -54,8 +58,12 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos) const; + int getUpdatedProbability(const int originalProbability, const int newProbability) const; + BigramDictContent *const mBigramDictContent; const TerminalPositionLookupTable *const mTerminalPositionLookupTable; + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; + const bool mNeedsToDecayWhenUpdating; }; } // namespace latinime #endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp index 906687647..4cd96722e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -103,6 +103,7 @@ bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap * return true; } +// Returns whether GC for the bigram list was succeeded or not. bool BigramDictContent::runGCBigramList(const int bigramListPos, const BigramDictContent *const sourceBigramDictContent, const int toPos, const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, @@ -121,9 +122,8 @@ bool BigramDictContent::runGCBigramList(const int bigramListPos, TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->find(targetTerminalId); if (it == terminalIdMap->end()) { - AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", - targetTerminalId, terminalIdMap->size()); - return false; + // Target word has been removed. + continue; } if (!writeBigramEntryAndAdvancePosition(probability, hasNext, it->second, &writingPos)) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h index ec0d756d8..cf380f403 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h @@ -38,6 +38,13 @@ class BigramDictContent : public SparseTableDictContent { : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE) {} + void getBigramEntry(int *const outProbability, bool *const outHasNext, + int *const outTargetTerminalId, const int bigramEntryPos) const { + int readingPos = bigramEntryPos; + getBigramEntryAndAdvancePosition(outProbability, outHasNext, outTargetTerminalId, + &readingPos); + } + void getBigramEntryAndAdvancePosition(int *const outProbability, bool *const outHasNext, int *const outTargetTerminalId, int *const bigramEntryPos) const; @@ -50,6 +57,13 @@ class BigramDictContent : public SparseTableDictContent { return addressLookupTable->get(terminalId); } + bool writeBigramEntry(const int probability, const int hasNext, const int targetTerminalId, + const int entryWritingPos) { + int writingPos = entryWritingPos; + return writeBigramEntryAndAdvancePosition(probability, hasNext, targetTerminalId, + &writingPos); + } + bool writeBigramEntryAndAdvancePosition(const int probability, const int hasNext, const int targetTerminalId, int *const entryWritingPos); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 6fe978d0f..3606a2ae9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -43,7 +43,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { false /* usesAdditionalBuffer*/), FormatUtils::VERSION_4), mDictBuffer(mBuffers.get()->getWritableTrieBuffer()), mBigramPolicy(mBuffers.get()->getUpdatableBigramDictContent(), - mBuffers.get()->getTerminalPositionLookupTable()), + mBuffers.get()->getTerminalPositionLookupTable(), &mHeaderPolicy, + mHeaderPolicy.isDecayingDict()), mShortcutPolicy(mBuffers.get()->getShortcutDictContent(), mBuffers.get()->getTerminalPositionLookupTable()), mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index f141d52f5..ea03c72fa 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -83,7 +83,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), mBuffers->getProbabilityDictContent()); Ver4BigramListPolicy bigramPolicy(mBuffers->getUpdatableBigramDictContent(), - mBuffers->getTerminalPositionLookupTable()); + mBuffers->getTerminalPositionLookupTable(), headerPolicy, needsToDecay); Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getShortcutDictContent(), mBuffers->getTerminalPositionLookupTable()); Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), @@ -134,7 +134,8 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), buffersToWrite->getProbabilityDictContent()); Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getUpdatableBigramDictContent(), - buffersToWrite->getTerminalPositionLookupTable()); + buffersToWrite->getTerminalPositionLookupTable(), headerPolicy, + false /* needsToDecay */); Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getShortcutDictContent(), buffersToWrite->getTerminalPositionLookupTable()); Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index 12b1caab1..140ab63b0 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -72,26 +72,63 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { } } - private File createEmptyDictionaryAndGetFile(final String filename) throws IOException { - final File file = File.createTempFile(filename, TEST_DICT_FILE_EXTENSION, + private File createEmptyDictionaryAndGetFile(final String dictId, + final int formatVersion) throws IOException { + if (formatVersion == 3) { + return createEmptyVer3DictionaryAndGetFile(dictId); + } else if (formatVersion == 4) { + return createEmptyVer4DictionaryAndGetFile(dictId); + } else { + throw new IOException("Dictionary format version " + formatVersion + + " is not supported."); + } + } + private File createEmptyVer4DictionaryAndGetFile(final String dictId) throws IOException { + final File file = File.createTempFile(dictId, TEST_DICT_FILE_EXTENSION, getContext().getCacheDir()); + file.delete(); + file.mkdir(); Map attributeMap = new HashMap(); attributeMap.put(FormatSpec.FileHeader.SUPPORTS_DYNAMIC_UPDATE_ATTRIBUTE, FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE); attributeMap.put(FormatSpec.FileHeader.USES_FORGETTING_CURVE_ATTRIBUTE, FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE); if (BinaryDictionary.createEmptyDictFile(file.getAbsolutePath(), - 3 /* dictVersion */, attributeMap)) { + 4 /* dictVersion */, attributeMap)) { + return new File(file, FormatSpec.TRIE_FILE_EXTENSION); + } else { + throw new IOException("Empty dictionary " + file.getAbsolutePath() + " " + + FormatSpec.TRIE_FILE_EXTENSION + " cannot be created."); + } + } + + private File createEmptyVer3DictionaryAndGetFile(final String dictId) throws IOException { + final File file = File.createTempFile(dictId, TEST_DICT_FILE_EXTENSION, + getContext().getCacheDir()); + file.delete(); + Map attributeMap = new HashMap(); + attributeMap.put(FormatSpec.FileHeader.SUPPORTS_DYNAMIC_UPDATE_ATTRIBUTE, + FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE); + attributeMap.put(FormatSpec.FileHeader.USES_FORGETTING_CURVE_ATTRIBUTE, + FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE); + if (BinaryDictionary.createEmptyDictFile(file.getAbsolutePath(), 3 /* dictVersion */, + attributeMap)) { return file; } else { - throw new IOException("Empty dictionary cannot be created."); + throw new IOException( + "Empty dictionary " + file.getAbsolutePath() + " cannot be created."); } } public void testAddValidAndInvalidWords() { + testAddValidAndInvalidWords(3 /* formatVersion */); + testAddValidAndInvalidWords(4 /* formatVersion */); + } + + private void testAddValidAndInvalidWords(final int formatVersion) { File dictFile = null; try { - dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); } catch (IOException e) { fail("IOException while writing an initial dictionary : " + e); } @@ -111,7 +148,6 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { binaryDictionary.addUnigramWord("b", DUMMY_PROBABILITY); assertTrue(binaryDictionary.isValidWord("b")); - final int unigramProbability = binaryDictionary.getFrequency("a"); binaryDictionary.addBigramWords("a", "b", Dictionary.NOT_A_PROBABILITY); assertFalse(binaryDictionary.isValidBigram("a", "b")); binaryDictionary.addBigramWords("a", "b", Dictionary.NOT_A_PROBABILITY); @@ -136,9 +172,14 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { } public void testDecayingProbability() { + testDecayingProbability(3 /* formatVersion */); + testDecayingProbability(4 /* formatVersion */); + } + + private void testDecayingProbability(final int formatVersion) { File dictFile = null; try { - dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); } catch (IOException e) { fail("IOException while writing an initial dictionary : " + e); } @@ -190,6 +231,11 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { } public void testAddManyUnigramsToDecayingDict() { + testAddManyUnigramsToDecayingDict(3 /* formatVersion */); + testAddManyUnigramsToDecayingDict(4 /* formatVersion */); + } + + private void testAddManyUnigramsToDecayingDict(final int formatVersion) { final int unigramCount = 30000; final int unigramTypedCount = 100000; final int codePointSetSize = 50; @@ -198,7 +244,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { File dictFile = null; try { - dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); } catch (IOException e) { fail("IOException while writing an initial dictionary : " + e); } @@ -242,6 +288,11 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { } public void testAddManyBigramsToDecayingDict() { + testAddManyBigramsToDecayingDict(3 /* formatVersion */); + testAddManyBigramsToDecayingDict(4 /* formatVersion */); + } + + private void testAddManyBigramsToDecayingDict(final int formatVersion) { final int unigramCount = 5000; final int bigramCount = 30000; final int bigramTypedCount = 100000; @@ -251,7 +302,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { File dictFile = null; try { - dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); } catch (IOException e) { fail("IOException while writing an initial dictionary : " + e); }