From 5f88c1e0f1882e9677fb316587b51c31e49b804c Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Mon, 9 Dec 2013 13:05:44 +0900 Subject: [PATCH] Start to support adding shortcuts. Bug: 11073222 Bug: 11956652 Change-Id: Iea81603a140697594cfea4f4939e82cd1d3963ca --- .../shortcut/ver4_shortcut_list_policy.h | 10 +- .../structure/pt_common/pt_node_writer.h | 4 + .../dynamic_patricia_trie_updating_helper.cpp | 8 ++ .../dynamic_patricia_trie_updating_helper.h | 4 + .../v4/content/shortcut_dict_content.h | 8 ++ .../v4/ver4_patricia_trie_node_writer.cpp | 7 + .../v4/ver4_patricia_trie_node_writer.h | 4 + .../v4/ver4_patricia_trie_policy.cpp | 20 ++- .../utils/buffer_with_extendable_buffer.cpp | 2 +- .../dictionary/utils/sparse_table.cpp | 14 +- .../latin/BinaryDictionaryTests.java | 131 +++++++++++++++++- 11 files changed, 201 insertions(+), 11 deletions(-) diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h index b7bd08531..ae863af57 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h @@ -88,11 +88,13 @@ class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); } // Overwrite existing entry. - int writingPos = entryPos; - if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, - codePointCount, probability, true /* hasNext */, &writingPos)) { + bool hasNext = false; + mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, + 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); + if (!mShortcutDictContent->writeShortcutEntry(codePoints, + codePointCount, probability, hasNext, entryPos)) { AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, - writingPos); + entryPos); return false; } return true; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h index 9957387bf..c7a36e796 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h @@ -74,6 +74,10 @@ class PtNodeWriter { const DictPositionRelocationMap *const dictPositionRelocationMap, int *const outBigramEntryCount) = 0; + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) = 0; + protected: PtNodeWriter() {}; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.cpp index e88938fcc..f12a8995e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.cpp @@ -105,6 +105,14 @@ bool DynamicPatriciaTrieUpdatingHelper::removeBigramWords(const int word0Pos, co return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams); } +bool DynamicPatriciaTrieUpdatingHelper::addShortcutTarget(const int wordPos, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + const PtNodeParams ptNodeParams(mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(wordPos)); + return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount, + shortcutProbability); +} + bool DynamicPatriciaTrieUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted, const int probability, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h index 799f07b6a..f02635fe2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h @@ -52,6 +52,10 @@ class DynamicPatriciaTrieUpdatingHelper { // Remove a bigram relation from word0Pos to word1Pos. bool removeBigramWords(const int word0Pos, const int word1Pos); + // Add a shortcut target. + bool addShortcutTarget(const int wordPos, const int *const targetCodePoints, + const int targetCodePointCount, const int shortcutProbability); + private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieUpdatingHelper); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h index a4f817e28..670e6eab6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h @@ -38,6 +38,14 @@ class ShortcutDictContent : public SparseTableDictContent { : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, int *const outProbability, bool *const outhasNext, + const int shortcutEntryPos) { + int readingPos = shortcutEntryPos; + return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, + outCodePointCount, outProbability, outhasNext, &readingPos); + } + void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, int *const outCodePoint, int *const outCodePointCount, int *const outProbability, bool *const outhasNext, int *const shortcutEntryPos) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index f6ea3b731..92bec30f5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -217,6 +217,13 @@ bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( return true; } +bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + return mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), + targetCodePoints, targetCodePointCount, shortcutProbability); +} + bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( const PtNodeParams *const ptNodeParams, int *const outTerminalId, int *const ptNodeWritingPos) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index 31b47c148..4a2a79259 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -83,6 +83,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const DictPositionRelocationMap *const dictPositionRelocationMap, int *const outBigramEntryCount); + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability); + private: DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index d1ba1877c..9c8db3bac 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -155,12 +155,26 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len DynamicPatriciaTrieReadingHelper readingHelper(mDictBuffer, &mNodeReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; - // TODO: Add shortcut. if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, probability, isNotAWord, isBlacklisted, timestamp, &addedNewUnigram)) { if (addedNewUnigram) { mUnigramCount++; } + if (shortcutLength > 0) { + // Add shortcut target. + const int wordPos = getTerminalPtNodePositionOfWord(word, length, + false /* forceLowerCaseSearch */); + if (wordPos == NOT_A_DICT_POS) { + AKLOGE("Cannot find terminal PtNode position to add shortcut target."); + return false; + } + if (!mUpdatingHelper.addShortcutTarget(wordPos, shortcutTargetCodePoints, + shortcutLength, shortcutProbability)) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, probability: %d", + wordPos, shortcutLength, shortcutProbability); + return false; + } + } return true; } else { return false; @@ -308,12 +322,12 @@ const UnigramProperty Ver4PatriciaTriePolicy::getUnigramProperty(const int *cons // Fetch shortcut information. std::vector > shortcutTargets; std::vector shortcutProbabilities; - if (ptNodeParams.hasShortcutTargets()) { + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { int shortcutTarget[MAX_WORD_LENGTH]; const ShortcutDictContent *const shortcutDictContent = mBuffers.get()->getShortcutDictContent(); bool hasNext = true; - int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); while (hasNext) { int shortcutTargetLength = 0; int shortcutProbability = NOT_A_PROBABILITY; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp index 4b537da8a..259dae4c6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -43,7 +43,7 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC *pos -= mOriginalBufferSize; } *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( - getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePointCount, pos); + getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos); if (readingPosIsInAdditionalBuffer) { *pos += mOriginalBufferSize; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp index 9be35620c..4ad82f9f7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp @@ -44,6 +44,7 @@ bool SparseTable::set(const int id, const uint32_t value) { int tailPos = mIndexTableBuffer->getTailPosition(); while(tailPos < posInIndexTable) { if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) { + AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable); return false; } } @@ -51,12 +52,19 @@ bool SparseTable::set(const int id, const uint32_t value) { if (contains(id)) { // The entry is already in the content table. const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable); - return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index)); + if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) { + AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value, + getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(), + mDataSize); + return false; + } + return true; } // The entry is not in the content table. // Create new entry in the content table. const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition()); if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) { + AKLOGE("cannot write index %d. pos %d", index, posInIndexTable); return false; } // Write a new block that containing the entry to be set. @@ -64,6 +72,8 @@ bool SparseTable::set(const int id, const uint32_t value) { for (int i = 0; i < mBlockSize; ++i) { if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_A_DICT_POS, mDataSize, &writingPos)) { + AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, " + "mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize); return false; } } @@ -80,7 +90,7 @@ int SparseTable::getPosInIndexTable(const int id) const { int SparseTable::getPosInContentTable(const int id, const int index) const { const int offset = id % mBlockSize; - return (index * mDataSize + offset) * mBlockSize; + return (index * mBlockSize + offset) * mDataSize; } } // namespace latinime diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 03a302b8f..b0ae9240f 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -24,6 +24,7 @@ import android.util.Pair; import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam; import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.UnigramProperty; import java.io.File; @@ -856,7 +857,6 @@ public class BinaryDictionaryTests extends AndroidTestCase { final int unigramProbability = random.nextInt(0xFF); final boolean isNotAWord = random.nextBoolean(); final boolean isBlacklisted = random.nextBoolean(); - // TODO: Add tests for shortcut. // TODO: Add tests for historical info. binaryDictionary.addUnigramWord(word, unigramProbability, null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY, @@ -873,4 +873,133 @@ public class BinaryDictionaryTests extends AndroidTestCase { assertTrue(unigramProperty.mShortcutTargets.isEmpty()); } } + + public void testAddShortcuts() { + testAddShortcuts(4 /* formatVersion */); + } + + private void testAddShortcuts(final int formatVersion) { + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + final int unigramProbability = 100; + final int shortcutProbability = 10; + binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz", + shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, + 0 /* timestamp */); + UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty("aaa"); + assertEquals(1, unigramProperty.mShortcutTargets.size()); + assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord); + assertEquals(shortcutProbability, unigramProperty.mShortcutTargets.get(0).mFrequency); + final int updatedShortcutProbability = 2; + binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz", + updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, + 0 /* timestamp */); + unigramProperty = binaryDictionary.getUnigramProperty("aaa"); + assertEquals(1, unigramProperty.mShortcutTargets.size()); + assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord); + assertEquals(updatedShortcutProbability, + unigramProperty.mShortcutTargets.get(0).mFrequency); + binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy", + shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, + 0 /* timestamp */); + final HashMap shortcutTargets = new HashMap(); + shortcutTargets.put("zzz", updatedShortcutProbability); + shortcutTargets.put("yyy", shortcutProbability); + unigramProperty = binaryDictionary.getUnigramProperty("aaa"); + assertEquals(2, unigramProperty.mShortcutTargets.size()); + for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) { + assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); + assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency); + shortcutTargets.remove(shortcutTarget.mWord); + } + shortcutTargets.put("zzz", updatedShortcutProbability); + shortcutTargets.put("yyy", shortcutProbability); + binaryDictionary.flushWithGC(); + unigramProperty = binaryDictionary.getUnigramProperty("aaa"); + assertEquals(2, unigramProperty.mShortcutTargets.size()); + for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) { + assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); + assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency); + shortcutTargets.remove(shortcutTarget.mWord); + } + } + + public void testAddManyShortcuts() { + testAddManyShortcuts(4 /* formatVersion */); + } + + private void testAddManyShortcuts(final int formatVersion) { + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final int UNIGRAM_COUNT = 1000; + final int SHORTCUT_COUNT = 10000; + final int codePointSetSize = 20; + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + + final ArrayList words = new ArrayList(); + final HashMap unigramProbabilities = new HashMap(); + final HashMap> shortcutTargets = + new HashMap>(); + + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + for (int i = 0; i < UNIGRAM_COUNT; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + final int unigramProbability = random.nextInt(0xFF); + addUnigramWord(binaryDictionary, word, unigramProbability); + words.add(word); + unigramProbabilities.put(word, unigramProbability); + if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + } + for (int i = 0; i < SHORTCUT_COUNT; i++) { + final String shortcutTarget = CodePointUtils.generateWord(random, codePointSet); + final int shortcutProbability = random.nextInt(0xF); + final String word = words.get(random.nextInt(words.size())); + final int unigramProbability = unigramProbabilities.get(word); + binaryDictionary.addUnigramWord(word, unigramProbability, shortcutTarget, + shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, + 0 /* timestamp */); + if (shortcutTargets.containsKey(word)) { + final HashMap shortcutTargetsOfWord = shortcutTargets.get(word); + shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability); + } else { + final HashMap shortcutTargetsOfWord = + new HashMap(); + shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability); + shortcutTargets.put(word, shortcutTargetsOfWord); + } + if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + } + + for (final String word : words) { + final UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty(word); + assertEquals((int)unigramProbabilities.get(word), unigramProperty.mProbability); + assertEquals(shortcutTargets.get(word).size(), unigramProperty.mShortcutTargets.size()); + for (final WeightedString shortcutTarget : unigramProperty.mShortcutTargets) { + final String targetCodePonts = shortcutTarget.mWord; + assertEquals((int)shortcutTargets.get(word).get(targetCodePonts), + shortcutTarget.mFrequency); + } + } + } }