From 941734695b9eeb59135db737e4b153c45e88247a Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Mon, 3 Feb 2014 22:22:35 +0900 Subject: [PATCH] Implement Ver4PatriciaTriePolicy::getNextWordAndNextToken. Bug: 12810574 Change-Id: Idea44f03c477964f58e65fbf2b55e3fcd77a2934 --- .../inputmethod/latin/BinaryDictionary.java | 2 +- .../pt_common/dynamic_pt_reading_helper.cpp | 8 ++ .../pt_common/dynamic_pt_reading_helper.h | 15 +++ .../v4/ver4_patricia_trie_policy.cpp | 30 +++++- .../structure/v4/ver4_patricia_trie_policy.h | 6 +- .../latin/BinaryDictionaryTests.java | 93 +++++++++++++++++++ 6 files changed, 148 insertions(+), 6 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index 00eb57c9f..bdf89450f 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -357,7 +357,7 @@ public final class BinaryDictionary extends Dictionary { while (len < MAX_WORD_LENGTH && codePoints[len] != 0) { ++len; } - final String word = new String(mOutputCodePoints, 0, len); + final String word = new String(codePoints, 0, len); return new GetNextWordPropertyResult(getWordProperty(word), nextToken); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp index b918e0765..824d442e4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp @@ -28,6 +28,14 @@ const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 10000 const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; +bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode( + const PtNodeParams *const ptNodeParams) { + if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) { + mTerminalPositions->push_back(ptNodeParams->getHeadPos()); + } + return true; +} + // Visits all PtNodes in post-order depth first manner. // For example, visits c -> b -> y -> x -> a for the following dictionary: // a _ b _ c diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h index a69490943..bcc5c7857 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h @@ -59,6 +59,21 @@ class DynamicPtReadingHelper { DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); }; + class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener { + public: + TraversePolicyToGetAllTerminalPtNodePositions(std::vector *const terminalPositions) + : mTerminalPositions(terminalPositions) {} + bool onAscend() { return true; } + bool onDescend(const int ptNodeArrayPos) { return true; } + bool onReadingPtNodeArrayTail() { return true; } + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions); + + std::vector *const mTerminalPositions; + }; + DynamicPtReadingHelper(const BufferWithExtendableBuffer *const buffer, const PtNodeReader *const ptNodeReader) : mIsError(false), mReadingState(), mBuffer(buffer), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 1c420e070..75d85988c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -392,10 +392,32 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code historicalInfo->getCount(), &bigrams, &shortcuts); } -int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, - int *const outCodePoints) { - // TODO: Implement. - return 0; +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(mDictBuffer, &mNodeReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + int unigramProbability = NOT_A_PROBABILITY; + getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, + outCodePoints, &unigramProbability); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 1bcd4ceea..9ba5be0c3 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -17,6 +17,8 @@ #ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H #define LATINIME_VER4_PATRICIA_TRIE_POLICY_H +#include + #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" @@ -50,7 +52,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), mUnigramCount(mHeaderPolicy->getUnigramCount()), - mBigramCount(mHeaderPolicy->getBigramCount()) {}; + mBigramCount(mHeaderPolicy->getBigramCount()), + mTerminalPtNodePositionsForIteratingWords() {}; AK_FORCE_INLINE int getRootPosition() const { return 0; @@ -134,6 +137,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { Ver4PatriciaTrieWritingHelper mWritingHelper; int mUnigramCount; int mBigramCount; + std::vector mTerminalPtNodePositionsForIteratingWords; }; } // namespace latinime #endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index e39b46f94..bab86e546 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -971,6 +971,99 @@ public class BinaryDictionaryTests extends AndroidTestCase { } } + public void testIterateAllWords() { + testIterateAllWords(FormatSpec.VERSION4); + } + + private void testIterateAllWords(final int formatVersion) { + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final int UNIGRAM_COUNT = 1000; + final int BIGRAM_COUNT = 1000; + final int codePointSetSize = 20; + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); + assertFalse(invalidWordProperty.isValid()); + + final ArrayList words = new ArrayList(); + final HashMap wordProbabilitiesToCheckLater = + new HashMap(); + final HashMap> bigrams = new HashMap>(); + final HashMap, Integer> bigramProbabilitiesToCheckLater = + new HashMap, Integer>(); + + for (int i = 0; i < UNIGRAM_COUNT; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + final int unigramProbability = random.nextInt(0xFF); + addUnigramWord(binaryDictionary, word, unigramProbability); + if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + words.add(word); + wordProbabilitiesToCheckLater.put(word, unigramProbability); + } + + for (int i = 0; i < BIGRAM_COUNT; i++) { + final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size()); + final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size()); + if (word0Index == word1Index) { + continue; + } + final String word0 = words.get(word0Index); + final String word1 = words.get(word1Index); + final int bigramProbability = random.nextInt(0xF); + binaryDictionary.addBigramWords(word0, word1, bigramProbability, + BinaryDictionary.NOT_A_VALID_TIMESTAMP); + if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + if (!bigrams.containsKey(word0)) { + final HashSet bigramWord1s = new HashSet(); + bigrams.put(word0, bigramWord1s); + } + bigrams.get(word0).add(word1); + bigramProbabilitiesToCheckLater.put( + new Pair(word0, word1), bigramProbability); + } + + final HashSet wordSet = new HashSet(words); + final HashSet> bigramSet = + new HashSet>(bigramProbabilitiesToCheckLater.keySet()); + int token = 0; + do { + final BinaryDictionary.GetNextWordPropertyResult result = + binaryDictionary.getNextWordProperty(token); + final WordProperty wordProperty = result.mWordProperty; + final String word0 = wordProperty.mCodePoints; + assertEquals((int)wordProbabilitiesToCheckLater.get(word0), + wordProperty.mProbabilityInfo.mProbability); + wordSet.remove(word0); + final HashSet bigramWord1s = bigrams.get(word0); + for (int j = 0; j < wordProperty.mBigramTargets.size(); j++) { + final String word1 = wordProperty.mBigramTargets.get(j).mWord; + assertTrue(bigramWord1s.contains(word1)); + final int probability = wordProperty.mBigramTargets.get(j).mFrequency; + final Pair bigram = new Pair(word0, word1); + assertEquals((int)bigramProbabilitiesToCheckLater.get(bigram), probability); + bigramSet.remove(bigram); + } + token = result.mNextToken; + } while (token != 0); + assertTrue(wordSet.isEmpty()); + assertTrue(bigramSet.isEmpty()); + } + public void testAddShortcuts() { testAddShortcuts(FormatSpec.VERSION4); }