Merge "Implement Ver4PatriciaTriePolicy::getNextWordAndNextToken."

2014-02-04 07:46:49 +00:00 · 2014-02-04 07:46:49 +00:00 · ffb12e76b8
commit ffb12e76b8
parent 96aee22e7b 941734695b
6 changed files with 148 additions and 6 deletions
--- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java
+++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java
@ -357,7 +357,7 @@ public final class BinaryDictionary extends Dictionary {
        while (len < MAX_WORD_LENGTH && codePoints[len] != 0) {
            ++len;
        }
-        final String word = new String(mOutputCodePoints, 0, len);
+        final String word = new String(codePoints, 0, len);
        return new GetNextWordPropertyResult(getWordProperty(word), nextToken);
    }

--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp
@ -28,6 +28,14 @@ const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 10000
 const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
 const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;

+bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
+        const PtNodeParams *const ptNodeParams) {
+    if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
+        mTerminalPositions->push_back(ptNodeParams->getHeadPos());
+    }
+    return true;
+}
+
 // Visits all PtNodes in post-order depth first manner.
 // For example, visits c -> b -> y -> x -> a for the following dictionary:
 // a _ b _ c
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h
@ -59,6 +59,21 @@ class DynamicPtReadingHelper {
        DISALLOW_COPY_AND_ASSIGN(TraversingEventListener);
    };

+    class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener {
+     public:
+        TraversePolicyToGetAllTerminalPtNodePositions(std::vector<int> *const terminalPositions)
+                : mTerminalPositions(terminalPositions) {}
+        bool onAscend() { return true; }
+        bool onDescend(const int ptNodeArrayPos) { return true; }
+        bool onReadingPtNodeArrayTail() { return true; }
+        bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+     private:
+        DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions);
+
+        std::vector<int> *const mTerminalPositions;
+    };
+
    DynamicPtReadingHelper(const BufferWithExtendableBuffer *const buffer,
            const PtNodeReader *const ptNodeReader)
            : mIsError(false), mReadingState(), mBuffer(buffer),
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@ -392,10 +392,32 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
            historicalInfo->getCount(), &bigrams, &shortcuts);
 }

-int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token,
-        int *const outCodePoints) {
-    // TODO: Implement.
-    return 0;
+int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
+    if (token == 0) {
+        mTerminalPtNodePositionsForIteratingWords.clear();
+        DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
+                &mTerminalPtNodePositionsForIteratingWords);
+        DynamicPtReadingHelper readingHelper(mDictBuffer, &mNodeReader);
+        readingHelper.initWithPtNodeArrayPos(getRootPosition());
+        readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
+    }
+    const int terminalPtNodePositionsVectorSize =
+            static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
+    if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
+        AKLOGE("Given token %d is invalid.", token);
+        return 0;
+    }
+    const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
+    int unigramProbability = NOT_A_PROBABILITY;
+    getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH,
+            outCodePoints, &unigramProbability);
+    const int nextToken = token + 1;
+    if (nextToken >= terminalPtNodePositionsVectorSize) {
+        // All words have been iterated.
+        mTerminalPtNodePositionsForIteratingWords.clear();
+        return 0;
+    }
+    return nextToken;
 }

 } // namespace latinime
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
@ -17,6 +17,8 @@
 #ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
 #define LATINIME_VER4_PATRICIA_TRIE_POLICY_H

+#include <vector>
+
 #include "defines.h"
 #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
 #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
@ -50,7 +52,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
              mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
              mWritingHelper(mBuffers.get()),
              mUnigramCount(mHeaderPolicy->getUnigramCount()),
-              mBigramCount(mHeaderPolicy->getBigramCount()) {};
+              mBigramCount(mHeaderPolicy->getBigramCount()),
+              mTerminalPtNodePositionsForIteratingWords() {};

    AK_FORCE_INLINE int getRootPosition() const {
        return 0;
@ -134,6 +137,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
    Ver4PatriciaTrieWritingHelper mWritingHelper;
    int mUnigramCount;
    int mBigramCount;
+    std::vector<int> mTerminalPtNodePositionsForIteratingWords;
 };
 } // namespace latinime
 #endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
--- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
+++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
@ -971,6 +971,99 @@ public class BinaryDictionaryTests extends AndroidTestCase {
        }
    }

+    public void testIterateAllWords() {
+        testIterateAllWords(FormatSpec.VERSION4);
+    }
+
+    private void testIterateAllWords(final int formatVersion) {
+        final long seed = System.currentTimeMillis();
+        final Random random = new Random(seed);
+        final int UNIGRAM_COUNT = 1000;
+        final int BIGRAM_COUNT = 1000;
+        final int codePointSetSize = 20;
+        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
+
+        File dictFile = null;
+        try {
+            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
+        } catch (IOException e) {
+            fail("IOException while writing an initial dictionary : " + e);
+        }
+        final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
+                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
+                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
+
+        final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
+        assertFalse(invalidWordProperty.isValid());
+
+        final ArrayList<String> words = new ArrayList<String>();
+        final HashMap<String, Integer> wordProbabilitiesToCheckLater =
+                new HashMap<String, Integer>();
+        final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>();
+        final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
+                new HashMap<Pair<String, String>, Integer>();
+
+        for (int i = 0; i < UNIGRAM_COUNT; i++) {
+            final String word = CodePointUtils.generateWord(random, codePointSet);
+            final int unigramProbability = random.nextInt(0xFF);
+            addUnigramWord(binaryDictionary, word, unigramProbability);
+            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
+                binaryDictionary.flushWithGC();
+            }
+            words.add(word);
+            wordProbabilitiesToCheckLater.put(word, unigramProbability);
+        }
+
+        for (int i = 0; i < BIGRAM_COUNT; i++) {
+            final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
+            final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
+            if (word0Index == word1Index) {
+                continue;
+            }
+            final String word0 = words.get(word0Index);
+            final String word1 = words.get(word1Index);
+            final int bigramProbability = random.nextInt(0xF);
+            binaryDictionary.addBigramWords(word0, word1, bigramProbability,
+                    BinaryDictionary.NOT_A_VALID_TIMESTAMP);
+            if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
+                binaryDictionary.flushWithGC();
+            }
+            if (!bigrams.containsKey(word0)) {
+                final HashSet<String> bigramWord1s = new HashSet<String>();
+                bigrams.put(word0, bigramWord1s);
+            }
+            bigrams.get(word0).add(word1);
+            bigramProbabilitiesToCheckLater.put(
+                    new Pair<String, String>(word0, word1), bigramProbability);
+        }
+
+        final HashSet<String> wordSet = new HashSet<String>(words);
+        final HashSet<Pair<String, String>> bigramSet =
+                new HashSet<Pair<String,String>>(bigramProbabilitiesToCheckLater.keySet());
+        int token = 0;
+        do {
+            final BinaryDictionary.GetNextWordPropertyResult result =
+                    binaryDictionary.getNextWordProperty(token);
+            final WordProperty wordProperty = result.mWordProperty;
+            final String word0 = wordProperty.mCodePoints;
+            assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
+                    wordProperty.mProbabilityInfo.mProbability);
+            wordSet.remove(word0);
+            final HashSet<String> bigramWord1s = bigrams.get(word0);
+            for (int j = 0; j < wordProperty.mBigramTargets.size(); j++) {
+                final String word1 = wordProperty.mBigramTargets.get(j).mWord;
+                assertTrue(bigramWord1s.contains(word1));
+                final int probability = wordProperty.mBigramTargets.get(j).mFrequency;
+                final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
+                assertEquals((int)bigramProbabilitiesToCheckLater.get(bigram), probability);
+                bigramSet.remove(bigram);
+            }
+            token = result.mNextToken;
+        } while (token != 0);
+        assertTrue(wordSet.isEmpty());
+        assertTrue(bigramSet.isEmpty());
+    }
+
    public void testAddShortcuts() {
        testAddShortcuts(FormatSpec.VERSION4);
    }