Implement Ver4PatriciaTriePolicy::getNextWordAndNextToken.

Bug: 12810574
Change-Id: Idea44f03c477964f58e65fbf2b55e3fcd77a2934
main
Keisuke Kuroyanagi 2014-02-03 22:22:35 +09:00
parent 0251c60b44
commit 941734695b
6 changed files with 148 additions and 6 deletions

View File

@ -357,7 +357,7 @@ public final class BinaryDictionary extends Dictionary {
while (len < MAX_WORD_LENGTH && codePoints[len] != 0) { while (len < MAX_WORD_LENGTH && codePoints[len] != 0) {
++len; ++len;
} }
final String word = new String(mOutputCodePoints, 0, len); final String word = new String(codePoints, 0, len);
return new GetNextWordPropertyResult(getWordProperty(word), nextToken); return new GetNextWordPropertyResult(getWordProperty(word), nextToken);
} }

View File

@ -28,6 +28,14 @@ const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 10000
const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
const PtNodeParams *const ptNodeParams) {
if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
mTerminalPositions->push_back(ptNodeParams->getHeadPos());
}
return true;
}
// Visits all PtNodes in post-order depth first manner. // Visits all PtNodes in post-order depth first manner.
// For example, visits c -> b -> y -> x -> a for the following dictionary: // For example, visits c -> b -> y -> x -> a for the following dictionary:
// a _ b _ c // a _ b _ c

View File

@ -59,6 +59,21 @@ class DynamicPtReadingHelper {
DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); DISALLOW_COPY_AND_ASSIGN(TraversingEventListener);
}; };
class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener {
public:
TraversePolicyToGetAllTerminalPtNodePositions(std::vector<int> *const terminalPositions)
: mTerminalPositions(terminalPositions) {}
bool onAscend() { return true; }
bool onDescend(const int ptNodeArrayPos) { return true; }
bool onReadingPtNodeArrayTail() { return true; }
bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions);
std::vector<int> *const mTerminalPositions;
};
DynamicPtReadingHelper(const BufferWithExtendableBuffer *const buffer, DynamicPtReadingHelper(const BufferWithExtendableBuffer *const buffer,
const PtNodeReader *const ptNodeReader) const PtNodeReader *const ptNodeReader)
: mIsError(false), mReadingState(), mBuffer(buffer), : mIsError(false), mReadingState(), mBuffer(buffer),

View File

@ -392,10 +392,32 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
historicalInfo->getCount(), &bigrams, &shortcuts); historicalInfo->getCount(), &bigrams, &shortcuts);
} }
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
int *const outCodePoints) { if (token == 0) {
// TODO: Implement. mTerminalPtNodePositionsForIteratingWords.clear();
return 0; DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
&mTerminalPtNodePositionsForIteratingWords);
DynamicPtReadingHelper readingHelper(mDictBuffer, &mNodeReader);
readingHelper.initWithPtNodeArrayPos(getRootPosition());
readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
}
const int terminalPtNodePositionsVectorSize =
static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
AKLOGE("Given token %d is invalid.", token);
return 0;
}
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
int unigramProbability = NOT_A_PROBABILITY;
getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH,
outCodePoints, &unigramProbability);
const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated.
mTerminalPtNodePositionsForIteratingWords.clear();
return 0;
}
return nextToken;
} }
} // namespace latinime } // namespace latinime

View File

@ -17,6 +17,8 @@
#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H #ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H #define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
#include <vector>
#include "defines.h" #include "defines.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
@ -50,7 +52,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
mWritingHelper(mBuffers.get()), mWritingHelper(mBuffers.get()),
mUnigramCount(mHeaderPolicy->getUnigramCount()), mUnigramCount(mHeaderPolicy->getUnigramCount()),
mBigramCount(mHeaderPolicy->getBigramCount()) {}; mBigramCount(mHeaderPolicy->getBigramCount()),
mTerminalPtNodePositionsForIteratingWords() {};
AK_FORCE_INLINE int getRootPosition() const { AK_FORCE_INLINE int getRootPosition() const {
return 0; return 0;
@ -134,6 +137,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
Ver4PatriciaTrieWritingHelper mWritingHelper; Ver4PatriciaTrieWritingHelper mWritingHelper;
int mUnigramCount; int mUnigramCount;
int mBigramCount; int mBigramCount;
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
}; };
} // namespace latinime } // namespace latinime
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H #endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H

View File

@ -971,6 +971,99 @@ public class BinaryDictionaryTests extends AndroidTestCase {
} }
} }
public void testIterateAllWords() {
testIterateAllWords(FormatSpec.VERSION4);
}
private void testIterateAllWords(final int formatVersion) {
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
final int UNIGRAM_COUNT = 1000;
final int BIGRAM_COUNT = 1000;
final int codePointSetSize = 20;
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
assertFalse(invalidWordProperty.isValid());
final ArrayList<String> words = new ArrayList<String>();
final HashMap<String, Integer> wordProbabilitiesToCheckLater =
new HashMap<String, Integer>();
final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>();
final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
new HashMap<Pair<String, String>, Integer>();
for (int i = 0; i < UNIGRAM_COUNT; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
final int unigramProbability = random.nextInt(0xFF);
addUnigramWord(binaryDictionary, word, unigramProbability);
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
binaryDictionary.flushWithGC();
}
words.add(word);
wordProbabilitiesToCheckLater.put(word, unigramProbability);
}
for (int i = 0; i < BIGRAM_COUNT; i++) {
final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
if (word0Index == word1Index) {
continue;
}
final String word0 = words.get(word0Index);
final String word1 = words.get(word1Index);
final int bigramProbability = random.nextInt(0xF);
binaryDictionary.addBigramWords(word0, word1, bigramProbability,
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
binaryDictionary.flushWithGC();
}
if (!bigrams.containsKey(word0)) {
final HashSet<String> bigramWord1s = new HashSet<String>();
bigrams.put(word0, bigramWord1s);
}
bigrams.get(word0).add(word1);
bigramProbabilitiesToCheckLater.put(
new Pair<String, String>(word0, word1), bigramProbability);
}
final HashSet<String> wordSet = new HashSet<String>(words);
final HashSet<Pair<String, String>> bigramSet =
new HashSet<Pair<String,String>>(bigramProbabilitiesToCheckLater.keySet());
int token = 0;
do {
final BinaryDictionary.GetNextWordPropertyResult result =
binaryDictionary.getNextWordProperty(token);
final WordProperty wordProperty = result.mWordProperty;
final String word0 = wordProperty.mCodePoints;
assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
wordProperty.mProbabilityInfo.mProbability);
wordSet.remove(word0);
final HashSet<String> bigramWord1s = bigrams.get(word0);
for (int j = 0; j < wordProperty.mBigramTargets.size(); j++) {
final String word1 = wordProperty.mBigramTargets.get(j).mWord;
assertTrue(bigramWord1s.contains(word1));
final int probability = wordProperty.mBigramTargets.get(j).mFrequency;
final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
assertEquals((int)bigramProbabilitiesToCheckLater.get(bigram), probability);
bigramSet.remove(bigram);
}
token = result.mNextToken;
} while (token != 0);
assertTrue(wordSet.isEmpty());
assertTrue(bigramSet.isEmpty());
}
public void testAddShortcuts() { public void testAddShortcuts() {
testAddShortcuts(FormatSpec.VERSION4); testAddShortcuts(FormatSpec.VERSION4);
} }