Merge "Implement Ver4PatriciaTriePolicy::getNextWordAndNextToken."
This commit is contained in:
commit
ffb12e76b8
6 changed files with 148 additions and 6 deletions
|
@ -357,7 +357,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
while (len < MAX_WORD_LENGTH && codePoints[len] != 0) {
|
while (len < MAX_WORD_LENGTH && codePoints[len] != 0) {
|
||||||
++len;
|
++len;
|
||||||
}
|
}
|
||||||
final String word = new String(mOutputCodePoints, 0, len);
|
final String word = new String(codePoints, 0, len);
|
||||||
return new GetNextWordPropertyResult(getWordProperty(word), nextToken);
|
return new GetNextWordPropertyResult(getWordProperty(word), nextToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,14 @@ const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 10000
|
||||||
const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
|
const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
|
||||||
const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
|
const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
|
||||||
|
|
||||||
|
bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
|
||||||
|
const PtNodeParams *const ptNodeParams) {
|
||||||
|
if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
|
||||||
|
mTerminalPositions->push_back(ptNodeParams->getHeadPos());
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Visits all PtNodes in post-order depth first manner.
|
// Visits all PtNodes in post-order depth first manner.
|
||||||
// For example, visits c -> b -> y -> x -> a for the following dictionary:
|
// For example, visits c -> b -> y -> x -> a for the following dictionary:
|
||||||
// a _ b _ c
|
// a _ b _ c
|
||||||
|
|
|
@ -59,6 +59,21 @@ class DynamicPtReadingHelper {
|
||||||
DISALLOW_COPY_AND_ASSIGN(TraversingEventListener);
|
DISALLOW_COPY_AND_ASSIGN(TraversingEventListener);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener {
|
||||||
|
public:
|
||||||
|
TraversePolicyToGetAllTerminalPtNodePositions(std::vector<int> *const terminalPositions)
|
||||||
|
: mTerminalPositions(terminalPositions) {}
|
||||||
|
bool onAscend() { return true; }
|
||||||
|
bool onDescend(const int ptNodeArrayPos) { return true; }
|
||||||
|
bool onReadingPtNodeArrayTail() { return true; }
|
||||||
|
bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions);
|
||||||
|
|
||||||
|
std::vector<int> *const mTerminalPositions;
|
||||||
|
};
|
||||||
|
|
||||||
DynamicPtReadingHelper(const BufferWithExtendableBuffer *const buffer,
|
DynamicPtReadingHelper(const BufferWithExtendableBuffer *const buffer,
|
||||||
const PtNodeReader *const ptNodeReader)
|
const PtNodeReader *const ptNodeReader)
|
||||||
: mIsError(false), mReadingState(), mBuffer(buffer),
|
: mIsError(false), mReadingState(), mBuffer(buffer),
|
||||||
|
|
|
@ -392,10 +392,32 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
|
||||||
historicalInfo->getCount(), &bigrams, &shortcuts);
|
historicalInfo->getCount(), &bigrams, &shortcuts);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token,
|
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
|
||||||
int *const outCodePoints) {
|
if (token == 0) {
|
||||||
// TODO: Implement.
|
mTerminalPtNodePositionsForIteratingWords.clear();
|
||||||
|
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
|
||||||
|
&mTerminalPtNodePositionsForIteratingWords);
|
||||||
|
DynamicPtReadingHelper readingHelper(mDictBuffer, &mNodeReader);
|
||||||
|
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
||||||
|
readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
|
||||||
|
}
|
||||||
|
const int terminalPtNodePositionsVectorSize =
|
||||||
|
static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
|
||||||
|
if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
|
||||||
|
AKLOGE("Given token %d is invalid.", token);
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||||
|
int unigramProbability = NOT_A_PROBABILITY;
|
||||||
|
getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH,
|
||||||
|
outCodePoints, &unigramProbability);
|
||||||
|
const int nextToken = token + 1;
|
||||||
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||||
|
// All words have been iterated.
|
||||||
|
mTerminalPtNodePositionsForIteratingWords.clear();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return nextToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||||
#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||||
|
@ -50,7 +52,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
|
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
|
||||||
mWritingHelper(mBuffers.get()),
|
mWritingHelper(mBuffers.get()),
|
||||||
mUnigramCount(mHeaderPolicy->getUnigramCount()),
|
mUnigramCount(mHeaderPolicy->getUnigramCount()),
|
||||||
mBigramCount(mHeaderPolicy->getBigramCount()) {};
|
mBigramCount(mHeaderPolicy->getBigramCount()),
|
||||||
|
mTerminalPtNodePositionsForIteratingWords() {};
|
||||||
|
|
||||||
AK_FORCE_INLINE int getRootPosition() const {
|
AK_FORCE_INLINE int getRootPosition() const {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -134,6 +137,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
Ver4PatriciaTrieWritingHelper mWritingHelper;
|
Ver4PatriciaTrieWritingHelper mWritingHelper;
|
||||||
int mUnigramCount;
|
int mUnigramCount;
|
||||||
int mBigramCount;
|
int mBigramCount;
|
||||||
|
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||||
|
|
|
@ -971,6 +971,99 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testIterateAllWords() {
|
||||||
|
testIterateAllWords(FormatSpec.VERSION4);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testIterateAllWords(final int formatVersion) {
|
||||||
|
final long seed = System.currentTimeMillis();
|
||||||
|
final Random random = new Random(seed);
|
||||||
|
final int UNIGRAM_COUNT = 1000;
|
||||||
|
final int BIGRAM_COUNT = 1000;
|
||||||
|
final int codePointSetSize = 20;
|
||||||
|
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||||
|
|
||||||
|
File dictFile = null;
|
||||||
|
try {
|
||||||
|
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while writing an initial dictionary : " + e);
|
||||||
|
}
|
||||||
|
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||||
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
|
||||||
|
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
|
||||||
|
assertFalse(invalidWordProperty.isValid());
|
||||||
|
|
||||||
|
final ArrayList<String> words = new ArrayList<String>();
|
||||||
|
final HashMap<String, Integer> wordProbabilitiesToCheckLater =
|
||||||
|
new HashMap<String, Integer>();
|
||||||
|
final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>();
|
||||||
|
final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
|
||||||
|
new HashMap<Pair<String, String>, Integer>();
|
||||||
|
|
||||||
|
for (int i = 0; i < UNIGRAM_COUNT; i++) {
|
||||||
|
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||||
|
final int unigramProbability = random.nextInt(0xFF);
|
||||||
|
addUnigramWord(binaryDictionary, word, unigramProbability);
|
||||||
|
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
}
|
||||||
|
words.add(word);
|
||||||
|
wordProbabilitiesToCheckLater.put(word, unigramProbability);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < BIGRAM_COUNT; i++) {
|
||||||
|
final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
|
||||||
|
final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
|
||||||
|
if (word0Index == word1Index) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
final String word0 = words.get(word0Index);
|
||||||
|
final String word1 = words.get(word1Index);
|
||||||
|
final int bigramProbability = random.nextInt(0xF);
|
||||||
|
binaryDictionary.addBigramWords(word0, word1, bigramProbability,
|
||||||
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||||
|
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
}
|
||||||
|
if (!bigrams.containsKey(word0)) {
|
||||||
|
final HashSet<String> bigramWord1s = new HashSet<String>();
|
||||||
|
bigrams.put(word0, bigramWord1s);
|
||||||
|
}
|
||||||
|
bigrams.get(word0).add(word1);
|
||||||
|
bigramProbabilitiesToCheckLater.put(
|
||||||
|
new Pair<String, String>(word0, word1), bigramProbability);
|
||||||
|
}
|
||||||
|
|
||||||
|
final HashSet<String> wordSet = new HashSet<String>(words);
|
||||||
|
final HashSet<Pair<String, String>> bigramSet =
|
||||||
|
new HashSet<Pair<String,String>>(bigramProbabilitiesToCheckLater.keySet());
|
||||||
|
int token = 0;
|
||||||
|
do {
|
||||||
|
final BinaryDictionary.GetNextWordPropertyResult result =
|
||||||
|
binaryDictionary.getNextWordProperty(token);
|
||||||
|
final WordProperty wordProperty = result.mWordProperty;
|
||||||
|
final String word0 = wordProperty.mCodePoints;
|
||||||
|
assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
|
||||||
|
wordProperty.mProbabilityInfo.mProbability);
|
||||||
|
wordSet.remove(word0);
|
||||||
|
final HashSet<String> bigramWord1s = bigrams.get(word0);
|
||||||
|
for (int j = 0; j < wordProperty.mBigramTargets.size(); j++) {
|
||||||
|
final String word1 = wordProperty.mBigramTargets.get(j).mWord;
|
||||||
|
assertTrue(bigramWord1s.contains(word1));
|
||||||
|
final int probability = wordProperty.mBigramTargets.get(j).mFrequency;
|
||||||
|
final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
|
||||||
|
assertEquals((int)bigramProbabilitiesToCheckLater.get(bigram), probability);
|
||||||
|
bigramSet.remove(bigram);
|
||||||
|
}
|
||||||
|
token = result.mNextToken;
|
||||||
|
} while (token != 0);
|
||||||
|
assertTrue(wordSet.isEmpty());
|
||||||
|
assertTrue(bigramSet.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
public void testAddShortcuts() {
|
public void testAddShortcuts() {
|
||||||
testAddShortcuts(FormatSpec.VERSION4);
|
testAddShortcuts(FormatSpec.VERSION4);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue