Merge "Implement Ver4PatriciaTriePolicy::getNextWordAndNextToken."
This commit is contained in:
commit
ffb12e76b8
6 changed files with 148 additions and 6 deletions
|
@ -357,7 +357,7 @@ public final class BinaryDictionary extends Dictionary {
|
|||
while (len < MAX_WORD_LENGTH && codePoints[len] != 0) {
|
||||
++len;
|
||||
}
|
||||
final String word = new String(mOutputCodePoints, 0, len);
|
||||
final String word = new String(codePoints, 0, len);
|
||||
return new GetNextWordPropertyResult(getWordProperty(word), nextToken);
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,14 @@ const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 10000
|
|||
const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
|
||||
const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
|
||||
|
||||
bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
|
||||
const PtNodeParams *const ptNodeParams) {
|
||||
if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
|
||||
mTerminalPositions->push_back(ptNodeParams->getHeadPos());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Visits all PtNodes in post-order depth first manner.
|
||||
// For example, visits c -> b -> y -> x -> a for the following dictionary:
|
||||
// a _ b _ c
|
||||
|
|
|
@ -59,6 +59,21 @@ class DynamicPtReadingHelper {
|
|||
DISALLOW_COPY_AND_ASSIGN(TraversingEventListener);
|
||||
};
|
||||
|
||||
class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener {
|
||||
public:
|
||||
TraversePolicyToGetAllTerminalPtNodePositions(std::vector<int> *const terminalPositions)
|
||||
: mTerminalPositions(terminalPositions) {}
|
||||
bool onAscend() { return true; }
|
||||
bool onDescend(const int ptNodeArrayPos) { return true; }
|
||||
bool onReadingPtNodeArrayTail() { return true; }
|
||||
bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions);
|
||||
|
||||
std::vector<int> *const mTerminalPositions;
|
||||
};
|
||||
|
||||
DynamicPtReadingHelper(const BufferWithExtendableBuffer *const buffer,
|
||||
const PtNodeReader *const ptNodeReader)
|
||||
: mIsError(false), mReadingState(), mBuffer(buffer),
|
||||
|
|
|
@ -392,10 +392,32 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
|
|||
historicalInfo->getCount(), &bigrams, &shortcuts);
|
||||
}
|
||||
|
||||
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token,
|
||||
int *const outCodePoints) {
|
||||
// TODO: Implement.
|
||||
return 0;
|
||||
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
|
||||
if (token == 0) {
|
||||
mTerminalPtNodePositionsForIteratingWords.clear();
|
||||
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
|
||||
&mTerminalPtNodePositionsForIteratingWords);
|
||||
DynamicPtReadingHelper readingHelper(mDictBuffer, &mNodeReader);
|
||||
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
||||
readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
|
||||
}
|
||||
const int terminalPtNodePositionsVectorSize =
|
||||
static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
|
||||
if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
|
||||
AKLOGE("Given token %d is invalid.", token);
|
||||
return 0;
|
||||
}
|
||||
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||
int unigramProbability = NOT_A_PROBABILITY;
|
||||
getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH,
|
||||
outCodePoints, &unigramProbability);
|
||||
const int nextToken = token + 1;
|
||||
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||
// All words have been iterated.
|
||||
mTerminalPtNodePositionsForIteratingWords.clear();
|
||||
return 0;
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||
#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||
|
@ -50,7 +52,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
|
||||
mWritingHelper(mBuffers.get()),
|
||||
mUnigramCount(mHeaderPolicy->getUnigramCount()),
|
||||
mBigramCount(mHeaderPolicy->getBigramCount()) {};
|
||||
mBigramCount(mHeaderPolicy->getBigramCount()),
|
||||
mTerminalPtNodePositionsForIteratingWords() {};
|
||||
|
||||
AK_FORCE_INLINE int getRootPosition() const {
|
||||
return 0;
|
||||
|
@ -134,6 +137,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
Ver4PatriciaTrieWritingHelper mWritingHelper;
|
||||
int mUnigramCount;
|
||||
int mBigramCount;
|
||||
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||
|
|
|
@ -971,6 +971,99 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testIterateAllWords() {
|
||||
testIterateAllWords(FormatSpec.VERSION4);
|
||||
}
|
||||
|
||||
private void testIterateAllWords(final int formatVersion) {
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final int UNIGRAM_COUNT = 1000;
|
||||
final int BIGRAM_COUNT = 1000;
|
||||
final int codePointSetSize = 20;
|
||||
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||
|
||||
File dictFile = null;
|
||||
try {
|
||||
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
}
|
||||
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
|
||||
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
|
||||
assertFalse(invalidWordProperty.isValid());
|
||||
|
||||
final ArrayList<String> words = new ArrayList<String>();
|
||||
final HashMap<String, Integer> wordProbabilitiesToCheckLater =
|
||||
new HashMap<String, Integer>();
|
||||
final HashMap<String, HashSet<String>> bigrams = new HashMap<String, HashSet<String>>();
|
||||
final HashMap<Pair<String, String>, Integer> bigramProbabilitiesToCheckLater =
|
||||
new HashMap<Pair<String, String>, Integer>();
|
||||
|
||||
for (int i = 0; i < UNIGRAM_COUNT; i++) {
|
||||
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||
final int unigramProbability = random.nextInt(0xFF);
|
||||
addUnigramWord(binaryDictionary, word, unigramProbability);
|
||||
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
words.add(word);
|
||||
wordProbabilitiesToCheckLater.put(word, unigramProbability);
|
||||
}
|
||||
|
||||
for (int i = 0; i < BIGRAM_COUNT; i++) {
|
||||
final int word0Index = random.nextInt(wordProbabilitiesToCheckLater.size());
|
||||
final int word1Index = random.nextInt(wordProbabilitiesToCheckLater.size());
|
||||
if (word0Index == word1Index) {
|
||||
continue;
|
||||
}
|
||||
final String word0 = words.get(word0Index);
|
||||
final String word1 = words.get(word1Index);
|
||||
final int bigramProbability = random.nextInt(0xF);
|
||||
binaryDictionary.addBigramWords(word0, word1, bigramProbability,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
if (!bigrams.containsKey(word0)) {
|
||||
final HashSet<String> bigramWord1s = new HashSet<String>();
|
||||
bigrams.put(word0, bigramWord1s);
|
||||
}
|
||||
bigrams.get(word0).add(word1);
|
||||
bigramProbabilitiesToCheckLater.put(
|
||||
new Pair<String, String>(word0, word1), bigramProbability);
|
||||
}
|
||||
|
||||
final HashSet<String> wordSet = new HashSet<String>(words);
|
||||
final HashSet<Pair<String, String>> bigramSet =
|
||||
new HashSet<Pair<String,String>>(bigramProbabilitiesToCheckLater.keySet());
|
||||
int token = 0;
|
||||
do {
|
||||
final BinaryDictionary.GetNextWordPropertyResult result =
|
||||
binaryDictionary.getNextWordProperty(token);
|
||||
final WordProperty wordProperty = result.mWordProperty;
|
||||
final String word0 = wordProperty.mCodePoints;
|
||||
assertEquals((int)wordProbabilitiesToCheckLater.get(word0),
|
||||
wordProperty.mProbabilityInfo.mProbability);
|
||||
wordSet.remove(word0);
|
||||
final HashSet<String> bigramWord1s = bigrams.get(word0);
|
||||
for (int j = 0; j < wordProperty.mBigramTargets.size(); j++) {
|
||||
final String word1 = wordProperty.mBigramTargets.get(j).mWord;
|
||||
assertTrue(bigramWord1s.contains(word1));
|
||||
final int probability = wordProperty.mBigramTargets.get(j).mFrequency;
|
||||
final Pair<String, String> bigram = new Pair<String, String>(word0, word1);
|
||||
assertEquals((int)bigramProbabilitiesToCheckLater.get(bigram), probability);
|
||||
bigramSet.remove(bigram);
|
||||
}
|
||||
token = result.mNextToken;
|
||||
} while (token != 0);
|
||||
assertTrue(wordSet.isEmpty());
|
||||
assertTrue(bigramSet.isEmpty());
|
||||
}
|
||||
|
||||
public void testAddShortcuts() {
|
||||
testAddShortcuts(FormatSpec.VERSION4);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue