Merge "Implement PatriciaTriePolicy::getWordProperty()."

This commit is contained in:
Keisuke Kuroyanagi 2014-02-14 09:08:09 +00:00 committed by Android (Google) Code Review
commit 8fa7a09f1e
3 changed files with 97 additions and 4 deletions

View file

@ -20,6 +20,7 @@
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
@ -303,4 +304,63 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
return siblingPos;
}
const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoints,
const int codePointCount) const {
const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount,
false /* forceLowerCaseSearch */);
if (ptNodePos == NOT_A_DICT_POS) {
AKLOGE("getWordProperty was called for invalid word.");
return WordProperty();
}
const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
std::vector<int> codePointVector(ptNodeParams.getCodePoints(),
ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
// Fetch bigram information.
std::vector<WordProperty::BigramProperty> bigrams;
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
int bigramWord1CodePoints[MAX_WORD_LENGTH];
BinaryDictionaryBigramsIterator bigramsIt(getBigramsStructurePolicy(), bigramListPos);
while (bigramsIt.hasNext()) {
// Fetch the next bigram information and forward the iterator.
bigramsIt.next();
// Skip the entry if the entry has been deleted. This never happens for ver2 dicts.
if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
int word1Probability = NOT_A_PROBABILITY;
int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints,
&word1Probability);
std::vector<int> word1(bigramWord1CodePoints,
bigramWord1CodePoints + word1CodePointCount);
bigrams.push_back(WordProperty::BigramProperty(&word1, bigramsIt.getProbability(),
NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */));
}
}
// Fetch shortcut information.
std::vector<WordProperty::ShortcutProperty> shortcuts;
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
if (shortcutPos != NOT_A_DICT_POS) {
int shortcutTargetCodePoints[MAX_WORD_LENGTH];
ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mDictRoot, &shortcutPos);
bool hasNext = true;
while (hasNext) {
const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, &shortcutPos);
hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
std::vector<int> shortcutTarget(shortcutTargetCodePoints,
shortcutTargetCodePoints + shortcutTargetLength);
const int shortcutProbability =
ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
shortcuts.push_back(
WordProperty::ShortcutProperty(&shortcutTarget, shortcutProbability));
}
}
return WordProperty(&codePointVector, ptNodeParams.isNotAWord(),
ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(),
ptNodeParams.hasShortcutTargets(), ptNodeParams.getProbability(),
NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */,
&bigrams, &shortcuts);
}
} // namespace latinime

View file

@ -128,10 +128,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
}
const WordProperty getWordProperty(const int *const codePoints,
const int codePointCount) const {
// getWordProperty is not supported.
return WordProperty();
}
const int codePointCount) const;
int getNextWordAndNextToken(const int token, int *const outCodePoints) {
// getNextWordAndNextToken is not supported.

View file

@ -39,6 +39,7 @@ import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
@ -596,4 +597,39 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
Log.d(TAG, result);
}
}
public void testVer2DictGetWordProperty() {
final FormatOptions formatOptions = BinaryDictUtils.VERSION2_OPTIONS;
final ArrayList<String> words = sWords;
final HashMap<String, List<String>> shortcuts = sShortcuts;
final String dictName = "testGetWordProperty";
final String dictVersion = Long.toString(System.currentTimeMillis());
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions));
addUnigrams(words.size(), dict, words, shortcuts);
addBigrams(dict, words, sEmptyBigrams);
final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions,
getContext().getCacheDir());
file.delete();
timeWritingDictToFile(file, dict, formatOptions);
final BinaryDictionary binaryDictionary = new BinaryDictionary(file.getAbsolutePath(),
0 /* offset */, file.length(), true /* useFullEditDistance */,
Locale.ENGLISH, dictName, false /* isUpdatable */);
for (final String word : words) {
final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
assertEquals(word, wordProperty.mWord);
assertEquals(UNIGRAM_FREQ, wordProperty.getProbability());
if (shortcuts.containsKey(word)) {
assertEquals(shortcuts.get(word).size(), wordProperty.mShortcutTargets.size());
final List<String> shortcutList = shortcuts.get(word);
assertTrue(wordProperty.mHasShortcuts);
for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
assertTrue(shortcutList.contains(shortcutTarget.mWord));
assertEquals(UNIGRAM_FREQ, shortcutTarget.getProbability());
shortcutList.remove(shortcutTarget.mWord);
}
assertTrue(shortcutList.isEmpty());
}
}
}
}