527 lines
27 KiB
C++
527 lines
27 KiB
C++
/*
|
|
* Copyright (C) 2013, The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h"
|
|
|
|
#include "defines.h"
|
|
#include "suggest/core/dicnode/dic_node.h"
|
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
|
#include "suggest/core/dictionary/multi_bigram_map.h"
|
|
#include "suggest/core/dictionary/ngram_listener.h"
|
|
#include "suggest/core/session/ngram_context.h"
|
|
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
|
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
|
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
|
#include "utils/char_utils.h"
|
|
|
|
namespace latinime {
|
|
|
|
void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
|
|
DicNodeVector *const childDicNodes) const {
|
|
if (!dicNode->hasChildren()) {
|
|
return;
|
|
}
|
|
int nextPos = dicNode->getChildrenPtNodeArrayPos();
|
|
if (!isValidPos(nextPos)) {
|
|
AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd",
|
|
nextPos, mBuffer.size());
|
|
mIsCorrupted = true;
|
|
ASSERT(false);
|
|
return;
|
|
}
|
|
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
|
|
mBuffer.data(), &nextPos);
|
|
for (int i = 0; i < childCount; i++) {
|
|
if (!isValidPos(nextPos)) {
|
|
AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d",
|
|
nextPos, mBuffer.size(), i, childCount);
|
|
mIsCorrupted = true;
|
|
ASSERT(false);
|
|
return;
|
|
}
|
|
nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
|
|
}
|
|
}
|
|
|
|
int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
|
|
const int maxCodePointCount, int *const outCodePoints) const {
|
|
return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount,
|
|
outCodePoints, nullptr /* outUnigramProbability */);
|
|
}
|
|
// This retrieves code points and the probability of the word by its id.
|
|
// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
|
|
// it is possible to check for this with advantageous complexity. For each PtNode array, we search
|
|
// for PtNodes with children and compare the children position with the position we look for.
|
|
// When we shoot the position we look for, it means the word we look for is in the children
|
|
// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a
|
|
// PtNode array with the last PtNode's children position still less than what we are searching for,
|
|
// we must descend the last PtNode's children (for example, if the word we are searching for starts
|
|
// with a z, it's the last PtNode of the root array, so all children addresses will be smaller
|
|
// than the position we look for, and we have to descend the z PtNode).
|
|
/* Parameters :
|
|
* wordId: Id of the word we are searching for.
|
|
* outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
|
|
* outUnigramProbability: a pointer to an int to write the probability into.
|
|
* Return value : the code point count, of 0 if the word was not found.
|
|
*/
|
|
// TODO: Split this function to be more readable
|
|
int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
|
int *const outUnigramProbability) const {
|
|
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
|
int pos = getRootPosition();
|
|
int wordPos = 0;
|
|
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
|
|
if (outUnigramProbability) {
|
|
*outUnigramProbability = NOT_A_PROBABILITY;
|
|
}
|
|
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
|
|
// only traverse PtNodes that are actually a part of the terminal we are searching, so each
|
|
// time we enter this loop we are one depth level further than last time.
|
|
// The only reason we count PtNodes is because we want to reduce the probability of infinite
|
|
// looping in case there is a bug. Since we know there is an upper bound to the depth we are
|
|
// supposed to traverse, it does not hurt to count iterations.
|
|
for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {
|
|
int lastCandidatePtNodePos = 0;
|
|
// Let's loop through PtNodes in this PtNode array searching for either the terminal
|
|
// or one of its ascendants.
|
|
if (!isValidPos(pos)) {
|
|
AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd",
|
|
pos, mBuffer.size());
|
|
mIsCorrupted = true;
|
|
ASSERT(false);
|
|
return 0;
|
|
}
|
|
for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
|
|
mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) {
|
|
const int startPos = pos;
|
|
if (!isValidPos(pos)) {
|
|
AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size());
|
|
mIsCorrupted = true;
|
|
ASSERT(false);
|
|
return 0;
|
|
}
|
|
const PatriciaTrieReadingUtils::NodeFlags flags =
|
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
|
|
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
|
mBuffer.data(), codePointTable, &pos);
|
|
if (ptNodePos == startPos) {
|
|
// We found the position. Copy the rest of the code points in the buffer and return
|
|
// the length.
|
|
outCodePoints[wordPos] = character;
|
|
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
|
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
|
mBuffer.data(), codePointTable, &pos);
|
|
// We count code points in order to avoid infinite loops if the file is broken
|
|
// or if there is some other bug
|
|
int charCount = maxCodePointCount;
|
|
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
|
|
outCodePoints[++wordPos] = nextChar;
|
|
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
|
mBuffer.data(), codePointTable, &pos);
|
|
}
|
|
}
|
|
if (outUnigramProbability) {
|
|
*outUnigramProbability =
|
|
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
|
|
mBuffer.data(), &pos);
|
|
}
|
|
return ++wordPos;
|
|
}
|
|
// We need to skip past this PtNode, so skip any remaining code points after the
|
|
// first and possibly the probability.
|
|
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
|
PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
|
|
codePointTable, &pos);
|
|
}
|
|
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
|
|
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
|
|
}
|
|
// The fact that this PtNode has children is very important. Since we already know
|
|
// that this PtNode does not match, if it has no children we know it is irrelevant
|
|
// to what we are searching for.
|
|
const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags);
|
|
// We will write in `found' whether we have passed the children position we are
|
|
// searching for. For example if we search for "beer", the children of b are less
|
|
// than the address we are searching for and the children of c are greater. When we
|
|
// come here for c, we realize this is too big, and that we should descend b.
|
|
bool found;
|
|
if (hasChildren) {
|
|
int currentPos = pos;
|
|
// Here comes the tricky part. First, read the children position.
|
|
const int childrenPos = PatriciaTrieReadingUtils
|
|
::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags,
|
|
¤tPos);
|
|
if (childrenPos > ptNodePos) {
|
|
// If the children pos is greater than the position, it means the previous
|
|
// PtNode, which position is stored in lastCandidatePtNodePos, was the right
|
|
// one.
|
|
found = true;
|
|
} else if (1 >= ptNodeCount) {
|
|
// However if we are on the LAST PtNode of this array, and we have NOT shot the
|
|
// position we should descend THIS PtNode. So we trick the
|
|
// lastCandidatePtNodePos so that we will descend this PtNode, not the previous
|
|
// one.
|
|
lastCandidatePtNodePos = startPos;
|
|
found = true;
|
|
} else {
|
|
// Else, we should continue looking.
|
|
found = false;
|
|
}
|
|
} else {
|
|
// Even if we don't have children here, we could still be on the last PtNode of
|
|
// this array. If this is the case, we should descend the last PtNode that had
|
|
// children, and their position is already in lastCandidatePtNodePos.
|
|
found = (1 >= ptNodeCount);
|
|
}
|
|
|
|
if (found) {
|
|
// Okay, we found the PtNode we should descend. Its position is in
|
|
// the lastCandidatePtNodePos variable, so we just re-read it.
|
|
if (0 != lastCandidatePtNodePos) {
|
|
const PatriciaTrieReadingUtils::NodeFlags lastFlags =
|
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
|
|
mBuffer.data(), &lastCandidatePtNodePos);
|
|
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
|
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
|
// We copy all the characters in this PtNode to the buffer
|
|
outCodePoints[wordPos] = lastChar;
|
|
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
|
|
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
|
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
|
int charCount = maxCodePointCount;
|
|
while (-1 != nextChar && --charCount > 0) {
|
|
outCodePoints[++wordPos] = nextChar;
|
|
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
|
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
|
}
|
|
}
|
|
++wordPos;
|
|
// Now we only need to branch to the children address. Skip the probability if
|
|
// it's there, read pos, and break to resume the search at pos.
|
|
if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
|
|
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
|
|
&lastCandidatePtNodePos);
|
|
}
|
|
pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
|
|
mBuffer.data(), lastFlags, &lastCandidatePtNodePos);
|
|
break;
|
|
} else {
|
|
// Here is a little tricky part: we come here if we found out that all children
|
|
// addresses in this PtNode are bigger than the address we are searching for.
|
|
// Should we conclude the word is not in the dictionary? No! It could still be
|
|
// one of the remaining PtNodes in this array, so we have to keep looking in
|
|
// this array until we find it (or we realize it's not there either, in which
|
|
// case it's actually not in the dictionary). Pass the end of this PtNode,
|
|
// ready to start the next one.
|
|
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
|
|
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
|
|
mBuffer.data(), flags, &pos);
|
|
}
|
|
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
|
|
mShortcutListPolicy.skipAllShortcuts(&pos);
|
|
}
|
|
if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
|
|
if (!mBigramListPolicy.skipAllBigrams(&pos)) {
|
|
AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(),
|
|
pos);
|
|
mIsCorrupted = true;
|
|
ASSERT(false);
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// If we did not find it, we should record the last children address for the next
|
|
// iteration.
|
|
if (hasChildren) lastCandidatePtNodePos = startPos;
|
|
// Now skip the end of this PtNode (children pos and the attributes if any) so that
|
|
// our pos is after the end of this PtNode, at the start of the next one.
|
|
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
|
|
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
|
|
mBuffer.data(), flags, &pos);
|
|
}
|
|
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
|
|
mShortcutListPolicy.skipAllShortcuts(&pos);
|
|
}
|
|
if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
|
|
if (!mBigramListPolicy.skipAllBigrams(&pos)) {
|
|
AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos);
|
|
mIsCorrupted = true;
|
|
ASSERT(false);
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
// If we have looked through all the PtNodes and found no match, the ptNodePos is
|
|
// not the position of a terminal in this dictionary.
|
|
return 0;
|
|
}
|
|
|
|
// This function gets the position of the terminal PtNode of the exact matching word in the
|
|
// dictionary. If no match is found, it returns NOT_A_WORD_ID.
|
|
int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
|
|
const bool forceLowerCaseSearch) const {
|
|
DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
|
|
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
|
const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
|
|
wordCodePoints.size(), forceLowerCaseSearch);
|
|
if (readingHelper.isError()) {
|
|
mIsCorrupted = true;
|
|
AKLOGE("Dictionary reading error in getWordId().");
|
|
}
|
|
return getWordIdFromTerminalPtNodePos(ptNodePos);
|
|
}
|
|
|
|
const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(
|
|
const WordIdArrayView prevWordIds, const int wordId,
|
|
MultiBigramMap *const multiBigramMap) const {
|
|
if (wordId == NOT_A_WORD_ID) {
|
|
return WordAttributes();
|
|
}
|
|
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
|
const PtNodeParams ptNodeParams =
|
|
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
|
if (multiBigramMap) {
|
|
const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
|
|
prevWordIds, wordId, ptNodeParams.getProbability());
|
|
return getWordAttributes(probability, ptNodeParams);
|
|
}
|
|
if (!prevWordIds.empty()) {
|
|
const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId);
|
|
if (bigramProbability != NOT_A_PROBABILITY) {
|
|
return getWordAttributes(bigramProbability, ptNodeParams);
|
|
}
|
|
}
|
|
return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
|
|
ptNodeParams);
|
|
}
|
|
|
|
const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability,
|
|
const PtNodeParams &ptNodeParams) const {
|
|
return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
|
|
ptNodeParams.isPossiblyOffensive());
|
|
}
|
|
|
|
int PatriciaTriePolicy::getProbability(const int unigramProbability,
|
|
const int bigramProbability) const {
|
|
// Due to space constraints, the probability for bigrams is approximate - the lower the unigram
|
|
// probability, the worse the precision. The theoritical maximum error in resulting probability
|
|
// is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means
|
|
// that sometimes, we'll see some bigrams interverted here, but it can't get too bad.
|
|
if (unigramProbability == NOT_A_PROBABILITY) {
|
|
return NOT_A_PROBABILITY;
|
|
} else if (bigramProbability == NOT_A_PROBABILITY) {
|
|
return ProbabilityUtils::backoff(unigramProbability);
|
|
} else {
|
|
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
|
|
bigramProbability);
|
|
}
|
|
}
|
|
|
|
int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
|
|
const int wordId) const {
|
|
if (wordId == NOT_A_WORD_ID) {
|
|
return NOT_A_PROBABILITY;
|
|
}
|
|
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
|
const PtNodeParams ptNodeParams =
|
|
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
|
if (ptNodeParams.isNotAWord()) {
|
|
// If this is not a word, it should behave as having no probability outside of the
|
|
// suggestion process (where it should be used for shortcuts).
|
|
return NOT_A_PROBABILITY;
|
|
}
|
|
if (!prevWordIds.empty()) {
|
|
const int bigramsPosition = getBigramsPositionOfPtNode(
|
|
getTerminalPtNodePosFromWordId(prevWordIds[0]));
|
|
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
|
|
while (bigramsIt.hasNext()) {
|
|
bigramsIt.next();
|
|
if (bigramsIt.getBigramPos() == ptNodePos
|
|
&& bigramsIt.getProbability() != NOT_A_PROBABILITY) {
|
|
return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
|
|
}
|
|
}
|
|
return NOT_A_PROBABILITY;
|
|
}
|
|
return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
|
|
}
|
|
|
|
void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
|
|
NgramListener *const listener) const {
|
|
if (prevWordIds.empty()) {
|
|
return;
|
|
}
|
|
const int bigramsPosition = getBigramsPositionOfPtNode(
|
|
getTerminalPtNodePosFromWordId(prevWordIds[0]));
|
|
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
|
|
while (bigramsIt.hasNext()) {
|
|
bigramsIt.next();
|
|
listener->onVisitEntry(bigramsIt.getProbability(),
|
|
getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
|
|
}
|
|
}
|
|
|
|
BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const {
|
|
const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
|
|
return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos);
|
|
}
|
|
|
|
int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
|
|
if (ptNodePos == NOT_A_DICT_POS) {
|
|
return NOT_A_DICT_POS;
|
|
}
|
|
return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
|
|
}
|
|
|
|
int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
|
if (ptNodePos == NOT_A_DICT_POS) {
|
|
return NOT_A_DICT_POS;
|
|
}
|
|
return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos();
|
|
}
|
|
|
|
int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
|
|
const int ptNodePos, DicNodeVector *childDicNodes) const {
|
|
PatriciaTrieReadingUtils::NodeFlags flags;
|
|
int mergedNodeCodePointCount = 0;
|
|
int mergedNodeCodePoints[MAX_WORD_LENGTH];
|
|
int probability = NOT_A_PROBABILITY;
|
|
int childrenPos = NOT_A_DICT_POS;
|
|
int shortcutPos = NOT_A_DICT_POS;
|
|
int bigramPos = NOT_A_DICT_POS;
|
|
int siblingPos = NOT_A_DICT_POS;
|
|
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
|
|
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
|
|
&mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
|
|
mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
|
|
&siblingPos);
|
|
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
|
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
|
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
|
childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId,
|
|
CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount));
|
|
}
|
|
return siblingPos;
|
|
}
|
|
|
|
const WordProperty PatriciaTriePolicy::getWordProperty(
|
|
const CodePointArrayView wordCodePoints) const {
|
|
const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
|
|
if (wordId == NOT_A_WORD_ID) {
|
|
AKLOGE("getWordProperty was called for invalid word.");
|
|
return WordProperty();
|
|
}
|
|
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
|
const PtNodeParams ptNodeParams =
|
|
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
|
// Fetch bigram information.
|
|
std::vector<NgramProperty> ngrams;
|
|
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
|
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
|
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
|
|
while (bigramsIt.hasNext()) {
|
|
// Fetch the next bigram information and forward the iterator.
|
|
bigramsIt.next();
|
|
// Skip the entry if the entry has been deleted. This never happens for ver2 dicts.
|
|
if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
|
|
int word1Probability = NOT_A_PROBABILITY;
|
|
const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
|
getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
|
|
bigramWord1CodePoints, &word1Probability);
|
|
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
|
|
ngrams.emplace_back(
|
|
NgramContext(wordCodePoints.data(), wordCodePoints.size(),
|
|
ptNodeParams.representsBeginningOfSentence()),
|
|
CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
|
|
probability, HistoricalInfo());
|
|
}
|
|
}
|
|
// Fetch shortcut information.
|
|
std::vector<UnigramProperty::ShortcutProperty> shortcuts;
|
|
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
|
|
if (shortcutPos != NOT_A_DICT_POS) {
|
|
int shortcutTargetCodePoints[MAX_WORD_LENGTH];
|
|
ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos);
|
|
bool hasNext = true;
|
|
while (hasNext) {
|
|
const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
|
|
ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos);
|
|
hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
|
|
const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
|
|
mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
|
|
const int shortcutProbability =
|
|
ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
|
|
shortcuts.emplace_back(
|
|
CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(),
|
|
shortcutProbability);
|
|
}
|
|
}
|
|
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
|
ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
|
|
ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
|
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
|
}
|
|
|
|
int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
|
int *const outCodePointCount) {
|
|
*outCodePointCount = 0;
|
|
if (token == 0) {
|
|
// Start iterating the dictionary.
|
|
mTerminalPtNodePositionsForIteratingWords.clear();
|
|
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
|
|
&mTerminalPtNodePositionsForIteratingWords);
|
|
DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
|
|
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
|
readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
|
|
}
|
|
const int terminalPtNodePositionsVectorSize =
|
|
static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
|
|
if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
|
|
AKLOGE("Given token %d is invalid.", token);
|
|
return 0;
|
|
}
|
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
|
*outCodePointCount = getCodePointsAndReturnCodePointCount(
|
|
getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
|
|
const int nextToken = token + 1;
|
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
|
// All words have been iterated.
|
|
mTerminalPtNodePositionsForIteratingWords.clear();
|
|
return 0;
|
|
}
|
|
return nextToken;
|
|
}
|
|
|
|
int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
|
|
return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
|
|
}
|
|
|
|
int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
|
|
return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
|
|
}
|
|
|
|
bool PatriciaTriePolicy::isValidPos(const int pos) const {
|
|
return pos >= 0 && pos < static_cast<int>(mBuffer.size());
|
|
}
|
|
|
|
} // namespace latinime
|