Use word id to get code ponits of the word.
Bug: 14425059 Change-Id: I81accffcdf5abe447c33ffc3a8e8315f9a4cde7f
This commit is contained in:
parent
ac983b13a9
commit
94e4cd25a8
9 changed files with 31 additions and 29 deletions
|
@ -67,8 +67,8 @@ Dictionary::NgramListenerForPrediction::NgramListenerForPrediction(
|
||||||
mDictStructurePolicy(dictStructurePolicy) {}
|
mDictStructurePolicy(dictStructurePolicy) {}
|
||||||
|
|
||||||
void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbability,
|
void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbability,
|
||||||
const int targetPtNodePos) {
|
const int targetWordId) {
|
||||||
if (targetPtNodePos == NOT_A_DICT_POS) {
|
if (targetWordId == NOT_A_WORD_ID) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (mPrevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)
|
if (mPrevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)
|
||||||
|
@ -78,8 +78,8 @@ void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbabi
|
||||||
int targetWordCodePoints[MAX_WORD_LENGTH];
|
int targetWordCodePoints[MAX_WORD_LENGTH];
|
||||||
int unigramProbability = 0;
|
int unigramProbability = 0;
|
||||||
const int codePointCount = mDictStructurePolicy->
|
const int codePointCount = mDictStructurePolicy->
|
||||||
getCodePointsAndProbabilityAndReturnCodePointCount(targetPtNodePos,
|
getCodePointsAndProbabilityAndReturnCodePointCount(targetWordId, MAX_WORD_LENGTH,
|
||||||
MAX_WORD_LENGTH, targetWordCodePoints, &unigramProbability);
|
targetWordCodePoints, &unigramProbability);
|
||||||
if (codePointCount <= 0) {
|
if (codePointCount <= 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -120,7 +120,7 @@ class Dictionary {
|
||||||
NgramListenerForPrediction(const PrevWordsInfo *const prevWordsInfo,
|
NgramListenerForPrediction(const PrevWordsInfo *const prevWordsInfo,
|
||||||
SuggestionResults *const suggestionResults,
|
SuggestionResults *const suggestionResults,
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy);
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy);
|
||||||
virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos);
|
virtual void onVisitEntry(const int ngramProbability, const int targetWordId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(NgramListenerForPrediction);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(NgramListenerForPrediction);
|
||||||
|
|
|
@ -50,7 +50,7 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
DicNodeVector *const childDicNodes) const = 0;
|
DicNodeVector *const childDicNodes) const = 0;
|
||||||
|
|
||||||
virtual int getCodePointsAndProbabilityAndReturnCodePointCount(
|
virtual int getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
||||||
int *const outUnigramProbability) const = 0;
|
int *const outUnigramProbability) const = 0;
|
||||||
|
|
||||||
virtual int getWordId(const CodePointArrayView wordCodePoints,
|
virtual int getWordId(const CodePointArrayView wordCodePoints,
|
||||||
|
|
|
@ -91,9 +91,10 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
||||||
int *const outUnigramProbability) const {
|
int *const outUnigramProbability) const {
|
||||||
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
|
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
|
||||||
|
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
||||||
readingHelper.initWithPtNodePos(ptNodePos);
|
readingHelper.initWithPtNodePos(ptNodePos);
|
||||||
const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
|
const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
maxCodePointCount, outCodePoints, outUnigramProbability);
|
maxCodePointCount, outCodePoints, outUnigramProbability);
|
||||||
|
@ -492,8 +493,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
// Word (unigram) probability
|
// Word (unigram) probability
|
||||||
int word1Probability = NOT_A_PROBABILITY;
|
int word1Probability = NOT_A_PROBABILITY;
|
||||||
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints,
|
getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH,
|
||||||
&word1Probability);
|
bigramWord1CodePoints, &word1Probability);
|
||||||
const std::vector<int> word1(bigramWord1CodePoints,
|
const std::vector<int> word1(bigramWord1CodePoints,
|
||||||
bigramWord1CodePoints + codePointCount);
|
bigramWord1CodePoints + codePointCount);
|
||||||
const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
|
const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
|
||||||
|
@ -550,7 +551,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
|
||||||
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||||
int unigramProbability = NOT_A_PROBABILITY;
|
int unigramProbability = NOT_A_PROBABILITY;
|
||||||
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints,
|
||||||
|
&unigramProbability);
|
||||||
const int nextToken = token + 1;
|
const int nextToken = token + 1;
|
||||||
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||||
// All words have been iterated.
|
// All words have been iterated.
|
||||||
|
|
|
@ -86,7 +86,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
DicNodeVector *const childDicNodes) const;
|
DicNodeVector *const childDicNodes) const;
|
||||||
|
|
||||||
int getCodePointsAndProbabilityAndReturnCodePointCount(
|
int getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints,
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
||||||
int *const outUnigramProbability) const;
|
int *const outUnigramProbability) const;
|
||||||
|
|
||||||
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
|
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
|
||||||
|
|
|
@ -57,7 +57,7 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This retrieves code points and the probability of the word by its terminal position.
|
// This retrieves code points and the probability of the word by its id.
|
||||||
// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
|
// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
|
||||||
// it is possible to check for this with advantageous complexity. For each PtNode array, we search
|
// it is possible to check for this with advantageous complexity. For each PtNode array, we search
|
||||||
// for PtNodes with children and compare the children position with the position we look for.
|
// for PtNodes with children and compare the children position with the position we look for.
|
||||||
|
@ -68,16 +68,16 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
|
||||||
// with a z, it's the last PtNode of the root array, so all children addresses will be smaller
|
// with a z, it's the last PtNode of the root array, so all children addresses will be smaller
|
||||||
// than the position we look for, and we have to descend the z PtNode).
|
// than the position we look for, and we have to descend the z PtNode).
|
||||||
/* Parameters :
|
/* Parameters :
|
||||||
* ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is
|
* wordId: Id of the word we are searching for.
|
||||||
* what is stored as the "bigram position" in each bigram)
|
|
||||||
* outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
|
* outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
|
||||||
* outUnigramProbability: a pointer to an int to write the probability into.
|
* outUnigramProbability: a pointer to an int to write the probability into.
|
||||||
* Return value : the code point count, of 0 if the word was not found.
|
* Return value : the code point count, of 0 if the word was not found.
|
||||||
*/
|
*/
|
||||||
// TODO: Split this function to be more readable
|
// TODO: Split this function to be more readable
|
||||||
int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
||||||
int *const outUnigramProbability) const {
|
int *const outUnigramProbability) const {
|
||||||
|
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
||||||
int pos = getRootPosition();
|
int pos = getRootPosition();
|
||||||
int wordPos = 0;
|
int wordPos = 0;
|
||||||
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
|
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
|
||||||
|
@ -410,8 +410,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
||||||
if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
|
if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
|
||||||
int word1Probability = NOT_A_PROBABILITY;
|
int word1Probability = NOT_A_PROBABILITY;
|
||||||
const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints,
|
getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
|
||||||
&word1Probability);
|
bigramWord1CodePoints, &word1Probability);
|
||||||
const std::vector<int> word1(bigramWord1CodePoints,
|
const std::vector<int> word1(bigramWord1CodePoints,
|
||||||
bigramWord1CodePoints + word1CodePointCount);
|
bigramWord1CodePoints + word1CodePointCount);
|
||||||
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
|
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
|
||||||
|
@ -465,8 +465,9 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC
|
||||||
}
|
}
|
||||||
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||||
int unigramProbability = NOT_A_PROBABILITY;
|
int unigramProbability = NOT_A_PROBABILITY;
|
||||||
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos,
|
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints,
|
||||||
|
&unigramProbability);
|
||||||
const int nextToken = token + 1;
|
const int nextToken = token + 1;
|
||||||
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||||
// All words have been iterated.
|
// All words have been iterated.
|
||||||
|
|
|
@ -61,7 +61,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
DicNodeVector *const childDicNodes) const;
|
DicNodeVector *const childDicNodes) const;
|
||||||
|
|
||||||
int getCodePointsAndProbabilityAndReturnCodePointCount(
|
int getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints,
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
||||||
int *const outUnigramProbability) const;
|
int *const outUnigramProbability) const;
|
||||||
|
|
||||||
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
|
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
|
||||||
|
|
|
@ -81,9 +81,11 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
||||||
int *const outUnigramProbability) const {
|
int *const outUnigramProbability) const {
|
||||||
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
|
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
|
||||||
|
const int ptNodePos =
|
||||||
|
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
|
||||||
readingHelper.initWithPtNodePos(ptNodePos);
|
readingHelper.initWithPtNodePos(ptNodePos);
|
||||||
const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
|
const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
maxCodePointCount, outCodePoints, outUnigramProbability);
|
maxCodePointCount, outCodePoints, outUnigramProbability);
|
||||||
|
@ -488,18 +490,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
// TODO: Support n-gram.
|
// TODO: Support n-gram.
|
||||||
std::vector<BigramProperty> bigrams;
|
std::vector<BigramProperty> bigrams;
|
||||||
const WordIdArrayView prevWordIds = WordIdArrayView::fromObject(&wordId);
|
const WordIdArrayView prevWordIds = WordIdArrayView::fromObject(&wordId);
|
||||||
const TerminalPositionLookupTable *const terminalPositionLookupTable =
|
|
||||||
mBuffers->getTerminalPositionLookupTable();
|
|
||||||
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
||||||
for (const auto entry : mBuffers->getLanguageModelDictContent()->getProbabilityEntries(
|
for (const auto entry : mBuffers->getLanguageModelDictContent()->getProbabilityEntries(
|
||||||
prevWordIds)) {
|
prevWordIds)) {
|
||||||
const int word1TerminalPtNodePos =
|
|
||||||
terminalPositionLookupTable->getTerminalPtNodePosition(entry.getWordId());
|
|
||||||
// Word (unigram) probability
|
// Word (unigram) probability
|
||||||
int word1Probability = NOT_A_PROBABILITY;
|
int word1Probability = NOT_A_PROBABILITY;
|
||||||
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints,
|
entry.getWordId(), MAX_WORD_LENGTH, bigramWord1CodePoints, &word1Probability);
|
||||||
&word1Probability);
|
|
||||||
const std::vector<int> word1(bigramWord1CodePoints,
|
const std::vector<int> word1(bigramWord1CodePoints,
|
||||||
bigramWord1CodePoints + codePointCount);
|
bigramWord1CodePoints + codePointCount);
|
||||||
const ProbabilityEntry probabilityEntry = entry.getProbabilityEntry();
|
const ProbabilityEntry probabilityEntry = entry.getProbabilityEntry();
|
||||||
|
@ -553,9 +550,11 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||||
|
const PtNodeParams ptNodeParams =
|
||||||
|
mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos);
|
||||||
int unigramProbability = NOT_A_PROBABILITY;
|
int unigramProbability = NOT_A_PROBABILITY;
|
||||||
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
ptNodeParams.getTerminalId(), MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
||||||
const int nextToken = token + 1;
|
const int nextToken = token + 1;
|
||||||
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||||
// All words have been iterated.
|
// All words have been iterated.
|
||||||
|
|
|
@ -63,7 +63,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
DicNodeVector *const childDicNodes) const;
|
DicNodeVector *const childDicNodes) const;
|
||||||
|
|
||||||
int getCodePointsAndProbabilityAndReturnCodePointCount(
|
int getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints,
|
const int wordId, const int maxCodePointCount, int *const outCodePoints,
|
||||||
int *const outUnigramProbability) const;
|
int *const outUnigramProbability) const;
|
||||||
|
|
||||||
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
|
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
|
||||||
|
|
Loading…
Reference in a new issue