Use word id to get code ponits of the word.

Bug: 14425059
Change-Id: I81accffcdf5abe447c33ffc3a8e8315f9a4cde7f
This commit is contained in:
Keisuke Kuroyanagi 2014-09-03 18:55:31 +09:00
parent ac983b13a9
commit 94e4cd25a8
9 changed files with 31 additions and 29 deletions

View file

@ -67,8 +67,8 @@ Dictionary::NgramListenerForPrediction::NgramListenerForPrediction(
mDictStructurePolicy(dictStructurePolicy) {} mDictStructurePolicy(dictStructurePolicy) {}
void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbability, void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbability,
const int targetPtNodePos) { const int targetWordId) {
if (targetPtNodePos == NOT_A_DICT_POS) { if (targetWordId == NOT_A_WORD_ID) {
return; return;
} }
if (mPrevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */) if (mPrevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)
@ -78,8 +78,8 @@ void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbabi
int targetWordCodePoints[MAX_WORD_LENGTH]; int targetWordCodePoints[MAX_WORD_LENGTH];
int unigramProbability = 0; int unigramProbability = 0;
const int codePointCount = mDictStructurePolicy-> const int codePointCount = mDictStructurePolicy->
getCodePointsAndProbabilityAndReturnCodePointCount(targetPtNodePos, getCodePointsAndProbabilityAndReturnCodePointCount(targetWordId, MAX_WORD_LENGTH,
MAX_WORD_LENGTH, targetWordCodePoints, &unigramProbability); targetWordCodePoints, &unigramProbability);
if (codePointCount <= 0) { if (codePointCount <= 0) {
return; return;
} }

View file

@ -120,7 +120,7 @@ class Dictionary {
NgramListenerForPrediction(const PrevWordsInfo *const prevWordsInfo, NgramListenerForPrediction(const PrevWordsInfo *const prevWordsInfo,
SuggestionResults *const suggestionResults, SuggestionResults *const suggestionResults,
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy); const DictionaryStructureWithBufferPolicy *const dictStructurePolicy);
virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos); virtual void onVisitEntry(const int ngramProbability, const int targetWordId);
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(NgramListenerForPrediction); DISALLOW_IMPLICIT_CONSTRUCTORS(NgramListenerForPrediction);

View file

@ -50,7 +50,7 @@ class DictionaryStructureWithBufferPolicy {
DicNodeVector *const childDicNodes) const = 0; DicNodeVector *const childDicNodes) const = 0;
virtual int getCodePointsAndProbabilityAndReturnCodePointCount( virtual int getCodePointsAndProbabilityAndReturnCodePointCount(
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const = 0; int *const outUnigramProbability) const = 0;
virtual int getWordId(const CodePointArrayView wordCodePoints, virtual int getWordId(const CodePointArrayView wordCodePoints,

View file

@ -91,9 +91,10 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
} }
int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const { int *const outUnigramProbability) const {
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
readingHelper.initWithPtNodePos(ptNodePos); readingHelper.initWithPtNodePos(ptNodePos);
const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
maxCodePointCount, outCodePoints, outUnigramProbability); maxCodePointCount, outCodePoints, outUnigramProbability);
@ -492,8 +493,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
// Word (unigram) probability // Word (unigram) probability
int word1Probability = NOT_A_PROBABILITY; int word1Probability = NOT_A_PROBABILITY;
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH,
&word1Probability); bigramWord1CodePoints, &word1Probability);
const std::vector<int> word1(bigramWord1CodePoints, const std::vector<int> word1(bigramWord1CodePoints,
bigramWord1CodePoints + codePointCount); bigramWord1CodePoints + codePointCount);
const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
@ -550,7 +551,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
int unigramProbability = NOT_A_PROBABILITY; int unigramProbability = NOT_A_PROBABILITY;
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints,
&unigramProbability);
const int nextToken = token + 1; const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) { if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated. // All words have been iterated.

View file

@ -86,7 +86,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
DicNodeVector *const childDicNodes) const; DicNodeVector *const childDicNodes) const;
int getCodePointsAndProbabilityAndReturnCodePointCount( int getCodePointsAndProbabilityAndReturnCodePointCount(
const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const; int *const outUnigramProbability) const;
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;

View file

@ -57,7 +57,7 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
} }
} }
// This retrieves code points and the probability of the word by its terminal position. // This retrieves code points and the probability of the word by its id.
// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
// it is possible to check for this with advantageous complexity. For each PtNode array, we search // it is possible to check for this with advantageous complexity. For each PtNode array, we search
// for PtNodes with children and compare the children position with the position we look for. // for PtNodes with children and compare the children position with the position we look for.
@ -68,16 +68,16 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
// with a z, it's the last PtNode of the root array, so all children addresses will be smaller // with a z, it's the last PtNode of the root array, so all children addresses will be smaller
// than the position we look for, and we have to descend the z PtNode). // than the position we look for, and we have to descend the z PtNode).
/* Parameters : /* Parameters :
* ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is * wordId: Id of the word we are searching for.
* what is stored as the "bigram position" in each bigram)
* outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
* outUnigramProbability: a pointer to an int to write the probability into. * outUnigramProbability: a pointer to an int to write the probability into.
* Return value : the code point count, of 0 if the word was not found. * Return value : the code point count, of 0 if the word was not found.
*/ */
// TODO: Split this function to be more readable // TODO: Split this function to be more readable
int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const { int *const outUnigramProbability) const {
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
int pos = getRootPosition(); int pos = getRootPosition();
int wordPos = 0; int wordPos = 0;
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
@ -410,8 +410,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
int word1Probability = NOT_A_PROBABILITY; int word1Probability = NOT_A_PROBABILITY;
const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints, getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
&word1Probability); bigramWord1CodePoints, &word1Probability);
const std::vector<int> word1(bigramWord1CodePoints, const std::vector<int> word1(bigramWord1CodePoints,
bigramWord1CodePoints + word1CodePointCount); bigramWord1CodePoints + word1CodePointCount);
const int probability = getProbability(word1Probability, bigramsIt.getProbability()); const int probability = getProbability(word1Probability, bigramsIt.getProbability());
@ -465,8 +465,9 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC
} }
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
int unigramProbability = NOT_A_PROBABILITY; int unigramProbability = NOT_A_PROBABILITY;
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
MAX_WORD_LENGTH, outCodePoints, &unigramProbability); getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints,
&unigramProbability);
const int nextToken = token + 1; const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) { if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated. // All words have been iterated.

View file

@ -61,7 +61,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
DicNodeVector *const childDicNodes) const; DicNodeVector *const childDicNodes) const;
int getCodePointsAndProbabilityAndReturnCodePointCount( int getCodePointsAndProbabilityAndReturnCodePointCount(
const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const; int *const outUnigramProbability) const;
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;

View file

@ -81,9 +81,11 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
} }
int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const { int *const outUnigramProbability) const {
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
const int ptNodePos =
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
readingHelper.initWithPtNodePos(ptNodePos); readingHelper.initWithPtNodePos(ptNodePos);
const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
maxCodePointCount, outCodePoints, outUnigramProbability); maxCodePointCount, outCodePoints, outUnigramProbability);
@ -488,18 +490,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
// TODO: Support n-gram. // TODO: Support n-gram.
std::vector<BigramProperty> bigrams; std::vector<BigramProperty> bigrams;
const WordIdArrayView prevWordIds = WordIdArrayView::fromObject(&wordId); const WordIdArrayView prevWordIds = WordIdArrayView::fromObject(&wordId);
const TerminalPositionLookupTable *const terminalPositionLookupTable =
mBuffers->getTerminalPositionLookupTable();
int bigramWord1CodePoints[MAX_WORD_LENGTH]; int bigramWord1CodePoints[MAX_WORD_LENGTH];
for (const auto entry : mBuffers->getLanguageModelDictContent()->getProbabilityEntries( for (const auto entry : mBuffers->getLanguageModelDictContent()->getProbabilityEntries(
prevWordIds)) { prevWordIds)) {
const int word1TerminalPtNodePos =
terminalPositionLookupTable->getTerminalPtNodePosition(entry.getWordId());
// Word (unigram) probability // Word (unigram) probability
int word1Probability = NOT_A_PROBABILITY; int word1Probability = NOT_A_PROBABILITY;
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, entry.getWordId(), MAX_WORD_LENGTH, bigramWord1CodePoints, &word1Probability);
&word1Probability);
const std::vector<int> word1(bigramWord1CodePoints, const std::vector<int> word1(bigramWord1CodePoints,
bigramWord1CodePoints + codePointCount); bigramWord1CodePoints + codePointCount);
const ProbabilityEntry probabilityEntry = entry.getProbabilityEntry(); const ProbabilityEntry probabilityEntry = entry.getProbabilityEntry();
@ -553,9 +550,11 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
return 0; return 0;
} }
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
const PtNodeParams ptNodeParams =
mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos);
int unigramProbability = NOT_A_PROBABILITY; int unigramProbability = NOT_A_PROBABILITY;
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); ptNodeParams.getTerminalId(), MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
const int nextToken = token + 1; const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) { if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated. // All words have been iterated.

View file

@ -63,7 +63,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
DicNodeVector *const childDicNodes) const; DicNodeVector *const childDicNodes) const;
int getCodePointsAndProbabilityAndReturnCodePointCount( int getCodePointsAndProbabilityAndReturnCodePointCount(
const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const; int *const outUnigramProbability) const;
int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;