Make getNextWordAndNextToken output code point count.

Bug: 14425059
Change-Id: Id1c2927c8a1be0b1680206d444dbdf0c9453dceb
This commit is contained in:
Keisuke Kuroyanagi 2014-06-23 19:08:30 +09:00
parent 9bbc3aa02a
commit f7322b166b
10 changed files with 45 additions and 47 deletions

View file

@ -304,17 +304,18 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
jlong dict, jint token, jintArray outCodePoints) { jlong dict, jint token, jintArray outCodePoints) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) return 0; if (!dictionary) return 0;
const jsize outCodePointsLength = env->GetArrayLength(outCodePoints); const jsize codePointBufSize = env->GetArrayLength(outCodePoints);
if (outCodePointsLength != MAX_WORD_LENGTH) { if (codePointBufSize != MAX_WORD_LENGTH) {
AKLOGE("Invalid outCodePointsLength: %d", outCodePointsLength); AKLOGE("Invalid outCodePointsLength: %d", codePointBufSize);
ASSERT(false); ASSERT(false);
return 0; return 0;
} }
int wordCodePoints[outCodePointsLength]; int wordCodePoints[codePointBufSize];
memset(wordCodePoints, 0, sizeof(wordCodePoints)); int wordCodePointCount = 0;
const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints); const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints,
&wordCodePointCount);
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
MAX_WORD_LENGTH /* maxLength */, wordCodePoints, outCodePointsLength, MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount,
false /* needsNullTermination */); false /* needsNullTermination */);
return nextToken; return nextToken;
} }
@ -555,12 +556,13 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
// TODO: Migrate historical information. // TODO: Migrate historical information.
int wordCodePoints[MAX_WORD_LENGTH]; int wordCodePoints[MAX_WORD_LENGTH];
int wordCodePointCount = 0;
int token = 0; int token = 0;
// Add unigrams. // Add unigrams.
do { do {
token = dictionary->getNextWordAndNextToken(token, wordCodePoints); token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
const int wordLength = CharUtils::getCodePointCount(MAX_WORD_LENGTH, wordCodePoints); const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints,
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength); wordCodePointCount);
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) { if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy( dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars); std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
@ -569,8 +571,8 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
return false; return false;
} }
} }
if (!dictionaryStructureWithBufferPolicy->addUnigramEntry(wordCodePoints, wordLength, if (!dictionaryStructureWithBufferPolicy->addUnigramEntry(wordCodePoints,
wordProperty.getUnigramProperty())) { wordCodePointCount, wordProperty.getUnigramProperty())) {
LogUtils::logToJava(env, "Cannot add unigram to the new dict."); LogUtils::logToJava(env, "Cannot add unigram to the new dict.");
return false; return false;
} }
@ -578,9 +580,9 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
// Add bigrams. // Add bigrams.
do { do {
token = dictionary->getNextWordAndNextToken(token, wordCodePoints); token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
const int wordLength = CharUtils::getCodePointCount(MAX_WORD_LENGTH, wordCodePoints); const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints,
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength); wordCodePointCount);
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) { if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy( dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars); std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
@ -589,8 +591,8 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
return false; return false;
} }
} }
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordLength, const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount,
false /* isStartOfSentence */); false /* isBeginningOfSentence */);
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) { for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo, if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
&bigramProperty)) { &bigramProperty)) {

View file

@ -145,10 +145,11 @@ const WordProperty Dictionary::getWordProperty(const int *const codePoints,
codePoints, codePointCount); codePoints, codePointCount);
} }
int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints) { int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount) {
TimeKeeper::setCurrentTime(); TimeKeeper::setCurrentTime();
return mDictionaryStructureWithBufferPolicy->getNextWordAndNextToken( return mDictionaryStructureWithBufferPolicy->getNextWordAndNextToken(
token, outCodePoints); token, outCodePoints, outCodePointCount);
} }
void Dictionary::logDictionaryInfo(JNIEnv *const env) const { void Dictionary::logDictionaryInfo(JNIEnv *const env) const {

View file

@ -103,7 +103,8 @@ class Dictionary {
// Method to iterate all words in the dictionary. // Method to iterate all words in the dictionary.
// The returned token has to be used to get the next word. If token is 0, this method newly // The returned token has to be used to get the next word. If token is 0, this method newly
// starts iterating the dictionary. // starts iterating the dictionary.
int getNextWordAndNextToken(const int token, int *const outCodePoints); int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount);
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
return mDictionaryStructureWithBufferPolicy.get(); return mDictionaryStructureWithBufferPolicy.get();

View file

@ -104,7 +104,8 @@ class DictionaryStructureWithBufferPolicy {
// Method to iterate all words in the dictionary. // Method to iterate all words in the dictionary.
// The returned token has to be used to get the next word. If token is 0, this method newly // The returned token has to be used to get the next word. If token is 0, this method newly
// starts iterating the dictionary. // starts iterating the dictionary.
virtual int getNextWordAndNextToken(const int token, int *const outCodePoints) = 0; virtual int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount) = 0;
virtual bool isCorrupted() const = 0; virtual bool isCorrupted() const = 0;

View file

@ -478,10 +478,9 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
return WordProperty(&codePointVector, &unigramProperty, &bigrams); return WordProperty(&codePointVector, &unigramProperty, &bigrams);
} }
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
// TODO: Return code point count like other methods. int *const outCodePointCount) {
// Null termination. *outCodePointCount = 0;
outCodePoints[0] = 0;
if (token == 0) { if (token == 0) {
mTerminalPtNodePositionsForIteratingWords.clear(); mTerminalPtNodePositionsForIteratingWords.clear();
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
@ -498,13 +497,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
} }
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
int unigramProbability = NOT_A_PROBABILITY; int unigramProbability = NOT_A_PROBABILITY;
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
if (codePointCount < MAX_WORD_LENGTH) {
// Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH
// code points.
outCodePoints[codePointCount] = 0;
}
const int nextToken = token + 1; const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) { if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated. // All words have been iterated.

View file

@ -134,7 +134,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const WordProperty getWordProperty(const int *const codePoints, const WordProperty getWordProperty(const int *const codePoints,
const int codePointCount) const; const int codePointCount) const;
int getNextWordAndNextToken(const int token, int *const outCodePoints); int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount);
bool isCorrupted() const { bool isCorrupted() const {
return mIsCorrupted; return mIsCorrupted;

View file

@ -391,7 +391,9 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
return WordProperty(&codePointVector, &unigramProperty, &bigrams); return WordProperty(&codePointVector, &unigramProperty, &bigrams);
} }
int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount) {
*outCodePointCount = 0;
if (token == 0) { if (token == 0) {
// Start iterating the dictionary. // Start iterating the dictionary.
mTerminalPtNodePositionsForIteratingWords.clear(); mTerminalPtNodePositionsForIteratingWords.clear();
@ -409,8 +411,8 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC
} }
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
int unigramProbability = NOT_A_PROBABILITY; int unigramProbability = NOT_A_PROBABILITY;
getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos,
outCodePoints, &unigramProbability); MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
const int nextToken = token + 1; const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) { if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated. // All words have been iterated.

View file

@ -137,7 +137,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const WordProperty getWordProperty(const int *const codePoints, const WordProperty getWordProperty(const int *const codePoints,
const int codePointCount) const; const int codePointCount) const;
int getNextWordAndNextToken(const int token, int *const outCodePoints); int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount);
bool isCorrupted() const { bool isCorrupted() const {
return mIsCorrupted; return mIsCorrupted;

View file

@ -489,10 +489,9 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
return WordProperty(&codePointVector, &unigramProperty, &bigrams); return WordProperty(&codePointVector, &unigramProperty, &bigrams);
} }
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
// TODO: Return code point count like other methods. int *const outCodePointCount) {
// Null termination. *outCodePointCount = 0;
outCodePoints[0] = 0;
if (token == 0) { if (token == 0) {
mTerminalPtNodePositionsForIteratingWords.clear(); mTerminalPtNodePositionsForIteratingWords.clear();
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
@ -509,13 +508,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
} }
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
int unigramProbability = NOT_A_PROBABILITY; int unigramProbability = NOT_A_PROBABILITY;
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
if (codePointCount < MAX_WORD_LENGTH) {
// Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH
// code points.
outCodePoints[codePointCount] = 0;
}
const int nextToken = token + 1; const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) { if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated. // All words have been iterated.

View file

@ -113,7 +113,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const WordProperty getWordProperty(const int *const codePoints, const WordProperty getWordProperty(const int *const codePoints,
const int codePointCount) const; const int codePointCount) const;
int getNextWordAndNextToken(const int token, int *const outCodePoints); int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount);
bool isCorrupted() const { bool isCorrupted() const {
return mIsCorrupted; return mIsCorrupted;