am f7322b16
: Make getNextWordAndNextToken output code point count.
* commit 'f7322b166b88f72b19509d8416700d4ec8ea7753': Make getNextWordAndNextToken output code point count.
This commit is contained in:
commit
179eb125c4
10 changed files with 45 additions and 47 deletions
|
@ -304,17 +304,18 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
|
||||||
jlong dict, jint token, jintArray outCodePoints) {
|
jlong dict, jint token, jintArray outCodePoints) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
if (!dictionary) return 0;
|
if (!dictionary) return 0;
|
||||||
const jsize outCodePointsLength = env->GetArrayLength(outCodePoints);
|
const jsize codePointBufSize = env->GetArrayLength(outCodePoints);
|
||||||
if (outCodePointsLength != MAX_WORD_LENGTH) {
|
if (codePointBufSize != MAX_WORD_LENGTH) {
|
||||||
AKLOGE("Invalid outCodePointsLength: %d", outCodePointsLength);
|
AKLOGE("Invalid outCodePointsLength: %d", codePointBufSize);
|
||||||
ASSERT(false);
|
ASSERT(false);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
int wordCodePoints[outCodePointsLength];
|
int wordCodePoints[codePointBufSize];
|
||||||
memset(wordCodePoints, 0, sizeof(wordCodePoints));
|
int wordCodePointCount = 0;
|
||||||
const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints);
|
const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints,
|
||||||
|
&wordCodePointCount);
|
||||||
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
||||||
MAX_WORD_LENGTH /* maxLength */, wordCodePoints, outCodePointsLength,
|
MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount,
|
||||||
false /* needsNullTermination */);
|
false /* needsNullTermination */);
|
||||||
return nextToken;
|
return nextToken;
|
||||||
}
|
}
|
||||||
|
@ -555,12 +556,13 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
|
|
||||||
// TODO: Migrate historical information.
|
// TODO: Migrate historical information.
|
||||||
int wordCodePoints[MAX_WORD_LENGTH];
|
int wordCodePoints[MAX_WORD_LENGTH];
|
||||||
|
int wordCodePointCount = 0;
|
||||||
int token = 0;
|
int token = 0;
|
||||||
// Add unigrams.
|
// Add unigrams.
|
||||||
do {
|
do {
|
||||||
token = dictionary->getNextWordAndNextToken(token, wordCodePoints);
|
token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
|
||||||
const int wordLength = CharUtils::getCodePointCount(MAX_WORD_LENGTH, wordCodePoints);
|
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints,
|
||||||
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength);
|
wordCodePointCount);
|
||||||
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
|
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
|
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
|
||||||
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
|
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
|
||||||
|
@ -569,8 +571,8 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!dictionaryStructureWithBufferPolicy->addUnigramEntry(wordCodePoints, wordLength,
|
if (!dictionaryStructureWithBufferPolicy->addUnigramEntry(wordCodePoints,
|
||||||
wordProperty.getUnigramProperty())) {
|
wordCodePointCount, wordProperty.getUnigramProperty())) {
|
||||||
LogUtils::logToJava(env, "Cannot add unigram to the new dict.");
|
LogUtils::logToJava(env, "Cannot add unigram to the new dict.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -578,9 +580,9 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
|
|
||||||
// Add bigrams.
|
// Add bigrams.
|
||||||
do {
|
do {
|
||||||
token = dictionary->getNextWordAndNextToken(token, wordCodePoints);
|
token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
|
||||||
const int wordLength = CharUtils::getCodePointCount(MAX_WORD_LENGTH, wordCodePoints);
|
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints,
|
||||||
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength);
|
wordCodePointCount);
|
||||||
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
|
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
|
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
|
||||||
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
|
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
|
||||||
|
@ -589,8 +591,8 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordLength,
|
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount,
|
||||||
false /* isStartOfSentence */);
|
false /* isBeginningOfSentence */);
|
||||||
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
|
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
|
||||||
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
|
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
|
||||||
&bigramProperty)) {
|
&bigramProperty)) {
|
||||||
|
|
|
@ -145,10 +145,11 @@ const WordProperty Dictionary::getWordProperty(const int *const codePoints,
|
||||||
codePoints, codePointCount);
|
codePoints, codePointCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints) {
|
int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
int *const outCodePointCount) {
|
||||||
TimeKeeper::setCurrentTime();
|
TimeKeeper::setCurrentTime();
|
||||||
return mDictionaryStructureWithBufferPolicy->getNextWordAndNextToken(
|
return mDictionaryStructureWithBufferPolicy->getNextWordAndNextToken(
|
||||||
token, outCodePoints);
|
token, outCodePoints, outCodePointCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
|
void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
|
||||||
|
|
|
@ -103,7 +103,8 @@ class Dictionary {
|
||||||
// Method to iterate all words in the dictionary.
|
// Method to iterate all words in the dictionary.
|
||||||
// The returned token has to be used to get the next word. If token is 0, this method newly
|
// The returned token has to be used to get the next word. If token is 0, this method newly
|
||||||
// starts iterating the dictionary.
|
// starts iterating the dictionary.
|
||||||
int getNextWordAndNextToken(const int token, int *const outCodePoints);
|
int getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
int *const outCodePointCount);
|
||||||
|
|
||||||
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
|
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
|
||||||
return mDictionaryStructureWithBufferPolicy.get();
|
return mDictionaryStructureWithBufferPolicy.get();
|
||||||
|
|
|
@ -104,7 +104,8 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
// Method to iterate all words in the dictionary.
|
// Method to iterate all words in the dictionary.
|
||||||
// The returned token has to be used to get the next word. If token is 0, this method newly
|
// The returned token has to be used to get the next word. If token is 0, this method newly
|
||||||
// starts iterating the dictionary.
|
// starts iterating the dictionary.
|
||||||
virtual int getNextWordAndNextToken(const int token, int *const outCodePoints) = 0;
|
virtual int getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
int *const outCodePointCount) = 0;
|
||||||
|
|
||||||
virtual bool isCorrupted() const = 0;
|
virtual bool isCorrupted() const = 0;
|
||||||
|
|
||||||
|
|
|
@ -478,10 +478,9 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
|
||||||
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
|
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
// TODO: Return code point count like other methods.
|
int *const outCodePointCount) {
|
||||||
// Null termination.
|
*outCodePointCount = 0;
|
||||||
outCodePoints[0] = 0;
|
|
||||||
if (token == 0) {
|
if (token == 0) {
|
||||||
mTerminalPtNodePositionsForIteratingWords.clear();
|
mTerminalPtNodePositionsForIteratingWords.clear();
|
||||||
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
|
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
|
||||||
|
@ -498,13 +497,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
|
||||||
}
|
}
|
||||||
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||||
int unigramProbability = NOT_A_PROBABILITY;
|
int unigramProbability = NOT_A_PROBABILITY;
|
||||||
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
||||||
if (codePointCount < MAX_WORD_LENGTH) {
|
|
||||||
// Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH
|
|
||||||
// code points.
|
|
||||||
outCodePoints[codePointCount] = 0;
|
|
||||||
}
|
|
||||||
const int nextToken = token + 1;
|
const int nextToken = token + 1;
|
||||||
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||||
// All words have been iterated.
|
// All words have been iterated.
|
||||||
|
|
|
@ -134,7 +134,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
const WordProperty getWordProperty(const int *const codePoints,
|
const WordProperty getWordProperty(const int *const codePoints,
|
||||||
const int codePointCount) const;
|
const int codePointCount) const;
|
||||||
|
|
||||||
int getNextWordAndNextToken(const int token, int *const outCodePoints);
|
int getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
int *const outCodePointCount);
|
||||||
|
|
||||||
bool isCorrupted() const {
|
bool isCorrupted() const {
|
||||||
return mIsCorrupted;
|
return mIsCorrupted;
|
||||||
|
|
|
@ -391,7 +391,9 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
|
||||||
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
|
int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
int *const outCodePointCount) {
|
||||||
|
*outCodePointCount = 0;
|
||||||
if (token == 0) {
|
if (token == 0) {
|
||||||
// Start iterating the dictionary.
|
// Start iterating the dictionary.
|
||||||
mTerminalPtNodePositionsForIteratingWords.clear();
|
mTerminalPtNodePositionsForIteratingWords.clear();
|
||||||
|
@ -409,8 +411,8 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC
|
||||||
}
|
}
|
||||||
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||||
int unigramProbability = NOT_A_PROBABILITY;
|
int unigramProbability = NOT_A_PROBABILITY;
|
||||||
getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH,
|
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos,
|
||||||
outCodePoints, &unigramProbability);
|
MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
||||||
const int nextToken = token + 1;
|
const int nextToken = token + 1;
|
||||||
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||||
// All words have been iterated.
|
// All words have been iterated.
|
||||||
|
|
|
@ -137,7 +137,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
const WordProperty getWordProperty(const int *const codePoints,
|
const WordProperty getWordProperty(const int *const codePoints,
|
||||||
const int codePointCount) const;
|
const int codePointCount) const;
|
||||||
|
|
||||||
int getNextWordAndNextToken(const int token, int *const outCodePoints);
|
int getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
int *const outCodePointCount);
|
||||||
|
|
||||||
bool isCorrupted() const {
|
bool isCorrupted() const {
|
||||||
return mIsCorrupted;
|
return mIsCorrupted;
|
||||||
|
|
|
@ -489,10 +489,9 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
|
||||||
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
|
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
// TODO: Return code point count like other methods.
|
int *const outCodePointCount) {
|
||||||
// Null termination.
|
*outCodePointCount = 0;
|
||||||
outCodePoints[0] = 0;
|
|
||||||
if (token == 0) {
|
if (token == 0) {
|
||||||
mTerminalPtNodePositionsForIteratingWords.clear();
|
mTerminalPtNodePositionsForIteratingWords.clear();
|
||||||
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
|
DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
|
||||||
|
@ -509,13 +508,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
|
||||||
}
|
}
|
||||||
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
|
||||||
int unigramProbability = NOT_A_PROBABILITY;
|
int unigramProbability = NOT_A_PROBABILITY;
|
||||||
const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
*outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
|
||||||
if (codePointCount < MAX_WORD_LENGTH) {
|
|
||||||
// Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH
|
|
||||||
// code points.
|
|
||||||
outCodePoints[codePointCount] = 0;
|
|
||||||
}
|
|
||||||
const int nextToken = token + 1;
|
const int nextToken = token + 1;
|
||||||
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
if (nextToken >= terminalPtNodePositionsVectorSize) {
|
||||||
// All words have been iterated.
|
// All words have been iterated.
|
||||||
|
|
|
@ -113,7 +113,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
const WordProperty getWordProperty(const int *const codePoints,
|
const WordProperty getWordProperty(const int *const codePoints,
|
||||||
const int codePointCount) const;
|
const int codePointCount) const;
|
||||||
|
|
||||||
int getNextWordAndNextToken(const int token, int *const outCodePoints);
|
int getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
int *const outCodePointCount);
|
||||||
|
|
||||||
bool isCorrupted() const {
|
bool isCorrupted() const {
|
||||||
return mIsCorrupted;
|
return mIsCorrupted;
|
||||||
|
|
Loading…
Reference in a new issue