124 lines
4.8 KiB
C++
124 lines
4.8 KiB
C++
|
/*
|
||
|
* Copyright (C) 2014 The Android Open Source Project
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
#include "dictionary/property/ngram_context.h"
|
||
|
|
||
|
#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
|
||
|
#include "utils/char_utils.h"
|
||
|
|
||
|
namespace latinime {
|
||
|
|
||
|
NgramContext::NgramContext() : mPrevWordCount(0) {}
|
||
|
|
||
|
NgramContext::NgramContext(const NgramContext &ngramContext)
|
||
|
: mPrevWordCount(ngramContext.mPrevWordCount) {
|
||
|
for (size_t i = 0; i < mPrevWordCount; ++i) {
|
||
|
mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
|
||
|
memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
|
||
|
sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
|
||
|
mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
|
||
|
const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
|
||
|
const size_t prevWordCount)
|
||
|
: mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
|
||
|
clear();
|
||
|
for (size_t i = 0; i < mPrevWordCount; ++i) {
|
||
|
if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
|
||
|
continue;
|
||
|
}
|
||
|
memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
|
||
|
sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
|
||
|
mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
|
||
|
mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
|
||
|
const bool isBeginningOfSentence) : mPrevWordCount(1) {
|
||
|
clear();
|
||
|
if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
|
||
|
return;
|
||
|
}
|
||
|
memmove(mPrevWordCodePoints[0], prevWordCodePoints,
|
||
|
sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
|
||
|
mPrevWordCodePointCount[0] = prevWordCodePointCount;
|
||
|
mIsBeginningOfSentence[0] = isBeginningOfSentence;
|
||
|
}
|
||
|
|
||
|
bool NgramContext::isValid() const {
|
||
|
if (mPrevWordCodePointCount[0] > 0) {
|
||
|
return true;
|
||
|
}
|
||
|
if (mIsBeginningOfSentence[0]) {
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
|
||
|
if (n <= 0 || n > mPrevWordCount) {
|
||
|
return CodePointArrayView();
|
||
|
}
|
||
|
return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
|
||
|
}
|
||
|
|
||
|
bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
|
||
|
if (n <= 0 || n > mPrevWordCount) {
|
||
|
return false;
|
||
|
}
|
||
|
return mIsBeginningOfSentence[n - 1];
|
||
|
}
|
||
|
|
||
|
/* static */ int NgramContext::getWordId(
|
||
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||
|
const int *const wordCodePoints, const int wordCodePointCount,
|
||
|
const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
|
||
|
if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
|
||
|
return NOT_A_WORD_ID;
|
||
|
}
|
||
|
int codePoints[MAX_WORD_LENGTH];
|
||
|
int codePointCount = wordCodePointCount;
|
||
|
memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
|
||
|
if (isBeginningOfSentence) {
|
||
|
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
|
||
|
MAX_WORD_LENGTH);
|
||
|
if (codePointCount <= 0) {
|
||
|
return NOT_A_WORD_ID;
|
||
|
}
|
||
|
}
|
||
|
const CodePointArrayView codePointArrayView(codePoints, codePointCount);
|
||
|
const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
|
||
|
false /* forceLowerCaseSearch */);
|
||
|
if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
|
||
|
// Return the id when when the word was found or doesn't try lower case search.
|
||
|
return wordId;
|
||
|
}
|
||
|
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
||
|
// auto-capitalized words like "The [current_word]".
|
||
|
return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
|
||
|
}
|
||
|
|
||
|
void NgramContext::clear() {
|
||
|
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
||
|
mPrevWordCodePointCount[i] = 0;
|
||
|
mIsBeginningOfSentence[i] = false;
|
||
|
}
|
||
|
}
|
||
|
} // namespace latinime
|