am 75cb258e: Merge "Support Beginning-of-Sentence in native code"
* commit '75cb258ee40a97654ae364f00e1803d0bce35da8': Support Beginning-of-Sentence in native codemain
commit
dd02535fb8
|
@ -20,11 +20,11 @@
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||||
|
#include "utils/char_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
// TODO: Support n-gram.
|
// TODO: Support n-gram.
|
||||||
// TODO: Support beginning of sentence.
|
|
||||||
// This class does not take ownership of any code point buffers.
|
// This class does not take ownership of any code point buffers.
|
||||||
class PrevWordsInfo {
|
class PrevWordsInfo {
|
||||||
public:
|
public:
|
||||||
|
@ -52,8 +52,7 @@ class PrevWordsInfo {
|
||||||
|
|
||||||
void getPrevWordsTerminalPtNodePos(
|
void getPrevWordsTerminalPtNodePos(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
int *const outPrevWordsTerminalPtNodePos,
|
int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const {
|
||||||
const bool tryLowerCaseSearch) const {
|
|
||||||
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
||||||
outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
|
outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
|
||||||
mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
|
mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
|
||||||
|
@ -63,17 +62,11 @@ class PrevWordsInfo {
|
||||||
|
|
||||||
BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
|
BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
|
||||||
int pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0],
|
const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch(
|
||||||
mPrevWordCodePointCount[0], false /* forceLowerCaseSearch */);
|
dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0],
|
||||||
// getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
|
mIsBeginningOfSentence[0]);
|
||||||
// dictionary or has no bigrams
|
return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(),
|
||||||
if (NOT_A_DICT_POS == pos) {
|
bigramListPos);
|
||||||
// If no bigrams for this exact word, search again in lower case.
|
|
||||||
pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0],
|
|
||||||
mPrevWordCodePointCount[0], true /* forceLowerCaseSearch */);
|
|
||||||
}
|
|
||||||
return BinaryDictionaryBigramsIterator(
|
|
||||||
dictStructurePolicy->getBigramsStructurePolicy(), pos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// n is 1-indexed.
|
// n is 1-indexed.
|
||||||
|
@ -102,8 +95,18 @@ class PrevWordsInfo {
|
||||||
if (!dictStructurePolicy || !wordCodePoints) {
|
if (!dictStructurePolicy || !wordCodePoints) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
}
|
}
|
||||||
|
int codePoints[MAX_WORD_LENGTH];
|
||||||
|
int codePointCount = wordCodePointCount;
|
||||||
|
memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
|
||||||
|
if (isBeginningOfSentence) {
|
||||||
|
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
|
||||||
|
codePointCount, MAX_WORD_LENGTH);
|
||||||
|
if (codePointCount <= 0) {
|
||||||
|
return NOT_A_DICT_POS;
|
||||||
|
}
|
||||||
|
}
|
||||||
const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
||||||
wordCodePoints, wordCodePointCount, false /* forceLowerCaseSearch */);
|
codePoints, codePointCount, false /* forceLowerCaseSearch */);
|
||||||
if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
|
if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
|
||||||
// Return the position when when the word was found or doesn't try lower case
|
// Return the position when when the word was found or doesn't try lower case
|
||||||
// search.
|
// search.
|
||||||
|
@ -112,7 +115,33 @@ class PrevWordsInfo {
|
||||||
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
||||||
// auto-capitalized words like "The [current_word]".
|
// auto-capitalized words like "The [current_word]".
|
||||||
return dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
return dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
||||||
wordCodePoints, wordCodePointCount, true /* forceLowerCaseSearch */);
|
codePoints, codePointCount, true /* forceLowerCaseSearch */);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int getBigramListPositionForWordWithTryingLowerCaseSearch(
|
||||||
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
|
const int *const wordCodePoints, const int wordCodePointCount,
|
||||||
|
const bool isBeginningOfSentence) {
|
||||||
|
int codePoints[MAX_WORD_LENGTH];
|
||||||
|
int codePointCount = wordCodePointCount;
|
||||||
|
memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
|
||||||
|
if (isBeginningOfSentence) {
|
||||||
|
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
|
||||||
|
codePointCount, MAX_WORD_LENGTH);
|
||||||
|
if (codePointCount <= 0) {
|
||||||
|
return NOT_A_DICT_POS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
|
||||||
|
codePointCount, false /* forceLowerCaseSearch */);
|
||||||
|
// getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
|
||||||
|
// dictionary or has no bigrams
|
||||||
|
if (NOT_A_DICT_POS == pos) {
|
||||||
|
// If no bigrams for this exact word, search again in lower case.
|
||||||
|
pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
|
||||||
|
codePointCount, true /* forceLowerCaseSearch */);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int getBigramListPositionForWord(
|
static int getBigramListPositionForWord(
|
||||||
|
|
|
@ -181,9 +181,19 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le
|
||||||
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
|
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
|
||||||
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
||||||
bool addedNewUnigram = false;
|
bool addedNewUnigram = false;
|
||||||
if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length,
|
int codePointsToAdd[MAX_WORD_LENGTH];
|
||||||
|
int codePointCountToAdd = length;
|
||||||
|
memmove(codePointsToAdd, word, sizeof(int) * length);
|
||||||
|
if (unigramProperty->representsBeginningOfSentence()) {
|
||||||
|
codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
|
||||||
|
codePointCountToAdd, MAX_WORD_LENGTH);
|
||||||
|
}
|
||||||
|
if (codePointCountToAdd <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd,
|
||||||
unigramProperty, &addedNewUnigram)) {
|
unigramProperty, &addedNewUnigram)) {
|
||||||
if (addedNewUnigram) {
|
if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
|
||||||
mUnigramCount++;
|
mUnigramCount++;
|
||||||
}
|
}
|
||||||
if (unigramProperty->getShortcuts().size() > 0) {
|
if (unigramProperty->getShortcuts().size() > 0) {
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#define LATINIME_CHAR_UTILS_H
|
#define LATINIME_CHAR_UTILS_H
|
||||||
|
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
@ -93,6 +94,19 @@ class CharUtils {
|
||||||
static unsigned short latin_tolower(const unsigned short c);
|
static unsigned short latin_tolower(const unsigned short c);
|
||||||
static const std::vector<int> EMPTY_STRING;
|
static const std::vector<int> EMPTY_STRING;
|
||||||
|
|
||||||
|
// Returns updated code point count. Returns 0 when the code points cannot be marked as a
|
||||||
|
// Beginning-of-Sentence.
|
||||||
|
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
|
||||||
|
const int codePointCount, const int maxCodePoint) {
|
||||||
|
if (codePointCount >= maxCodePoint) {
|
||||||
|
// the code points cannot be marked as a Beginning-of-Sentence.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
|
||||||
|
codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
|
||||||
|
return codePointCount + 1;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue