Support Beginning-of-Sentence in native code

Bug: 14119293
Change-Id: I0f382e33a19bf481823b23405d454de61ec835ff
main
Keisuke Kuroyanagi 2014-05-23 17:23:08 +09:00
parent 6600340af5
commit 96990ca773
3 changed files with 71 additions and 18 deletions

View File

@ -20,11 +20,11 @@
#include "defines.h" #include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/char_utils.h"
namespace latinime { namespace latinime {
// TODO: Support n-gram. // TODO: Support n-gram.
// TODO: Support beginning of sentence.
// This class does not take ownership of any code point buffers. // This class does not take ownership of any code point buffers.
class PrevWordsInfo { class PrevWordsInfo {
public: public:
@ -52,8 +52,7 @@ class PrevWordsInfo {
void getPrevWordsTerminalPtNodePos( void getPrevWordsTerminalPtNodePos(
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
int *const outPrevWordsTerminalPtNodePos, int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const {
const bool tryLowerCaseSearch) const {
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
mPrevWordCodePoints[i], mPrevWordCodePointCount[i], mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
@ -63,17 +62,11 @@ class PrevWordsInfo {
BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction( BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const { const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
int pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0], const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch(
mPrevWordCodePointCount[0], false /* forceLowerCaseSearch */); dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0],
// getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the mIsBeginningOfSentence[0]);
// dictionary or has no bigrams return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(),
if (NOT_A_DICT_POS == pos) { bigramListPos);
// If no bigrams for this exact word, search again in lower case.
pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0],
mPrevWordCodePointCount[0], true /* forceLowerCaseSearch */);
}
return BinaryDictionaryBigramsIterator(
dictStructurePolicy->getBigramsStructurePolicy(), pos);
} }
// n is 1-indexed. // n is 1-indexed.
@ -102,8 +95,18 @@ class PrevWordsInfo {
if (!dictStructurePolicy || !wordCodePoints) { if (!dictStructurePolicy || !wordCodePoints) {
return NOT_A_DICT_POS; return NOT_A_DICT_POS;
} }
int codePoints[MAX_WORD_LENGTH];
int codePointCount = wordCodePointCount;
memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
if (isBeginningOfSentence) {
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
codePointCount, MAX_WORD_LENGTH);
if (codePointCount <= 0) {
return NOT_A_DICT_POS;
}
}
const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
wordCodePoints, wordCodePointCount, false /* forceLowerCaseSearch */); codePoints, codePointCount, false /* forceLowerCaseSearch */);
if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
// Return the position when when the word was found or doesn't try lower case // Return the position when when the word was found or doesn't try lower case
// search. // search.
@ -112,7 +115,33 @@ class PrevWordsInfo {
// Check bigrams for lower-cased previous word if original was not found. Useful for // Check bigrams for lower-cased previous word if original was not found. Useful for
// auto-capitalized words like "The [current_word]". // auto-capitalized words like "The [current_word]".
return dictStructurePolicy->getTerminalPtNodePositionOfWord( return dictStructurePolicy->getTerminalPtNodePositionOfWord(
wordCodePoints, wordCodePointCount, true /* forceLowerCaseSearch */); codePoints, codePointCount, true /* forceLowerCaseSearch */);
}
static int getBigramListPositionForWordWithTryingLowerCaseSearch(
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
const int *const wordCodePoints, const int wordCodePointCount,
const bool isBeginningOfSentence) {
int codePoints[MAX_WORD_LENGTH];
int codePointCount = wordCodePointCount;
memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
if (isBeginningOfSentence) {
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
codePointCount, MAX_WORD_LENGTH);
if (codePointCount <= 0) {
return NOT_A_DICT_POS;
}
}
int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
codePointCount, false /* forceLowerCaseSearch */);
// getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
// dictionary or has no bigrams
if (NOT_A_DICT_POS == pos) {
// If no bigrams for this exact word, search again in lower case.
pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
codePointCount, true /* forceLowerCaseSearch */);
}
return pos;
} }
static int getBigramListPositionForWord( static int getBigramListPositionForWord(

View File

@ -181,9 +181,19 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
readingHelper.initWithPtNodeArrayPos(getRootPosition()); readingHelper.initWithPtNodeArrayPos(getRootPosition());
bool addedNewUnigram = false; bool addedNewUnigram = false;
if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, int codePointsToAdd[MAX_WORD_LENGTH];
int codePointCountToAdd = length;
memmove(codePointsToAdd, word, sizeof(int) * length);
if (unigramProperty->representsBeginningOfSentence()) {
codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
codePointCountToAdd, MAX_WORD_LENGTH);
}
if (codePointCountToAdd <= 0) {
return false;
}
if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd,
unigramProperty, &addedNewUnigram)) { unigramProperty, &addedNewUnigram)) {
if (addedNewUnigram) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
mUnigramCount++; mUnigramCount++;
} }
if (unigramProperty->getShortcuts().size() > 0) { if (unigramProperty->getShortcuts().size() > 0) {

View File

@ -18,6 +18,7 @@
#define LATINIME_CHAR_UTILS_H #define LATINIME_CHAR_UTILS_H
#include <cctype> #include <cctype>
#include <cstring>
#include <vector> #include <vector>
#include "defines.h" #include "defines.h"
@ -93,6 +94,19 @@ class CharUtils {
static unsigned short latin_tolower(const unsigned short c); static unsigned short latin_tolower(const unsigned short c);
static const std::vector<int> EMPTY_STRING; static const std::vector<int> EMPTY_STRING;
// Returns updated code point count. Returns 0 when the code points cannot be marked as a
// Beginning-of-Sentence.
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
const int codePointCount, const int maxCodePoint) {
if (codePointCount >= maxCodePoint) {
// the code points cannot be marked as a Beginning-of-Sentence.
return 0;
}
memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
return codePointCount + 1;
}
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);