Merge "Native side reads character table"
commit
5c6db929e4
|
@ -65,7 +65,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
||||
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
|
||||
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
|
||||
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
|
||||
|
||||
// Constructs header information using an attribute map.
|
||||
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
||||
|
@ -97,7 +98,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
||||
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
|
||||
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
|
||||
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
|
||||
|
||||
// Copy header information
|
||||
HeaderPolicy(const HeaderPolicy *const headerPolicy)
|
||||
|
@ -118,7 +120,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
mForgettingCurveDurationToLevelDown(
|
||||
headerPolicy->mForgettingCurveDurationToLevelDown),
|
||||
mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
|
||||
mMaxBigramCount(headerPolicy->mMaxBigramCount) {}
|
||||
mMaxBigramCount(headerPolicy->mMaxBigramCount),
|
||||
mCodePointTable(headerPolicy->mCodePointTable) {}
|
||||
|
||||
// Temporary dummy header.
|
||||
HeaderPolicy()
|
||||
|
@ -128,7 +131,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
|
||||
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
|
||||
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
|
||||
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
|
||||
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0),
|
||||
mCodePointTable(nullptr) {}
|
||||
|
||||
~HeaderPolicy() {}
|
||||
|
||||
|
@ -139,6 +143,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
switch (mDictFormatVersion) {
|
||||
case FormatUtils::VERSION_2:
|
||||
return FormatUtils::VERSION_2;
|
||||
case FormatUtils::VERSION_201:
|
||||
return FormatUtils::VERSION_201;
|
||||
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
||||
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
|
||||
case FormatUtils::VERSION_4:
|
||||
|
@ -250,6 +256,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
return mDictFormatVersion >= FormatUtils::VERSION_4;
|
||||
}
|
||||
|
||||
const int *getCodePointTable() const {
|
||||
return mCodePointTable;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
|
||||
|
||||
|
@ -295,6 +305,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
const int mForgettingCurveDurationToLevelDown;
|
||||
const int mMaxUnigramCount;
|
||||
const int mMaxBigramCount;
|
||||
const int *const mCodePointTable;
|
||||
|
||||
const std::vector<int> readLocale() const;
|
||||
float readMultipleWordCostMultiplier() const;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
#include <cctype>
|
||||
#include <cstdio>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "defines.h"
|
||||
|
@ -34,12 +35,13 @@ namespace latinime {
|
|||
const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
|
||||
|
||||
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
|
||||
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256;
|
||||
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
|
||||
|
||||
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
|
||||
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
|
||||
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
|
||||
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
|
||||
const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
|
||||
|
||||
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
|
||||
|
||||
|
@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
|
|||
return;
|
||||
}
|
||||
int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
|
||||
int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH];
|
||||
std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
|
||||
while (pos < headerSize) {
|
||||
// The values in the header don't use the code point table for their encoding.
|
||||
const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
|
||||
MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos);
|
||||
MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
|
||||
std::vector<int> key;
|
||||
key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
|
||||
const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
|
||||
MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos);
|
||||
MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
|
||||
std::vector<int> value;
|
||||
value.insert(value.end(), valueBuffer, valueBuffer + valueLength);
|
||||
value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
|
||||
headerAttributes->insert(AttributeMap::value_type(key, value));
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
|
||||
AttributeMap *const headerAttributes) {
|
||||
AttributeMap::key_type keyVector;
|
||||
insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
|
||||
AttributeMap::const_iterator it = headerAttributes->find(keyVector);
|
||||
if (it == headerAttributes->end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return it->second.data();
|
||||
}
|
||||
|
||||
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
|
||||
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
|
||||
int *const writingPos) {
|
||||
|
@ -96,7 +110,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
|
|||
}
|
||||
switch (version) {
|
||||
case FormatUtils::VERSION_2:
|
||||
// Version 2 dictionary writing is not supported.
|
||||
case FormatUtils::VERSION_201:
|
||||
// Version 2 or 201 dictionary writing is not supported.
|
||||
return false;
|
||||
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
||||
case FormatUtils::VERSION_4:
|
||||
|
|
|
@ -46,6 +46,9 @@ class HeaderReadWriteUtils {
|
|||
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
|
||||
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
|
||||
|
||||
static const int *readCodePointTable(
|
||||
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
|
||||
|
||||
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
|
||||
const FormatUtils::FORMAT_VERSION version, int *const writingPos);
|
||||
|
||||
|
@ -101,6 +104,8 @@ class HeaderReadWriteUtils {
|
|||
static const int HEADER_FLAG_SIZE;
|
||||
static const int HEADER_SIZE_FIELD_SIZE;
|
||||
|
||||
static const char *const CODE_POINT_TABLE_KEY;
|
||||
|
||||
// Value for the "flags" field. It's unused at the moment.
|
||||
static const DictionaryFlags NO_FLAGS;
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
|
||||
|
||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
|
||||
|
@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
|||
const int parentPos =
|
||||
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
|
||||
int codePoints[MAX_WORD_LENGTH];
|
||||
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
||||
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
|
||||
const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
||||
dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
|
||||
int terminalIdFieldPos = NOT_A_DICT_POS;
|
||||
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||
int probability = NOT_A_PROBABILITY;
|
||||
|
@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
|||
// The destination position is stored at the same place as the parent position.
|
||||
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
|
||||
} else {
|
||||
return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints,
|
||||
return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
|
||||
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
|
||||
newSiblingNodePos);
|
||||
}
|
||||
|
|
|
@ -114,7 +114,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
|
|||
mmappedBuffer->getReadOnlyByteArrayView());
|
||||
switch (formatVersion) {
|
||||
case FormatUtils::VERSION_2:
|
||||
AKLOGE("Given path is a directory but the format is version 2. path: %s", path);
|
||||
case FormatUtils::VERSION_201:
|
||||
AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path);
|
||||
break;
|
||||
case FormatUtils::VERSION_4: {
|
||||
return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
|
||||
|
@ -175,6 +176,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
|
|||
}
|
||||
switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
|
||||
case FormatUtils::VERSION_2:
|
||||
case FormatUtils::VERSION_201:
|
||||
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
|
||||
new PatriciaTriePolicy(std::move(mmappedBuffer)));
|
||||
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
||||
|
|
|
@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
|||
}
|
||||
|
||||
/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
|
||||
int *const pos) {
|
||||
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
|
||||
const int *const codePointTable, int *const pos) {
|
||||
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||
}
|
||||
|
||||
// Returns the number of read characters.
|
||||
/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
|
||||
const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
|
||||
const NodeFlags flags, const int maxLength, const int *const codePointTable,
|
||||
int *const outBuffer, int *const pos) {
|
||||
int length = 0;
|
||||
if (hasMultipleChars(flags)) {
|
||||
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
|
||||
pos);
|
||||
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
|
||||
outBuffer, pos);
|
||||
} else {
|
||||
const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
|
||||
const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||
if (codePoint == NOT_A_CODE_POINT) {
|
||||
// CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
|
||||
// CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
|
||||
|
@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
|||
|
||||
// Returns the number of skipped characters.
|
||||
/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
|
||||
const int maxLength, int *const pos) {
|
||||
const int maxLength, const int *const codePointTable, int *const pos) {
|
||||
if (hasMultipleChars(flags)) {
|
||||
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
|
||||
} else {
|
||||
if (maxLength > 0) {
|
||||
getCodePointAndAdvancePosition(buffer, pos);
|
||||
getCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
|
@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
|||
|
||||
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
|
||||
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
||||
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
||||
const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
|
||||
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
|
||||
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
|
||||
int *const outBigramPos, int *const outSiblingPos) {
|
||||
|
@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
|||
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
|
||||
*outFlags = flags;
|
||||
*outCodePointCount = getCharsAndAdvancePosition(
|
||||
dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos);
|
||||
dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
|
||||
*outProbability = isTerminal(flags) ?
|
||||
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
|
||||
*outChildrenPos = hasChildrenInFlags(flags) ?
|
||||
|
|
|
@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils {
|
|||
|
||||
static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
||||
|
||||
static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
||||
static int getCodePointAndAdvancePosition(const uint8_t *const buffer,
|
||||
const int *const codePointTable, int *const pos);
|
||||
|
||||
// Returns the number of read characters.
|
||||
static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
|
||||
const int maxLength, int *const outBuffer, int *const pos);
|
||||
const int maxLength, const int *const codePointTable, int *const outBuffer,
|
||||
int *const pos);
|
||||
|
||||
// Returns the number of skipped characters.
|
||||
static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
|
||||
const int maxLength, int *const pos);
|
||||
const int maxLength, const int *const codePointTable, int *const pos);
|
||||
|
||||
static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
||||
|
||||
|
@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils {
|
|||
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
|
||||
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
||||
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
||||
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
|
||||
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
|
||||
int *const outBigramPos, int *const outSiblingPos);
|
||||
const int *const codePointTable, NodeFlags *const outFlags,
|
||||
int *const outCodePointCount, int *const outCodePoint, int *const outProbability,
|
||||
int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos,
|
||||
int *const outSiblingPos);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);
|
||||
|
|
|
@ -45,7 +45,9 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
|
|||
|
||||
/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
|
||||
const int maxLength, int *const outWord, int *const pos) {
|
||||
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, outWord, pos);
|
||||
// TODO: Use codePointTable for shortcuts.
|
||||
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength,
|
||||
nullptr /* codePointTable */, outWord, pos);
|
||||
}
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -81,6 +81,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
|||
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
||||
int pos = getRootPosition();
|
||||
int wordPos = 0;
|
||||
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
|
||||
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
|
||||
// only traverse PtNodes that are actually a part of the terminal we are searching, so each
|
||||
// time we enter this loop we are one depth level further than last time.
|
||||
|
@ -112,21 +113,21 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
|||
const PatriciaTrieReadingUtils::NodeFlags flags =
|
||||
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
|
||||
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||
mBuffer.data(), &pos);
|
||||
mBuffer.data(), codePointTable, &pos);
|
||||
if (ptNodePos == startPos) {
|
||||
// We found the position. Copy the rest of the code points in the buffer and return
|
||||
// the length.
|
||||
outCodePoints[wordPos] = character;
|
||||
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
||||
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||
mBuffer.data(), &pos);
|
||||
mBuffer.data(), codePointTable, &pos);
|
||||
// We count code points in order to avoid infinite loops if the file is broken
|
||||
// or if there is some other bug
|
||||
int charCount = maxCodePointCount;
|
||||
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
|
||||
outCodePoints[++wordPos] = nextChar;
|
||||
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||
mBuffer.data(), &pos);
|
||||
mBuffer.data(), codePointTable, &pos);
|
||||
}
|
||||
}
|
||||
*outUnigramProbability =
|
||||
|
@ -138,7 +139,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
|||
// first and possibly the probability.
|
||||
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
||||
PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
|
||||
&pos);
|
||||
codePointTable, &pos);
|
||||
}
|
||||
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
|
||||
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
|
||||
|
@ -189,17 +190,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
|||
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
|
||||
mBuffer.data(), &lastCandidatePtNodePos);
|
||||
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||
mBuffer.data(), &lastCandidatePtNodePos);
|
||||
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
||||
// We copy all the characters in this PtNode to the buffer
|
||||
outCodePoints[wordPos] = lastChar;
|
||||
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
|
||||
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||
mBuffer.data(), &lastCandidatePtNodePos);
|
||||
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
||||
int charCount = maxCodePointCount;
|
||||
while (-1 != nextChar && --charCount > 0) {
|
||||
outCodePoints[++wordPos] = nextChar;
|
||||
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||
mBuffer.data(), &lastCandidatePtNodePos);
|
||||
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
||||
}
|
||||
}
|
||||
++wordPos;
|
||||
|
@ -404,9 +405,11 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
|||
int shortcutPos = NOT_A_DICT_POS;
|
||||
int bigramPos = NOT_A_DICT_POS;
|
||||
int siblingPos = NOT_A_DICT_POS;
|
||||
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
|
||||
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
|
||||
&mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
||||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||
&mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
|
||||
mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
|
||||
&siblingPos);
|
||||
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
||||
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
||||
|
|
|
@ -43,10 +43,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
||||
: mMmappedBuffer(std::move(mmappedBuffer)),
|
||||
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
|
||||
FormatUtils::VERSION_2),
|
||||
FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())),
|
||||
mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
|
||||
mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
|
||||
mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy),
|
||||
mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
|
||||
mHeaderPolicy.getCodePointTable()),
|
||||
mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
|
||||
mIsCorrupted(false) {}
|
||||
|
||||
|
|
|
@ -38,8 +38,8 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo
|
|||
int bigramPos = NOT_A_DICT_POS;
|
||||
int siblingPos = NOT_A_DICT_POS;
|
||||
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy,
|
||||
mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,
|
||||
&childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||
mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
||||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||
if (mergedNodeCodePointCount <= 0) {
|
||||
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
|
||||
ASSERT(false);
|
||||
|
|
|
@ -33,8 +33,10 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
|
|||
public:
|
||||
Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
|
||||
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
||||
const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
|
||||
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy) {}
|
||||
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
||||
const int *const codePointTable)
|
||||
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy),
|
||||
mCodePointTable(codePointTable) {}
|
||||
|
||||
virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
|
||||
|
||||
|
@ -44,6 +46,7 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
|
|||
const ReadOnlyByteArrayView mBuffer;
|
||||
const DictionaryBigramsStructurePolicy *const mBigramPolicy;
|
||||
const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
|
||||
const int *const mCodePointTable;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
|
||||
|
||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
|
||||
|
@ -51,7 +52,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
|||
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
|
||||
int codePoints[MAX_WORD_LENGTH];
|
||||
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
||||
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
|
||||
dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
|
||||
int terminalIdFieldPos = NOT_A_DICT_POS;
|
||||
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||
int probability = NOT_A_PROBABILITY;
|
||||
|
|
|
@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
|
|||
if (readingPosIsInAdditionalBuffer) {
|
||||
*pos -= mOriginalBuffer.size();
|
||||
}
|
||||
// Code point table is not used for dynamic format.
|
||||
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
|
||||
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
|
||||
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount,
|
||||
nullptr /* codePointTable */, outCodePoints, pos);
|
||||
if (readingPosIsInAdditionalBuffer) {
|
||||
*pos += mOriginalBuffer.size();
|
||||
}
|
||||
|
|
|
@ -147,11 +147,18 @@ class ByteArrayUtils {
|
|||
*/
|
||||
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
|
||||
int p = pos;
|
||||
return readCodePointAndAdvancePosition(buffer, &p);
|
||||
return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
|
||||
const uint8_t *const buffer, int *const pos) {
|
||||
const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
|
||||
/*
|
||||
* codePointTable is an array to convert the most frequent characters in this dictionary to
|
||||
* 1 byte code points. It is only made of the original code points of the most frequent
|
||||
* characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
|
||||
* The original code points are restored by picking the code points at the indices of the
|
||||
* codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
|
||||
*/
|
||||
const uint8_t firstByte = readUint8(buffer, *pos);
|
||||
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
|
||||
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
|
||||
|
@ -162,6 +169,9 @@ class ByteArrayUtils {
|
|||
}
|
||||
} else {
|
||||
*pos += 1;
|
||||
if (codePointTable) {
|
||||
return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
|
||||
}
|
||||
return firstByte;
|
||||
}
|
||||
}
|
||||
|
@ -173,12 +183,13 @@ class ByteArrayUtils {
|
|||
*/
|
||||
// Returns the length of the string.
|
||||
static int readStringAndAdvancePosition(const uint8_t *const buffer,
|
||||
const int maxLength, int *const outBuffer, int *const pos) {
|
||||
const int maxLength, const int *const codePointTable, int *const outBuffer,
|
||||
int *const pos) {
|
||||
int length = 0;
|
||||
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||
int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
||||
outBuffer[length++] = codePoint;
|
||||
codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||
codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
@ -187,9 +198,9 @@ class ByteArrayUtils {
|
|||
static int advancePositionToBehindString(
|
||||
const uint8_t *const buffer, const int maxLength, int *const pos) {
|
||||
int length = 0;
|
||||
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||
int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
|
||||
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
||||
codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
||||
codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
|
||||
length++;
|
||||
}
|
||||
return length;
|
||||
|
|
|
@ -29,6 +29,8 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
|
|||
switch (formatVersion) {
|
||||
case VERSION_2:
|
||||
return VERSION_2;
|
||||
case VERSION_201:
|
||||
return VERSION_201;
|
||||
case VERSION_4_ONLY_FOR_TESTING:
|
||||
return VERSION_4_ONLY_FOR_TESTING;
|
||||
case VERSION_4:
|
||||
|
|
|
@ -32,6 +32,7 @@ class FormatUtils {
|
|||
enum FORMAT_VERSION {
|
||||
// These MUST have the same values as the relevant constants in FormatSpec.java.
|
||||
VERSION_2 = 2,
|
||||
VERSION_201 = 201,
|
||||
VERSION_4_ONLY_FOR_TESTING = 399,
|
||||
VERSION_4 = 402,
|
||||
VERSION_4_DEV = 403,
|
||||
|
|
|
@ -23,6 +23,19 @@
|
|||
namespace latinime {
|
||||
namespace {
|
||||
|
||||
TEST(ByteArrayUtilsTest, TestReadCodePointTable) {
|
||||
const int codePointTable[] = { 0x6f, 0x6b };
|
||||
const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u };
|
||||
int pos = 0;
|
||||
// Expect the first entry of codePointTable
|
||||
EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
|
||||
// Expect the second entry of codePointTable
|
||||
EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
|
||||
// Expect the original code point from buffer[2] to buffer[4], 0x100
|
||||
// It isn't picked from the codePointTable, since it exceeds the range of the codePointTable.
|
||||
EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
|
||||
}
|
||||
|
||||
TEST(ByteArrayUtilsTest, TestReadInt) {
|
||||
const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu };
|
||||
|
||||
|
@ -67,7 +80,7 @@ TEST(ByteArrayUtilsTest, TestReadCodePoint) {
|
|||
|
||||
int pos = 0;
|
||||
int codePointArray[3];
|
||||
EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH,
|
||||
EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr,
|
||||
codePointArray, &pos));
|
||||
EXPECT_EQ(0x10FF00, codePointArray[0]);
|
||||
EXPECT_EQ(0x20, codePointArray[1]);
|
||||
|
|
Loading…
Reference in New Issue