Merge "Native side reads character table"

main
Akifumi Yoshimoto 2014-09-19 09:58:40 +00:00 committed by Android (Google) Code Review
commit 5c6db929e4
18 changed files with 133 additions and 56 deletions

View File

@ -65,7 +65,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@ -97,7 +98,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Copy header information
HeaderPolicy(const HeaderPolicy *const headerPolicy)
@ -118,7 +120,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mForgettingCurveDurationToLevelDown(
headerPolicy->mForgettingCurveDurationToLevelDown),
mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
mMaxBigramCount(headerPolicy->mMaxBigramCount) {}
mMaxBigramCount(headerPolicy->mMaxBigramCount),
mCodePointTable(headerPolicy->mCodePointTable) {}
// Temporary dummy header.
HeaderPolicy()
@ -128,7 +131,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0),
mCodePointTable(nullptr) {}
~HeaderPolicy() {}
@ -139,6 +143,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
switch (mDictFormatVersion) {
case FormatUtils::VERSION_2:
return FormatUtils::VERSION_2;
case FormatUtils::VERSION_201:
return FormatUtils::VERSION_201;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
case FormatUtils::VERSION_4:
@ -250,6 +256,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mDictFormatVersion >= FormatUtils::VERSION_4;
}
const int *getCodePointTable() const {
return mCodePointTable;
}
private:
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
@ -295,6 +305,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const int mForgettingCurveDurationToLevelDown;
const int mMaxUnigramCount;
const int mMaxBigramCount;
const int *const mCodePointTable;
const std::vector<int> readLocale() const;
float readMultipleWordCostMultiplier() const;

View File

@ -18,6 +18,7 @@
#include <cctype>
#include <cstdio>
#include <memory>
#include <vector>
#include "defines.h"
@ -34,12 +35,13 @@ namespace latinime {
const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256;
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
return;
}
int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH];
std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
while (pos < headerSize) {
// The values in the header don't use the code point table for their encoding.
const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos);
MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
std::vector<int> key;
key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos);
MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
std::vector<int> value;
value.insert(value.end(), valueBuffer, valueBuffer + valueLength);
value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
headerAttributes->insert(AttributeMap::value_type(key, value));
}
}
/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
AttributeMap *const headerAttributes) {
AttributeMap::key_type keyVector;
insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
AttributeMap::const_iterator it = headerAttributes->find(keyVector);
if (it == headerAttributes->end()) {
return nullptr;
}
return it->second.data();
}
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
int *const writingPos) {
@ -96,7 +110,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
}
switch (version) {
case FormatUtils::VERSION_2:
// Version 2 dictionary writing is not supported.
case FormatUtils::VERSION_201:
// Version 2 or 201 dictionary writing is not supported.
return false;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4:

View File

@ -46,6 +46,9 @@ class HeaderReadWriteUtils {
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
static const int *readCodePointTable(
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
const FormatUtils::FORMAT_VERSION version, int *const writingPos);
@ -101,6 +104,8 @@ class HeaderReadWriteUtils {
static const int HEADER_FLAG_SIZE;
static const int HEADER_SIZE_FIELD_SIZE;
static const char *const CODE_POINT_TABLE_KEY;
// Value for the "flags" field. It's unused at the moment.
static const DictionaryFlags NO_FLAGS;

View File

@ -23,6 +23,7 @@
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
const int parentPos =
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH];
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY;
@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
// The destination position is stored at the same place as the parent position.
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
} else {
return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints,
return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
newSiblingNodePos);
}

View File

@ -114,7 +114,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
mmappedBuffer->getReadOnlyByteArrayView());
switch (formatVersion) {
case FormatUtils::VERSION_2:
AKLOGE("Given path is a directory but the format is version 2. path: %s", path);
case FormatUtils::VERSION_201:
AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path);
break;
case FormatUtils::VERSION_4: {
return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
@ -175,6 +176,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
}
switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
case FormatUtils::VERSION_2:
case FormatUtils::VERSION_201:
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
new PatriciaTriePolicy(std::move(mmappedBuffer)));
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:

View File

@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
}
/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
int *const pos) {
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
const int *const codePointTable, int *const pos) {
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
}
// Returns the number of read characters.
/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
const NodeFlags flags, const int maxLength, const int *const codePointTable,
int *const outBuffer, int *const pos) {
int length = 0;
if (hasMultipleChars(flags)) {
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
pos);
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
outBuffer, pos);
} else {
const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
if (codePoint == NOT_A_CODE_POINT) {
// CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
// CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
// Returns the number of skipped characters.
/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
const int maxLength, int *const pos) {
const int maxLength, const int *const codePointTable, int *const pos) {
if (hasMultipleChars(flags)) {
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
} else {
if (maxLength > 0) {
getCodePointAndAdvancePosition(buffer, pos);
getCodePointAndAdvancePosition(buffer, codePointTable, pos);
return 1;
} else {
return 0;
@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
int *const outBigramPos, int *const outSiblingPos) {
@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
*outFlags = flags;
*outCodePointCount = getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos);
dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
*outProbability = isTerminal(flags) ?
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
*outChildrenPos = hasChildrenInFlags(flags) ?

View File

@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils {
static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos);
static int getCodePointAndAdvancePosition(const uint8_t *const buffer,
const int *const codePointTable, int *const pos);
// Returns the number of read characters.
static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
const int maxLength, int *const outBuffer, int *const pos);
const int maxLength, const int *const codePointTable, int *const outBuffer,
int *const pos);
// Returns the number of skipped characters.
static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
const int maxLength, int *const pos);
const int maxLength, const int *const codePointTable, int *const pos);
static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils {
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
int *const outBigramPos, int *const outSiblingPos);
const int *const codePointTable, NodeFlags *const outFlags,
int *const outCodePointCount, int *const outCodePoint, int *const outProbability,
int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos,
int *const outSiblingPos);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);

View File

@ -45,7 +45,9 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
const int maxLength, int *const outWord, int *const pos) {
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, outWord, pos);
// TODO: Use codePointTable for shortcuts.
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength,
nullptr /* codePointTable */, outWord, pos);
}
} // namespace latinime

View File

@ -81,6 +81,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
int pos = getRootPosition();
int wordPos = 0;
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
// only traverse PtNodes that are actually a part of the terminal we are searching, so each
// time we enter this loop we are one depth level further than last time.
@ -112,21 +113,21 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const PatriciaTrieReadingUtils::NodeFlags flags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &pos);
mBuffer.data(), codePointTable, &pos);
if (ptNodePos == startPos) {
// We found the position. Copy the rest of the code points in the buffer and return
// the length.
outCodePoints[wordPos] = character;
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &pos);
mBuffer.data(), codePointTable, &pos);
// We count code points in order to avoid infinite loops if the file is broken
// or if there is some other bug
int charCount = maxCodePointCount;
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &pos);
mBuffer.data(), codePointTable, &pos);
}
}
*outUnigramProbability =
@ -138,7 +139,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
// first and possibly the probability.
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
&pos);
codePointTable, &pos);
}
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
@ -189,17 +190,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos);
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos);
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
// We copy all the characters in this PtNode to the buffer
outCodePoints[wordPos] = lastChar;
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos);
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
int charCount = maxCodePointCount;
while (-1 != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos);
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
}
}
++wordPos;
@ -404,9 +405,11 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
int shortcutPos = NOT_A_DICT_POS;
int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS;
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
&mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
&mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
&siblingPos);
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;

View File

@ -43,10 +43,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
: mMmappedBuffer(std::move(mmappedBuffer)),
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
FormatUtils::VERSION_2),
FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())),
mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy),
mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
mHeaderPolicy.getCodePointTable()),
mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
mIsCorrupted(false) {}

View File

@ -38,8 +38,8 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo
int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS;
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy,
mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,
&childrenPos, &shortcutPos, &bigramPos, &siblingPos);
mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
if (mergedNodeCodePointCount <= 0) {
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
ASSERT(false);

View File

@ -33,8 +33,10 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
public:
Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy) {}
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const int *const codePointTable)
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy),
mCodePointTable(codePointTable) {}
virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
@ -44,6 +46,7 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
const ReadOnlyByteArrayView mBuffer;
const DictionaryBigramsStructurePolicy *const mBigramPolicy;
const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
const int *const mCodePointTable;
};
} // namespace latinime
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */

View File

@ -16,6 +16,7 @@
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
@ -51,7 +52,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH];
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY;

View File

@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
if (readingPosIsInAdditionalBuffer) {
*pos -= mOriginalBuffer.size();
}
// Code point table is not used for dynamic format.
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount,
nullptr /* codePointTable */, outCodePoints, pos);
if (readingPosIsInAdditionalBuffer) {
*pos += mOriginalBuffer.size();
}

View File

@ -147,11 +147,18 @@ class ByteArrayUtils {
*/
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
int p = pos;
return readCodePointAndAdvancePosition(buffer, &p);
return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
}
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
const uint8_t *const buffer, int *const pos) {
const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
/*
* codePointTable is an array to convert the most frequent characters in this dictionary to
* 1 byte code points. It is only made of the original code points of the most frequent
* characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
* The original code points are restored by picking the code points at the indices of the
* codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
*/
const uint8_t firstByte = readUint8(buffer, *pos);
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
@ -162,6 +169,9 @@ class ByteArrayUtils {
}
} else {
*pos += 1;
if (codePointTable) {
return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
}
return firstByte;
}
}
@ -173,12 +183,13 @@ class ByteArrayUtils {
*/
// Returns the length of the string.
static int readStringAndAdvancePosition(const uint8_t *const buffer,
const int maxLength, int *const outBuffer, int *const pos) {
const int maxLength, const int *const codePointTable, int *const outBuffer,
int *const pos) {
int length = 0;
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
outBuffer[length++] = codePoint;
codePoint = readCodePointAndAdvancePosition(buffer, pos);
codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
}
return length;
}
@ -187,9 +198,9 @@ class ByteArrayUtils {
static int advancePositionToBehindString(
const uint8_t *const buffer, const int maxLength, int *const pos) {
int length = 0;
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
codePoint = readCodePointAndAdvancePosition(buffer, pos);
codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
length++;
}
return length;

View File

@ -29,6 +29,8 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
switch (formatVersion) {
case VERSION_2:
return VERSION_2;
case VERSION_201:
return VERSION_201;
case VERSION_4_ONLY_FOR_TESTING:
return VERSION_4_ONLY_FOR_TESTING;
case VERSION_4:

View File

@ -32,6 +32,7 @@ class FormatUtils {
enum FORMAT_VERSION {
// These MUST have the same values as the relevant constants in FormatSpec.java.
VERSION_2 = 2,
VERSION_201 = 201,
VERSION_4_ONLY_FOR_TESTING = 399,
VERSION_4 = 402,
VERSION_4_DEV = 403,

View File

@ -23,6 +23,19 @@
namespace latinime {
namespace {
TEST(ByteArrayUtilsTest, TestReadCodePointTable) {
const int codePointTable[] = { 0x6f, 0x6b };
const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u };
int pos = 0;
// Expect the first entry of codePointTable
EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
// Expect the second entry of codePointTable
EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
// Expect the original code point from buffer[2] to buffer[4], 0x100
// It isn't picked from the codePointTable, since it exceeds the range of the codePointTable.
EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
}
TEST(ByteArrayUtilsTest, TestReadInt) {
const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu };
@ -67,7 +80,7 @@ TEST(ByteArrayUtilsTest, TestReadCodePoint) {
int pos = 0;
int codePointArray[3];
EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH,
EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr,
codePointArray, &pos));
EXPECT_EQ(0x10FF00, codePointArray[0]);
EXPECT_EQ(0x20, codePointArray[1]);