Native side reads character table

Bug:17097992
Change-Id: Ibcfc67833a6754fe6a2d82a3e3023b33886b9ea2
main
Akifumi Yoshimoto 2014-09-10 11:38:36 +09:00
parent eddfe51b38
commit fb2bde5a68
18 changed files with 133 additions and 56 deletions

View File

@ -65,7 +65,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Constructs header information using an attribute map. // Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@ -97,7 +98,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Copy header information // Copy header information
HeaderPolicy(const HeaderPolicy *const headerPolicy) HeaderPolicy(const HeaderPolicy *const headerPolicy)
@ -118,7 +120,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mForgettingCurveDurationToLevelDown( mForgettingCurveDurationToLevelDown(
headerPolicy->mForgettingCurveDurationToLevelDown), headerPolicy->mForgettingCurveDurationToLevelDown),
mMaxUnigramCount(headerPolicy->mMaxUnigramCount), mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
mMaxBigramCount(headerPolicy->mMaxBigramCount) {} mMaxBigramCount(headerPolicy->mMaxBigramCount),
mCodePointTable(headerPolicy->mCodePointTable) {}
// Temporary dummy header. // Temporary dummy header.
HeaderPolicy() HeaderPolicy()
@ -128,7 +131,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0), mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {} mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0),
mCodePointTable(nullptr) {}
~HeaderPolicy() {} ~HeaderPolicy() {}
@ -139,6 +143,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
switch (mDictFormatVersion) { switch (mDictFormatVersion) {
case FormatUtils::VERSION_2: case FormatUtils::VERSION_2:
return FormatUtils::VERSION_2; return FormatUtils::VERSION_2;
case FormatUtils::VERSION_201:
return FormatUtils::VERSION_201;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
return FormatUtils::VERSION_4_ONLY_FOR_TESTING; return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
case FormatUtils::VERSION_4: case FormatUtils::VERSION_4:
@ -250,6 +256,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mDictFormatVersion >= FormatUtils::VERSION_4; return mDictFormatVersion >= FormatUtils::VERSION_4;
} }
const int *getCodePointTable() const {
return mCodePointTable;
}
private: private:
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
@ -295,6 +305,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const int mForgettingCurveDurationToLevelDown; const int mForgettingCurveDurationToLevelDown;
const int mMaxUnigramCount; const int mMaxUnigramCount;
const int mMaxBigramCount; const int mMaxBigramCount;
const int *const mCodePointTable;
const std::vector<int> readLocale() const; const std::vector<int> readLocale() const;
float readMultipleWordCostMultiplier() const; float readMultipleWordCostMultiplier() const;

View File

@ -18,6 +18,7 @@
#include <cctype> #include <cctype>
#include <cstdio> #include <cstdio>
#include <memory>
#include <vector> #include <vector>
#include "defines.h" #include "defines.h"
@ -34,12 +35,13 @@ namespace latinime {
const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
return; return;
} }
int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH]; std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
while (pos < headerSize) { while (pos < headerSize) {
// The values in the header don't use the code point table for their encoding.
const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos); MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
std::vector<int> key; std::vector<int> key;
key.insert(key.end(), keyBuffer, keyBuffer + keyLength); key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos); MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
std::vector<int> value; std::vector<int> value;
value.insert(value.end(), valueBuffer, valueBuffer + valueLength); value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
headerAttributes->insert(AttributeMap::value_type(key, value)); headerAttributes->insert(AttributeMap::value_type(key, value));
} }
} }
/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
AttributeMap *const headerAttributes) {
AttributeMap::key_type keyVector;
insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
AttributeMap::const_iterator it = headerAttributes->find(keyVector);
if (it == headerAttributes->end()) {
return nullptr;
}
return it->second.data();
}
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( /* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
int *const writingPos) { int *const writingPos) {
@ -96,7 +110,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
} }
switch (version) { switch (version) {
case FormatUtils::VERSION_2: case FormatUtils::VERSION_2:
// Version 2 dictionary writing is not supported. case FormatUtils::VERSION_201:
// Version 2 or 201 dictionary writing is not supported.
return false; return false;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4: case FormatUtils::VERSION_4:

View File

@ -46,6 +46,9 @@ class HeaderReadWriteUtils {
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
static const int *readCodePointTable(
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
const FormatUtils::FORMAT_VERSION version, int *const writingPos); const FormatUtils::FORMAT_VERSION version, int *const writingPos);
@ -101,6 +104,8 @@ class HeaderReadWriteUtils {
static const int HEADER_FLAG_SIZE; static const int HEADER_FLAG_SIZE;
static const int HEADER_SIZE_FIELD_SIZE; static const int HEADER_SIZE_FIELD_SIZE;
static const char *const CODE_POINT_TABLE_KEY;
// Value for the "flags" field. It's unused at the moment. // Value for the "flags" field. It's unused at the moment.
static const DictionaryFlags NO_FLAGS; static const DictionaryFlags NO_FLAGS;

View File

@ -23,6 +23,7 @@
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" #include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
const int parentPos = const int parentPos =
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH]; int codePoints[MAX_WORD_LENGTH];
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS; int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY; int probability = NOT_A_PROBABILITY;
@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
// The destination position is stored at the same place as the parent position. // The destination position is stored at the same place as the parent position.
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
} else { } else {
return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints, return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
newSiblingNodePos); newSiblingNodePos);
} }

View File

@ -114,7 +114,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
mmappedBuffer->getReadOnlyByteArrayView()); mmappedBuffer->getReadOnlyByteArrayView());
switch (formatVersion) { switch (formatVersion) {
case FormatUtils::VERSION_2: case FormatUtils::VERSION_2:
AKLOGE("Given path is a directory but the format is version 2. path: %s", path); case FormatUtils::VERSION_201:
AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path);
break; break;
case FormatUtils::VERSION_4: { case FormatUtils::VERSION_4: {
return newPolicyForV4Dict<backward::v402::Ver4DictConstants, return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
@ -175,6 +176,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
} }
switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
case FormatUtils::VERSION_2: case FormatUtils::VERSION_2:
case FormatUtils::VERSION_201:
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
new PatriciaTriePolicy(std::move(mmappedBuffer))); new PatriciaTriePolicy(std::move(mmappedBuffer)));
case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_ONLY_FOR_TESTING:

View File

@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
} }
/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
int *const pos) { const int *const codePointTable, int *const pos) {
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos); return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
} }
// Returns the number of read characters. // Returns the number of read characters.
/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) { const NodeFlags flags, const int maxLength, const int *const codePointTable,
int *const outBuffer, int *const pos) {
int length = 0; int length = 0;
if (hasMultipleChars(flags)) { if (hasMultipleChars(flags)) {
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer, length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
pos); outBuffer, pos);
} else { } else {
const int codePoint = getCodePointAndAdvancePosition(buffer, pos); const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
if (codePoint == NOT_A_CODE_POINT) { if (codePoint == NOT_A_CODE_POINT) {
// CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
// CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
// Returns the number of skipped characters. // Returns the number of skipped characters.
/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
const int maxLength, int *const pos) { const int maxLength, const int *const codePointTable, int *const pos) {
if (hasMultipleChars(flags)) { if (hasMultipleChars(flags)) {
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
} else { } else {
if (maxLength > 0) { if (maxLength > 0) {
getCodePointAndAdvancePosition(buffer, pos); getCodePointAndAdvancePosition(buffer, codePointTable, pos);
return 1; return 1;
} else { } else {
return 0; return 0;
@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy, const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy, const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
int *const outBigramPos, int *const outSiblingPos) { int *const outBigramPos, int *const outSiblingPos) {
@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
*outFlags = flags; *outFlags = flags;
*outCodePointCount = getCharsAndAdvancePosition( *outCodePointCount = getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos); dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
*outProbability = isTerminal(flags) ? *outProbability = isTerminal(flags) ?
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
*outChildrenPos = hasChildrenInFlags(flags) ? *outChildrenPos = hasChildrenInFlags(flags) ?

View File

@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils {
static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos); static int getCodePointAndAdvancePosition(const uint8_t *const buffer,
const int *const codePointTable, int *const pos);
// Returns the number of read characters. // Returns the number of read characters.
static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
const int maxLength, int *const outBuffer, int *const pos); const int maxLength, const int *const codePointTable, int *const outBuffer,
int *const pos);
// Returns the number of skipped characters. // Returns the number of skipped characters.
static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
const int maxLength, int *const pos); const int maxLength, const int *const codePointTable, int *const pos);
static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils {
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy, const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy, const DictionaryBigramsStructurePolicy *const bigramPolicy,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, const int *const codePointTable, NodeFlags *const outFlags,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, int *const outCodePointCount, int *const outCodePoint, int *const outProbability,
int *const outBigramPos, int *const outSiblingPos); int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos,
int *const outSiblingPos);
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);

View File

@ -45,7 +45,9 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer, /* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
const int maxLength, int *const outWord, int *const pos) { const int maxLength, int *const outWord, int *const pos) {
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, outWord, pos); // TODO: Use codePointTable for shortcuts.
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength,
nullptr /* codePointTable */, outWord, pos);
} }
} // namespace latinime } // namespace latinime

View File

@ -81,6 +81,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
int pos = getRootPosition(); int pos = getRootPosition();
int wordPos = 0; int wordPos = 0;
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
// only traverse PtNodes that are actually a part of the terminal we are searching, so each // only traverse PtNodes that are actually a part of the terminal we are searching, so each
// time we enter this loop we are one depth level further than last time. // time we enter this loop we are one depth level further than last time.
@ -112,21 +113,21 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const PatriciaTrieReadingUtils::NodeFlags flags = const PatriciaTrieReadingUtils::NodeFlags flags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos); PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &pos); mBuffer.data(), codePointTable, &pos);
if (ptNodePos == startPos) { if (ptNodePos == startPos) {
// We found the position. Copy the rest of the code points in the buffer and return // We found the position. Copy the rest of the code points in the buffer and return
// the length. // the length.
outCodePoints[wordPos] = character; outCodePoints[wordPos] = character;
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &pos); mBuffer.data(), codePointTable, &pos);
// We count code points in order to avoid infinite loops if the file is broken // We count code points in order to avoid infinite loops if the file is broken
// or if there is some other bug // or if there is some other bug
int charCount = maxCodePointCount; int charCount = maxCodePointCount;
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar; outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &pos); mBuffer.data(), codePointTable, &pos);
} }
} }
*outUnigramProbability = *outUnigramProbability =
@ -138,7 +139,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
// first and possibly the probability. // first and possibly the probability.
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH, PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
&pos); codePointTable, &pos);
} }
if (PatriciaTrieReadingUtils::isTerminal(flags)) { if (PatriciaTrieReadingUtils::isTerminal(flags)) {
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos); PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
@ -189,17 +190,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos); mBuffer.data(), &lastCandidatePtNodePos);
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos); mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
// We copy all the characters in this PtNode to the buffer // We copy all the characters in this PtNode to the buffer
outCodePoints[wordPos] = lastChar; outCodePoints[wordPos] = lastChar;
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos); mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
int charCount = maxCodePointCount; int charCount = maxCodePointCount;
while (-1 != nextChar && --charCount > 0) { while (-1 != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar; outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos); mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
} }
} }
++wordPos; ++wordPos;
@ -404,9 +405,11 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
int shortcutPos = NOT_A_DICT_POS; int shortcutPos = NOT_A_DICT_POS;
int bigramPos = NOT_A_DICT_POS; int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS; int siblingPos = NOT_A_DICT_POS;
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy, PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
&mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
&siblingPos);
// Skip PtNodes don't start with Unicode code point because they represent non-word information. // Skip PtNodes don't start with Unicode code point because they represent non-word information.
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID; const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;

View File

@ -43,10 +43,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
: mMmappedBuffer(std::move(mmappedBuffer)), : mMmappedBuffer(std::move(mmappedBuffer)),
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
FormatUtils::VERSION_2), FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())),
mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())), mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer), mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy), mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
mHeaderPolicy.getCodePointTable()),
mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(), mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
mIsCorrupted(false) {} mIsCorrupted(false) {}

View File

@ -38,8 +38,8 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo
int bigramPos = NOT_A_DICT_POS; int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS; int siblingPos = NOT_A_DICT_POS;
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy, PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy,
mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability, mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
&childrenPos, &shortcutPos, &bigramPos, &siblingPos); &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
if (mergedNodeCodePointCount <= 0) { if (mergedNodeCodePointCount <= 0) {
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
ASSERT(false); ASSERT(false);

View File

@ -33,8 +33,10 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
public: public:
Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer, Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
const DictionaryBigramsStructurePolicy *const bigramPolicy, const DictionaryBigramsStructurePolicy *const bigramPolicy,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy) const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy) {} const int *const codePointTable)
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy),
mCodePointTable(codePointTable) {}
virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const; virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
@ -44,6 +46,7 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
const ReadOnlyByteArrayView mBuffer; const ReadOnlyByteArrayView mBuffer;
const DictionaryBigramsStructurePolicy *const mBigramPolicy; const DictionaryBigramsStructurePolicy *const mBigramPolicy;
const DictionaryShortcutsStructurePolicy *const mShortuctPolicy; const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
const int *const mCodePointTable;
}; };
} // namespace latinime } // namespace latinime
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */ #endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */

View File

@ -16,6 +16,7 @@
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" #include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
@ -51,7 +52,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH]; int codePoints[MAX_WORD_LENGTH];
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS; int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY; int probability = NOT_A_PROBABILITY;

View File

@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
if (readingPosIsInAdditionalBuffer) { if (readingPosIsInAdditionalBuffer) {
*pos -= mOriginalBuffer.size(); *pos -= mOriginalBuffer.size();
} }
// Code point table is not used for dynamic format.
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos); getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount,
nullptr /* codePointTable */, outCodePoints, pos);
if (readingPosIsInAdditionalBuffer) { if (readingPosIsInAdditionalBuffer) {
*pos += mOriginalBuffer.size(); *pos += mOriginalBuffer.size();
} }

View File

@ -147,11 +147,18 @@ class ByteArrayUtils {
*/ */
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
int p = pos; int p = pos;
return readCodePointAndAdvancePosition(buffer, &p); return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
} }
static AK_FORCE_INLINE int readCodePointAndAdvancePosition( static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
const uint8_t *const buffer, int *const pos) { const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
/*
* codePointTable is an array to convert the most frequent characters in this dictionary to
* 1 byte code points. It is only made of the original code points of the most frequent
* characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
* The original code points are restored by picking the code points at the indices of the
* codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
*/
const uint8_t firstByte = readUint8(buffer, *pos); const uint8_t firstByte = readUint8(buffer, *pos);
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
if (firstByte == CHARACTER_ARRAY_TERMINATOR) { if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
@ -162,6 +169,9 @@ class ByteArrayUtils {
} }
} else { } else {
*pos += 1; *pos += 1;
if (codePointTable) {
return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
}
return firstByte; return firstByte;
} }
} }
@ -173,12 +183,13 @@ class ByteArrayUtils {
*/ */
// Returns the length of the string. // Returns the length of the string.
static int readStringAndAdvancePosition(const uint8_t *const buffer, static int readStringAndAdvancePosition(const uint8_t *const buffer,
const int maxLength, int *const outBuffer, int *const pos) { const int maxLength, const int *const codePointTable, int *const outBuffer,
int *const pos) {
int length = 0; int length = 0;
int codePoint = readCodePointAndAdvancePosition(buffer, pos); int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) { while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
outBuffer[length++] = codePoint; outBuffer[length++] = codePoint;
codePoint = readCodePointAndAdvancePosition(buffer, pos); codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
} }
return length; return length;
} }
@ -187,9 +198,9 @@ class ByteArrayUtils {
static int advancePositionToBehindString( static int advancePositionToBehindString(
const uint8_t *const buffer, const int maxLength, int *const pos) { const uint8_t *const buffer, const int maxLength, int *const pos) {
int length = 0; int length = 0;
int codePoint = readCodePointAndAdvancePosition(buffer, pos); int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) { while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
codePoint = readCodePointAndAdvancePosition(buffer, pos); codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
length++; length++;
} }
return length; return length;

View File

@ -29,6 +29,8 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
switch (formatVersion) { switch (formatVersion) {
case VERSION_2: case VERSION_2:
return VERSION_2; return VERSION_2;
case VERSION_201:
return VERSION_201;
case VERSION_4_ONLY_FOR_TESTING: case VERSION_4_ONLY_FOR_TESTING:
return VERSION_4_ONLY_FOR_TESTING; return VERSION_4_ONLY_FOR_TESTING;
case VERSION_4: case VERSION_4:

View File

@ -32,6 +32,7 @@ class FormatUtils {
enum FORMAT_VERSION { enum FORMAT_VERSION {
// These MUST have the same values as the relevant constants in FormatSpec.java. // These MUST have the same values as the relevant constants in FormatSpec.java.
VERSION_2 = 2, VERSION_2 = 2,
VERSION_201 = 201,
VERSION_4_ONLY_FOR_TESTING = 399, VERSION_4_ONLY_FOR_TESTING = 399,
VERSION_4 = 402, VERSION_4 = 402,
VERSION_4_DEV = 403, VERSION_4_DEV = 403,

View File

@ -23,6 +23,19 @@
namespace latinime { namespace latinime {
namespace { namespace {
TEST(ByteArrayUtilsTest, TestReadCodePointTable) {
const int codePointTable[] = { 0x6f, 0x6b };
const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u };
int pos = 0;
// Expect the first entry of codePointTable
EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
// Expect the second entry of codePointTable
EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
// Expect the original code point from buffer[2] to buffer[4], 0x100
// It isn't picked from the codePointTable, since it exceeds the range of the codePointTable.
EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
}
TEST(ByteArrayUtilsTest, TestReadInt) { TEST(ByteArrayUtilsTest, TestReadInt) {
const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu }; const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu };
@ -67,7 +80,7 @@ TEST(ByteArrayUtilsTest, TestReadCodePoint) {
int pos = 0; int pos = 0;
int codePointArray[3]; int codePointArray[3];
EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr,
codePointArray, &pos)); codePointArray, &pos));
EXPECT_EQ(0x10FF00, codePointArray[0]); EXPECT_EQ(0x10FF00, codePointArray[0]);
EXPECT_EQ(0x20, codePointArray[1]); EXPECT_EQ(0x20, codePointArray[1]);