Merge "Native side reads character table"
commit
5c6db929e4
|
@ -65,7 +65,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
||||||
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
|
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
|
||||||
|
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
|
||||||
|
|
||||||
// Constructs header information using an attribute map.
|
// Constructs header information using an attribute map.
|
||||||
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
||||||
|
@ -97,7 +98,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
||||||
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
|
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
|
||||||
|
mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
|
||||||
|
|
||||||
// Copy header information
|
// Copy header information
|
||||||
HeaderPolicy(const HeaderPolicy *const headerPolicy)
|
HeaderPolicy(const HeaderPolicy *const headerPolicy)
|
||||||
|
@ -118,7 +120,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
mForgettingCurveDurationToLevelDown(
|
mForgettingCurveDurationToLevelDown(
|
||||||
headerPolicy->mForgettingCurveDurationToLevelDown),
|
headerPolicy->mForgettingCurveDurationToLevelDown),
|
||||||
mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
|
mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
|
||||||
mMaxBigramCount(headerPolicy->mMaxBigramCount) {}
|
mMaxBigramCount(headerPolicy->mMaxBigramCount),
|
||||||
|
mCodePointTable(headerPolicy->mCodePointTable) {}
|
||||||
|
|
||||||
// Temporary dummy header.
|
// Temporary dummy header.
|
||||||
HeaderPolicy()
|
HeaderPolicy()
|
||||||
|
@ -128,7 +131,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
|
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
|
||||||
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
|
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
|
||||||
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
|
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
|
||||||
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
|
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0),
|
||||||
|
mCodePointTable(nullptr) {}
|
||||||
|
|
||||||
~HeaderPolicy() {}
|
~HeaderPolicy() {}
|
||||||
|
|
||||||
|
@ -139,6 +143,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
switch (mDictFormatVersion) {
|
switch (mDictFormatVersion) {
|
||||||
case FormatUtils::VERSION_2:
|
case FormatUtils::VERSION_2:
|
||||||
return FormatUtils::VERSION_2;
|
return FormatUtils::VERSION_2;
|
||||||
|
case FormatUtils::VERSION_201:
|
||||||
|
return FormatUtils::VERSION_201;
|
||||||
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
||||||
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
|
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
|
||||||
case FormatUtils::VERSION_4:
|
case FormatUtils::VERSION_4:
|
||||||
|
@ -250,6 +256,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
return mDictFormatVersion >= FormatUtils::VERSION_4;
|
return mDictFormatVersion >= FormatUtils::VERSION_4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int *getCodePointTable() const {
|
||||||
|
return mCodePointTable;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
|
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
|
||||||
|
|
||||||
|
@ -295,6 +305,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
const int mForgettingCurveDurationToLevelDown;
|
const int mForgettingCurveDurationToLevelDown;
|
||||||
const int mMaxUnigramCount;
|
const int mMaxUnigramCount;
|
||||||
const int mMaxBigramCount;
|
const int mMaxBigramCount;
|
||||||
|
const int *const mCodePointTable;
|
||||||
|
|
||||||
const std::vector<int> readLocale() const;
|
const std::vector<int> readLocale() const;
|
||||||
float readMultipleWordCostMultiplier() const;
|
float readMultipleWordCostMultiplier() const;
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <memory>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
@ -34,12 +35,13 @@ namespace latinime {
|
||||||
const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
|
const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
|
||||||
|
|
||||||
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
|
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
|
||||||
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256;
|
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
|
||||||
|
|
||||||
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
|
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
|
||||||
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
|
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
|
||||||
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
|
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
|
||||||
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
|
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
|
||||||
|
const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
|
||||||
|
|
||||||
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
|
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
|
||||||
|
|
||||||
|
@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
|
int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
|
||||||
int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH];
|
std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
|
||||||
while (pos < headerSize) {
|
while (pos < headerSize) {
|
||||||
|
// The values in the header don't use the code point table for their encoding.
|
||||||
const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
|
const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
|
||||||
MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos);
|
MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
|
||||||
std::vector<int> key;
|
std::vector<int> key;
|
||||||
key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
|
key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
|
||||||
const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
|
const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
|
||||||
MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos);
|
MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
|
||||||
std::vector<int> value;
|
std::vector<int> value;
|
||||||
value.insert(value.end(), valueBuffer, valueBuffer + valueLength);
|
value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
|
||||||
headerAttributes->insert(AttributeMap::value_type(key, value));
|
headerAttributes->insert(AttributeMap::value_type(key, value));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
|
||||||
|
AttributeMap *const headerAttributes) {
|
||||||
|
AttributeMap::key_type keyVector;
|
||||||
|
insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
|
||||||
|
AttributeMap::const_iterator it = headerAttributes->find(keyVector);
|
||||||
|
if (it == headerAttributes->end()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return it->second.data();
|
||||||
|
}
|
||||||
|
|
||||||
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
|
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
|
||||||
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
|
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
|
||||||
int *const writingPos) {
|
int *const writingPos) {
|
||||||
|
@ -96,7 +110,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
|
||||||
}
|
}
|
||||||
switch (version) {
|
switch (version) {
|
||||||
case FormatUtils::VERSION_2:
|
case FormatUtils::VERSION_2:
|
||||||
// Version 2 dictionary writing is not supported.
|
case FormatUtils::VERSION_201:
|
||||||
|
// Version 2 or 201 dictionary writing is not supported.
|
||||||
return false;
|
return false;
|
||||||
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
||||||
case FormatUtils::VERSION_4:
|
case FormatUtils::VERSION_4:
|
||||||
|
|
|
@ -46,6 +46,9 @@ class HeaderReadWriteUtils {
|
||||||
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
|
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
|
||||||
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
|
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
|
||||||
|
|
||||||
|
static const int *readCodePointTable(
|
||||||
|
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
|
||||||
|
|
||||||
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
|
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
|
||||||
const FormatUtils::FORMAT_VERSION version, int *const writingPos);
|
const FormatUtils::FORMAT_VERSION version, int *const writingPos);
|
||||||
|
|
||||||
|
@ -101,6 +104,8 @@ class HeaderReadWriteUtils {
|
||||||
static const int HEADER_FLAG_SIZE;
|
static const int HEADER_FLAG_SIZE;
|
||||||
static const int HEADER_SIZE_FIELD_SIZE;
|
static const int HEADER_SIZE_FIELD_SIZE;
|
||||||
|
|
||||||
|
static const char *const CODE_POINT_TABLE_KEY;
|
||||||
|
|
||||||
// Value for the "flags" field. It's unused at the moment.
|
// Value for the "flags" field. It's unused at the moment.
|
||||||
static const DictionaryFlags NO_FLAGS;
|
static const DictionaryFlags NO_FLAGS;
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
|
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
|
||||||
|
|
||||||
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
|
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
|
||||||
|
@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
||||||
const int parentPos =
|
const int parentPos =
|
||||||
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
|
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
|
||||||
int codePoints[MAX_WORD_LENGTH];
|
int codePoints[MAX_WORD_LENGTH];
|
||||||
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
||||||
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
|
dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
|
||||||
int terminalIdFieldPos = NOT_A_DICT_POS;
|
int terminalIdFieldPos = NOT_A_DICT_POS;
|
||||||
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||||
int probability = NOT_A_PROBABILITY;
|
int probability = NOT_A_PROBABILITY;
|
||||||
|
@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
||||||
// The destination position is stored at the same place as the parent position.
|
// The destination position is stored at the same place as the parent position.
|
||||||
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
|
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
|
||||||
} else {
|
} else {
|
||||||
return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints,
|
return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
|
||||||
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
|
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
|
||||||
newSiblingNodePos);
|
newSiblingNodePos);
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,7 +114,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
|
||||||
mmappedBuffer->getReadOnlyByteArrayView());
|
mmappedBuffer->getReadOnlyByteArrayView());
|
||||||
switch (formatVersion) {
|
switch (formatVersion) {
|
||||||
case FormatUtils::VERSION_2:
|
case FormatUtils::VERSION_2:
|
||||||
AKLOGE("Given path is a directory but the format is version 2. path: %s", path);
|
case FormatUtils::VERSION_201:
|
||||||
|
AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path);
|
||||||
break;
|
break;
|
||||||
case FormatUtils::VERSION_4: {
|
case FormatUtils::VERSION_4: {
|
||||||
return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
|
return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
|
||||||
|
@ -175,6 +176,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
|
||||||
}
|
}
|
||||||
switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
|
switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
|
||||||
case FormatUtils::VERSION_2:
|
case FormatUtils::VERSION_2:
|
||||||
|
case FormatUtils::VERSION_201:
|
||||||
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
|
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
|
||||||
new PatriciaTriePolicy(std::move(mmappedBuffer)));
|
new PatriciaTriePolicy(std::move(mmappedBuffer)));
|
||||||
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
|
||||||
|
|
|
@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
|
/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
|
||||||
int *const pos) {
|
const int *const codePointTable, int *const pos) {
|
||||||
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
|
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the number of read characters.
|
// Returns the number of read characters.
|
||||||
/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
|
/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
|
||||||
const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
|
const NodeFlags flags, const int maxLength, const int *const codePointTable,
|
||||||
|
int *const outBuffer, int *const pos) {
|
||||||
int length = 0;
|
int length = 0;
|
||||||
if (hasMultipleChars(flags)) {
|
if (hasMultipleChars(flags)) {
|
||||||
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
|
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
|
||||||
pos);
|
outBuffer, pos);
|
||||||
} else {
|
} else {
|
||||||
const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
|
const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||||
if (codePoint == NOT_A_CODE_POINT) {
|
if (codePoint == NOT_A_CODE_POINT) {
|
||||||
// CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
|
// CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
|
||||||
// CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
|
// CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
|
||||||
|
@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
||||||
|
|
||||||
// Returns the number of skipped characters.
|
// Returns the number of skipped characters.
|
||||||
/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
|
/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
|
||||||
const int maxLength, int *const pos) {
|
const int maxLength, const int *const codePointTable, int *const pos) {
|
||||||
if (hasMultipleChars(flags)) {
|
if (hasMultipleChars(flags)) {
|
||||||
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
|
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
|
||||||
} else {
|
} else {
|
||||||
if (maxLength > 0) {
|
if (maxLength > 0) {
|
||||||
getCodePointAndAdvancePosition(buffer, pos);
|
getCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||||
return 1;
|
return 1;
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
||||||
|
|
||||||
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
|
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
|
||||||
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
||||||
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
|
||||||
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
|
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
|
||||||
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
|
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
|
||||||
int *const outBigramPos, int *const outSiblingPos) {
|
int *const outBigramPos, int *const outSiblingPos) {
|
||||||
|
@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
||||||
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
|
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
|
||||||
*outFlags = flags;
|
*outFlags = flags;
|
||||||
*outCodePointCount = getCharsAndAdvancePosition(
|
*outCodePointCount = getCharsAndAdvancePosition(
|
||||||
dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos);
|
dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
|
||||||
*outProbability = isTerminal(flags) ?
|
*outProbability = isTerminal(flags) ?
|
||||||
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
|
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
|
||||||
*outChildrenPos = hasChildrenInFlags(flags) ?
|
*outChildrenPos = hasChildrenInFlags(flags) ?
|
||||||
|
|
|
@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils {
|
||||||
|
|
||||||
static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
||||||
|
|
||||||
static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
static int getCodePointAndAdvancePosition(const uint8_t *const buffer,
|
||||||
|
const int *const codePointTable, int *const pos);
|
||||||
|
|
||||||
// Returns the number of read characters.
|
// Returns the number of read characters.
|
||||||
static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
|
static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
|
||||||
const int maxLength, int *const outBuffer, int *const pos);
|
const int maxLength, const int *const codePointTable, int *const outBuffer,
|
||||||
|
int *const pos);
|
||||||
|
|
||||||
// Returns the number of skipped characters.
|
// Returns the number of skipped characters.
|
||||||
static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
|
static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
|
||||||
const int maxLength, int *const pos);
|
const int maxLength, const int *const codePointTable, int *const pos);
|
||||||
|
|
||||||
static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
|
||||||
|
|
||||||
|
@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils {
|
||||||
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
|
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
|
||||||
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
||||||
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
||||||
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
|
const int *const codePointTable, NodeFlags *const outFlags,
|
||||||
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
|
int *const outCodePointCount, int *const outCodePoint, int *const outProbability,
|
||||||
int *const outBigramPos, int *const outSiblingPos);
|
int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos,
|
||||||
|
int *const outSiblingPos);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);
|
||||||
|
|
|
@ -45,7 +45,9 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
|
||||||
|
|
||||||
/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
|
/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
|
||||||
const int maxLength, int *const outWord, int *const pos) {
|
const int maxLength, int *const outWord, int *const pos) {
|
||||||
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, outWord, pos);
|
// TODO: Use codePointTable for shortcuts.
|
||||||
|
return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength,
|
||||||
|
nullptr /* codePointTable */, outWord, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -81,6 +81,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
|
||||||
int pos = getRootPosition();
|
int pos = getRootPosition();
|
||||||
int wordPos = 0;
|
int wordPos = 0;
|
||||||
|
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
|
||||||
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
|
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
|
||||||
// only traverse PtNodes that are actually a part of the terminal we are searching, so each
|
// only traverse PtNodes that are actually a part of the terminal we are searching, so each
|
||||||
// time we enter this loop we are one depth level further than last time.
|
// time we enter this loop we are one depth level further than last time.
|
||||||
|
@ -112,21 +113,21 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const PatriciaTrieReadingUtils::NodeFlags flags =
|
const PatriciaTrieReadingUtils::NodeFlags flags =
|
||||||
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
|
||||||
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||||
mBuffer.data(), &pos);
|
mBuffer.data(), codePointTable, &pos);
|
||||||
if (ptNodePos == startPos) {
|
if (ptNodePos == startPos) {
|
||||||
// We found the position. Copy the rest of the code points in the buffer and return
|
// We found the position. Copy the rest of the code points in the buffer and return
|
||||||
// the length.
|
// the length.
|
||||||
outCodePoints[wordPos] = character;
|
outCodePoints[wordPos] = character;
|
||||||
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
||||||
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||||
mBuffer.data(), &pos);
|
mBuffer.data(), codePointTable, &pos);
|
||||||
// We count code points in order to avoid infinite loops if the file is broken
|
// We count code points in order to avoid infinite loops if the file is broken
|
||||||
// or if there is some other bug
|
// or if there is some other bug
|
||||||
int charCount = maxCodePointCount;
|
int charCount = maxCodePointCount;
|
||||||
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
|
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
|
||||||
outCodePoints[++wordPos] = nextChar;
|
outCodePoints[++wordPos] = nextChar;
|
||||||
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||||
mBuffer.data(), &pos);
|
mBuffer.data(), codePointTable, &pos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*outUnigramProbability =
|
*outUnigramProbability =
|
||||||
|
@ -138,7 +139,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
// first and possibly the probability.
|
// first and possibly the probability.
|
||||||
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
|
||||||
PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
|
PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
|
||||||
&pos);
|
codePointTable, &pos);
|
||||||
}
|
}
|
||||||
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
|
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
|
||||||
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
|
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
|
||||||
|
@ -189,17 +190,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
|
||||||
mBuffer.data(), &lastCandidatePtNodePos);
|
mBuffer.data(), &lastCandidatePtNodePos);
|
||||||
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||||
mBuffer.data(), &lastCandidatePtNodePos);
|
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
||||||
// We copy all the characters in this PtNode to the buffer
|
// We copy all the characters in this PtNode to the buffer
|
||||||
outCodePoints[wordPos] = lastChar;
|
outCodePoints[wordPos] = lastChar;
|
||||||
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
|
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
|
||||||
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||||
mBuffer.data(), &lastCandidatePtNodePos);
|
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
||||||
int charCount = maxCodePointCount;
|
int charCount = maxCodePointCount;
|
||||||
while (-1 != nextChar && --charCount > 0) {
|
while (-1 != nextChar && --charCount > 0) {
|
||||||
outCodePoints[++wordPos] = nextChar;
|
outCodePoints[++wordPos] = nextChar;
|
||||||
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
|
||||||
mBuffer.data(), &lastCandidatePtNodePos);
|
mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
++wordPos;
|
++wordPos;
|
||||||
|
@ -404,9 +405,11 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
||||||
int shortcutPos = NOT_A_DICT_POS;
|
int shortcutPos = NOT_A_DICT_POS;
|
||||||
int bigramPos = NOT_A_DICT_POS;
|
int bigramPos = NOT_A_DICT_POS;
|
||||||
int siblingPos = NOT_A_DICT_POS;
|
int siblingPos = NOT_A_DICT_POS;
|
||||||
|
const int *const codePointTable = mHeaderPolicy.getCodePointTable();
|
||||||
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
|
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
|
||||||
&mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
&mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
|
||||||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
|
||||||
|
&siblingPos);
|
||||||
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
||||||
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||||
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
||||||
|
|
|
@ -43,10 +43,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
||||||
: mMmappedBuffer(std::move(mmappedBuffer)),
|
: mMmappedBuffer(std::move(mmappedBuffer)),
|
||||||
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
|
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
|
||||||
FormatUtils::VERSION_2),
|
FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())),
|
||||||
mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
|
mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
|
||||||
mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
|
mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
|
||||||
mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy),
|
mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
|
||||||
|
mHeaderPolicy.getCodePointTable()),
|
||||||
mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
|
mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
|
||||||
mIsCorrupted(false) {}
|
mIsCorrupted(false) {}
|
||||||
|
|
||||||
|
|
|
@ -38,8 +38,8 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo
|
||||||
int bigramPos = NOT_A_DICT_POS;
|
int bigramPos = NOT_A_DICT_POS;
|
||||||
int siblingPos = NOT_A_DICT_POS;
|
int siblingPos = NOT_A_DICT_POS;
|
||||||
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy,
|
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy,
|
||||||
mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,
|
mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
||||||
&childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||||
if (mergedNodeCodePointCount <= 0) {
|
if (mergedNodeCodePointCount <= 0) {
|
||||||
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
|
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
|
||||||
ASSERT(false);
|
ASSERT(false);
|
||||||
|
|
|
@ -33,8 +33,10 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
|
||||||
public:
|
public:
|
||||||
Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
|
Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
|
||||||
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
const DictionaryBigramsStructurePolicy *const bigramPolicy,
|
||||||
const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
|
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
|
||||||
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy) {}
|
const int *const codePointTable)
|
||||||
|
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy),
|
||||||
|
mCodePointTable(codePointTable) {}
|
||||||
|
|
||||||
virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
|
virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
|
||||||
|
|
||||||
|
@ -44,6 +46,7 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
|
||||||
const ReadOnlyByteArrayView mBuffer;
|
const ReadOnlyByteArrayView mBuffer;
|
||||||
const DictionaryBigramsStructurePolicy *const mBigramPolicy;
|
const DictionaryBigramsStructurePolicy *const mBigramPolicy;
|
||||||
const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
|
const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
|
||||||
|
const int *const mCodePointTable;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */
|
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
|
||||||
|
|
||||||
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
|
||||||
|
@ -51,7 +52,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
||||||
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
|
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
|
||||||
int codePoints[MAX_WORD_LENGTH];
|
int codePoints[MAX_WORD_LENGTH];
|
||||||
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
||||||
dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
|
dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
|
||||||
int terminalIdFieldPos = NOT_A_DICT_POS;
|
int terminalIdFieldPos = NOT_A_DICT_POS;
|
||||||
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||||
int probability = NOT_A_PROBABILITY;
|
int probability = NOT_A_PROBABILITY;
|
||||||
|
|
|
@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
|
||||||
if (readingPosIsInAdditionalBuffer) {
|
if (readingPosIsInAdditionalBuffer) {
|
||||||
*pos -= mOriginalBuffer.size();
|
*pos -= mOriginalBuffer.size();
|
||||||
}
|
}
|
||||||
|
// Code point table is not used for dynamic format.
|
||||||
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
|
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
|
||||||
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
|
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount,
|
||||||
|
nullptr /* codePointTable */, outCodePoints, pos);
|
||||||
if (readingPosIsInAdditionalBuffer) {
|
if (readingPosIsInAdditionalBuffer) {
|
||||||
*pos += mOriginalBuffer.size();
|
*pos += mOriginalBuffer.size();
|
||||||
}
|
}
|
||||||
|
|
|
@ -147,11 +147,18 @@ class ByteArrayUtils {
|
||||||
*/
|
*/
|
||||||
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
|
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
|
||||||
int p = pos;
|
int p = pos;
|
||||||
return readCodePointAndAdvancePosition(buffer, &p);
|
return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
|
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
|
||||||
|
/*
|
||||||
|
* codePointTable is an array to convert the most frequent characters in this dictionary to
|
||||||
|
* 1 byte code points. It is only made of the original code points of the most frequent
|
||||||
|
* characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
|
||||||
|
* The original code points are restored by picking the code points at the indices of the
|
||||||
|
* codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
|
||||||
|
*/
|
||||||
const uint8_t firstByte = readUint8(buffer, *pos);
|
const uint8_t firstByte = readUint8(buffer, *pos);
|
||||||
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
|
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
|
||||||
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
|
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
|
||||||
|
@ -162,6 +169,9 @@ class ByteArrayUtils {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
*pos += 1;
|
*pos += 1;
|
||||||
|
if (codePointTable) {
|
||||||
|
return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
|
||||||
|
}
|
||||||
return firstByte;
|
return firstByte;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -173,12 +183,13 @@ class ByteArrayUtils {
|
||||||
*/
|
*/
|
||||||
// Returns the length of the string.
|
// Returns the length of the string.
|
||||||
static int readStringAndAdvancePosition(const uint8_t *const buffer,
|
static int readStringAndAdvancePosition(const uint8_t *const buffer,
|
||||||
const int maxLength, int *const outBuffer, int *const pos) {
|
const int maxLength, const int *const codePointTable, int *const outBuffer,
|
||||||
|
int *const pos) {
|
||||||
int length = 0;
|
int length = 0;
|
||||||
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||||
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
||||||
outBuffer[length++] = codePoint;
|
outBuffer[length++] = codePoint;
|
||||||
codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
|
||||||
}
|
}
|
||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
@ -187,9 +198,9 @@ class ByteArrayUtils {
|
||||||
static int advancePositionToBehindString(
|
static int advancePositionToBehindString(
|
||||||
const uint8_t *const buffer, const int maxLength, int *const pos) {
|
const uint8_t *const buffer, const int maxLength, int *const pos) {
|
||||||
int length = 0;
|
int length = 0;
|
||||||
int codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
|
||||||
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
|
||||||
codePoint = readCodePointAndAdvancePosition(buffer, pos);
|
codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
|
||||||
length++;
|
length++;
|
||||||
}
|
}
|
||||||
return length;
|
return length;
|
||||||
|
|
|
@ -29,6 +29,8 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
|
||||||
switch (formatVersion) {
|
switch (formatVersion) {
|
||||||
case VERSION_2:
|
case VERSION_2:
|
||||||
return VERSION_2;
|
return VERSION_2;
|
||||||
|
case VERSION_201:
|
||||||
|
return VERSION_201;
|
||||||
case VERSION_4_ONLY_FOR_TESTING:
|
case VERSION_4_ONLY_FOR_TESTING:
|
||||||
return VERSION_4_ONLY_FOR_TESTING;
|
return VERSION_4_ONLY_FOR_TESTING;
|
||||||
case VERSION_4:
|
case VERSION_4:
|
||||||
|
|
|
@ -32,6 +32,7 @@ class FormatUtils {
|
||||||
enum FORMAT_VERSION {
|
enum FORMAT_VERSION {
|
||||||
// These MUST have the same values as the relevant constants in FormatSpec.java.
|
// These MUST have the same values as the relevant constants in FormatSpec.java.
|
||||||
VERSION_2 = 2,
|
VERSION_2 = 2,
|
||||||
|
VERSION_201 = 201,
|
||||||
VERSION_4_ONLY_FOR_TESTING = 399,
|
VERSION_4_ONLY_FOR_TESTING = 399,
|
||||||
VERSION_4 = 402,
|
VERSION_4 = 402,
|
||||||
VERSION_4_DEV = 403,
|
VERSION_4_DEV = 403,
|
||||||
|
|
|
@ -23,6 +23,19 @@
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
TEST(ByteArrayUtilsTest, TestReadCodePointTable) {
|
||||||
|
const int codePointTable[] = { 0x6f, 0x6b };
|
||||||
|
const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u };
|
||||||
|
int pos = 0;
|
||||||
|
// Expect the first entry of codePointTable
|
||||||
|
EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
|
||||||
|
// Expect the second entry of codePointTable
|
||||||
|
EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
|
||||||
|
// Expect the original code point from buffer[2] to buffer[4], 0x100
|
||||||
|
// It isn't picked from the codePointTable, since it exceeds the range of the codePointTable.
|
||||||
|
EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
|
||||||
|
}
|
||||||
|
|
||||||
TEST(ByteArrayUtilsTest, TestReadInt) {
|
TEST(ByteArrayUtilsTest, TestReadInt) {
|
||||||
const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu };
|
const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu };
|
||||||
|
|
||||||
|
@ -67,7 +80,7 @@ TEST(ByteArrayUtilsTest, TestReadCodePoint) {
|
||||||
|
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
int codePointArray[3];
|
int codePointArray[3];
|
||||||
EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH,
|
EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr,
|
||||||
codePointArray, &pos));
|
codePointArray, &pos));
|
||||||
EXPECT_EQ(0x10FF00, codePointArray[0]);
|
EXPECT_EQ(0x10FF00, codePointArray[0]);
|
||||||
EXPECT_EQ(0x20, codePointArray[1]);
|
EXPECT_EQ(0x20, codePointArray[1]);
|
||||||
|
|
Loading…
Reference in New Issue