From fb2bde5a688d93aa946e3dd923aa1e99588777fc Mon Sep 17 00:00:00 2001 From: Akifumi Yoshimoto Date: Wed, 10 Sep 2014 11:38:36 +0900 Subject: [PATCH] Native side reads character table Bug:17097992 Change-Id: Ibcfc67833a6754fe6a2d82a3e3023b33886b9ea2 --- .../dictionary/header/header_policy.h | 19 ++++++++++--- .../header/header_read_write_utils.cpp | 27 ++++++++++++++----- .../header/header_read_write_utils.h | 5 ++++ .../v402/ver4_patricia_trie_node_reader.cpp | 7 ++--- ...y_structure_with_buffer_policy_factory.cpp | 4 ++- .../pt_common/patricia_trie_reading_utils.cpp | 21 ++++++++------- .../pt_common/patricia_trie_reading_utils.h | 15 ++++++----- .../shortcut/shortcut_list_reading_utils.cpp | 4 ++- .../structure/v2/patricia_trie_policy.cpp | 21 ++++++++------- .../structure/v2/patricia_trie_policy.h | 5 ++-- .../v2/ver2_patricia_trie_node_reader.cpp | 4 +-- .../v2/ver2_patricia_trie_node_reader.h | 7 +++-- .../v4/ver4_patricia_trie_node_reader.cpp | 3 ++- .../utils/buffer_with_extendable_buffer.cpp | 4 ++- .../dictionary/utils/byte_array_utils.h | 25 ++++++++++++----- .../dictionary/utils/format_utils.cpp | 2 ++ .../dictionary/utils/format_utils.h | 1 + .../utils/byte_array_utils_test.cpp | 15 ++++++++++- 18 files changed, 133 insertions(+), 56 deletions(-) diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 87cf0cd3b..daf40d4f9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -65,7 +65,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} + &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Constructs header information using an attribute map. HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, @@ -97,7 +98,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} + &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Copy header information HeaderPolicy(const HeaderPolicy *const headerPolicy) @@ -118,7 +120,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mForgettingCurveDurationToLevelDown( headerPolicy->mForgettingCurveDurationToLevelDown), mMaxUnigramCount(headerPolicy->mMaxUnigramCount), - mMaxBigramCount(headerPolicy->mMaxBigramCount) {} + mMaxBigramCount(headerPolicy->mMaxBigramCount), + mCodePointTable(headerPolicy->mCodePointTable) {} // Temporary dummy header. HeaderPolicy() @@ -128,7 +131,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0), - mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {} + mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0), + mCodePointTable(nullptr) {} ~HeaderPolicy() {} @@ -139,6 +143,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { switch (mDictFormatVersion) { case FormatUtils::VERSION_2: return FormatUtils::VERSION_2; + case FormatUtils::VERSION_201: + return FormatUtils::VERSION_201; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: return FormatUtils::VERSION_4_ONLY_FOR_TESTING; case FormatUtils::VERSION_4: @@ -250,6 +256,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mDictFormatVersion >= FormatUtils::VERSION_4; } + const int *getCodePointTable() const { + return mCodePointTable; + } + private: DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); @@ -295,6 +305,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const int mForgettingCurveDurationToLevelDown; const int mMaxUnigramCount; const int mMaxBigramCount; + const int *const mCodePointTable; const std::vector readLocale() const; float readMultipleWordCostMultiplier() const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index d2c3d2fe0..41a8b13b8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include "defines.h" @@ -34,12 +35,13 @@ namespace latinime { const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; -const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048; const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; +const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable"; const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; @@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; return; } int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; - int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH]; + std::unique_ptr valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]); while (pos < headerSize) { + // The values in the header don't use the code point table for their encoding. const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos); + MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos); std::vector key; key.insert(key.end(), keyBuffer, keyBuffer + keyLength); const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos); + MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos); std::vector value; - value.insert(value.end(), valueBuffer, valueBuffer + valueLength); + value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength); headerAttributes->insert(AttributeMap::value_type(key, value)); } } +/* static */ const int *HeaderReadWriteUtils::readCodePointTable( + AttributeMap *const headerAttributes) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return nullptr; + } + return it->second.data(); +} + /* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, int *const writingPos) { @@ -96,7 +110,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; } switch (version) { case FormatUtils::VERSION_2: - // Version 2 dictionary writing is not supported. + case FormatUtils::VERSION_201: + // Version 2 or 201 dictionary writing is not supported. return false; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h index 1ab2eec69..5dd91b26c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h @@ -46,6 +46,9 @@ class HeaderReadWriteUtils { static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + static const int *readCodePointTable( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, int *const writingPos); @@ -101,6 +104,8 @@ class HeaderReadWriteUtils { static const int HEADER_FLAG_SIZE; static const int HEADER_SIZE_FIELD_SIZE; + static const char *const CODE_POINT_TABLE_KEY; + // Value for the "flags" field. It's unused at the moment. static const DictionaryFlags NO_FLAGS; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp index 82399f190..5c639b19c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp @@ -23,6 +23,7 @@ #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" @@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce const int parentPos = DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); int codePoints[MAX_WORD_LENGTH]; - const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); + const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos); int terminalIdFieldPos = NOT_A_DICT_POS; int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int probability = NOT_A_PROBABILITY; @@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce // The destination position is stored at the same place as the parent position. return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); } else { - return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints, + return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, newSiblingNodePos); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index 9fa93efc9..372c9e36f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -114,7 +114,8 @@ templategetReadOnlyByteArrayView()); switch (formatVersion) { case FormatUtils::VERSION_2: - AKLOGE("Given path is a directory but the format is version 2. path: %s", path); + case FormatUtils::VERSION_201: + AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path); break; case FormatUtils::VERSION_4: { return newPolicyForV4DictgetReadOnlyByteArrayView())) { case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); case FormatUtils::VERSION_4_ONLY_FOR_TESTING: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp index e64a13cc4..6a498b2f4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp @@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; } /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, - int *const pos) { - return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos); + const int *const codePointTable, int *const pos) { + return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos); } // Returns the number of read characters. /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, - const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) { + const NodeFlags flags, const int maxLength, const int *const codePointTable, + int *const outBuffer, int *const pos) { int length = 0; if (hasMultipleChars(flags)) { - length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer, - pos); + length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable, + outBuffer, pos); } else { - const int codePoint = getCodePointAndAdvancePosition(buffer, pos); + const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos); if (codePoint == NOT_A_CODE_POINT) { // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR @@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; // Returns the number of skipped characters. /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, int *const pos) { + const int maxLength, const int *const codePointTable, int *const pos) { if (hasMultipleChars(flags)) { return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); } else { if (maxLength > 0) { - getCodePointAndAdvancePosition(buffer, pos); + getCodePointAndAdvancePosition(buffer, codePointTable, pos); return 1; } else { return 0; @@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, const DictionaryShortcutsStructurePolicy *const shortcutPolicy, - const DictionaryBigramsStructurePolicy *const bigramPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable, NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos, int *const outSiblingPos) { @@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); *outFlags = flags; *outCodePointCount = getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos); + dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos); *outProbability = isTerminal(flags) ? readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; *outChildrenPos = hasChildrenInFlags(flags) ? diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h index c3f09c3b1..a69ec4435 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h @@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils { static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); - static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos); + static int getCodePointAndAdvancePosition(const uint8_t *const buffer, + const int *const codePointTable, int *const pos); // Returns the number of read characters. static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, int *const outBuffer, int *const pos); + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos); // Returns the number of skipped characters. static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, int *const pos); + const int maxLength, const int *const codePointTable, int *const pos); static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); @@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils { static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, const DictionaryShortcutsStructurePolicy *const shortcutPolicy, const DictionaryBigramsStructurePolicy *const bigramPolicy, - NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, - int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, - int *const outBigramPos, int *const outSiblingPos); + const int *const codePointTable, NodeFlags *const outFlags, + int *const outCodePointCount, int *const outCodePoint, int *const outProbability, + int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos, + int *const outSiblingPos); private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp index 7cb7dff9a..40b872055 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp @@ -45,7 +45,9 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; /* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer, const int maxLength, int *const outWord, int *const pos) { - return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, outWord, pos); + // TODO: Use codePointTable for shortcuts. + return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, + nullptr /* codePointTable */, outWord, pos); } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 64b767dac..6e7dba9ff 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -81,6 +81,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); int pos = getRootPosition(); int wordPos = 0; + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will // only traverse PtNodes that are actually a part of the terminal we are searching, so each // time we enter this loop we are one depth level further than last time. @@ -112,21 +113,21 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos); const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), &pos); + mBuffer.data(), codePointTable, &pos); if (ptNodePos == startPos) { // We found the position. Copy the rest of the code points in the buffer and return // the length. outCodePoints[wordPos] = character; if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), &pos); + mBuffer.data(), codePointTable, &pos); // We count code points in order to avoid infinite loops if the file is broken // or if there is some other bug int charCount = maxCodePointCount; while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { outCodePoints[++wordPos] = nextChar; nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), &pos); + mBuffer.data(), codePointTable, &pos); } } *outUnigramProbability = @@ -138,7 +139,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( // first and possibly the probability. if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH, - &pos); + codePointTable, &pos); } if (PatriciaTrieReadingUtils::isTerminal(flags)) { PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos); @@ -189,17 +190,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( mBuffer.data(), &lastCandidatePtNodePos); const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), &lastCandidatePtNodePos); + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); // We copy all the characters in this PtNode to the buffer outCodePoints[wordPos] = lastChar; if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), &lastCandidatePtNodePos); + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); int charCount = maxCodePointCount; while (-1 != nextChar && --charCount > 0) { outCodePoints[++wordPos] = nextChar; nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), &lastCandidatePtNodePos); + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); } } ++wordPos; @@ -404,9 +405,11 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod int shortcutPos = NOT_A_DICT_POS; int bigramPos = NOT_A_DICT_POS; int siblingPos = NOT_A_DICT_POS; + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy, - &mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, - &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); + &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount, + mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, + &siblingPos); // Skip PtNodes don't start with Unicode code point because they represent non-word information. if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 70e8d847e..5f179513f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -43,10 +43,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) : mMmappedBuffer(std::move(mmappedBuffer)), mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), - FormatUtils::VERSION_2), + FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())), mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())), mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer), - mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy), + mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy, + mHeaderPolicy.getCodePointTable()), mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp index 74cdf7929..dc0ed96d0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp @@ -38,8 +38,8 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo int bigramPos = NOT_A_DICT_POS; int siblingPos = NOT_A_DICT_POS; PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy, - mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability, - &childrenPos, &shortcutPos, &bigramPos, &siblingPos); + mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, + &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); if (mergedNodeCodePointCount <= 0) { AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); ASSERT(false); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h index 0f6769dc8..24ec5bcca 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h @@ -33,8 +33,10 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader { public: Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer, const DictionaryBigramsStructurePolicy *const bigramPolicy, - const DictionaryShortcutsStructurePolicy *const shortcutPolicy) - : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy) {} + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const int *const codePointTable) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy), + mCodePointTable(codePointTable) {} virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const; @@ -44,6 +46,7 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader { const ReadOnlyByteArrayView mBuffer; const DictionaryBigramsStructurePolicy *const mBigramPolicy; const DictionaryShortcutsStructurePolicy *const mShortuctPolicy; + const int *const mCodePointTable; }; } // namespace latinime #endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp index 731092efd..d795239fc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -16,6 +16,7 @@ #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" @@ -51,7 +52,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); int codePoints[MAX_WORD_LENGTH]; const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); + dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos); int terminalIdFieldPos = NOT_A_DICT_POS; int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int probability = NOT_A_PROBABILITY; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp index ecbe7922c..da2c30cd6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC if (readingPosIsInAdditionalBuffer) { *pos -= mOriginalBuffer.size(); } + // Code point table is not used for dynamic format. *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( - getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos); + getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, + nullptr /* codePointTable */, outCodePoints, pos); if (readingPosIsInAdditionalBuffer) { *pos += mOriginalBuffer.size(); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h index 4b3c98988..abb979050 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h @@ -147,11 +147,18 @@ class ByteArrayUtils { */ static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { int p = pos; - return readCodePointAndAdvancePosition(buffer, &p); + return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); } static AK_FORCE_INLINE int readCodePointAndAdvancePosition( - const uint8_t *const buffer, int *const pos) { + const uint8_t *const buffer, const int *const codePointTable, int *const pos) { + /* + * codePointTable is an array to convert the most frequent characters in this dictionary to + * 1 byte code points. It is only made of the original code points of the most frequent + * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. + * The original code points are restored by picking the code points at the indices of the + * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. + */ const uint8_t firstByte = readUint8(buffer, *pos); if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { if (firstByte == CHARACTER_ARRAY_TERMINATOR) { @@ -162,6 +169,9 @@ class ByteArrayUtils { } } else { *pos += 1; + if (codePointTable) { + return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; + } return firstByte; } } @@ -173,12 +183,13 @@ class ByteArrayUtils { */ // Returns the length of the string. static int readStringAndAdvancePosition(const uint8_t *const buffer, - const int maxLength, int *const outBuffer, int *const pos) { + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos) { int length = 0; - int codePoint = readCodePointAndAdvancePosition(buffer, pos); + int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); while (NOT_A_CODE_POINT != codePoint && length < maxLength) { outBuffer[length++] = codePoint; - codePoint = readCodePointAndAdvancePosition(buffer, pos); + codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); } return length; } @@ -187,9 +198,9 @@ class ByteArrayUtils { static int advancePositionToBehindString( const uint8_t *const buffer, const int maxLength, int *const pos) { int length = 0; - int codePoint = readCodePointAndAdvancePosition(buffer, pos); + int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); while (NOT_A_CODE_POINT != codePoint && length < maxLength) { - codePoint = readCodePointAndAdvancePosition(buffer, pos); + codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); length++; } return length; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index e6e7167c2..0cffe569d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -29,6 +29,8 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; switch (formatVersion) { case VERSION_2: return VERSION_2; + case VERSION_201: + return VERSION_201; case VERSION_4_ONLY_FOR_TESTING: return VERSION_4_ONLY_FOR_TESTING; case VERSION_4: diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index 51ad9877c..96310086b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -32,6 +32,7 @@ class FormatUtils { enum FORMAT_VERSION { // These MUST have the same values as the relevant constants in FormatSpec.java. VERSION_2 = 2, + VERSION_201 = 201, VERSION_4_ONLY_FOR_TESTING = 399, VERSION_4 = 402, VERSION_4_DEV = 403, diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp index a1c310d8a..c201e0d00 100644 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp +++ b/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp @@ -23,6 +23,19 @@ namespace latinime { namespace { +TEST(ByteArrayUtilsTest, TestReadCodePointTable) { + const int codePointTable[] = { 0x6f, 0x6b }; + const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u }; + int pos = 0; + // Expect the first entry of codePointTable + EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); + // Expect the second entry of codePointTable + EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); + // Expect the original code point from buffer[2] to buffer[4], 0x100 + // It isn't picked from the codePointTable, since it exceeds the range of the codePointTable. + EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); +} + TEST(ByteArrayUtilsTest, TestReadInt) { const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu }; @@ -67,7 +80,7 @@ TEST(ByteArrayUtilsTest, TestReadCodePoint) { int pos = 0; int codePointArray[3]; - EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, + EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr, codePointArray, &pos)); EXPECT_EQ(0x10FF00, codePointArray[0]); EXPECT_EQ(0x20, codePointArray[1]);