Merge "Add dictionary header writing methods."
commit
561a83118c
|
@ -250,7 +250,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
|
||||||
}
|
}
|
||||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||||
&mBigramListPolicy, &mShortcutListPolicy);
|
&mBigramListPolicy, &mShortcutListPolicy);
|
||||||
writingHelper.writeToDictFile(filePath, mBuffer->getBuffer(), mHeaderPolicy.getSize());
|
writingHelper.writeToDictFile(filePath, &mHeaderPolicy);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
||||||
|
|
|
@ -33,7 +33,7 @@ class DicNodeVector;
|
||||||
class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer)
|
DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer)
|
||||||
: mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer()),
|
: mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()),
|
||||||
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
|
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
|
||||||
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
|
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
|
||||||
mShortcutListPolicy(&mBufferWithExtendableBuffer),
|
mShortcutListPolicy(&mBufferWithExtendableBuffer),
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
|
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
|
||||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h"
|
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h"
|
#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h"
|
||||||
|
|
||||||
|
@ -137,7 +138,11 @@ bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, con
|
||||||
}
|
}
|
||||||
|
|
||||||
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
|
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
|
||||||
const uint8_t *const headerBuf, const int headerSize) {
|
const HeaderPolicy *const headerPolicy) {
|
||||||
|
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||||
|
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
const int tmpFileNameBufSize = strlen(fileName)
|
const int tmpFileNameBufSize = strlen(fileName)
|
||||||
+ strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1;
|
+ strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1;
|
||||||
char tmpFileName[tmpFileNameBufSize];
|
char tmpFileName[tmpFileNameBufSize];
|
||||||
|
@ -148,7 +153,8 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Write header.
|
// Write header.
|
||||||
if (fwrite(headerBuf, headerSize, 1, file) < 1) {
|
if (fwrite(headerBuffer.getBuffer(true /* usesAdditionalBuffer */),
|
||||||
|
headerBuffer.getTailPosition(), 1, file) < 1) {
|
||||||
fclose(file);
|
fclose(file);
|
||||||
remove(tmpFileName);
|
remove(tmpFileName);
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -28,6 +28,7 @@ class DynamicBigramListPolicy;
|
||||||
class DynamicPatriciaTrieNodeReader;
|
class DynamicPatriciaTrieNodeReader;
|
||||||
class DynamicPatriciaTrieReadingHelper;
|
class DynamicPatriciaTrieReadingHelper;
|
||||||
class DynamicShortcutListPolicy;
|
class DynamicShortcutListPolicy;
|
||||||
|
class HeaderPolicy;
|
||||||
|
|
||||||
class DynamicPatriciaTrieWritingHelper {
|
class DynamicPatriciaTrieWritingHelper {
|
||||||
public:
|
public:
|
||||||
|
@ -48,8 +49,9 @@ class DynamicPatriciaTrieWritingHelper {
|
||||||
// Remove a bigram relation from word0Pos to word1Pos.
|
// Remove a bigram relation from word0Pos to word1Pos.
|
||||||
bool removeBigramWords(const int word0Pos, const int word1Pos);
|
bool removeBigramWords(const int word0Pos, const int word1Pos);
|
||||||
|
|
||||||
void writeToDictFile(const char *const fileName, const uint8_t *const headerBuf,
|
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy);
|
||||||
const int headerSize);
|
|
||||||
|
void writeToDictFileWithGC(const char *const fileName, const HeaderPolicy *const headerPolicy);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper);
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -36,7 +38,7 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out
|
||||||
}
|
}
|
||||||
std::vector<int> keyCodePointVector;
|
std::vector<int> keyCodePointVector;
|
||||||
insertCharactersIntoVector(key, &keyCodePointVector);
|
insertCharactersIntoVector(key, &keyCodePointVector);
|
||||||
HeaderReadingUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector);
|
HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector);
|
||||||
if (it == mAttributeMap.end()) {
|
if (it == mAttributeMap.end()) {
|
||||||
// The key was not found.
|
// The key was not found.
|
||||||
outValue[0] = '?';
|
outValue[0] = '?';
|
||||||
|
@ -85,7 +87,7 @@ int HeaderPolicy::readLastUpdatedTime() const {
|
||||||
bool HeaderPolicy::getAttributeValueAsInt(const char *const key, int *const outValue) const {
|
bool HeaderPolicy::getAttributeValueAsInt(const char *const key, int *const outValue) const {
|
||||||
std::vector<int> keyVector;
|
std::vector<int> keyVector;
|
||||||
insertCharactersIntoVector(key, &keyVector);
|
insertCharactersIntoVector(key, &keyVector);
|
||||||
HeaderReadingUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyVector);
|
HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyVector);
|
||||||
if (it == mAttributeMap.end()) {
|
if (it == mAttributeMap.end()) {
|
||||||
// The key was not found.
|
// The key was not found.
|
||||||
return false;
|
return false;
|
||||||
|
@ -94,10 +96,56 @@ bool HeaderPolicy::getAttributeValueAsInt(const char *const key, int *const outV
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ HeaderReadingUtils::AttributeMap HeaderPolicy::createAttributeMapAndReadAllAttributes(
|
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
||||||
const uint8_t *const dictBuf) {
|
const bool updatesLastUpdatedTime) const {
|
||||||
HeaderReadingUtils::AttributeMap attributeMap;
|
int writingPos = 0;
|
||||||
HeaderReadingUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap);
|
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
|
||||||
|
&writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!HeaderReadWriteUtils::writeDictionaryFlags(bufferToWrite, mDictionaryFlags,
|
||||||
|
&writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Temporarily writes a dummy header size.
|
||||||
|
int headerSizeFieldPos = writingPos;
|
||||||
|
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, 0 /* size */,
|
||||||
|
&writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (updatesLastUpdatedTime) {
|
||||||
|
// Set current time as a last updated time.
|
||||||
|
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
|
||||||
|
std::vector<int> updatedTimekey;
|
||||||
|
insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey);
|
||||||
|
const time_t currentTime = time(NULL);
|
||||||
|
std::vector<int> updatedTimeValue;
|
||||||
|
char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
|
||||||
|
snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%ld", currentTime);
|
||||||
|
insertCharactersIntoVector(charBuf, &updatedTimeValue);
|
||||||
|
attributeMapTowrite[updatedTimekey] = updatedTimeValue;
|
||||||
|
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
|
||||||
|
&writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap,
|
||||||
|
&writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Writes an actual header size.
|
||||||
|
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos,
|
||||||
|
&headerSizeFieldPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ HeaderReadWriteUtils::AttributeMap
|
||||||
|
HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) {
|
||||||
|
HeaderReadWriteUtils::AttributeMap attributeMap;
|
||||||
|
HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap);
|
||||||
return attributeMap;
|
return attributeMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,14 +23,17 @@
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/policy/dictionary_header_structure_policy.h"
|
#include "suggest/core/policy/dictionary_header_structure_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/header/header_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/header/header_reading_utils.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/utils/format_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
public:
|
public:
|
||||||
explicit HeaderPolicy(const uint8_t *const dictBuf)
|
explicit HeaderPolicy(const uint8_t *const dictBuf, const int dictSize)
|
||||||
: mDictBuf(dictBuf), mDictionaryFlags(HeaderReadingUtils::getFlags(dictBuf)),
|
: mDictBuf(dictBuf),
|
||||||
mSize(HeaderReadingUtils::getHeaderSize(dictBuf)),
|
mDictFormatVersion(FormatUtils::detectFormatVersion(dictBuf, dictSize)),
|
||||||
|
mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
|
||||||
|
mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
|
||||||
mAttributeMap(createAttributeMapAndReadAllAttributes(mDictBuf)),
|
mAttributeMap(createAttributeMapAndReadAllAttributes(mDictBuf)),
|
||||||
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
|
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
|
||||||
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
|
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
|
||||||
|
@ -43,16 +46,15 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool supportsDynamicUpdate() const {
|
AK_FORCE_INLINE bool supportsDynamicUpdate() const {
|
||||||
return HeaderReadingUtils::supportsDynamicUpdate(mDictionaryFlags);
|
return HeaderReadWriteUtils::supportsDynamicUpdate(mDictionaryFlags);
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
|
AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
|
||||||
return HeaderReadingUtils::requiresGermanUmlautProcessing(mDictionaryFlags);
|
return HeaderReadWriteUtils::requiresGermanUmlautProcessing(mDictionaryFlags);
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const {
|
AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const {
|
||||||
return HeaderReadingUtils::requiresFrenchLigatureProcessing(
|
return HeaderReadWriteUtils::requiresFrenchLigatureProcessing(mDictionaryFlags);
|
||||||
mDictionaryFlags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
|
AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
|
||||||
|
@ -70,6 +72,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
void readHeaderValueOrQuestionMark(const char *const key,
|
void readHeaderValueOrQuestionMark(const char *const key,
|
||||||
int *outValue, int outValueSize) const;
|
int *outValue, int outValueSize) const;
|
||||||
|
|
||||||
|
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
||||||
|
const bool updatesLastUpdatedTime) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
|
||||||
|
|
||||||
|
@ -80,9 +85,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
|
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
|
||||||
|
|
||||||
const uint8_t *const mDictBuf;
|
const uint8_t *const mDictBuf;
|
||||||
const HeaderReadingUtils::DictionaryFlags mDictionaryFlags;
|
const FormatUtils::FORMAT_VERSION mDictFormatVersion;
|
||||||
|
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
|
||||||
const int mSize;
|
const int mSize;
|
||||||
HeaderReadingUtils::AttributeMap mAttributeMap;
|
HeaderReadWriteUtils::AttributeMap mAttributeMap;
|
||||||
const float mMultiWordCostMultiplier;
|
const float mMultiWordCostMultiplier;
|
||||||
const bool mUsesForgettingCurve;
|
const bool mUsesForgettingCurve;
|
||||||
const int mLastUpdatedTime;
|
const int mLastUpdatedTime;
|
||||||
|
@ -95,7 +101,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
|
|
||||||
bool getAttributeValueAsInt(const char *const key, int *const outValue) const;
|
bool getAttributeValueAsInt(const char *const key, int *const outValue) const;
|
||||||
|
|
||||||
static HeaderReadingUtils::AttributeMap createAttributeMapAndReadAllAttributes(
|
static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes(
|
||||||
const uint8_t *const dictBuf);
|
const uint8_t *const dictBuf);
|
||||||
|
|
||||||
static int parseIntAttributeValue(const std::vector<int> *const attributeValue);
|
static int parseIntAttributeValue(const std::vector<int> *const attributeValue);
|
||||||
|
|
|
@ -19,43 +19,44 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
|
||||||
#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
|
#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
const int HeaderReadingUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
|
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
|
||||||
const int HeaderReadingUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256;
|
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256;
|
||||||
|
|
||||||
const int HeaderReadingUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
|
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
|
||||||
const int HeaderReadingUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
|
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
|
||||||
const int HeaderReadingUtils::HEADER_FLAG_SIZE = 2;
|
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
|
||||||
const int HeaderReadingUtils::HEADER_SIZE_FIELD_SIZE = 4;
|
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
|
||||||
|
|
||||||
const HeaderReadingUtils::DictionaryFlags HeaderReadingUtils::NO_FLAGS = 0;
|
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
|
||||||
// Flags for special processing
|
// Flags for special processing
|
||||||
// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or
|
// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or
|
||||||
// something very bad (like, the apocalypse) will happen. Please update both at the same time.
|
// something very bad (like, the apocalypse) will happen. Please update both at the same time.
|
||||||
const HeaderReadingUtils::DictionaryFlags
|
const HeaderReadWriteUtils::DictionaryFlags
|
||||||
HeaderReadingUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
|
HeaderReadWriteUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
|
||||||
const HeaderReadingUtils::DictionaryFlags
|
const HeaderReadWriteUtils::DictionaryFlags
|
||||||
HeaderReadingUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2;
|
HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2;
|
||||||
const HeaderReadingUtils::DictionaryFlags
|
const HeaderReadWriteUtils::DictionaryFlags
|
||||||
HeaderReadingUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
|
HeaderReadWriteUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
|
||||||
|
|
||||||
/* static */ int HeaderReadingUtils::getHeaderSize(const uint8_t *const dictBuf) {
|
/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) {
|
||||||
// See the format of the header in the comment in
|
// See the format of the header in the comment in
|
||||||
// BinaryDictionaryFormatUtils::detectFormatVersion()
|
// BinaryDictionaryFormatUtils::detectFormatVersion()
|
||||||
return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE
|
return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE
|
||||||
+ HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE);
|
+ HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ HeaderReadingUtils::DictionaryFlags
|
/* static */ HeaderReadWriteUtils::DictionaryFlags
|
||||||
HeaderReadingUtils::getFlags(const uint8_t *const dictBuf) {
|
HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) {
|
||||||
return ByteArrayUtils::readUint16(dictBuf,
|
return ByteArrayUtils::readUint16(dictBuf,
|
||||||
HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE);
|
HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ void HeaderReadingUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf,
|
/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf,
|
||||||
AttributeMap *const headerAttributes) {
|
AttributeMap *const headerAttributes) {
|
||||||
const int headerSize = getHeaderSize(dictBuf);
|
const int headerSize = getHeaderSize(dictBuf);
|
||||||
int pos = getHeaderOptionsPosition();
|
int pos = getHeaderOptionsPosition();
|
||||||
|
@ -78,4 +79,53 @@ const HeaderReadingUtils::DictionaryFlags
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
|
||||||
|
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
|
||||||
|
int *const writingPos) {
|
||||||
|
if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE,
|
||||||
|
writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
switch (version) {
|
||||||
|
case FormatUtils::VERSION_2:
|
||||||
|
// Version 2 dictionary writing is not supported.
|
||||||
|
return false;
|
||||||
|
case FormatUtils::VERSION_3:
|
||||||
|
return buffer->writeUintAndAdvancePosition(3 /* data */,
|
||||||
|
HEADER_DICTIONARY_VERSION_SIZE, writingPos);
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags(
|
||||||
|
BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags,
|
||||||
|
int *const writingPos) {
|
||||||
|
return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize(
|
||||||
|
BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) {
|
||||||
|
return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes(
|
||||||
|
BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes,
|
||||||
|
int *const writingPos) {
|
||||||
|
for (AttributeMap::const_iterator it = headerAttributes->begin();
|
||||||
|
it != headerAttributes->end(); ++it) {
|
||||||
|
// Write a key.
|
||||||
|
if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(),
|
||||||
|
true /* writesTerminator */, writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Write a value.
|
||||||
|
if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(),
|
||||||
|
true /* writesTerminator */, writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -22,10 +22,14 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/utils/format_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class HeaderReadingUtils {
|
class BufferWithExtendableBuffer;
|
||||||
|
|
||||||
|
// TODO: Change this file name to header_read_write_utils.h.
|
||||||
|
class HeaderReadWriteUtils {
|
||||||
public:
|
public:
|
||||||
typedef uint16_t DictionaryFlags;
|
typedef uint16_t DictionaryFlags;
|
||||||
typedef std::map<std::vector<int>, std::vector<int> > AttributeMap;
|
typedef std::map<std::vector<int>, std::vector<int> > AttributeMap;
|
||||||
|
@ -54,8 +58,20 @@ class HeaderReadingUtils {
|
||||||
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
|
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
|
||||||
AttributeMap *const headerAttributes);
|
AttributeMap *const headerAttributes);
|
||||||
|
|
||||||
|
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
|
||||||
|
const FormatUtils::FORMAT_VERSION version, int *const writingPos);
|
||||||
|
|
||||||
|
static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer,
|
||||||
|
const DictionaryFlags flags, int *const writingPos);
|
||||||
|
|
||||||
|
static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer,
|
||||||
|
const int size, int *const writingPos);
|
||||||
|
|
||||||
|
static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer,
|
||||||
|
const AttributeMap *const headerAttributes, int *const writingPos);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadingUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils);
|
||||||
|
|
||||||
static const int MAX_ATTRIBUTE_KEY_LENGTH;
|
static const int MAX_ATTRIBUTE_KEY_LENGTH;
|
||||||
static const int MAX_ATTRIBUTE_VALUE_LENGTH;
|
static const int MAX_ATTRIBUTE_VALUE_LENGTH;
|
||||||
|
|
|
@ -34,7 +34,7 @@ class DicNodeVector;
|
||||||
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
PatriciaTriePolicy(const MmappedBuffer *const buffer)
|
PatriciaTriePolicy(const MmappedBuffer *const buffer)
|
||||||
: mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer()),
|
: mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()),
|
||||||
mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()),
|
mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()),
|
||||||
mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {}
|
mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {}
|
||||||
|
|
||||||
|
|
|
@ -20,20 +20,10 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
/**
|
const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE;
|
||||||
* Dictionary size
|
|
||||||
*/
|
|
||||||
// Any file smaller than this is not a dictionary.
|
|
||||||
const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 4;
|
|
||||||
|
|
||||||
/**
|
// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12
|
||||||
* Format versions
|
const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
|
||||||
*/
|
|
||||||
// 32 bit magic number is stored at the beginning of the dictionary header to reject unsupported
|
|
||||||
// or obsolete dictionary formats.
|
|
||||||
const uint32_t FormatUtils::HEADER_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
|
|
||||||
// Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12
|
|
||||||
const int FormatUtils::HEADER_VERSION_2_MINIMUM_SIZE = 12;
|
|
||||||
|
|
||||||
/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion(
|
/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion(
|
||||||
const uint8_t *const dict, const int dictSize) {
|
const uint8_t *const dict, const int dictSize) {
|
||||||
|
@ -45,16 +35,10 @@ const int FormatUtils::HEADER_VERSION_2_MINIMUM_SIZE = 12;
|
||||||
}
|
}
|
||||||
const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0);
|
const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0);
|
||||||
switch (magicNumber) {
|
switch (magicNumber) {
|
||||||
case HEADER_VERSION_2_MAGIC_NUMBER:
|
case MAGIC_NUMBER:
|
||||||
// Version 2 header are at least 12 bytes long.
|
|
||||||
// If this header has the version 2 magic number but is less than 12 bytes long,
|
|
||||||
// then it's an unknown format and we need to avoid confidently reading the next bytes.
|
|
||||||
if (dictSize < HEADER_VERSION_2_MINIMUM_SIZE) {
|
|
||||||
return UNKNOWN_VERSION;
|
|
||||||
}
|
|
||||||
// Version 2 header is as follows:
|
// Version 2 header is as follows:
|
||||||
// Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
|
// Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
|
||||||
// Version number (2 bytes)
|
// Dictionary format version number (2 bytes)
|
||||||
// Options (2 bytes)
|
// Options (2 bytes)
|
||||||
// Header size (4 bytes) : integer, big endian
|
// Header size (4 bytes) : integer, big endian
|
||||||
if (ByteArrayUtils::readUint16(dict, 4) == 2) {
|
if (ByteArrayUtils::readUint16(dict, 4) == 2) {
|
||||||
|
|
|
@ -34,14 +34,16 @@ class FormatUtils {
|
||||||
UNKNOWN_VERSION
|
UNKNOWN_VERSION
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// 32 bit magic number is stored at the beginning of the dictionary header to reject
|
||||||
|
// unsupported or obsolete dictionary formats.
|
||||||
|
static const uint32_t MAGIC_NUMBER;
|
||||||
|
|
||||||
static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize);
|
static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils);
|
||||||
|
|
||||||
static const int DICTIONARY_MINIMUM_SIZE;
|
static const int DICTIONARY_MINIMUM_SIZE;
|
||||||
static const uint32_t HEADER_VERSION_2_MAGIC_NUMBER;
|
|
||||||
static const int HEADER_VERSION_2_MINIMUM_SIZE;
|
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_FORMAT_UTILS_H */
|
#endif /* LATINIME_FORMAT_UTILS_H */
|
||||||
|
|
|
@ -358,8 +358,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
|
||||||
assertEquals(-1, binaryDictionary.getFrequency("aaa"));
|
assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("aaa"));
|
||||||
assertEquals(-1, binaryDictionary.getFrequency("abcd"));
|
assertEquals(Dictionary.NOT_A_PROBABILITY, binaryDictionary.getFrequency("abcd"));
|
||||||
|
|
||||||
binaryDictionary.addUnigramWord("aaa", probability);
|
binaryDictionary.addUnigramWord("aaa", probability);
|
||||||
binaryDictionary.addUnigramWord("abcd", probability);
|
binaryDictionary.addUnigramWord("abcd", probability);
|
||||||
|
|
Loading…
Reference in New Issue