Add a variable-length header region to the binary format.

Also bump up the format version to 2.

Bug: 5686638
Change-Id: I3aafdd7e42c422202122998ec093280051aa8e07
main
Jean Chalard 2012-02-27 19:48:47 +09:00
parent 168d2c094b
commit 46a1eec4d8
5 changed files with 50 additions and 19 deletions

View File

@ -28,7 +28,7 @@ namespace latinime {
BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram, int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
Dictionary *parentDictionary) Dictionary *parentDictionary)
: DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength), : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
if (DEBUG_DICT) { if (DEBUG_DICT) {

View File

@ -17,6 +17,7 @@
#ifndef LATINIME_BINARY_FORMAT_H #ifndef LATINIME_BINARY_FORMAT_H
#define LATINIME_BINARY_FORMAT_H #define LATINIME_BINARY_FORMAT_H
#include <limits>
#include "unigram_dictionary.h" #include "unigram_dictionary.h"
namespace latinime { namespace latinime {
@ -29,10 +30,18 @@ class BinaryFormat {
public: public:
const static int UNKNOWN_FORMAT = -1; const static int UNKNOWN_FORMAT = -1;
const static int FORMAT_VERSION_1 = 1; // Originally, format version 1 had a 16-bit magic number, then the version number `01'
const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; // then options that must be 0. Hence the first 32-bits of the format are always as follow
// and it's okay to consider them a magic number as a whole.
const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
// The versions of Latin IME that only handle format version 1 only test for the magic
// number, so we had to change it so that version 2 files would be rejected by older
// implementations. On this occasion, we made the magic number 32 bits long.
const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
static int detectFormat(const uint8_t* const dict); static int detectFormat(const uint8_t* const dict);
static unsigned int getHeaderSize(const uint8_t* const dict);
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
@ -55,9 +64,37 @@ class BinaryFormat {
}; };
inline int BinaryFormat::detectFormat(const uint8_t* const dict) { inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian // The magic number is stored big-endian.
if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1; const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
switch (magicNumber) {
case FORMAT_VERSION_1_MAGIC_NUMBER:
// Format 1 header is exactly 5 bytes long and looks like:
// Magic number (2 bytes) 0x78 0xB1
// Version number (1 byte) 0x01
// Options (2 bytes) must be 0x00 0x00
return 1;
case FORMAT_VERSION_2_MAGIC_NUMBER:
// Format 2 header is as follows:
// Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
// Version number (2 bytes) 0x00 0x02
// Options (2 bytes) must be 0x00 0x00
// Header size (4 bytes) : integer, big endian
return (dict[4] << 8) + dict[5];
default:
return UNKNOWN_FORMAT; return UNKNOWN_FORMAT;
}
}
inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
switch (detectFormat(dict)) {
case 1:
return FORMAT_VERSION_1_HEADER_SIZE;
case 2:
// See the format of the header in the comment in detectFormat() above
return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];
default:
return std::numeric_limits<unsigned int>::max();
}
} }
inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {

View File

@ -162,9 +162,6 @@ static inline void prof_out(void) {
#define FLAG_BIGRAM_FREQ 0x7F #define FLAG_BIGRAM_FREQ 0x7F
#define DICTIONARY_VERSION_MIN 200 #define DICTIONARY_VERSION_MIN 200
// TODO: remove this constant when the switch to the new dict format is over
#define DICTIONARY_HEADER_SIZE 2
#define NEW_DICTIONARY_HEADER_SIZE 5
#define NOT_VALID_WORD -99 #define NOT_VALID_WORD -99
#define NOT_A_CHARACTER -1 #define NOT_A_CHARACTER -1
#define NOT_A_DISTANCE -1 #define NOT_A_DISTANCE -1

View File

@ -19,6 +19,7 @@
#define LOG_TAG "LatinIME: dictionary.cpp" #define LOG_TAG "LatinIME: dictionary.cpp"
#include "binary_format.h"
#include "dictionary.h" #include "dictionary.h"
namespace latinime { namespace latinime {
@ -41,10 +42,11 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust,
mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier);
mWordsPriorityQueuePool = new WordsPriorityQueuePool( mWordsPriorityQueuePool = new WordsPriorityQueuePool(
maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength); maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength);
mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier, const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict);
maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier,
mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives, fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
IS_LATEST_DICT_VERSION, hasBigram(), this); mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, maxAlternatives,
IS_LATEST_DICT_VERSION, true /* hasBigram */, this);
} }
Dictionary::~Dictionary() { Dictionary::~Dictionary() {
@ -54,10 +56,6 @@ Dictionary::~Dictionary() {
delete mBigramDictionary; delete mBigramDictionary;
} }
bool Dictionary::hasBigram() {
return ((mDict[1] & 0xFF) == 1);
}
bool Dictionary::isValidWord(unsigned short *word, int length) { bool Dictionary::isValidWord(unsigned short *word, int length) {
return mUnigramDictionary->isValidWord(word, length); return mUnigramDictionary->isValidWord(word, length);
} }

View File

@ -38,8 +38,7 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion) const bool isLatestDictVersion)
: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE), : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
// TODO : remove this variable. // TODO : remove this variable.