am 47e6cf36: am 19560508: Move flags belonging to BinaryFormat to the right place.

* commit '47e6cf3695ab15b042d85f0d8b55d1eb58e14223':
  Move flags belonging to BinaryFormat to the right place.
main
Jean Chalard 2012-07-31 08:32:02 -07:00 committed by Android Git Automerger
commit 9abcf50691
5 changed files with 80 additions and 81 deletions

View File

@ -126,7 +126,7 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
// codesSize == 0 means we are trying to find bigram predictions. // codesSize == 0 means we are trying to find bigram predictions.
if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) { if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
const int bigramFreqTemp = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
// Due to space constraints, the frequency for bigrams is approximate - the lower the // Due to space constraints, the frequency for bigrams is approximate - the lower the
// unigram frequency, the worse the precision. The theoritical maximum error in // unigram frequency, the worse the precision. The theoritical maximum error in
// resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
@ -139,7 +139,7 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
++bigramCount; ++bigramCount;
} }
} }
} while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
return bigramCount; return bigramCount;
} }
@ -154,8 +154,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
if (NOT_VALID_WORD == pos) return 0; if (NOT_VALID_WORD == pos) return 0;
const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0; if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) { if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
BinaryFormat::getCharCodeAndForwardPointer(root, &pos); BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
} else { } else {
pos = BinaryFormat::skipOtherCharacters(root, pos); pos = BinaryFormat::skipOtherCharacters(root, pos);
@ -182,12 +182,12 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *p
int bigramFlags; int bigramFlags;
do { do {
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
&pos); &pos);
(*map)[bigramPos] = frequency; (*map)[bigramPos] = frequency;
setInFilter(filter, bigramPos); setInFilter(filter, bigramPos);
} while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
} }
bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const { bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
@ -223,7 +223,7 @@ bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const in
if (bigramPos == nextWordPos) { if (bigramPos == nextWordPos) {
return true; return true;
} }
} while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
return false; return false;
} }

View File

@ -18,13 +18,47 @@
#define LATINIME_BINARY_FORMAT_H #define LATINIME_BINARY_FORMAT_H
#include <limits> #include <limits>
#include <map>
#include "bloom_filter.h" #include "bloom_filter.h"
#include "char_utils.h" #include "char_utils.h"
#include "unigram_dictionary.h"
namespace latinime { namespace latinime {
class BinaryFormat { class BinaryFormat {
public:
// Mask and flags for children address type selection.
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
// Flag for single/multiple char group
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
// Flag for terminal groups
static const int FLAG_IS_TERMINAL = 0x10;
// Flag for shortcut targets presence
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
// Flag for sign of offset. If this flag is set, the offset value must be negated.
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
// Mask and flags for attribute address type selection.
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
@ -174,13 +208,13 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const in
static inline int attributeAddressSize(const uint8_t flags) { static inline int attributeAddressSize(const uint8_t flags) {
static const int ATTRIBUTE_ADDRESS_SHIFT = 4; static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
/* Note: this is a value-dependant optimization of what may probably be /* Note: this is a value-dependant optimization of what may probably be
more readably written this way: more readably written this way:
switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
default: return 0; default: return 0;
} }
*/ */
@ -189,7 +223,7 @@ static inline int attributeAddressSize(const uint8_t flags) {
static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) { static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
int currentPos = pos; int currentPos = pos;
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos); uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
currentPos += attributeAddressSize(flags); currentPos += attributeAddressSize(flags);
flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos); flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
} }
@ -199,7 +233,7 @@ static inline int skipExistingBigrams(const uint8_t *const dict, const int pos)
static inline int childrenAddressSize(const uint8_t flags) { static inline int childrenAddressSize(const uint8_t flags) {
static const int CHILDREN_ADDRESS_SHIFT = 6; static const int CHILDREN_ADDRESS_SHIFT = 6;
return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
/* See the note in attributeAddressSize. The same applies here */ /* See the note in attributeAddressSize. The same applies here */
} }
@ -212,12 +246,12 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos
} }
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
} }
inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
const int pos) { const int pos) {
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
return pos + shortcutByteSize(dict, pos); return pos + shortcutByteSize(dict, pos);
} else { } else {
return pos; return pos;
@ -226,7 +260,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t
inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
const int pos) { const int pos) {
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { if (FLAG_HAS_BIGRAMS & flags) {
return skipExistingBigrams(dict, pos); return skipExistingBigrams(dict, pos);
} else { } else {
return pos; return pos;
@ -253,15 +287,15 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags, inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
const int pos) { const int pos) {
int offset = 0; int offset = 0;
switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { switch (MASK_GROUP_ADDRESS_TYPE & flags) {
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
offset = dict[pos]; offset = dict[pos];
break; break;
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
offset = dict[pos] << 8; offset = dict[pos] << 8;
offset += dict[pos + 1]; offset += dict[pos + 1];
break; break;
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
offset = dict[pos] << 16; offset = dict[pos] << 16;
offset += dict[pos + 1] << 8; offset += dict[pos + 1] << 8;
offset += dict[pos + 2]; offset += dict[pos + 2];
@ -275,32 +309,31 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const u
} }
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
!= (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
} }
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict, inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
const uint8_t flags, int *pos) { const uint8_t flags, int *pos) {
int offset = 0; int offset = 0;
const int origin = *pos; const int origin = *pos;
switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
offset = dict[origin]; offset = dict[origin];
*pos = origin + 1; *pos = origin + 1;
break; break;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
offset = dict[origin] << 8; offset = dict[origin] << 8;
offset += dict[origin + 1]; offset += dict[origin + 1];
*pos = origin + 2; *pos = origin + 2;
break; break;
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
offset = dict[origin] << 16; offset = dict[origin] << 16;
offset += dict[origin + 1] << 8; offset += dict[origin + 1] << 8;
offset += dict[origin + 2]; offset += dict[origin + 2];
*pos = origin + 3; *pos = origin + 3;
break; break;
} }
if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
return origin - offset; return origin - offset;
} else { } else {
return origin + offset; return origin + offset;
@ -332,7 +365,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
// char within a node, so either we found our match in this node, or there is // char within a node, so either we found our match in this node, or there is
// no match and we can return NOT_VALID_WORD. So we will check all the characters // no match and we can return NOT_VALID_WORD. So we will check all the characters
// in this character group indeed does match. // in this character group indeed does match.
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { if (FLAG_HAS_MULTIPLE_CHARS & flags) {
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
while (NOT_A_CHARACTER != character) { while (NOT_A_CHARACTER != character) {
++wordPos; ++wordPos;
@ -350,14 +383,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
// If we don't match the length AND don't have children, then a word in the // If we don't match the length AND don't have children, then a word in the
// dictionary fully matches a prefix of the searched word but not the full word. // dictionary fully matches a prefix of the searched word but not the full word.
++wordPos; ++wordPos;
if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { if (FLAG_IS_TERMINAL & flags) {
if (wordPos == length) { if (wordPos == length) {
return charGroupPos; return charGroupPos;
} }
pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
} }
if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
== (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
return NOT_VALID_WORD; return NOT_VALID_WORD;
} }
// We have children and we are still shorter than the word we are searching for, so // We have children and we are still shorter than the word we are searching for, so
@ -367,7 +399,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
break; break;
} else { } else {
// This chargroup does not match, so skip the remaining part and go to the next. // This chargroup does not match, so skip the remaining part and go to the next.
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = BinaryFormat::skipOtherCharacters(root, pos); pos = BinaryFormat::skipOtherCharacters(root, pos);
} }
pos = BinaryFormat::skipFrequency(flags, pos); pos = BinaryFormat::skipFrequency(flags, pos);
@ -420,7 +452,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
// We found the address. Copy the rest of the word in the buffer and return // We found the address. Copy the rest of the word in the buffer and return
// the length. // the length.
outWord[wordPos] = character; outWord[wordPos] = character;
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { if (FLAG_HAS_MULTIPLE_CHARS & flags) {
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
// We count chars in order to avoid infinite loops if the file is broken or // We count chars in order to avoid infinite loops if the file is broken or
// if there is some other bug // if there is some other bug
@ -435,7 +467,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
} }
// We need to skip past this char group, so skip any remaining chars after the // We need to skip past this char group, so skip any remaining chars after the
// first and possibly the frequency. // first and possibly the frequency.
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = skipOtherCharacters(root, pos); pos = skipOtherCharacters(root, pos);
} }
pos = skipFrequency(flags, pos); pos = skipFrequency(flags, pos);
@ -443,8 +475,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
// The fact that this group has children is very important. Since we already know // The fact that this group has children is very important. Since we already know
// that this group does not match, if it has no children we know it is irrelevant // that this group does not match, if it has no children we know it is irrelevant
// to what we are searching for. // to what we are searching for.
const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
(UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); (MASK_GROUP_ADDRESS_TYPE & flags));
// We will write in `found' whether we have passed the children address we are // We will write in `found' whether we have passed the children address we are
// searching for. For example if we search for "beer", the children of b are less // searching for. For example if we search for "beer", the children of b are less
// than the address we are searching for and the children of c are greater. When we // than the address we are searching for and the children of c are greater. When we
@ -484,7 +516,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
// We copy all the characters in this group to the buffer // We copy all the characters in this group to the buffer
outWord[wordPos] = lastChar; outWord[wordPos] = lastChar;
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
int32_t nextChar = int32_t nextChar =
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
int charCount = maxDepth; int charCount = maxDepth;

View File

@ -17,7 +17,7 @@
#ifndef LATINIME_TERMINAL_ATTRIBUTES_H #ifndef LATINIME_TERMINAL_ATTRIBUTES_H
#define LATINIME_TERMINAL_ATTRIBUTES_H #define LATINIME_TERMINAL_ATTRIBUTES_H
#include "unigram_dictionary.h" #include "binary_format.h"
namespace latinime { namespace latinime {
@ -36,7 +36,7 @@ class TerminalAttributes {
public: public:
ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict), ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict),
mPos(pos) { mPos(pos) {
mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS)); mHasNextShortcutTarget = (0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS));
} }
inline bool hasNextShortcutTarget() const { inline bool hasNextShortcutTarget() const {
@ -49,7 +49,7 @@ class TerminalAttributes {
inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) { inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) {
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos); const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
mHasNextShortcutTarget = mHasNextShortcutTarget =
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT); 0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
unsigned int i; unsigned int i;
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) { for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos); const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);

View File

@ -707,7 +707,7 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
const uint8_t *const root, const int startPos, const uint8_t *const root, const int startPos,
const uint16_t *const inWord, const int startInputIndex, const uint16_t *const inWord, const int startInputIndex,
int32_t *outNewWord, int *outInputIndex, int *outPos) { int32_t *outNewWord, int *outInputIndex, int *outPos) {
const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
int pos = startPos; int pos = startPos;
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
int32_t baseChar = toBaseLowerCase(character); int32_t baseChar = toBaseLowerCase(character);
@ -780,7 +780,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord
// into inputIndex if there is a match. // into inputIndex if there is a match.
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
inputIndex, newWord, &inputIndex, &pos); inputIndex, newWord, &inputIndex, &pos);
if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) { if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
} }
@ -823,7 +823,7 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
if (hasMultipleChars) { if (hasMultipleChars) {
pos = BinaryFormat::skipOtherCharacters(root, pos); pos = BinaryFormat::skipOtherCharacters(root, pos);
} else { } else {
@ -871,8 +871,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
bool needsToInvokeOnTerminal = false; bool needsToInvokeOnTerminal = false;

View File

@ -32,39 +32,6 @@ class UnigramDictionary {
typedef struct { int first; int second; int replacement; } digraph_t; typedef struct { int first; int second; int replacement; } digraph_t;
public: public:
// Mask and flags for children address type selection.
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
// Flag for single/multiple char group
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
// Flag for terminal groups
static const int FLAG_IS_TERMINAL = 0x10;
// Flag for shortcut targets presence
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
// Flag for sign of offset. If this flag is set, the offset value must be negated.
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
// Mask and flags for attribute address type selection.
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
// Error tolerances // Error tolerances
static const int DEFAULT_MAX_ERRORS = 2; static const int DEFAULT_MAX_ERRORS = 2;
static const int MAX_ERRORS_FOR_TWO_WORDS = 1; static const int MAX_ERRORS_FOR_TWO_WORDS = 1;