Move flags belonging to BinaryFormat to the right place.
These masks and flags are constants that are an integral part of the format. They belong in BinaryFormat and have nothing to do in UnigramDictionary. This needs I6751dda4 to not break the build Bug: 6429243 Change-Id: Ic1c842b3245f7fdc25aa8d1459c5bb07b262e265main
parent
8ec8c5feda
commit
195605084e
|
@ -126,7 +126,7 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
|
||||||
|
|
||||||
// codesSize == 0 means we are trying to find bigram predictions.
|
// codesSize == 0 means we are trying to find bigram predictions.
|
||||||
if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
|
if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
|
||||||
const int bigramFreqTemp = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
||||||
// Due to space constraints, the frequency for bigrams is approximate - the lower the
|
// Due to space constraints, the frequency for bigrams is approximate - the lower the
|
||||||
// unigram frequency, the worse the precision. The theoritical maximum error in
|
// unigram frequency, the worse the precision. The theoritical maximum error in
|
||||||
// resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
|
// resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
|
||||||
|
@ -139,7 +139,7 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
|
||||||
++bigramCount;
|
++bigramCount;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||||
return bigramCount;
|
return bigramCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -154,8 +154,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
|
||||||
|
|
||||||
if (NOT_VALID_WORD == pos) return 0;
|
if (NOT_VALID_WORD == pos) return 0;
|
||||||
const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0;
|
if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
|
||||||
if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) {
|
if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
|
||||||
BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
} else {
|
} else {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
|
@ -182,12 +182,12 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *p
|
||||||
int bigramFlags;
|
int bigramFlags;
|
||||||
do {
|
do {
|
||||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
||||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||||
&pos);
|
&pos);
|
||||||
(*map)[bigramPos] = frequency;
|
(*map)[bigramPos] = frequency;
|
||||||
setInFilter(filter, bigramPos);
|
setInFilter(filter, bigramPos);
|
||||||
} while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
} while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
|
bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
|
||||||
|
@ -223,7 +223,7 @@ bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const in
|
||||||
if (bigramPos == nextWordPos) {
|
if (bigramPos == nextWordPos) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,13 +18,47 @@
|
||||||
#define LATINIME_BINARY_FORMAT_H
|
#define LATINIME_BINARY_FORMAT_H
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
#include <map>
|
||||||
#include "bloom_filter.h"
|
#include "bloom_filter.h"
|
||||||
#include "char_utils.h"
|
#include "char_utils.h"
|
||||||
#include "unigram_dictionary.h"
|
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class BinaryFormat {
|
class BinaryFormat {
|
||||||
|
public:
|
||||||
|
// Mask and flags for children address type selection.
|
||||||
|
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
|
||||||
|
|
||||||
|
// Flag for single/multiple char group
|
||||||
|
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
|
||||||
|
|
||||||
|
// Flag for terminal groups
|
||||||
|
static const int FLAG_IS_TERMINAL = 0x10;
|
||||||
|
|
||||||
|
// Flag for shortcut targets presence
|
||||||
|
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||||
|
// Flag for bigram presence
|
||||||
|
static const int FLAG_HAS_BIGRAMS = 0x04;
|
||||||
|
|
||||||
|
// Attribute (bigram/shortcut) related flags:
|
||||||
|
// Flag for presence of more attributes
|
||||||
|
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
||||||
|
// Flag for sign of offset. If this flag is set, the offset value must be negated.
|
||||||
|
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
||||||
|
|
||||||
|
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
|
||||||
|
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
|
||||||
|
|
||||||
|
// Mask and flags for attribute address type selection.
|
||||||
|
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
||||||
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
||||||
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||||
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
||||||
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
||||||
|
@ -174,13 +208,13 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const in
|
||||||
|
|
||||||
static inline int attributeAddressSize(const uint8_t flags) {
|
static inline int attributeAddressSize(const uint8_t flags) {
|
||||||
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
|
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
|
||||||
return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
|
return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
|
||||||
/* Note: this is a value-dependant optimization of what may probably be
|
/* Note: this is a value-dependant optimization of what may probably be
|
||||||
more readably written this way:
|
more readably written this way:
|
||||||
switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
|
||||||
default: return 0;
|
default: return 0;
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
@ -189,7 +223,7 @@ static inline int attributeAddressSize(const uint8_t flags) {
|
||||||
static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
|
static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
|
||||||
int currentPos = pos;
|
int currentPos = pos;
|
||||||
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||||
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
|
while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
|
||||||
currentPos += attributeAddressSize(flags);
|
currentPos += attributeAddressSize(flags);
|
||||||
flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||||
}
|
}
|
||||||
|
@ -199,7 +233,7 @@ static inline int skipExistingBigrams(const uint8_t *const dict, const int pos)
|
||||||
|
|
||||||
static inline int childrenAddressSize(const uint8_t flags) {
|
static inline int childrenAddressSize(const uint8_t flags) {
|
||||||
static const int CHILDREN_ADDRESS_SHIFT = 6;
|
static const int CHILDREN_ADDRESS_SHIFT = 6;
|
||||||
return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
|
return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
|
||||||
/* See the note in attributeAddressSize. The same applies here */
|
/* See the note in attributeAddressSize. The same applies here */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -212,12 +246,12 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
|
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
|
||||||
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
|
inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
||||||
return pos + shortcutByteSize(dict, pos);
|
return pos + shortcutByteSize(dict, pos);
|
||||||
} else {
|
} else {
|
||||||
return pos;
|
return pos;
|
||||||
|
@ -226,7 +260,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t
|
||||||
|
|
||||||
inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
|
inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
|
if (FLAG_HAS_BIGRAMS & flags) {
|
||||||
return skipExistingBigrams(dict, pos);
|
return skipExistingBigrams(dict, pos);
|
||||||
} else {
|
} else {
|
||||||
return pos;
|
return pos;
|
||||||
|
@ -253,15 +287,15 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
|
||||||
inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
|
inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
|
switch (MASK_GROUP_ADDRESS_TYPE & flags) {
|
||||||
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
|
case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
|
||||||
offset = dict[pos];
|
offset = dict[pos];
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
|
case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
|
||||||
offset = dict[pos] << 8;
|
offset = dict[pos] << 8;
|
||||||
offset += dict[pos + 1];
|
offset += dict[pos + 1];
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
|
case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
|
||||||
offset = dict[pos] << 16;
|
offset = dict[pos] << 16;
|
||||||
offset += dict[pos + 1] << 8;
|
offset += dict[pos + 1] << 8;
|
||||||
offset += dict[pos + 2];
|
offset += dict[pos + 2];
|
||||||
|
@ -275,32 +309,31 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const u
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
|
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
|
||||||
return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
|
return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
|
||||||
!= (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
|
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
|
||||||
const uint8_t flags, int *pos) {
|
const uint8_t flags, int *pos) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
const int origin = *pos;
|
const int origin = *pos;
|
||||||
switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
||||||
offset = dict[origin];
|
offset = dict[origin];
|
||||||
*pos = origin + 1;
|
*pos = origin + 1;
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
||||||
offset = dict[origin] << 8;
|
offset = dict[origin] << 8;
|
||||||
offset += dict[origin + 1];
|
offset += dict[origin + 1];
|
||||||
*pos = origin + 2;
|
*pos = origin + 2;
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
||||||
offset = dict[origin] << 16;
|
offset = dict[origin] << 16;
|
||||||
offset += dict[origin + 1] << 8;
|
offset += dict[origin + 1] << 8;
|
||||||
offset += dict[origin + 2];
|
offset += dict[origin + 2];
|
||||||
*pos = origin + 3;
|
*pos = origin + 3;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
|
if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
|
||||||
return origin - offset;
|
return origin - offset;
|
||||||
} else {
|
} else {
|
||||||
return origin + offset;
|
return origin + offset;
|
||||||
|
@ -332,7 +365,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
// char within a node, so either we found our match in this node, or there is
|
// char within a node, so either we found our match in this node, or there is
|
||||||
// no match and we can return NOT_VALID_WORD. So we will check all the characters
|
// no match and we can return NOT_VALID_WORD. So we will check all the characters
|
||||||
// in this character group indeed does match.
|
// in this character group indeed does match.
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
while (NOT_A_CHARACTER != character) {
|
while (NOT_A_CHARACTER != character) {
|
||||||
++wordPos;
|
++wordPos;
|
||||||
|
@ -350,14 +383,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
// If we don't match the length AND don't have children, then a word in the
|
// If we don't match the length AND don't have children, then a word in the
|
||||||
// dictionary fully matches a prefix of the searched word but not the full word.
|
// dictionary fully matches a prefix of the searched word but not the full word.
|
||||||
++wordPos;
|
++wordPos;
|
||||||
if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
|
if (FLAG_IS_TERMINAL & flags) {
|
||||||
if (wordPos == length) {
|
if (wordPos == length) {
|
||||||
return charGroupPos;
|
return charGroupPos;
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
|
pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
|
||||||
}
|
}
|
||||||
if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
|
if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
|
||||||
== (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
|
|
||||||
return NOT_VALID_WORD;
|
return NOT_VALID_WORD;
|
||||||
}
|
}
|
||||||
// We have children and we are still shorter than the word we are searching for, so
|
// We have children and we are still shorter than the word we are searching for, so
|
||||||
|
@ -367,7 +399,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// This chargroup does not match, so skip the remaining part and go to the next.
|
// This chargroup does not match, so skip the remaining part and go to the next.
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipFrequency(flags, pos);
|
||||||
|
@ -420,7 +452,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
// We found the address. Copy the rest of the word in the buffer and return
|
// We found the address. Copy the rest of the word in the buffer and return
|
||||||
// the length.
|
// the length.
|
||||||
outWord[wordPos] = character;
|
outWord[wordPos] = character;
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
|
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
|
||||||
// We count chars in order to avoid infinite loops if the file is broken or
|
// We count chars in order to avoid infinite loops if the file is broken or
|
||||||
// if there is some other bug
|
// if there is some other bug
|
||||||
|
@ -435,7 +467,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
}
|
}
|
||||||
// We need to skip past this char group, so skip any remaining chars after the
|
// We need to skip past this char group, so skip any remaining chars after the
|
||||||
// first and possibly the frequency.
|
// first and possibly the frequency.
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
pos = skipOtherCharacters(root, pos);
|
pos = skipOtherCharacters(root, pos);
|
||||||
}
|
}
|
||||||
pos = skipFrequency(flags, pos);
|
pos = skipFrequency(flags, pos);
|
||||||
|
@ -443,8 +475,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
// The fact that this group has children is very important. Since we already know
|
// The fact that this group has children is very important. Since we already know
|
||||||
// that this group does not match, if it has no children we know it is irrelevant
|
// that this group does not match, if it has no children we know it is irrelevant
|
||||||
// to what we are searching for.
|
// to what we are searching for.
|
||||||
const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
|
const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
|
||||||
(UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
|
(MASK_GROUP_ADDRESS_TYPE & flags));
|
||||||
// We will write in `found' whether we have passed the children address we are
|
// We will write in `found' whether we have passed the children address we are
|
||||||
// searching for. For example if we search for "beer", the children of b are less
|
// searching for. For example if we search for "beer", the children of b are less
|
||||||
// than the address we are searching for and the children of c are greater. When we
|
// than the address we are searching for and the children of c are greater. When we
|
||||||
|
@ -484,7 +516,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
||||||
// We copy all the characters in this group to the buffer
|
// We copy all the characters in this group to the buffer
|
||||||
outWord[wordPos] = lastChar;
|
outWord[wordPos] = lastChar;
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
|
||||||
int32_t nextChar =
|
int32_t nextChar =
|
||||||
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
||||||
int charCount = maxDepth;
|
int charCount = maxDepth;
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
#ifndef LATINIME_TERMINAL_ATTRIBUTES_H
|
#ifndef LATINIME_TERMINAL_ATTRIBUTES_H
|
||||||
#define LATINIME_TERMINAL_ATTRIBUTES_H
|
#define LATINIME_TERMINAL_ATTRIBUTES_H
|
||||||
|
|
||||||
#include "unigram_dictionary.h"
|
#include "binary_format.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ class TerminalAttributes {
|
||||||
public:
|
public:
|
||||||
ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict),
|
ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict),
|
||||||
mPos(pos) {
|
mPos(pos) {
|
||||||
mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS));
|
mHasNextShortcutTarget = (0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool hasNextShortcutTarget() const {
|
inline bool hasNextShortcutTarget() const {
|
||||||
|
@ -49,7 +49,7 @@ class TerminalAttributes {
|
||||||
inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) {
|
inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) {
|
||||||
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
||||||
mHasNextShortcutTarget =
|
mHasNextShortcutTarget =
|
||||||
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
|
0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
|
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
|
||||||
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
|
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
|
||||||
|
|
|
@ -707,7 +707,7 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
|
||||||
const uint8_t *const root, const int startPos,
|
const uint8_t *const root, const int startPos,
|
||||||
const uint16_t *const inWord, const int startInputIndex,
|
const uint16_t *const inWord, const int startInputIndex,
|
||||||
int32_t *outNewWord, int *outInputIndex, int *outPos) {
|
int32_t *outNewWord, int *outInputIndex, int *outPos) {
|
||||||
const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
int pos = startPos;
|
int pos = startPos;
|
||||||
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
int32_t baseChar = toBaseLowerCase(character);
|
int32_t baseChar = toBaseLowerCase(character);
|
||||||
|
@ -780,7 +780,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord
|
||||||
// into inputIndex if there is a match.
|
// into inputIndex if there is a match.
|
||||||
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
|
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
|
||||||
inputIndex, newWord, &inputIndex, &pos);
|
inputIndex, newWord, &inputIndex, &pos);
|
||||||
if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
|
if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
|
||||||
const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
|
const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
|
||||||
onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
|
onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
|
||||||
}
|
}
|
||||||
|
@ -823,7 +823,7 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
if (hasMultipleChars) {
|
if (hasMultipleChars) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
} else {
|
} else {
|
||||||
|
@ -871,8 +871,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
|
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
|
||||||
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
|
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
|
||||||
const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
|
const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
|
||||||
|
|
||||||
bool needsToInvokeOnTerminal = false;
|
bool needsToInvokeOnTerminal = false;
|
||||||
|
|
||||||
|
|
|
@ -32,39 +32,6 @@ class UnigramDictionary {
|
||||||
typedef struct { int first; int second; int replacement; } digraph_t;
|
typedef struct { int first; int second; int replacement; } digraph_t;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Mask and flags for children address type selection.
|
|
||||||
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
|
|
||||||
|
|
||||||
// Flag for single/multiple char group
|
|
||||||
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
|
|
||||||
|
|
||||||
// Flag for terminal groups
|
|
||||||
static const int FLAG_IS_TERMINAL = 0x10;
|
|
||||||
|
|
||||||
// Flag for shortcut targets presence
|
|
||||||
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
|
||||||
// Flag for bigram presence
|
|
||||||
static const int FLAG_HAS_BIGRAMS = 0x04;
|
|
||||||
|
|
||||||
// Attribute (bigram/shortcut) related flags:
|
|
||||||
// Flag for presence of more attributes
|
|
||||||
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
|
||||||
// Flag for sign of offset. If this flag is set, the offset value must be negated.
|
|
||||||
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
|
||||||
|
|
||||||
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
|
|
||||||
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
|
|
||||||
|
|
||||||
// Mask and flags for attribute address type selection.
|
|
||||||
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
|
||||||
|
|
||||||
// Error tolerances
|
// Error tolerances
|
||||||
static const int DEFAULT_MAX_ERRORS = 2;
|
static const int DEFAULT_MAX_ERRORS = 2;
|
||||||
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
|
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
|
||||||
|
|
Loading…
Reference in New Issue