am 47e6cf36: am 19560508: Move flags belonging to BinaryFormat to the right place.
* commit '47e6cf3695ab15b042d85f0d8b55d1eb58e14223': Move flags belonging to BinaryFormat to the right place.main
commit
9abcf50691
|
@ -126,7 +126,7 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
|
||||||
|
|
||||||
// codesSize == 0 means we are trying to find bigram predictions.
|
// codesSize == 0 means we are trying to find bigram predictions.
|
||||||
if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
|
if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
|
||||||
const int bigramFreqTemp = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
||||||
// Due to space constraints, the frequency for bigrams is approximate - the lower the
|
// Due to space constraints, the frequency for bigrams is approximate - the lower the
|
||||||
// unigram frequency, the worse the precision. The theoritical maximum error in
|
// unigram frequency, the worse the precision. The theoritical maximum error in
|
||||||
// resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
|
// resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
|
||||||
|
@ -139,7 +139,7 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
|
||||||
++bigramCount;
|
++bigramCount;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||||
return bigramCount;
|
return bigramCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -154,8 +154,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
|
||||||
|
|
||||||
if (NOT_VALID_WORD == pos) return 0;
|
if (NOT_VALID_WORD == pos) return 0;
|
||||||
const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0;
|
if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
|
||||||
if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) {
|
if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
|
||||||
BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
} else {
|
} else {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
|
@ -182,12 +182,12 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *p
|
||||||
int bigramFlags;
|
int bigramFlags;
|
||||||
do {
|
do {
|
||||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
|
||||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||||
&pos);
|
&pos);
|
||||||
(*map)[bigramPos] = frequency;
|
(*map)[bigramPos] = frequency;
|
||||||
setInFilter(filter, bigramPos);
|
setInFilter(filter, bigramPos);
|
||||||
} while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
} while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
|
bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
|
||||||
|
@ -223,7 +223,7 @@ bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const in
|
||||||
if (bigramPos == nextWordPos) {
|
if (bigramPos == nextWordPos) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,13 +18,47 @@
|
||||||
#define LATINIME_BINARY_FORMAT_H
|
#define LATINIME_BINARY_FORMAT_H
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
#include <map>
|
||||||
#include "bloom_filter.h"
|
#include "bloom_filter.h"
|
||||||
#include "char_utils.h"
|
#include "char_utils.h"
|
||||||
#include "unigram_dictionary.h"
|
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class BinaryFormat {
|
class BinaryFormat {
|
||||||
|
public:
|
||||||
|
// Mask and flags for children address type selection.
|
||||||
|
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
|
||||||
|
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
|
||||||
|
|
||||||
|
// Flag for single/multiple char group
|
||||||
|
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
|
||||||
|
|
||||||
|
// Flag for terminal groups
|
||||||
|
static const int FLAG_IS_TERMINAL = 0x10;
|
||||||
|
|
||||||
|
// Flag for shortcut targets presence
|
||||||
|
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||||
|
// Flag for bigram presence
|
||||||
|
static const int FLAG_HAS_BIGRAMS = 0x04;
|
||||||
|
|
||||||
|
// Attribute (bigram/shortcut) related flags:
|
||||||
|
// Flag for presence of more attributes
|
||||||
|
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
||||||
|
// Flag for sign of offset. If this flag is set, the offset value must be negated.
|
||||||
|
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
||||||
|
|
||||||
|
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
|
||||||
|
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
|
||||||
|
|
||||||
|
// Mask and flags for attribute address type selection.
|
||||||
|
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
||||||
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
||||||
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||||
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
||||||
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
||||||
|
@ -174,13 +208,13 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const in
|
||||||
|
|
||||||
static inline int attributeAddressSize(const uint8_t flags) {
|
static inline int attributeAddressSize(const uint8_t flags) {
|
||||||
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
|
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
|
||||||
return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
|
return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
|
||||||
/* Note: this is a value-dependant optimization of what may probably be
|
/* Note: this is a value-dependant optimization of what may probably be
|
||||||
more readably written this way:
|
more readably written this way:
|
||||||
switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
|
||||||
default: return 0;
|
default: return 0;
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
@ -189,7 +223,7 @@ static inline int attributeAddressSize(const uint8_t flags) {
|
||||||
static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
|
static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
|
||||||
int currentPos = pos;
|
int currentPos = pos;
|
||||||
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||||
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
|
while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
|
||||||
currentPos += attributeAddressSize(flags);
|
currentPos += attributeAddressSize(flags);
|
||||||
flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||||
}
|
}
|
||||||
|
@ -199,7 +233,7 @@ static inline int skipExistingBigrams(const uint8_t *const dict, const int pos)
|
||||||
|
|
||||||
static inline int childrenAddressSize(const uint8_t flags) {
|
static inline int childrenAddressSize(const uint8_t flags) {
|
||||||
static const int CHILDREN_ADDRESS_SHIFT = 6;
|
static const int CHILDREN_ADDRESS_SHIFT = 6;
|
||||||
return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
|
return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
|
||||||
/* See the note in attributeAddressSize. The same applies here */
|
/* See the note in attributeAddressSize. The same applies here */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -212,12 +246,12 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
|
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
|
||||||
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
|
inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
||||||
return pos + shortcutByteSize(dict, pos);
|
return pos + shortcutByteSize(dict, pos);
|
||||||
} else {
|
} else {
|
||||||
return pos;
|
return pos;
|
||||||
|
@ -226,7 +260,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t
|
||||||
|
|
||||||
inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
|
inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
|
if (FLAG_HAS_BIGRAMS & flags) {
|
||||||
return skipExistingBigrams(dict, pos);
|
return skipExistingBigrams(dict, pos);
|
||||||
} else {
|
} else {
|
||||||
return pos;
|
return pos;
|
||||||
|
@ -253,15 +287,15 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
|
||||||
inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
|
inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
|
switch (MASK_GROUP_ADDRESS_TYPE & flags) {
|
||||||
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
|
case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
|
||||||
offset = dict[pos];
|
offset = dict[pos];
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
|
case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
|
||||||
offset = dict[pos] << 8;
|
offset = dict[pos] << 8;
|
||||||
offset += dict[pos + 1];
|
offset += dict[pos + 1];
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
|
case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
|
||||||
offset = dict[pos] << 16;
|
offset = dict[pos] << 16;
|
||||||
offset += dict[pos + 1] << 8;
|
offset += dict[pos + 1] << 8;
|
||||||
offset += dict[pos + 2];
|
offset += dict[pos + 2];
|
||||||
|
@ -275,32 +309,31 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const u
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
|
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
|
||||||
return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
|
return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
|
||||||
!= (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
|
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
|
||||||
const uint8_t flags, int *pos) {
|
const uint8_t flags, int *pos) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
const int origin = *pos;
|
const int origin = *pos;
|
||||||
switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
||||||
offset = dict[origin];
|
offset = dict[origin];
|
||||||
*pos = origin + 1;
|
*pos = origin + 1;
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
||||||
offset = dict[origin] << 8;
|
offset = dict[origin] << 8;
|
||||||
offset += dict[origin + 1];
|
offset += dict[origin + 1];
|
||||||
*pos = origin + 2;
|
*pos = origin + 2;
|
||||||
break;
|
break;
|
||||||
case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
||||||
offset = dict[origin] << 16;
|
offset = dict[origin] << 16;
|
||||||
offset += dict[origin + 1] << 8;
|
offset += dict[origin + 1] << 8;
|
||||||
offset += dict[origin + 2];
|
offset += dict[origin + 2];
|
||||||
*pos = origin + 3;
|
*pos = origin + 3;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
|
if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
|
||||||
return origin - offset;
|
return origin - offset;
|
||||||
} else {
|
} else {
|
||||||
return origin + offset;
|
return origin + offset;
|
||||||
|
@ -332,7 +365,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
// char within a node, so either we found our match in this node, or there is
|
// char within a node, so either we found our match in this node, or there is
|
||||||
// no match and we can return NOT_VALID_WORD. So we will check all the characters
|
// no match and we can return NOT_VALID_WORD. So we will check all the characters
|
||||||
// in this character group indeed does match.
|
// in this character group indeed does match.
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
while (NOT_A_CHARACTER != character) {
|
while (NOT_A_CHARACTER != character) {
|
||||||
++wordPos;
|
++wordPos;
|
||||||
|
@ -350,14 +383,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
// If we don't match the length AND don't have children, then a word in the
|
// If we don't match the length AND don't have children, then a word in the
|
||||||
// dictionary fully matches a prefix of the searched word but not the full word.
|
// dictionary fully matches a prefix of the searched word but not the full word.
|
||||||
++wordPos;
|
++wordPos;
|
||||||
if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
|
if (FLAG_IS_TERMINAL & flags) {
|
||||||
if (wordPos == length) {
|
if (wordPos == length) {
|
||||||
return charGroupPos;
|
return charGroupPos;
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
|
pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
|
||||||
}
|
}
|
||||||
if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
|
if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
|
||||||
== (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
|
|
||||||
return NOT_VALID_WORD;
|
return NOT_VALID_WORD;
|
||||||
}
|
}
|
||||||
// We have children and we are still shorter than the word we are searching for, so
|
// We have children and we are still shorter than the word we are searching for, so
|
||||||
|
@ -367,7 +399,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// This chargroup does not match, so skip the remaining part and go to the next.
|
// This chargroup does not match, so skip the remaining part and go to the next.
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipFrequency(flags, pos);
|
||||||
|
@ -420,7 +452,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
// We found the address. Copy the rest of the word in the buffer and return
|
// We found the address. Copy the rest of the word in the buffer and return
|
||||||
// the length.
|
// the length.
|
||||||
outWord[wordPos] = character;
|
outWord[wordPos] = character;
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
|
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
|
||||||
// We count chars in order to avoid infinite loops if the file is broken or
|
// We count chars in order to avoid infinite loops if the file is broken or
|
||||||
// if there is some other bug
|
// if there is some other bug
|
||||||
|
@ -435,7 +467,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
}
|
}
|
||||||
// We need to skip past this char group, so skip any remaining chars after the
|
// We need to skip past this char group, so skip any remaining chars after the
|
||||||
// first and possibly the frequency.
|
// first and possibly the frequency.
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
pos = skipOtherCharacters(root, pos);
|
pos = skipOtherCharacters(root, pos);
|
||||||
}
|
}
|
||||||
pos = skipFrequency(flags, pos);
|
pos = skipFrequency(flags, pos);
|
||||||
|
@ -443,8 +475,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
// The fact that this group has children is very important. Since we already know
|
// The fact that this group has children is very important. Since we already know
|
||||||
// that this group does not match, if it has no children we know it is irrelevant
|
// that this group does not match, if it has no children we know it is irrelevant
|
||||||
// to what we are searching for.
|
// to what we are searching for.
|
||||||
const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
|
const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
|
||||||
(UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
|
(MASK_GROUP_ADDRESS_TYPE & flags));
|
||||||
// We will write in `found' whether we have passed the children address we are
|
// We will write in `found' whether we have passed the children address we are
|
||||||
// searching for. For example if we search for "beer", the children of b are less
|
// searching for. For example if we search for "beer", the children of b are less
|
||||||
// than the address we are searching for and the children of c are greater. When we
|
// than the address we are searching for and the children of c are greater. When we
|
||||||
|
@ -484,7 +516,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
|
||||||
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
||||||
// We copy all the characters in this group to the buffer
|
// We copy all the characters in this group to the buffer
|
||||||
outWord[wordPos] = lastChar;
|
outWord[wordPos] = lastChar;
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
|
if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
|
||||||
int32_t nextChar =
|
int32_t nextChar =
|
||||||
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
|
||||||
int charCount = maxDepth;
|
int charCount = maxDepth;
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
#ifndef LATINIME_TERMINAL_ATTRIBUTES_H
|
#ifndef LATINIME_TERMINAL_ATTRIBUTES_H
|
||||||
#define LATINIME_TERMINAL_ATTRIBUTES_H
|
#define LATINIME_TERMINAL_ATTRIBUTES_H
|
||||||
|
|
||||||
#include "unigram_dictionary.h"
|
#include "binary_format.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ class TerminalAttributes {
|
||||||
public:
|
public:
|
||||||
ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict),
|
ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict),
|
||||||
mPos(pos) {
|
mPos(pos) {
|
||||||
mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS));
|
mHasNextShortcutTarget = (0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool hasNextShortcutTarget() const {
|
inline bool hasNextShortcutTarget() const {
|
||||||
|
@ -49,7 +49,7 @@ class TerminalAttributes {
|
||||||
inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) {
|
inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) {
|
||||||
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
||||||
mHasNextShortcutTarget =
|
mHasNextShortcutTarget =
|
||||||
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
|
0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
|
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
|
||||||
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
|
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
|
||||||
|
|
|
@ -707,7 +707,7 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
|
||||||
const uint8_t *const root, const int startPos,
|
const uint8_t *const root, const int startPos,
|
||||||
const uint16_t *const inWord, const int startInputIndex,
|
const uint16_t *const inWord, const int startInputIndex,
|
||||||
int32_t *outNewWord, int *outInputIndex, int *outPos) {
|
int32_t *outNewWord, int *outInputIndex, int *outPos) {
|
||||||
const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
int pos = startPos;
|
int pos = startPos;
|
||||||
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
int32_t baseChar = toBaseLowerCase(character);
|
int32_t baseChar = toBaseLowerCase(character);
|
||||||
|
@ -780,7 +780,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord
|
||||||
// into inputIndex if there is a match.
|
// into inputIndex if there is a match.
|
||||||
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
|
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
|
||||||
inputIndex, newWord, &inputIndex, &pos);
|
inputIndex, newWord, &inputIndex, &pos);
|
||||||
if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
|
if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
|
||||||
const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
|
const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
|
||||||
onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
|
onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
|
||||||
}
|
}
|
||||||
|
@ -823,7 +823,7 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
if (hasMultipleChars) {
|
if (hasMultipleChars) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
} else {
|
} else {
|
||||||
|
@ -871,8 +871,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
|
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
|
||||||
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
|
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
|
||||||
const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
|
const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
|
||||||
|
|
||||||
bool needsToInvokeOnTerminal = false;
|
bool needsToInvokeOnTerminal = false;
|
||||||
|
|
||||||
|
|
|
@ -32,39 +32,6 @@ class UnigramDictionary {
|
||||||
typedef struct { int first; int second; int replacement; } digraph_t;
|
typedef struct { int first; int second; int replacement; } digraph_t;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Mask and flags for children address type selection.
|
|
||||||
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
|
|
||||||
|
|
||||||
// Flag for single/multiple char group
|
|
||||||
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
|
|
||||||
|
|
||||||
// Flag for terminal groups
|
|
||||||
static const int FLAG_IS_TERMINAL = 0x10;
|
|
||||||
|
|
||||||
// Flag for shortcut targets presence
|
|
||||||
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
|
||||||
// Flag for bigram presence
|
|
||||||
static const int FLAG_HAS_BIGRAMS = 0x04;
|
|
||||||
|
|
||||||
// Attribute (bigram/shortcut) related flags:
|
|
||||||
// Flag for presence of more attributes
|
|
||||||
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
|
||||||
// Flag for sign of offset. If this flag is set, the offset value must be negated.
|
|
||||||
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
|
||||||
|
|
||||||
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
|
|
||||||
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
|
|
||||||
|
|
||||||
// Mask and flags for attribute address type selection.
|
|
||||||
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
|
||||||
|
|
||||||
// Error tolerances
|
// Error tolerances
|
||||||
static const int DEFAULT_MAX_ERRORS = 2;
|
static const int DEFAULT_MAX_ERRORS = 2;
|
||||||
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
|
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
|
||||||
|
|
Loading…
Reference in New Issue