parent
e2ac3932e3
commit
de8a9a8227
|
@ -29,8 +29,6 @@ class BigramDictionary {
|
||||||
BigramDictionary(const unsigned char *dict, int maxWordLength, int maxPredictions);
|
BigramDictionary(const unsigned char *dict, int maxWordLength, int maxPredictions);
|
||||||
int getBigrams(const int32_t *word, int length, int *inputCodes, int codesSize,
|
int getBigrams(const int32_t *word, int length, int *inputCodes, int codesSize,
|
||||||
unsigned short *outWords, int *frequencies, int *outputTypes) const;
|
unsigned short *outWords, int *frequencies, int *outputTypes) const;
|
||||||
int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength,
|
|
||||||
const bool forceLowerCaseSearch) const;
|
|
||||||
void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
|
void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
|
||||||
std::map<int, int> *map, uint8_t *filter) const;
|
std::map<int, int> *map, uint8_t *filter) const;
|
||||||
bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const;
|
bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const;
|
||||||
|
@ -45,6 +43,8 @@ class BigramDictionary {
|
||||||
bool getFirstBitOfByte(int *pos) { return (DICT[*pos] & 0x80) > 0; }
|
bool getFirstBitOfByte(int *pos) { return (DICT[*pos] & 0x80) > 0; }
|
||||||
bool getSecondBitOfByte(int *pos) { return (DICT[*pos] & 0x40) > 0; }
|
bool getSecondBitOfByte(int *pos) { return (DICT[*pos] & 0x40) > 0; }
|
||||||
bool checkFirstCharacter(unsigned short *word, int *inputCodes) const;
|
bool checkFirstCharacter(unsigned short *word, int *inputCodes) const;
|
||||||
|
int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength,
|
||||||
|
const bool forceLowerCaseSearch) const;
|
||||||
|
|
||||||
const unsigned char *DICT;
|
const unsigned char *DICT;
|
||||||
const int MAX_WORD_LENGTH;
|
const int MAX_WORD_LENGTH;
|
||||||
|
|
|
@ -61,13 +61,6 @@ class BinaryFormat {
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||||
|
|
||||||
private:
|
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
|
||||||
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
|
||||||
const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
|
|
||||||
const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
|
|
||||||
|
|
||||||
public:
|
|
||||||
const static int UNKNOWN_FORMAT = -1;
|
const static int UNKNOWN_FORMAT = -1;
|
||||||
// Originally, format version 1 had a 16-bit magic number, then the version number `01'
|
// Originally, format version 1 had a 16-bit magic number, then the version number `01'
|
||||||
// then options that must be 0. Hence the first 32-bits of the format are always as follow
|
// then options that must be 0. Hence the first 32-bits of the format are always as follow
|
||||||
|
@ -94,7 +87,6 @@ class BinaryFormat {
|
||||||
static int skipFrequency(const uint8_t flags, const int pos);
|
static int skipFrequency(const uint8_t flags, const int pos);
|
||||||
static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
|
static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
|
||||||
static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
|
static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
|
||||||
static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
|
|
||||||
static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
|
static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
|
||||||
const int pos);
|
const int pos);
|
||||||
static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos);
|
static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos);
|
||||||
|
@ -118,6 +110,13 @@ class BinaryFormat {
|
||||||
REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
|
REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
|
||||||
};
|
};
|
||||||
const static unsigned int NO_FLAGS = 0;
|
const static unsigned int NO_FLAGS = 0;
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
||||||
|
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
||||||
|
const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
|
||||||
|
const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
|
||||||
|
static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
|
||||||
};
|
};
|
||||||
|
|
||||||
inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
|
inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
|
||||||
|
|
|
@ -889,7 +889,7 @@ static int compare_pair_capital(const void *a, const void *b) {
|
||||||
- static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital);
|
- static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned short latin_tolower(unsigned short c) {
|
unsigned short latin_tolower(const unsigned short c) {
|
||||||
struct LatinCapitalSmallPair *p =
|
struct LatinCapitalSmallPair *p =
|
||||||
static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP,
|
static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP,
|
||||||
sizeof(SORTED_CHAR_MAP) / sizeof(SORTED_CHAR_MAP[0]),
|
sizeof(SORTED_CHAR_MAP) / sizeof(SORTED_CHAR_MAP[0]),
|
||||||
|
|
|
@ -17,21 +17,23 @@
|
||||||
#ifndef LATINIME_CHAR_UTILS_H
|
#ifndef LATINIME_CHAR_UTILS_H
|
||||||
#define LATINIME_CHAR_UTILS_H
|
#define LATINIME_CHAR_UTILS_H
|
||||||
|
|
||||||
|
#include <cctype>
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
inline static int isAsciiUpper(unsigned short c) {
|
inline static bool isAsciiUpper(unsigned short c) {
|
||||||
return c >= 'A' && c <= 'Z';
|
return isupper(static_cast<int>(c)) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static unsigned short toAsciiLower(unsigned short c) {
|
inline static unsigned short toAsciiLower(unsigned short c) {
|
||||||
return c - 'A' + 'a';
|
return c - 'A' + 'a';
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static int isAscii(unsigned short c) {
|
inline static bool isAscii(unsigned short c) {
|
||||||
return c <= 127;
|
return isascii(static_cast<int>(c)) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned short latin_tolower(unsigned short c);
|
unsigned short latin_tolower(const unsigned short c);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Table mapping most combined Latin, Greek, and Cyrillic characters
|
* Table mapping most combined Latin, Greek, and Cyrillic characters
|
||||||
|
|
Loading…
Reference in New Issue