Merge "New dict format, step 1"
This commit is contained in:
commit
3f4385511b
2 changed files with 23 additions and 16 deletions
|
@ -16,8 +16,6 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <fcntl.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
|
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
|
||||||
|
@ -34,10 +32,12 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
|
||||||
{ 'o', 'e' },
|
{ 'o', 'e' },
|
||||||
{ 'u', 'e' } };
|
{ 'u', 'e' } };
|
||||||
|
|
||||||
UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
|
// TODO: check the header
|
||||||
|
UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
|
||||||
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||||
const bool isLatestDictVersion)
|
const bool isLatestDictVersion)
|
||||||
: DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
|
: DICT_ROOT(streamStart),
|
||||||
|
MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
|
||||||
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
||||||
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
|
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
|
||||||
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
|
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
|
||||||
|
@ -363,7 +363,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
|
||||||
}
|
}
|
||||||
int rootPosition = ROOT_POS;
|
int rootPosition = ROOT_POS;
|
||||||
// Get the number of child of root, then increment the position
|
// Get the number of child of root, then increment the position
|
||||||
int childCount = Dictionary::getCount(DICT, &rootPosition);
|
int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
|
|
||||||
mStackChildCount[0] = childCount;
|
mStackChildCount[0] = childCount;
|
||||||
|
@ -562,7 +562,7 @@ void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength,
|
||||||
const int excessivePos, const int transposedPos,int *nextLetters,
|
const int excessivePos, const int transposedPos,int *nextLetters,
|
||||||
const int nextLettersSize) {
|
const int nextLettersSize) {
|
||||||
int initialPosition = initialPos;
|
int initialPosition = initialPos;
|
||||||
const int count = Dictionary::getCount(DICT, &initialPosition);
|
const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
|
||||||
getWordsRec(count, initialPosition, 0,
|
getWordsRec(count, initialPosition, 0,
|
||||||
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
|
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
|
||||||
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
|
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
|
||||||
|
@ -770,8 +770,8 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
||||||
|
|
||||||
if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
|
if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
|
||||||
|
|
||||||
*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
|
*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
|
||||||
&childPosition, &terminal, &freq);
|
&c, &childPosition, &terminal, &freq);
|
||||||
|
|
||||||
const bool needsToTraverseChildrenNodes = childPosition != 0;
|
const bool needsToTraverseChildrenNodes = childPosition != 0;
|
||||||
|
|
||||||
|
@ -829,7 +829,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
||||||
*newTraverseAllNodes = true;
|
*newTraverseAllNodes = true;
|
||||||
}
|
}
|
||||||
// get the count of nodes and increment childAddress.
|
// get the count of nodes and increment childAddress.
|
||||||
*newCount = Dictionary::getCount(DICT, &childPosition);
|
*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
|
||||||
*newChildPosition = childPosition;
|
*newChildPosition = childPosition;
|
||||||
if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
|
if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
|
||||||
return needsToTraverseChildrenNodes;
|
return needsToTraverseChildrenNodes;
|
||||||
|
@ -838,7 +838,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
||||||
inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
|
inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
|
||||||
unsigned short *word) {
|
unsigned short *word) {
|
||||||
int pos = ROOT_POS;
|
int pos = ROOT_POS;
|
||||||
int count = Dictionary::getCount(DICT, &pos);
|
int count = Dictionary::getCount(DICT_ROOT, &pos);
|
||||||
int maxFreq = 0;
|
int maxFreq = 0;
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
|
unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
|
||||||
|
@ -894,8 +894,8 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
|
||||||
const int inputIndex = startInputIndex + depth;
|
const int inputIndex = startInputIndex + depth;
|
||||||
const int *currentChars = getInputCharsAt(inputIndex);
|
const int *currentChars = getInputCharsAt(inputIndex);
|
||||||
unsigned short c;
|
unsigned short c;
|
||||||
*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,
|
*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
|
||||||
newChildPosition, newTerminal, newFreq);
|
&c, newChildPosition, newTerminal, newFreq);
|
||||||
const unsigned int inputC = currentChars[0];
|
const unsigned int inputC = currentChars[0];
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
assert(inputC <= U_SHORT_MAX);
|
assert(inputC <= U_SHORT_MAX);
|
||||||
|
@ -912,7 +912,7 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (hasChild) {
|
if (hasChild) {
|
||||||
*newCount = Dictionary::getCount(DICT, newChildPosition);
|
*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -17,9 +17,14 @@
|
||||||
#ifndef LATINIME_UNIGRAM_DICTIONARY_H
|
#ifndef LATINIME_UNIGRAM_DICTIONARY_H
|
||||||
#define LATINIME_UNIGRAM_DICTIONARY_H
|
#define LATINIME_UNIGRAM_DICTIONARY_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "proximity_info.h"
|
#include "proximity_info.h"
|
||||||
|
|
||||||
|
#ifndef NULL
|
||||||
|
#define NULL 0
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class UnigramDictionary {
|
class UnigramDictionary {
|
||||||
|
@ -31,8 +36,9 @@ class UnigramDictionary {
|
||||||
} ProximityType;
|
} ProximityType;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
|
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
|
||||||
int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
|
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||||
|
const bool isLatestDictVersion);
|
||||||
int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
|
int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
|
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
|
||||||
unsigned short *outWords, int *frequencies);
|
unsigned short *outWords, int *frequencies);
|
||||||
|
@ -104,7 +110,8 @@ private:
|
||||||
inline const int* getInputCharsAt(const int index) const {
|
inline const int* getInputCharsAt(const int index) const {
|
||||||
return mInputCodes + (index * MAX_PROXIMITY_CHARS);
|
return mInputCodes + (index * MAX_PROXIMITY_CHARS);
|
||||||
}
|
}
|
||||||
const unsigned char *DICT;
|
|
||||||
|
const uint8_t* const DICT_ROOT;
|
||||||
const int MAX_WORD_LENGTH;
|
const int MAX_WORD_LENGTH;
|
||||||
const int MAX_WORDS;
|
const int MAX_WORDS;
|
||||||
const int MAX_PROXIMITY_CHARS;
|
const int MAX_PROXIMITY_CHARS;
|
||||||
|
|
Loading…
Reference in a new issue