LatinIME/native/src/unigram_dictionary.h

179 lines
8.4 KiB
C
Raw Normal View History

/*
* Copyright (C) 2010 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_UNIGRAM_DICTIONARY_H
#define LATINIME_UNIGRAM_DICTIONARY_H
#include <stdint.h>
#include "defines.h"
#include "proximity_info.h"
#ifndef NULL
#define NULL 0
#endif
namespace latinime {
class UnigramDictionary {
public:
#ifdef NEW_DICTIONARY_FORMAT
// Mask and flags for children address type selection.
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
// Flag for single/multiple char group
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
// Flag for terminal groups
static const int FLAG_IS_TERMINAL = 0x10;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
// Flag for sign of offset. If this flag is set, the offset value must be negated.
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
// Mask and flags for attribute address type selection.
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
#endif // NEW_DICTIONARY_FORMAT
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion);
#ifndef NEW_DICTIONARY_FORMAT
bool isValidWord(unsigned short *word, int length);
#else // NEW_DICTIONARY_FORMAT
bool isValidWord(const uint16_t* const inWord, const int length) const;
#endif // NEW_DICTIONARY_FORMAT
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
unsigned short *outWords, int *frequencies);
~UnigramDictionary();
private:
void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const int codesSize,
unsigned short *outWords, int *frequencies);
bool isDigraph(const int* codes, const int i, const int codesSize) const;
void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const int codesSize,
unsigned short *outWords, int *frequencies);
void getSuggestionCandidates(const int skipPos, const int excessivePos,
const int transposedPos, int *nextLetters, const int nextLettersSize,
const int maxDepth);
bool addWord(unsigned short *word, int length, int frequency);
bool getSplitTwoWordsSuggestion(const int inputLength,
const int firstWordStartPos, const int firstWordLength,
const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity);
bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
const int excessivePos, const int transposedPos, const int freq,
const bool sameLength) const;
void onTerminal(unsigned short int* word, const int depth,
const uint8_t* const root, const uint8_t flags, const int pos,
const int inputIndex, const int matchWeight, const int skipPos,
const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
int *nextLetters, const int nextLettersSize);
bool needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth);
// Process a node by considering proximity, missing and excessive character
bool processCurrentNode(const int initialPos, const int initialDepth,
const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex,
const int initialDiffs, const int skipPos, const int excessivePos,
const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount,
int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex,
int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex);
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
unsigned short *word);
#ifndef NEW_DICTIONARY_FORMAT
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize);
// Keep getWordsOld for comparing performance between getWords and getWordsOld
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize);
// Process a node by considering missing space
bool processCurrentNodeForExactMatch(const int firstChildPos,
const int startInputIndex, const int depth, unsigned short *word,
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
#else // NEW_DICTIONARY_FORMAT
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
short unsigned int* outWord);
#endif // NEW_DICTIONARY_FORMAT
const uint8_t* const DICT_ROOT;
const int MAX_WORD_LENGTH;
const int MAX_WORDS;
const int MAX_PROXIMITY_CHARS;
const bool IS_LATEST_DICT_VERSION;
const int TYPED_LETTER_MULTIPLIER;
const int FULL_WORD_MULTIPLIER;
const int ROOT_POS;
const unsigned int BYTES_IN_ONE_CHAR;
const int MAX_UMLAUT_SEARCH_DEPTH;
// Flags for special processing
// Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
// or something very bad (like, the apocalypse) will happen.
// Please update both at the same time.
enum {
REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
};
static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
int *mFrequencies;
unsigned short *mOutputChars;
const ProximityInfo *mProximityInfo;
int mInputLength;
// MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
int mMaxEditDistance;
int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
int mStackNodeFreq[MAX_WORD_LENGTH_INTERNAL];
int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
int mNextLettersFrequency[NEXT_LETTERS_SIZE];
};
} // namespace latinime
#endif // LATINIME_UNIGRAM_DICTIONARY_H