2010-12-01 12:22:15 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2010 The Android Open Source Project
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef LATINIME_UNIGRAM_DICTIONARY_H
|
|
|
|
#define LATINIME_UNIGRAM_DICTIONARY_H
|
|
|
|
|
2011-06-16 11:55:16 +00:00
|
|
|
#include <stdint.h>
|
2010-12-02 05:53:24 +00:00
|
|
|
#include "defines.h"
|
2011-02-22 08:28:55 +00:00
|
|
|
#include "proximity_info.h"
|
2010-12-01 12:22:15 +00:00
|
|
|
|
2011-06-16 11:55:16 +00:00
|
|
|
#ifndef NULL
|
|
|
|
#define NULL 0
|
|
|
|
#endif
|
|
|
|
|
2010-12-02 05:53:24 +00:00
|
|
|
namespace latinime {
|
2010-12-01 12:22:15 +00:00
|
|
|
|
|
|
|
class UnigramDictionary {
|
2011-01-27 05:20:22 +00:00
|
|
|
|
2010-12-01 12:22:15 +00:00
|
|
|
public:
|
2011-06-28 11:45:05 +00:00
|
|
|
#ifdef NEW_DICTIONARY_FORMAT
|
|
|
|
|
|
|
|
// Mask and flags for children address type selection.
|
|
|
|
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
|
|
|
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
|
|
|
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
|
|
|
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
|
|
|
|
static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
|
|
|
|
|
|
|
|
// Flag for single/multiple char group
|
|
|
|
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
|
|
|
|
|
|
|
|
// Flag for terminal groups
|
|
|
|
static const int FLAG_IS_TERMINAL = 0x10;
|
|
|
|
|
|
|
|
// Flag for bigram presence
|
|
|
|
static const int FLAG_HAS_BIGRAMS = 0x04;
|
|
|
|
|
|
|
|
// Attribute (bigram/shortcut) related flags:
|
|
|
|
// Flag for presence of more attributes
|
|
|
|
static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
|
|
|
// Flag for sign of offset. If this flag is set, the offset value must be negated.
|
|
|
|
static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
|
|
|
|
|
|
|
// Mask for attribute frequency, stored on 4 bits inside the flags byte.
|
|
|
|
static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
|
|
|
|
|
|
|
|
// Mask and flags for attribute address type selection.
|
|
|
|
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
|
|
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
|
|
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
|
|
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
|
|
|
#endif // NEW_DICTIONARY_FORMAT
|
|
|
|
|
2011-06-16 11:55:16 +00:00
|
|
|
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
|
|
|
|
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
|
|
|
const bool isLatestDictVersion);
|
2011-06-28 11:45:05 +00:00
|
|
|
#ifndef NEW_DICTIONARY_FORMAT
|
2011-06-16 13:33:41 +00:00
|
|
|
bool isValidWord(unsigned short *word, int length);
|
2011-06-28 11:45:05 +00:00
|
|
|
#else // NEW_DICTIONARY_FORMAT
|
|
|
|
bool isValidWord(const uint16_t* const inWord, const int length) const;
|
|
|
|
#endif // NEW_DICTIONARY_FORMAT
|
2011-06-17 03:45:17 +00:00
|
|
|
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
|
2011-07-13 01:32:02 +00:00
|
|
|
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
2011-02-25 08:56:53 +00:00
|
|
|
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
|
|
|
|
unsigned short *outWords, int *frequencies);
|
2010-12-01 12:22:15 +00:00
|
|
|
~UnigramDictionary();
|
|
|
|
|
|
|
|
private:
|
2011-07-13 01:32:02 +00:00
|
|
|
void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
2011-02-25 08:56:53 +00:00
|
|
|
const int *ycoordinates, const int *codes, const int codesSize,
|
|
|
|
unsigned short *outWords, int *frequencies);
|
|
|
|
bool isDigraph(const int* codes, const int i, const int codesSize) const;
|
2011-07-13 01:32:02 +00:00
|
|
|
void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
|
2011-02-25 08:56:53 +00:00
|
|
|
const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
|
|
|
|
const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
|
2011-03-05 06:50:19 +00:00
|
|
|
const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
|
2011-07-13 01:32:02 +00:00
|
|
|
void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
|
|
|
const int *ycoordinates, const int *codes, const int codesSize,
|
|
|
|
unsigned short *outWords, int *frequencies);
|
2010-12-13 05:42:35 +00:00
|
|
|
void getSuggestionCandidates(const int skipPos, const int excessivePos,
|
2010-12-09 13:08:33 +00:00
|
|
|
const int transposedPos, int *nextLetters, const int nextLettersSize,
|
|
|
|
const int maxDepth);
|
2010-12-01 12:22:15 +00:00
|
|
|
bool addWord(unsigned short *word, int length, int frequency);
|
2011-03-04 14:06:45 +00:00
|
|
|
bool getSplitTwoWordsSuggestion(const int inputLength,
|
|
|
|
const int firstWordStartPos, const int firstWordLength,
|
2011-05-18 06:31:04 +00:00
|
|
|
const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity);
|
2010-12-08 08:05:39 +00:00
|
|
|
bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
|
2011-03-04 14:06:45 +00:00
|
|
|
bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
|
2011-01-26 18:23:39 +00:00
|
|
|
int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
|
2011-03-03 01:22:10 +00:00
|
|
|
const int excessivePos, const int transposedPos, const int freq,
|
|
|
|
const bool sameLength) const;
|
2011-06-17 06:36:26 +00:00
|
|
|
void onTerminal(unsigned short int* word, const int depth,
|
2011-06-30 08:02:23 +00:00
|
|
|
const uint8_t* const root, const uint8_t flags, const int pos,
|
2011-06-17 06:36:26 +00:00
|
|
|
const int inputIndex, const int matchWeight, const int skipPos,
|
|
|
|
const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
|
|
|
|
int *nextLetters, const int nextLettersSize);
|
2010-12-03 07:39:16 +00:00
|
|
|
bool needsToSkipCurrentNode(const unsigned short c,
|
2010-12-03 10:38:08 +00:00
|
|
|
const int inputIndex, const int skipPos, const int depth);
|
2010-12-08 08:05:39 +00:00
|
|
|
// Process a node by considering proximity, missing and excessive character
|
2011-06-30 10:23:16 +00:00
|
|
|
bool processCurrentNode(const int initialPos, const int initialDepth,
|
|
|
|
const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex,
|
|
|
|
const int initialDiffs, const int skipPos, const int excessivePos,
|
|
|
|
const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount,
|
|
|
|
int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex,
|
|
|
|
int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex);
|
2011-06-30 11:14:38 +00:00
|
|
|
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
|
|
|
|
unsigned short *word);
|
2011-06-28 11:45:05 +00:00
|
|
|
#ifndef NEW_DICTIONARY_FORMAT
|
2011-06-30 08:15:32 +00:00
|
|
|
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
|
|
|
|
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
|
|
|
|
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
|
|
|
|
const int nextLettersSize);
|
|
|
|
// Keep getWordsOld for comparing performance between getWords and getWordsOld
|
|
|
|
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
|
|
|
|
const int excessivePos, const int transposedPos, int *nextLetters,
|
|
|
|
const int nextLettersSize);
|
2010-12-08 08:05:39 +00:00
|
|
|
// Process a node by considering missing space
|
2010-12-09 10:21:51 +00:00
|
|
|
bool processCurrentNodeForExactMatch(const int firstChildPos,
|
|
|
|
const int startInputIndex, const int depth, unsigned short *word,
|
|
|
|
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
|
2011-06-28 11:45:05 +00:00
|
|
|
#else // NEW_DICTIONARY_FORMAT
|
|
|
|
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
|
|
|
|
short unsigned int* outWord);
|
|
|
|
#endif // NEW_DICTIONARY_FORMAT
|
2011-06-16 11:55:16 +00:00
|
|
|
|
|
|
|
const uint8_t* const DICT_ROOT;
|
2010-12-01 12:22:15 +00:00
|
|
|
const int MAX_WORD_LENGTH;
|
2011-01-07 06:01:51 +00:00
|
|
|
const int MAX_WORDS;
|
2010-12-08 08:05:39 +00:00
|
|
|
const int MAX_PROXIMITY_CHARS;
|
2010-12-02 05:53:24 +00:00
|
|
|
const bool IS_LATEST_DICT_VERSION;
|
2010-12-02 09:11:54 +00:00
|
|
|
const int TYPED_LETTER_MULTIPLIER;
|
|
|
|
const int FULL_WORD_MULTIPLIER;
|
2011-01-07 06:01:51 +00:00
|
|
|
const int ROOT_POS;
|
2011-02-25 08:56:53 +00:00
|
|
|
const unsigned int BYTES_IN_ONE_CHAR;
|
2011-03-05 06:50:19 +00:00
|
|
|
const int MAX_UMLAUT_SEARCH_DEPTH;
|
2011-02-25 08:56:53 +00:00
|
|
|
|
|
|
|
// Flags for special processing
|
|
|
|
// Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
|
|
|
|
// or something very bad (like, the apocalypse) will happen.
|
|
|
|
// Please update both at the same time.
|
|
|
|
enum {
|
|
|
|
REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
|
|
|
|
};
|
|
|
|
static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
|
2010-12-01 12:22:15 +00:00
|
|
|
|
|
|
|
int *mFrequencies;
|
|
|
|
unsigned short *mOutputChars;
|
2011-07-13 01:32:02 +00:00
|
|
|
const ProximityInfo *mProximityInfo;
|
2010-12-01 12:22:15 +00:00
|
|
|
int mInputLength;
|
2010-12-02 11:19:59 +00:00
|
|
|
// MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
|
|
|
|
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
|
2010-12-01 12:22:15 +00:00
|
|
|
int mMaxEditDistance;
|
2010-12-07 04:08:39 +00:00
|
|
|
|
|
|
|
int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
|
|
|
|
bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
|
|
|
|
int mStackNodeFreq[MAX_WORD_LENGTH_INTERNAL];
|
|
|
|
int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
|
|
|
|
int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
|
|
|
|
int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
|
2011-06-16 13:51:11 +00:00
|
|
|
int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
|
2011-02-10 11:53:58 +00:00
|
|
|
int mNextLettersFrequency[NEXT_LETTERS_SIZE];
|
2010-12-01 12:22:15 +00:00
|
|
|
};
|
2011-06-18 04:09:55 +00:00
|
|
|
} // namespace latinime
|
2010-12-01 12:22:15 +00:00
|
|
|
|
|
|
|
#endif // LATINIME_UNIGRAM_DICTIONARY_H
|