LatinIME/native/src/unigram_dictionary.h

/*
 * Copyright (C) 2010 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LATINIME_UNIGRAM_DICTIONARY_H
#define LATINIME_UNIGRAM_DICTIONARY_H

#include <stdint.h>
#include "defines.h"
#include "proximity_info.h"

#ifndef NULL
#define NULL 0
#endif

namespace latinime {

class UnigramDictionary {

    typedef enum {                             // Used as a return value for character comparison
        SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR,  // Same char, possibly with different case or accent
        NEAR_PROXIMITY_CHAR,                   // It is a char located nearby on the keyboard
        UNRELATED_CHAR                         // It is an unrelated char
    } ProximityType;

public:
    UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
            int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
            const bool isLatestDictVersion);
    bool isValidWord(unsigned short *word, int length);
    int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
    int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize, const int flags,
            unsigned short *outWords, int *frequencies);
    ~UnigramDictionary();

private:
    void getWordSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize,
            unsigned short *outWords, int *frequencies);
    bool isDigraph(const int* codes, const int i, const int codesSize) const;
    void getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
        const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
        const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
        const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
    void initSuggestions(const int *codes, const int codesSize, unsigned short *outWords,
            int *frequencies);
    void getSuggestionCandidates(const int skipPos, const int excessivePos,
            const int transposedPos, int *nextLetters, const int nextLettersSize,
            const int maxDepth);
    void getVersionNumber();
    bool checkIfDictVersionIsLatest();
    int getAddress(int *pos);
    int getFreq(int *pos);
    bool sameAsTyped(const unsigned short *word, int length) const;
    bool addWord(unsigned short *word, int length, int frequency);
    void addWordAlternatesSpellings(const uint8_t* const root, int pos, int depth, int finalFreq);
    void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
            const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
            const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
            const int nextLettersSize);
    bool getSplitTwoWordsSuggestion(const int inputLength,
            const int firstWordStartPos, const int firstWordLength,
            const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity);
    bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
    bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
    // Keep getWordsOld for comparing performance between getWords and getWordsOld
    void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
            const int excessivePos, const int transposedPos, int *nextLetters,
            const int nextLettersSize);
    int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
            const int excessivePos, const int transposedPos, const int freq,
            const bool sameLength) const;
    void onTerminal(unsigned short int* word, const int depth,
            const uint8_t* const root, const uint8_t flags, int pos,
            const int inputIndex, const int matchWeight, const int skipPos,
            const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
            int *nextLetters, const int nextLettersSize);
    bool needsToSkipCurrentNode(const unsigned short c,
            const int inputIndex, const int skipPos, const int depth);
    ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c,
            const int skipPos, const int excessivePos, const int transposedPos);
    // Process a node by considering proximity, missing and excessive character
    bool processCurrentNode(const int pos, const int depth,
            const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
            const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
            int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
            bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
            int *nextSiblingPosition, int *nextOutputIndex);
    int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);
    // Process a node by considering missing space
    bool processCurrentNodeForExactMatch(const int firstChildPos,
            const int startInputIndex, const int depth, unsigned short *word,
            int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
    bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const;
    inline const int* getInputCharsAt(const int index) const {
        return mInputCodes + (index * MAX_PROXIMITY_CHARS);
    }

    const uint8_t* const DICT_ROOT;
    const int MAX_WORD_LENGTH;
    const int MAX_WORDS;
    const int MAX_PROXIMITY_CHARS;
    const bool IS_LATEST_DICT_VERSION;
    const int TYPED_LETTER_MULTIPLIER;
    const int FULL_WORD_MULTIPLIER;
    const int ROOT_POS;
    const unsigned int BYTES_IN_ONE_CHAR;
    const int MAX_UMLAUT_SEARCH_DEPTH;

    // Flags for special processing
    // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
    // or something very bad (like, the apocalypse) will happen.
    // Please update both at the same time.
    enum {
        REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
    };
    static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];

    int *mFrequencies;
    unsigned short *mOutputChars;
    const int *mInputCodes;
    int mInputLength;
    // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
    unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
    int mMaxEditDistance;

    int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
    bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
    int mStackNodeFreq[MAX_WORD_LENGTH_INTERNAL];
    int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
    int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
    int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
    int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
    int mNextLettersFrequency[NEXT_LETTERS_SIZE];
};

// ----------------------------------------------------------------------------

}; // namespace latinime

#endif // LATINIME_UNIGRAM_DICTIONARY_H