LatinIME/native/src/unigram_dictionary.h

/*
 * Copyright (C) 2010 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LATINIME_UNIGRAM_DICTIONARY_H
#define LATINIME_UNIGRAM_DICTIONARY_H

#include <stdint.h>
#include "correction.h"
#include "correction_state.h"
#include "defines.h"
#include "proximity_info.h"

#ifndef NULL
#define NULL 0
#endif

namespace latinime {

class UnigramDictionary {

public:

    // Mask and flags for children address type selection.
    static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
    static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
    static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
    static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
    static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;

    // Flag for single/multiple char group
    static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;

    // Flag for terminal groups
    static const int FLAG_IS_TERMINAL = 0x10;

    // Flag for bigram presence
    static const int FLAG_HAS_BIGRAMS = 0x04;

    // Attribute (bigram/shortcut) related flags:
    // Flag for presence of more attributes
    static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
    // Flag for sign of offset. If this flag is set, the offset value must be negated.
    static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;

    // Mask for attribute frequency, stored on 4 bits inside the flags byte.
    static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;

    // Mask and flags for attribute address type selection.
    static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;

    UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
            int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
            const bool isLatestDictVersion);
    bool isValidWord(const uint16_t* const inWord, const int length) const;
    int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
    int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize, const int flags,
            unsigned short *outWords, int *frequencies);
    virtual ~UnigramDictionary();

private:

    void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize,
            unsigned short *outWords, int *frequencies);
    bool isDigraph(const int* codes, const int i, const int codesSize) const;
    void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
        const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
        const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
    void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize,
            unsigned short *outWords, int *frequencies);
    void getSuggestionCandidates();
    bool addWord(unsigned short *word, int length, int frequency);
    void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction);
    void getMissingSpaceWords(
            const int inputLength, const int missingSpacePos, Correction *correction);
    void getMistypedSpaceWords(
            const int inputLength, const int spaceProximityPos, Correction *correction);
    void onTerminal(const int freq, Correction *correction);
    bool needsToSkipCurrentNode(const unsigned short c,
            const int inputIndex, const int skipPos, const int depth);
    // Process a node by considering proximity, missing and excessive character
    bool processCurrentNode(const int initialPos,
            Correction *correction, int *newCount,
            int *newChildPosition, int *nextSiblingPosition);
    int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
            unsigned short *word);
    int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
            short unsigned int* outWord);

    const uint8_t* const DICT_ROOT;
    const int MAX_WORD_LENGTH;
    const int MAX_WORDS;
    const int MAX_PROXIMITY_CHARS;
    const bool IS_LATEST_DICT_VERSION;
    const int TYPED_LETTER_MULTIPLIER;
    const int FULL_WORD_MULTIPLIER;
    const int ROOT_POS;
    const unsigned int BYTES_IN_ONE_CHAR;
    const int MAX_UMLAUT_SEARCH_DEPTH;

    // Flags for special processing
    // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
    // or something very bad (like, the apocalypse) will happen.
    // Please update both at the same time.
    enum {
        REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
    };
    static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];

    int *mFrequencies;
    unsigned short *mOutputChars;
    ProximityInfo *mProximityInfo;
    Correction *mCorrection;
    int mInputLength;
    // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
    unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];

    int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
};
} // namespace latinime

#endif // LATINIME_UNIGRAM_DICTIONARY_H