* commit '4096fd08025f2dbafabc2a558a9e15d1561d6234': Add DigraphUtils class
This commit is contained in:
commit
8ab65b8b4e
5 changed files with 153 additions and 29 deletions
|
@ -58,6 +58,7 @@ LATIN_IME_CORE_SRC_FILES := \
|
||||||
correction.cpp \
|
correction.cpp \
|
||||||
dictionary.cpp \
|
dictionary.cpp \
|
||||||
dic_traverse_wrapper.cpp \
|
dic_traverse_wrapper.cpp \
|
||||||
|
digraph_utils.cpp \
|
||||||
proximity_info.cpp \
|
proximity_info.cpp \
|
||||||
proximity_info_params.cpp \
|
proximity_info_params.cpp \
|
||||||
proximity_info_state.cpp \
|
proximity_info_state.cpp \
|
||||||
|
|
93
native/jni/src/digraph_utils.cpp
Normal file
93
native/jni/src/digraph_utils.cpp
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "binary_format.h"
|
||||||
|
#include "defines.h"
|
||||||
|
#include "digraph_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
const DigraphUtils::digraph_t DigraphUtils::GERMAN_UMLAUT_DIGRAPHS[] =
|
||||||
|
{ { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS
|
||||||
|
{ 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS
|
||||||
|
{ 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS
|
||||||
|
const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] =
|
||||||
|
{ { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE
|
||||||
|
{ 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE
|
||||||
|
|
||||||
|
/* static */ bool DigraphUtils::hasDigraphForCodePoint(
|
||||||
|
const int dictFlags, const int compositeGlyphCodePoint) {
|
||||||
|
if (DigraphUtils::getDigraphForCodePoint(dictFlags, compositeGlyphCodePoint)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retrieves the set of all digraphs associated with the given dictionary.
|
||||||
|
// Returns the size of the digraph array, or 0 if none exist.
|
||||||
|
/* static */ int DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(
|
||||||
|
const int dictFlags, const DigraphUtils::digraph_t **digraphs) {
|
||||||
|
if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & dictFlags) {
|
||||||
|
*digraphs = DigraphUtils::GERMAN_UMLAUT_DIGRAPHS;
|
||||||
|
return NELEMS(DigraphUtils::GERMAN_UMLAUT_DIGRAPHS);
|
||||||
|
}
|
||||||
|
if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & dictFlags) {
|
||||||
|
*digraphs = DigraphUtils::FRENCH_LIGATURES_DIGRAPHS;
|
||||||
|
return NELEMS(DigraphUtils::FRENCH_LIGATURES_DIGRAPHS);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the digraph codepoint for the given composite glyph codepoint and digraph codepoint index
|
||||||
|
// (which specifies the first or second codepoint in the digraph).
|
||||||
|
/* static */ int DigraphUtils::getDigraphCodePointForIndex(const int dictFlags,
|
||||||
|
const int compositeGlyphCodePoint, const DigraphCodePointIndex digraphCodePointIndex) {
|
||||||
|
if (digraphCodePointIndex == NOT_A_DIGRAPH_INDEX) {
|
||||||
|
return NOT_A_CODE_POINT;
|
||||||
|
}
|
||||||
|
const DigraphUtils::digraph_t *digraph =
|
||||||
|
DigraphUtils::getDigraphForCodePoint(dictFlags, compositeGlyphCodePoint);
|
||||||
|
if (!digraph) {
|
||||||
|
return NOT_A_CODE_POINT;
|
||||||
|
}
|
||||||
|
if (digraphCodePointIndex == FIRST_DIGRAPH_CODEPOINT) {
|
||||||
|
return digraph->first;
|
||||||
|
} else if (digraphCodePointIndex == SECOND_DIGRAPH_CODEPOINT) {
|
||||||
|
return digraph->second;
|
||||||
|
}
|
||||||
|
ASSERT(false);
|
||||||
|
return NOT_A_CODE_POINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the digraph for the input composite glyph codepoint, or 0 if none exists.
|
||||||
|
* dictFlags: the dictionary flags needed to determine which digraphs are supported.
|
||||||
|
* compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint.
|
||||||
|
*/
|
||||||
|
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForCodePoint(
|
||||||
|
const int dictFlags, const int compositeGlyphCodePoint) {
|
||||||
|
const DigraphUtils::digraph_t *digraphs = 0;
|
||||||
|
const int digraphsSize =
|
||||||
|
DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(dictFlags, &digraphs);
|
||||||
|
for (int i = 0; i < digraphsSize; i++) {
|
||||||
|
if (digraphs[i].compositeGlyph == compositeGlyphCodePoint) {
|
||||||
|
return &digraphs[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace latinime
|
47
native/jni/src/digraph_utils.h
Normal file
47
native/jni/src/digraph_utils.h
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef DIGRAPH_UTILS_H
|
||||||
|
#define DIGRAPH_UTILS_H
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
class DigraphUtils {
|
||||||
|
public:
|
||||||
|
typedef enum {
|
||||||
|
NOT_A_DIGRAPH_INDEX,
|
||||||
|
FIRST_DIGRAPH_CODEPOINT,
|
||||||
|
SECOND_DIGRAPH_CODEPOINT
|
||||||
|
} DigraphCodePointIndex;
|
||||||
|
|
||||||
|
typedef struct { int first; int second; int compositeGlyph; } digraph_t;
|
||||||
|
|
||||||
|
static bool hasDigraphForCodePoint(const int dictFlags, const int compositeGlyphCodePoint);
|
||||||
|
static int getAllDigraphsForDictionaryAndReturnSize(
|
||||||
|
const int dictFlags, const digraph_t **digraphs);
|
||||||
|
static int getDigraphCodePointForIndex(const int dictFlags, const int compositeGlyphCodePoint,
|
||||||
|
const DigraphCodePointIndex digraphCodePointIndex);
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils);
|
||||||
|
static const digraph_t *getDigraphForCodePoint(
|
||||||
|
const int dictFlags, const int compositeGlyphCodePoint);
|
||||||
|
|
||||||
|
static const digraph_t GERMAN_UMLAUT_DIGRAPHS[];
|
||||||
|
static const digraph_t FRENCH_LIGATURES_DIGRAPHS[];
|
||||||
|
};
|
||||||
|
} // namespace latinime
|
||||||
|
#endif // DIGRAPH_UTILS_H
|
|
@ -22,6 +22,7 @@
|
||||||
#include "char_utils.h"
|
#include "char_utils.h"
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "dictionary.h"
|
#include "dictionary.h"
|
||||||
|
#include "digraph_utils.h"
|
||||||
#include "proximity_info.h"
|
#include "proximity_info.h"
|
||||||
#include "terminal_attributes.h"
|
#include "terminal_attributes.h"
|
||||||
#include "unigram_dictionary.h"
|
#include "unigram_dictionary.h"
|
||||||
|
@ -30,15 +31,6 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
|
|
||||||
{ { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS
|
|
||||||
{ 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS
|
|
||||||
{ 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS
|
|
||||||
|
|
||||||
const UnigramDictionary::digraph_t UnigramDictionary::FRENCH_LIGATURES_DIGRAPHS[] =
|
|
||||||
{ { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE
|
|
||||||
{ 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE
|
|
||||||
|
|
||||||
// TODO: check the header
|
// TODO: check the header
|
||||||
UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int flags)
|
UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int flags)
|
||||||
: DICT_ROOT(streamStart), ROOT_POS(0),
|
: DICT_ROOT(streamStart), ROOT_POS(0),
|
||||||
|
@ -58,7 +50,7 @@ static void addWord(int *word, int length, int probability, WordsPriorityQueue *
|
||||||
|
|
||||||
// Return the replacement code point for a digraph, or 0 if none.
|
// Return the replacement code point for a digraph, or 0 if none.
|
||||||
int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int inputSize,
|
int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int inputSize,
|
||||||
const digraph_t *const digraphs, const unsigned int digraphsSize) const {
|
const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const {
|
||||||
|
|
||||||
// There can't be a digraph if we don't have at least 2 characters to examine
|
// There can't be a digraph if we don't have at least 2 characters to examine
|
||||||
if (i + 2 > inputSize) return false;
|
if (i + 2 > inputSize) return false;
|
||||||
|
@ -74,7 +66,7 @@ int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, cons
|
||||||
|
|
||||||
// It's an interesting digraph if the second char matches too.
|
// It's an interesting digraph if the second char matches too.
|
||||||
if (digraphs[lastDigraphIndex].second == codes[i + 1]) {
|
if (digraphs[lastDigraphIndex].second == codes[i + 1]) {
|
||||||
return digraphs[lastDigraphIndex].replacement;
|
return digraphs[lastDigraphIndex].compositeGlyph;
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -93,7 +85,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit
|
||||||
const bool useFullEditDistance, const int *codesSrc,
|
const bool useFullEditDistance, const int *codesSrc,
|
||||||
const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
|
const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
|
||||||
WordsPriorityQueuePool *queuePool,
|
WordsPriorityQueuePool *queuePool,
|
||||||
const digraph_t *const digraphs, const unsigned int digraphsSize) const {
|
const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const {
|
||||||
ASSERT(sizeof(codesDest[0]) == sizeof(codesSrc[0]));
|
ASSERT(sizeof(codesDest[0]) == sizeof(codesSrc[0]));
|
||||||
ASSERT(sizeof(xCoordinatesBuffer[0]) == sizeof(xcoordinates[0]));
|
ASSERT(sizeof(xCoordinatesBuffer[0]) == sizeof(xcoordinates[0]));
|
||||||
ASSERT(sizeof(yCoordinatesBuffer[0]) == sizeof(ycoordinates[0]));
|
ASSERT(sizeof(yCoordinatesBuffer[0]) == sizeof(ycoordinates[0]));
|
||||||
|
@ -169,7 +161,10 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x
|
||||||
queuePool.clearAll();
|
queuePool.clearAll();
|
||||||
Correction masterCorrection;
|
Correction masterCorrection;
|
||||||
masterCorrection.resetCorrection();
|
masterCorrection.resetCorrection();
|
||||||
if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & FLAGS)
|
const DigraphUtils::digraph_t *digraphs = 0;
|
||||||
|
const int digraphsSize =
|
||||||
|
DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(FLAGS, &digraphs);
|
||||||
|
if (digraphsSize > 0)
|
||||||
{ // Incrementally tune the word and try all possibilities
|
{ // Incrementally tune the word and try all possibilities
|
||||||
int codesBuffer[sizeof(*inputCodePoints) * inputSize];
|
int codesBuffer[sizeof(*inputCodePoints) * inputSize];
|
||||||
int xCoordinatesBuffer[inputSize];
|
int xCoordinatesBuffer[inputSize];
|
||||||
|
@ -177,15 +172,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x
|
||||||
getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
|
getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
|
||||||
xCoordinatesBuffer, yCoordinatesBuffer, inputSize, bigramMap, bigramFilter,
|
xCoordinatesBuffer, yCoordinatesBuffer, inputSize, bigramMap, bigramFilter,
|
||||||
useFullEditDistance, inputCodePoints, inputSize, 0, codesBuffer, &masterCorrection,
|
useFullEditDistance, inputCodePoints, inputSize, 0, codesBuffer, &masterCorrection,
|
||||||
&queuePool, GERMAN_UMLAUT_DIGRAPHS, NELEMS(GERMAN_UMLAUT_DIGRAPHS));
|
&queuePool, digraphs, digraphsSize);
|
||||||
} else if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & FLAGS) {
|
|
||||||
int codesBuffer[sizeof(*inputCodePoints) * inputSize];
|
|
||||||
int xCoordinatesBuffer[inputSize];
|
|
||||||
int yCoordinatesBuffer[inputSize];
|
|
||||||
getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
|
|
||||||
xCoordinatesBuffer, yCoordinatesBuffer, inputSize, bigramMap, bigramFilter,
|
|
||||||
useFullEditDistance, inputCodePoints, inputSize, 0, codesBuffer, &masterCorrection,
|
|
||||||
&queuePool, FRENCH_LIGATURES_DIGRAPHS, NELEMS(FRENCH_LIGATURES_DIGRAPHS));
|
|
||||||
} else { // Normal processing
|
} else { // Normal processing
|
||||||
getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, inputSize,
|
getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, inputCodePoints, inputSize,
|
||||||
bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool);
|
bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool);
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "digraph_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -29,8 +30,6 @@ class TerminalAttributes;
|
||||||
class WordsPriorityQueuePool;
|
class WordsPriorityQueuePool;
|
||||||
|
|
||||||
class UnigramDictionary {
|
class UnigramDictionary {
|
||||||
typedef struct { int first; int second; int replacement; } digraph_t;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Error tolerances
|
// Error tolerances
|
||||||
static const int DEFAULT_MAX_ERRORS = 2;
|
static const int DEFAULT_MAX_ERRORS = 2;
|
||||||
|
@ -57,13 +56,13 @@ class UnigramDictionary {
|
||||||
const bool useFullEditDistance, Correction *correction,
|
const bool useFullEditDistance, Correction *correction,
|
||||||
WordsPriorityQueuePool *queuePool) const;
|
WordsPriorityQueuePool *queuePool) const;
|
||||||
int getDigraphReplacement(const int *codes, const int i, const int inputSize,
|
int getDigraphReplacement(const int *codes, const int i, const int inputSize,
|
||||||
const digraph_t *const digraphs, const unsigned int digraphsSize) const;
|
const DigraphUtils::digraph_t *const digraphs, const unsigned int digraphsSize) const;
|
||||||
void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates,
|
void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer,
|
const int *ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer,
|
||||||
int *yCoordinatesBuffer, const int codesBufferSize, const std::map<int, int> *bigramMap,
|
int *yCoordinatesBuffer, const int codesBufferSize, const std::map<int, int> *bigramMap,
|
||||||
const uint8_t *bigramFilter, const bool useFullEditDistance, const int *codesSrc,
|
const uint8_t *bigramFilter, const bool useFullEditDistance, const int *codesSrc,
|
||||||
const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
|
const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
|
||||||
WordsPriorityQueuePool *queuePool, const digraph_t *const digraphs,
|
WordsPriorityQueuePool *queuePool, const DigraphUtils::digraph_t *const digraphs,
|
||||||
const unsigned int digraphsSize) const;
|
const unsigned int digraphsSize) const;
|
||||||
void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const int inputSize,
|
const int *ycoordinates, const int *codes, const int inputSize,
|
||||||
|
@ -111,9 +110,6 @@ class UnigramDictionary {
|
||||||
const int ROOT_POS;
|
const int ROOT_POS;
|
||||||
const int MAX_DIGRAPH_SEARCH_DEPTH;
|
const int MAX_DIGRAPH_SEARCH_DEPTH;
|
||||||
const int FLAGS;
|
const int FLAGS;
|
||||||
|
|
||||||
static const digraph_t GERMAN_UMLAUT_DIGRAPHS[];
|
|
||||||
static const digraph_t FRENCH_LIGATURES_DIGRAPHS[];
|
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_UNIGRAM_DICTIONARY_H
|
#endif // LATINIME_UNIGRAM_DICTIONARY_H
|
||||||
|
|
Loading…
Reference in a new issue