Merge "Add mechanism to handle digraphs in DicNode"
This commit is contained in:
commit
b6e1777d4b
4 changed files with 114 additions and 21 deletions
|
@ -27,39 +27,47 @@ const DigraphUtils::digraph_t DigraphUtils::GERMAN_UMLAUT_DIGRAPHS[] =
|
||||||
const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] =
|
const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] =
|
||||||
{ { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE
|
{ { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE
|
||||||
{ 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE
|
{ 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE
|
||||||
|
const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] =
|
||||||
|
{ DIGRAPH_TYPE_GERMAN_UMLAUT, DIGRAPH_TYPE_FRENCH_LIGATURES };
|
||||||
|
|
||||||
/* static */ bool DigraphUtils::hasDigraphForCodePoint(
|
/* static */ bool DigraphUtils::hasDigraphForCodePoint(
|
||||||
const int dictFlags, const int compositeGlyphCodePoint) {
|
const int dictFlags, const int compositeGlyphCodePoint) {
|
||||||
if (DigraphUtils::getDigraphForCodePoint(dictFlags, compositeGlyphCodePoint)) {
|
const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(dictFlags);
|
||||||
|
if (DigraphUtils::getDigraphForDigraphTypeAndCodePoint(digraphType, compositeGlyphCodePoint)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieves the set of all digraphs associated with the given dictionary.
|
// Returns the digraph type associated with the given dictionary.
|
||||||
// Returns the size of the digraph array, or 0 if none exist.
|
/* static */ DigraphUtils::DigraphType DigraphUtils::getDigraphTypeForDictionary(
|
||||||
/* static */ int DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(
|
const int dictFlags) {
|
||||||
const int dictFlags, const DigraphUtils::digraph_t **digraphs) {
|
|
||||||
if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & dictFlags) {
|
if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & dictFlags) {
|
||||||
*digraphs = DigraphUtils::GERMAN_UMLAUT_DIGRAPHS;
|
return DIGRAPH_TYPE_GERMAN_UMLAUT;
|
||||||
return NELEMS(DigraphUtils::GERMAN_UMLAUT_DIGRAPHS);
|
|
||||||
}
|
}
|
||||||
if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & dictFlags) {
|
if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & dictFlags) {
|
||||||
*digraphs = DigraphUtils::FRENCH_LIGATURES_DIGRAPHS;
|
return DIGRAPH_TYPE_FRENCH_LIGATURES;
|
||||||
return NELEMS(DigraphUtils::FRENCH_LIGATURES_DIGRAPHS);
|
|
||||||
}
|
}
|
||||||
return 0;
|
return DIGRAPH_TYPE_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retrieves the set of all digraphs associated with the given dictionary flags.
|
||||||
|
// Returns the size of the digraph array, or 0 if none exist.
|
||||||
|
/* static */ int DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(
|
||||||
|
const int dictFlags, const DigraphUtils::digraph_t **const digraphs) {
|
||||||
|
const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(dictFlags);
|
||||||
|
return getAllDigraphsForDigraphTypeAndReturnSize(digraphType, digraphs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the digraph codepoint for the given composite glyph codepoint and digraph codepoint index
|
// Returns the digraph codepoint for the given composite glyph codepoint and digraph codepoint index
|
||||||
// (which specifies the first or second codepoint in the digraph).
|
// (which specifies the first or second codepoint in the digraph).
|
||||||
/* static */ int DigraphUtils::getDigraphCodePointForIndex(const int dictFlags,
|
/* static */ int DigraphUtils::getDigraphCodePointForIndex(const int compositeGlyphCodePoint,
|
||||||
const int compositeGlyphCodePoint, const DigraphCodePointIndex digraphCodePointIndex) {
|
const DigraphCodePointIndex digraphCodePointIndex) {
|
||||||
if (digraphCodePointIndex == NOT_A_DIGRAPH_INDEX) {
|
if (digraphCodePointIndex == NOT_A_DIGRAPH_INDEX) {
|
||||||
return NOT_A_CODE_POINT;
|
return NOT_A_CODE_POINT;
|
||||||
}
|
}
|
||||||
const DigraphUtils::digraph_t *digraph =
|
const DigraphUtils::digraph_t *const digraph =
|
||||||
DigraphUtils::getDigraphForCodePoint(dictFlags, compositeGlyphCodePoint);
|
DigraphUtils::getDigraphForCodePoint(compositeGlyphCodePoint);
|
||||||
if (!digraph) {
|
if (!digraph) {
|
||||||
return NOT_A_CODE_POINT;
|
return NOT_A_CODE_POINT;
|
||||||
}
|
}
|
||||||
|
@ -72,16 +80,48 @@ const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] =
|
||||||
return NOT_A_CODE_POINT;
|
return NOT_A_CODE_POINT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Retrieves the set of all digraphs associated with the given digraph type.
|
||||||
|
// Returns the size of the digraph array, or 0 if none exist.
|
||||||
|
/* static */ int DigraphUtils::getAllDigraphsForDigraphTypeAndReturnSize(
|
||||||
|
const DigraphUtils::DigraphType digraphType,
|
||||||
|
const DigraphUtils::digraph_t **const digraphs) {
|
||||||
|
if (digraphType == DigraphUtils::DIGRAPH_TYPE_GERMAN_UMLAUT) {
|
||||||
|
*digraphs = GERMAN_UMLAUT_DIGRAPHS;
|
||||||
|
return NELEMS(GERMAN_UMLAUT_DIGRAPHS);
|
||||||
|
}
|
||||||
|
if (digraphType == DIGRAPH_TYPE_FRENCH_LIGATURES) {
|
||||||
|
*digraphs = FRENCH_LIGATURES_DIGRAPHS;
|
||||||
|
return NELEMS(FRENCH_LIGATURES_DIGRAPHS);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the digraph for the input composite glyph codepoint, or 0 if none exists.
|
* Returns the digraph for the input composite glyph codepoint, or 0 if none exists.
|
||||||
* dictFlags: the dictionary flags needed to determine which digraphs are supported.
|
|
||||||
* compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint.
|
* compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint.
|
||||||
*/
|
*/
|
||||||
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForCodePoint(
|
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForCodePoint(
|
||||||
const int dictFlags, const int compositeGlyphCodePoint) {
|
const int compositeGlyphCodePoint) {
|
||||||
|
for (size_t i = 0; i < NELEMS(USED_DIGRAPH_TYPES); i++) {
|
||||||
|
const DigraphUtils::digraph_t *const digraph = getDigraphForDigraphTypeAndCodePoint(
|
||||||
|
USED_DIGRAPH_TYPES[i], compositeGlyphCodePoint);
|
||||||
|
if (digraph) {
|
||||||
|
return digraph;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the digraph for the input composite glyph codepoint, or 0 if none exists.
|
||||||
|
* digraphType: the type of digraphs supported.
|
||||||
|
* compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint.
|
||||||
|
*/
|
||||||
|
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForDigraphTypeAndCodePoint(
|
||||||
|
const DigraphUtils::DigraphType digraphType, const int compositeGlyphCodePoint) {
|
||||||
const DigraphUtils::digraph_t *digraphs = 0;
|
const DigraphUtils::digraph_t *digraphs = 0;
|
||||||
const int digraphsSize =
|
const int digraphsSize =
|
||||||
DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(dictFlags, &digraphs);
|
DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(digraphType, &digraphs);
|
||||||
for (int i = 0; i < digraphsSize; i++) {
|
for (int i = 0; i < digraphsSize; i++) {
|
||||||
if (digraphs[i].compositeGlyph == compositeGlyphCodePoint) {
|
if (digraphs[i].compositeGlyph == compositeGlyphCodePoint) {
|
||||||
return &digraphs[i];
|
return &digraphs[i];
|
||||||
|
|
|
@ -27,21 +27,34 @@ class DigraphUtils {
|
||||||
SECOND_DIGRAPH_CODEPOINT
|
SECOND_DIGRAPH_CODEPOINT
|
||||||
} DigraphCodePointIndex;
|
} DigraphCodePointIndex;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
DIGRAPH_TYPE_NONE,
|
||||||
|
DIGRAPH_TYPE_GERMAN_UMLAUT,
|
||||||
|
DIGRAPH_TYPE_FRENCH_LIGATURES
|
||||||
|
} DigraphType;
|
||||||
|
|
||||||
typedef struct { int first; int second; int compositeGlyph; } digraph_t;
|
typedef struct { int first; int second; int compositeGlyph; } digraph_t;
|
||||||
|
|
||||||
static bool hasDigraphForCodePoint(const int dictFlags, const int compositeGlyphCodePoint);
|
static bool hasDigraphForCodePoint(const int dictFlags, const int compositeGlyphCodePoint);
|
||||||
static int getAllDigraphsForDictionaryAndReturnSize(
|
static int getAllDigraphsForDictionaryAndReturnSize(
|
||||||
const int dictFlags, const digraph_t **digraphs);
|
const int dictFlags, const digraph_t **const digraphs);
|
||||||
static int getDigraphCodePointForIndex(const int dictFlags, const int compositeGlyphCodePoint,
|
static int getDigraphCodePointForIndex(const int dictFlags, const int compositeGlyphCodePoint,
|
||||||
const DigraphCodePointIndex digraphCodePointIndex);
|
const DigraphCodePointIndex digraphCodePointIndex);
|
||||||
|
static int getDigraphCodePointForIndex(const int compositeGlyphCodePoint,
|
||||||
|
const DigraphCodePointIndex digraphCodePointIndex);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils);
|
||||||
static const digraph_t *getDigraphForCodePoint(
|
static DigraphType getDigraphTypeForDictionary(const int dictFlags);
|
||||||
const int dictFlags, const int compositeGlyphCodePoint);
|
static int getAllDigraphsForDigraphTypeAndReturnSize(
|
||||||
|
const DigraphType digraphType, const digraph_t **const digraphs);
|
||||||
|
static const digraph_t *getDigraphForCodePoint(const int compositeGlyphCodePoint);
|
||||||
|
static const digraph_t *getDigraphForDigraphTypeAndCodePoint(
|
||||||
|
const DigraphType digraphType, const int compositeGlyphCodePoint);
|
||||||
|
|
||||||
static const digraph_t GERMAN_UMLAUT_DIGRAPHS[];
|
static const digraph_t GERMAN_UMLAUT_DIGRAPHS[];
|
||||||
static const digraph_t FRENCH_LIGATURES_DIGRAPHS[];
|
static const digraph_t FRENCH_LIGATURES_DIGRAPHS[];
|
||||||
|
static const DigraphType USED_DIGRAPH_TYPES[];
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // DIGRAPH_UTILS_H
|
#endif // DIGRAPH_UTILS_H
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include "dic_node_profiler.h"
|
#include "dic_node_profiler.h"
|
||||||
#include "dic_node_properties.h"
|
#include "dic_node_properties.h"
|
||||||
#include "dic_node_release_listener.h"
|
#include "dic_node_release_listener.h"
|
||||||
|
#include "digraph_utils.h"
|
||||||
|
|
||||||
#if DEBUG_DICT
|
#if DEBUG_DICT
|
||||||
#define LOGI_SHOW_ADD_COST_PROP \
|
#define LOGI_SHOW_ADD_COST_PROP \
|
||||||
|
@ -399,8 +400,15 @@ class DicNode {
|
||||||
// TODO: Remove //
|
// TODO: Remove //
|
||||||
//////////////////////
|
//////////////////////
|
||||||
// TODO: Remove once touch path is merged into ProximityInfoState
|
// TODO: Remove once touch path is merged into ProximityInfoState
|
||||||
|
// Note: Returned codepoint may be a digraph codepoint if the node is in a composite glyph.
|
||||||
int getNodeCodePoint() const {
|
int getNodeCodePoint() const {
|
||||||
return mDicNodeProperties.getNodeCodePoint();
|
const int codePoint = mDicNodeProperties.getNodeCodePoint();
|
||||||
|
const DigraphUtils::DigraphCodePointIndex digraphIndex =
|
||||||
|
mDicNodeState.mDicNodeStateScoring.getDigraphIndex();
|
||||||
|
if (digraphIndex == DigraphUtils::NOT_A_DIGRAPH_INDEX) {
|
||||||
|
return codePoint;
|
||||||
|
}
|
||||||
|
return DigraphUtils::getDigraphCodePointForIndex(codePoint, digraphIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
|
@ -452,6 +460,15 @@ class DicNode {
|
||||||
mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel);
|
mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isInDigraph() const {
|
||||||
|
return mDicNodeState.mDicNodeStateScoring.getDigraphIndex()
|
||||||
|
!= DigraphUtils::NOT_A_DIGRAPH_INDEX;
|
||||||
|
}
|
||||||
|
|
||||||
|
void advanceDigraphIndex() {
|
||||||
|
mDicNodeState.mDicNodeStateScoring.advanceDigraphIndex();
|
||||||
|
}
|
||||||
|
|
||||||
uint8_t getFlags() const {
|
uint8_t getFlags() const {
|
||||||
return mDicNodeProperties.getFlags();
|
return mDicNodeProperties.getFlags();
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "digraph_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -27,6 +28,7 @@ class DicNodeStateScoring {
|
||||||
public:
|
public:
|
||||||
AK_FORCE_INLINE DicNodeStateScoring()
|
AK_FORCE_INLINE DicNodeStateScoring()
|
||||||
: mDoubleLetterLevel(NOT_A_DOUBLE_LETTER),
|
: mDoubleLetterLevel(NOT_A_DOUBLE_LETTER),
|
||||||
|
mDigraphIndex(DigraphUtils::NOT_A_DIGRAPH_INDEX),
|
||||||
mEditCorrectionCount(0), mProximityCorrectionCount(0),
|
mEditCorrectionCount(0), mProximityCorrectionCount(0),
|
||||||
mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f),
|
mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f),
|
||||||
mTotalPrevWordsLanguageCost(0.0f), mRawLength(0.0f) {
|
mTotalPrevWordsLanguageCost(0.0f), mRawLength(0.0f) {
|
||||||
|
@ -43,6 +45,7 @@ class DicNodeStateScoring {
|
||||||
mTotalPrevWordsLanguageCost = 0.0f;
|
mTotalPrevWordsLanguageCost = 0.0f;
|
||||||
mRawLength = 0.0f;
|
mRawLength = 0.0f;
|
||||||
mDoubleLetterLevel = NOT_A_DOUBLE_LETTER;
|
mDoubleLetterLevel = NOT_A_DOUBLE_LETTER;
|
||||||
|
mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX;
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) {
|
AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) {
|
||||||
|
@ -54,6 +57,7 @@ class DicNodeStateScoring {
|
||||||
mTotalPrevWordsLanguageCost = scoring->mTotalPrevWordsLanguageCost;
|
mTotalPrevWordsLanguageCost = scoring->mTotalPrevWordsLanguageCost;
|
||||||
mRawLength = scoring->mRawLength;
|
mRawLength = scoring->mRawLength;
|
||||||
mDoubleLetterLevel = scoring->mDoubleLetterLevel;
|
mDoubleLetterLevel = scoring->mDoubleLetterLevel;
|
||||||
|
mDigraphIndex = scoring->mDigraphIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
void addCost(const float spatialCost, const float languageCost, const bool doNormalization,
|
void addCost(const float spatialCost, const float languageCost, const bool doNormalization,
|
||||||
|
@ -126,6 +130,24 @@ class DicNodeStateScoring {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DigraphUtils::DigraphCodePointIndex getDigraphIndex() const {
|
||||||
|
return mDigraphIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
void advanceDigraphIndex() {
|
||||||
|
switch(mDigraphIndex) {
|
||||||
|
case DigraphUtils::NOT_A_DIGRAPH_INDEX:
|
||||||
|
mDigraphIndex = DigraphUtils::FIRST_DIGRAPH_CODEPOINT;
|
||||||
|
break;
|
||||||
|
case DigraphUtils::FIRST_DIGRAPH_CODEPOINT:
|
||||||
|
mDigraphIndex = DigraphUtils::SECOND_DIGRAPH_CODEPOINT;
|
||||||
|
break;
|
||||||
|
case DigraphUtils::SECOND_DIGRAPH_CODEPOINT:
|
||||||
|
mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
float getTotalPrevWordsLanguageCost() const {
|
float getTotalPrevWordsLanguageCost() const {
|
||||||
return mTotalPrevWordsLanguageCost;
|
return mTotalPrevWordsLanguageCost;
|
||||||
}
|
}
|
||||||
|
@ -135,6 +157,7 @@ class DicNodeStateScoring {
|
||||||
// Use a default copy constructor and an assign operator because shallow copies are ok
|
// Use a default copy constructor and an assign operator because shallow copies are ok
|
||||||
// for this class
|
// for this class
|
||||||
DoubleLetterLevel mDoubleLetterLevel;
|
DoubleLetterLevel mDoubleLetterLevel;
|
||||||
|
DigraphUtils::DigraphCodePointIndex mDigraphIndex;
|
||||||
|
|
||||||
int16_t mEditCorrectionCount;
|
int16_t mEditCorrectionCount;
|
||||||
int16_t mProximityCorrectionCount;
|
int16_t mProximityCorrectionCount;
|
||||||
|
|
Loading…
Reference in a new issue