Merge "Add mechanism to handle digraphs in DicNode"

This commit is contained in:
Tom Ouyang 2013-04-10 22:52:24 +00:00 committed by Android (Google) Code Review
commit b6e1777d4b
4 changed files with 114 additions and 21 deletions

View file

@ -27,39 +27,47 @@ const DigraphUtils::digraph_t DigraphUtils::GERMAN_UMLAUT_DIGRAPHS[] =
const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] = const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] =
{ { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE { { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE
{ 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE
const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] =
{ DIGRAPH_TYPE_GERMAN_UMLAUT, DIGRAPH_TYPE_FRENCH_LIGATURES };
/* static */ bool DigraphUtils::hasDigraphForCodePoint( /* static */ bool DigraphUtils::hasDigraphForCodePoint(
const int dictFlags, const int compositeGlyphCodePoint) { const int dictFlags, const int compositeGlyphCodePoint) {
if (DigraphUtils::getDigraphForCodePoint(dictFlags, compositeGlyphCodePoint)) { const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(dictFlags);
if (DigraphUtils::getDigraphForDigraphTypeAndCodePoint(digraphType, compositeGlyphCodePoint)) {
return true; return true;
} }
return false; return false;
} }
// Retrieves the set of all digraphs associated with the given dictionary. // Returns the digraph type associated with the given dictionary.
// Returns the size of the digraph array, or 0 if none exist. /* static */ DigraphUtils::DigraphType DigraphUtils::getDigraphTypeForDictionary(
/* static */ int DigraphUtils::getAllDigraphsForDictionaryAndReturnSize( const int dictFlags) {
const int dictFlags, const DigraphUtils::digraph_t **digraphs) {
if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & dictFlags) { if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & dictFlags) {
*digraphs = DigraphUtils::GERMAN_UMLAUT_DIGRAPHS; return DIGRAPH_TYPE_GERMAN_UMLAUT;
return NELEMS(DigraphUtils::GERMAN_UMLAUT_DIGRAPHS);
} }
if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & dictFlags) { if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & dictFlags) {
*digraphs = DigraphUtils::FRENCH_LIGATURES_DIGRAPHS; return DIGRAPH_TYPE_FRENCH_LIGATURES;
return NELEMS(DigraphUtils::FRENCH_LIGATURES_DIGRAPHS);
} }
return 0; return DIGRAPH_TYPE_NONE;
}
// Retrieves the set of all digraphs associated with the given dictionary flags.
// Returns the size of the digraph array, or 0 if none exist.
/* static */ int DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(
const int dictFlags, const DigraphUtils::digraph_t **const digraphs) {
const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(dictFlags);
return getAllDigraphsForDigraphTypeAndReturnSize(digraphType, digraphs);
} }
// Returns the digraph codepoint for the given composite glyph codepoint and digraph codepoint index // Returns the digraph codepoint for the given composite glyph codepoint and digraph codepoint index
// (which specifies the first or second codepoint in the digraph). // (which specifies the first or second codepoint in the digraph).
/* static */ int DigraphUtils::getDigraphCodePointForIndex(const int dictFlags, /* static */ int DigraphUtils::getDigraphCodePointForIndex(const int compositeGlyphCodePoint,
const int compositeGlyphCodePoint, const DigraphCodePointIndex digraphCodePointIndex) { const DigraphCodePointIndex digraphCodePointIndex) {
if (digraphCodePointIndex == NOT_A_DIGRAPH_INDEX) { if (digraphCodePointIndex == NOT_A_DIGRAPH_INDEX) {
return NOT_A_CODE_POINT; return NOT_A_CODE_POINT;
} }
const DigraphUtils::digraph_t *digraph = const DigraphUtils::digraph_t *const digraph =
DigraphUtils::getDigraphForCodePoint(dictFlags, compositeGlyphCodePoint); DigraphUtils::getDigraphForCodePoint(compositeGlyphCodePoint);
if (!digraph) { if (!digraph) {
return NOT_A_CODE_POINT; return NOT_A_CODE_POINT;
} }
@ -72,16 +80,48 @@ const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] =
return NOT_A_CODE_POINT; return NOT_A_CODE_POINT;
} }
// Retrieves the set of all digraphs associated with the given digraph type.
// Returns the size of the digraph array, or 0 if none exist.
/* static */ int DigraphUtils::getAllDigraphsForDigraphTypeAndReturnSize(
const DigraphUtils::DigraphType digraphType,
const DigraphUtils::digraph_t **const digraphs) {
if (digraphType == DigraphUtils::DIGRAPH_TYPE_GERMAN_UMLAUT) {
*digraphs = GERMAN_UMLAUT_DIGRAPHS;
return NELEMS(GERMAN_UMLAUT_DIGRAPHS);
}
if (digraphType == DIGRAPH_TYPE_FRENCH_LIGATURES) {
*digraphs = FRENCH_LIGATURES_DIGRAPHS;
return NELEMS(FRENCH_LIGATURES_DIGRAPHS);
}
return 0;
}
/** /**
* Returns the digraph for the input composite glyph codepoint, or 0 if none exists. * Returns the digraph for the input composite glyph codepoint, or 0 if none exists.
* dictFlags: the dictionary flags needed to determine which digraphs are supported.
* compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint. * compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint.
*/ */
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForCodePoint( /* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForCodePoint(
const int dictFlags, const int compositeGlyphCodePoint) { const int compositeGlyphCodePoint) {
for (size_t i = 0; i < NELEMS(USED_DIGRAPH_TYPES); i++) {
const DigraphUtils::digraph_t *const digraph = getDigraphForDigraphTypeAndCodePoint(
USED_DIGRAPH_TYPES[i], compositeGlyphCodePoint);
if (digraph) {
return digraph;
}
}
return 0;
}
/**
* Returns the digraph for the input composite glyph codepoint, or 0 if none exists.
* digraphType: the type of digraphs supported.
* compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint.
*/
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForDigraphTypeAndCodePoint(
const DigraphUtils::DigraphType digraphType, const int compositeGlyphCodePoint) {
const DigraphUtils::digraph_t *digraphs = 0; const DigraphUtils::digraph_t *digraphs = 0;
const int digraphsSize = const int digraphsSize =
DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(dictFlags, &digraphs); DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(digraphType, &digraphs);
for (int i = 0; i < digraphsSize; i++) { for (int i = 0; i < digraphsSize; i++) {
if (digraphs[i].compositeGlyph == compositeGlyphCodePoint) { if (digraphs[i].compositeGlyph == compositeGlyphCodePoint) {
return &digraphs[i]; return &digraphs[i];

View file

@ -27,21 +27,34 @@ class DigraphUtils {
SECOND_DIGRAPH_CODEPOINT SECOND_DIGRAPH_CODEPOINT
} DigraphCodePointIndex; } DigraphCodePointIndex;
typedef enum {
DIGRAPH_TYPE_NONE,
DIGRAPH_TYPE_GERMAN_UMLAUT,
DIGRAPH_TYPE_FRENCH_LIGATURES
} DigraphType;
typedef struct { int first; int second; int compositeGlyph; } digraph_t; typedef struct { int first; int second; int compositeGlyph; } digraph_t;
static bool hasDigraphForCodePoint(const int dictFlags, const int compositeGlyphCodePoint); static bool hasDigraphForCodePoint(const int dictFlags, const int compositeGlyphCodePoint);
static int getAllDigraphsForDictionaryAndReturnSize( static int getAllDigraphsForDictionaryAndReturnSize(
const int dictFlags, const digraph_t **digraphs); const int dictFlags, const digraph_t **const digraphs);
static int getDigraphCodePointForIndex(const int dictFlags, const int compositeGlyphCodePoint, static int getDigraphCodePointForIndex(const int dictFlags, const int compositeGlyphCodePoint,
const DigraphCodePointIndex digraphCodePointIndex); const DigraphCodePointIndex digraphCodePointIndex);
static int getDigraphCodePointForIndex(const int compositeGlyphCodePoint,
const DigraphCodePointIndex digraphCodePointIndex);
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils); DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils);
static const digraph_t *getDigraphForCodePoint( static DigraphType getDigraphTypeForDictionary(const int dictFlags);
const int dictFlags, const int compositeGlyphCodePoint); static int getAllDigraphsForDigraphTypeAndReturnSize(
const DigraphType digraphType, const digraph_t **const digraphs);
static const digraph_t *getDigraphForCodePoint(const int compositeGlyphCodePoint);
static const digraph_t *getDigraphForDigraphTypeAndCodePoint(
const DigraphType digraphType, const int compositeGlyphCodePoint);
static const digraph_t GERMAN_UMLAUT_DIGRAPHS[]; static const digraph_t GERMAN_UMLAUT_DIGRAPHS[];
static const digraph_t FRENCH_LIGATURES_DIGRAPHS[]; static const digraph_t FRENCH_LIGATURES_DIGRAPHS[];
static const DigraphType USED_DIGRAPH_TYPES[];
}; };
} // namespace latinime } // namespace latinime
#endif // DIGRAPH_UTILS_H #endif // DIGRAPH_UTILS_H

View file

@ -23,6 +23,7 @@
#include "dic_node_profiler.h" #include "dic_node_profiler.h"
#include "dic_node_properties.h" #include "dic_node_properties.h"
#include "dic_node_release_listener.h" #include "dic_node_release_listener.h"
#include "digraph_utils.h"
#if DEBUG_DICT #if DEBUG_DICT
#define LOGI_SHOW_ADD_COST_PROP \ #define LOGI_SHOW_ADD_COST_PROP \
@ -399,8 +400,15 @@ class DicNode {
// TODO: Remove // // TODO: Remove //
////////////////////// //////////////////////
// TODO: Remove once touch path is merged into ProximityInfoState // TODO: Remove once touch path is merged into ProximityInfoState
// Note: Returned codepoint may be a digraph codepoint if the node is in a composite glyph.
int getNodeCodePoint() const { int getNodeCodePoint() const {
return mDicNodeProperties.getNodeCodePoint(); const int codePoint = mDicNodeProperties.getNodeCodePoint();
const DigraphUtils::DigraphCodePointIndex digraphIndex =
mDicNodeState.mDicNodeStateScoring.getDigraphIndex();
if (digraphIndex == DigraphUtils::NOT_A_DIGRAPH_INDEX) {
return codePoint;
}
return DigraphUtils::getDigraphCodePointForIndex(codePoint, digraphIndex);
} }
//////////////////////////////// ////////////////////////////////
@ -452,6 +460,15 @@ class DicNode {
mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel); mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel);
} }
bool isInDigraph() const {
return mDicNodeState.mDicNodeStateScoring.getDigraphIndex()
!= DigraphUtils::NOT_A_DIGRAPH_INDEX;
}
void advanceDigraphIndex() {
mDicNodeState.mDicNodeStateScoring.advanceDigraphIndex();
}
uint8_t getFlags() const { uint8_t getFlags() const {
return mDicNodeProperties.getFlags(); return mDicNodeProperties.getFlags();
} }

View file

@ -20,6 +20,7 @@
#include <stdint.h> #include <stdint.h>
#include "defines.h" #include "defines.h"
#include "digraph_utils.h"
namespace latinime { namespace latinime {
@ -27,6 +28,7 @@ class DicNodeStateScoring {
public: public:
AK_FORCE_INLINE DicNodeStateScoring() AK_FORCE_INLINE DicNodeStateScoring()
: mDoubleLetterLevel(NOT_A_DOUBLE_LETTER), : mDoubleLetterLevel(NOT_A_DOUBLE_LETTER),
mDigraphIndex(DigraphUtils::NOT_A_DIGRAPH_INDEX),
mEditCorrectionCount(0), mProximityCorrectionCount(0), mEditCorrectionCount(0), mProximityCorrectionCount(0),
mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f), mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f),
mTotalPrevWordsLanguageCost(0.0f), mRawLength(0.0f) { mTotalPrevWordsLanguageCost(0.0f), mRawLength(0.0f) {
@ -43,6 +45,7 @@ class DicNodeStateScoring {
mTotalPrevWordsLanguageCost = 0.0f; mTotalPrevWordsLanguageCost = 0.0f;
mRawLength = 0.0f; mRawLength = 0.0f;
mDoubleLetterLevel = NOT_A_DOUBLE_LETTER; mDoubleLetterLevel = NOT_A_DOUBLE_LETTER;
mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX;
} }
AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) { AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) {
@ -54,6 +57,7 @@ class DicNodeStateScoring {
mTotalPrevWordsLanguageCost = scoring->mTotalPrevWordsLanguageCost; mTotalPrevWordsLanguageCost = scoring->mTotalPrevWordsLanguageCost;
mRawLength = scoring->mRawLength; mRawLength = scoring->mRawLength;
mDoubleLetterLevel = scoring->mDoubleLetterLevel; mDoubleLetterLevel = scoring->mDoubleLetterLevel;
mDigraphIndex = scoring->mDigraphIndex;
} }
void addCost(const float spatialCost, const float languageCost, const bool doNormalization, void addCost(const float spatialCost, const float languageCost, const bool doNormalization,
@ -126,6 +130,24 @@ class DicNodeStateScoring {
} }
} }
DigraphUtils::DigraphCodePointIndex getDigraphIndex() const {
return mDigraphIndex;
}
void advanceDigraphIndex() {
switch(mDigraphIndex) {
case DigraphUtils::NOT_A_DIGRAPH_INDEX:
mDigraphIndex = DigraphUtils::FIRST_DIGRAPH_CODEPOINT;
break;
case DigraphUtils::FIRST_DIGRAPH_CODEPOINT:
mDigraphIndex = DigraphUtils::SECOND_DIGRAPH_CODEPOINT;
break;
case DigraphUtils::SECOND_DIGRAPH_CODEPOINT:
mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX;
break;
}
}
float getTotalPrevWordsLanguageCost() const { float getTotalPrevWordsLanguageCost() const {
return mTotalPrevWordsLanguageCost; return mTotalPrevWordsLanguageCost;
} }
@ -135,6 +157,7 @@ class DicNodeStateScoring {
// Use a default copy constructor and an assign operator because shallow copies are ok // Use a default copy constructor and an assign operator because shallow copies are ok
// for this class // for this class
DoubleLetterLevel mDoubleLetterLevel; DoubleLetterLevel mDoubleLetterLevel;
DigraphUtils::DigraphCodePointIndex mDigraphIndex;
int16_t mEditCorrectionCount; int16_t mEditCorrectionCount;
int16_t mProximityCorrectionCount; int16_t mProximityCorrectionCount;