From cc3e5c7c1036ef70c1f4d488e8091ac5926ac258 Mon Sep 17 00:00:00 2001 From: Amith Yamasani <> Date: Tue, 31 Mar 2009 10:51:17 -0700 Subject: [PATCH] AI 143659: am: CL 143472 Reduce dictionary size. Changed the tree structure to have variable length nodes to save an average of 21% on the dictionary size. Created a shortened English dictionary for Dream - 50K words. Added a shortened Spanish dictionary for Dream - 32K words. Original author: yamasani Merged from: //branches/cupcake/... Automated import of CL 143659 --- dictionary/src/dictionary.cpp | 19 +++++++++++++------ dictionary/src/dictionary.h | 23 ++++++++++++++++------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/dictionary/src/dictionary.cpp b/dictionary/src/dictionary.cpp index 6aecb6374..b37f4c926 100644 --- a/dictionary/src/dictionary.cpp +++ b/dictionary/src/dictionary.cpp @@ -85,10 +85,14 @@ int Dictionary::getAddress(int *pos) { int address = 0; - address += (mDict[*pos] & 0x7F) << 16; - address += (mDict[*pos + 1] & 0xFF) << 8; - address += (mDict[*pos + 2] & 0xFF); - *pos += 3; + if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) { + *pos += 1; + } else { + address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16; + address += (mDict[*pos + 1] & 0xFF) << 8; + address += (mDict[*pos + 2] & 0xFF); + *pos += 3; + } return address; } @@ -193,7 +197,8 @@ Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int s unsigned short lowerC = toLowerCase(c, depth); bool terminal = getTerminal(&pos); int childrenAddress = getAddress(&pos); - int freq = getFreq(&pos); + int freq = 1; + if (terminal) freq = getFreq(&pos); // If we are only doing completions, no need to look at the typed characters. if (completion) { mWord[depth] = c; @@ -266,7 +271,9 @@ Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length } } } - getFreq(&pos); + if (terminal) { + getFreq(&pos); + } // There could be two instances of each alphabet - upper and lower case. So continue // looking ... } diff --git a/dictionary/src/dictionary.h b/dictionary/src/dictionary.h index 8574e0736..b13e97795 100644 --- a/dictionary/src/dictionary.h +++ b/dictionary/src/dictionary.h @@ -19,35 +19,44 @@ namespace latinime { +// 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words +#define ADDRESS_MASK 0x3FFFFF + +// The bit that decides if an address follows in the next 22 bits +#define FLAG_ADDRESS_MASK 0x40 +// The bit that decides if this is a terminal node for a word. The node could still have children, +// if the word has other endings. +#define FLAG_TERMINAL_MASK 0x80 + class Dictionary { public: Dictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier); - int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies, + int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxWords, int maxAlternatives); bool isValidWord(unsigned short *word, int length); void setAsset(void *asset) { mAsset = asset; } void *getAsset() { return mAsset; } ~Dictionary(); - + private: int getAddress(int *pos); - bool getTerminal(int *pos) { return (mDict[*pos] & 0x80) > 0; } + bool getTerminal(int *pos) { return (mDict[*pos] & FLAG_TERMINAL_MASK) > 0; } int getFreq(int *pos) { return mDict[(*pos)++] & 0xFF; } int getCount(int *pos) { return mDict[(*pos)++] & 0xFF; } unsigned short getChar(int *pos); int wideStrLen(unsigned short *str); - + bool sameAsTyped(unsigned short *word, int length); bool addWord(unsigned short *word, int length, int frequency); unsigned short toLowerCase(unsigned short c, int depth); - void getWordsRec(int pos, int depth, int maxDepth, bool completion, int frequency, + void getWordsRec(int pos, int depth, int maxDepth, bool completion, int frequency, int inputIndex); bool isValidWordRec(int pos, unsigned short *word, int offset, int length); unsigned char *mDict; void *mAsset; - + int *mFrequencies; int mMaxWords; int mMaxWordLength; @@ -57,7 +66,7 @@ private: int mInputLength; int mMaxAlternatives; unsigned short mWord[128]; - + int mFullWordMultiplier; int mTypedLetterMultiplier; };