am 20a6dea1
: Add a flag for bigram presence in the header
* commit '20a6dea1cabfd8822824f7dca828d898e5b91cbc': Add a flag for bigram presence in the header
This commit is contained in:
commit
7edb7a288b
2 changed files with 34 additions and 4 deletions
|
@ -131,6 +131,7 @@ public class BinaryDictInputOutput {
|
|||
// These options need to be the same numeric values as the one in the native reading code.
|
||||
private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
|
||||
private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
|
||||
private static final int CONTAINS_BIGRAMS_FLAG = 0x8;
|
||||
|
||||
// TODO: Make this value adaptative to content data, store it in the header, and
|
||||
// use it in the reading code.
|
||||
|
@ -752,9 +753,12 @@ public class BinaryDictInputOutput {
|
|||
/**
|
||||
* Makes the 2-byte value for options flags.
|
||||
*/
|
||||
private static final int makeOptionsValue(final DictionaryOptions options) {
|
||||
private static final int makeOptionsValue(final FusionDictionary dictionary) {
|
||||
final DictionaryOptions options = dictionary.mOptions;
|
||||
final boolean hasBigrams = dictionary.hasBigrams();
|
||||
return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0)
|
||||
+ (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0);
|
||||
+ (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0)
|
||||
+ (hasBigrams ? CONTAINS_BIGRAMS_FLAG : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -970,7 +974,7 @@ public class BinaryDictInputOutput {
|
|||
headerBuffer.write((byte) (0xFF & version));
|
||||
}
|
||||
// Options flags
|
||||
final int options = makeOptionsValue(dict.mOptions);
|
||||
final int options = makeOptionsValue(dict);
|
||||
headerBuffer.write((byte) (0xFF & (options >> 8)));
|
||||
headerBuffer.write((byte) (0xFF & options));
|
||||
if (version >= FIRST_VERSION_WITH_HEADER_SIZE) {
|
||||
|
|
|
@ -563,7 +563,7 @@ public class FusionDictionary implements Iterable<Word> {
|
|||
* Recursively count the number of nodes in a given branch of the trie.
|
||||
*
|
||||
* @param node the node to count.
|
||||
* @result the number of nodes in this branch.
|
||||
* @return the number of nodes in this branch.
|
||||
*/
|
||||
public static int countNodes(final Node node) {
|
||||
int size = 1;
|
||||
|
@ -575,6 +575,32 @@ public class FusionDictionary implements Iterable<Word> {
|
|||
return size;
|
||||
}
|
||||
|
||||
// Recursively find out whether there are any bigrams.
|
||||
// This can be pretty expensive especially if there aren't any (we return as soon
|
||||
// as we find one, so it's much cheaper if there are bigrams)
|
||||
private static boolean hasBigramsInternal(final Node node) {
|
||||
if (null == node) return false;
|
||||
for (int i = node.mData.size() - 1; i >= 0; --i) {
|
||||
CharGroup group = node.mData.get(i);
|
||||
if (null != group.mBigrams) return true;
|
||||
if (hasBigramsInternal(group.mChildren)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds out whether there are any bigrams in this dictionary.
|
||||
*
|
||||
* @return true if there is any bigram, false otherwise.
|
||||
*/
|
||||
// TODO: this is expensive especially for large dictionaries without any bigram.
|
||||
// The up side is, this is always accurate and correct and uses no memory. We should
|
||||
// find a more efficient way of doing this, without compromising too much on memory
|
||||
// and ease of use.
|
||||
public boolean hasBigrams() {
|
||||
return hasBigramsInternal(mRoot);
|
||||
}
|
||||
|
||||
// Historically, the tails of the words were going to be merged to save space.
|
||||
// However, that would prevent the code to search for a specific address in log(n)
|
||||
// time so this was abandoned.
|
||||
|
|
Loading…
Reference in a new issue