parent
84c1bbd76d
commit
d36245fad2
|
@ -16,10 +16,11 @@
|
||||||
|
|
||||||
package com.android.inputmethod.latin.makedict;
|
package com.android.inputmethod.latin.makedict;
|
||||||
|
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput;
|
import com.android.inputmethod.latin.Constants;
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
|
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
|
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -124,4 +125,69 @@ public class BinaryDictIOUtils {
|
||||||
readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams,
|
readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams,
|
||||||
header.mFormatOptions);
|
header.mFormatOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the address of the last CharGroup of the exact matching word in the dictionary.
|
||||||
|
* If no match is found, returns NOT_VALID_WORD.
|
||||||
|
*
|
||||||
|
* @param buffer the buffer to read.
|
||||||
|
* @param word the word we search for.
|
||||||
|
* @return the address of the terminal node.
|
||||||
|
* @throws IOException
|
||||||
|
* @throws UnsupportedFormatException
|
||||||
|
*/
|
||||||
|
public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
|
||||||
|
final String word) throws IOException, UnsupportedFormatException {
|
||||||
|
if (word == null) return FormatSpec.NOT_VALID_WORD;
|
||||||
|
if (buffer.position() != 0) buffer.position(0);
|
||||||
|
|
||||||
|
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
|
||||||
|
int wordPos = 0;
|
||||||
|
final int wordLen = word.codePointCount(0, word.length());
|
||||||
|
for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
|
||||||
|
if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
|
||||||
|
int groupOffset = buffer.position() - header.mHeaderSize;
|
||||||
|
final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
|
||||||
|
groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount);
|
||||||
|
|
||||||
|
for (int i = 0; i < charGroupCount; ++i) {
|
||||||
|
final int charGroupPos = buffer.position();
|
||||||
|
final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
|
||||||
|
buffer.position(), header.mFormatOptions);
|
||||||
|
boolean same = true;
|
||||||
|
for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
|
||||||
|
p < currentInfo.mCharacters.length;
|
||||||
|
++p, j = word.offsetByCodePoints(j, 1)) {
|
||||||
|
if (wordPos + p >= wordLen
|
||||||
|
|| word.codePointAt(j) != currentInfo.mCharacters[p]) {
|
||||||
|
same = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (same) {
|
||||||
|
if (wordPos + currentInfo.mCharacters.length == wordLen) {
|
||||||
|
if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) {
|
||||||
|
return FormatSpec.NOT_VALID_WORD;
|
||||||
|
} else {
|
||||||
|
return charGroupPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wordPos += currentInfo.mCharacters.length;
|
||||||
|
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
|
||||||
|
return FormatSpec.NOT_VALID_WORD;
|
||||||
|
}
|
||||||
|
buffer.position(currentInfo.mChildrenAddress);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
groupOffset = currentInfo.mEndAddress;
|
||||||
|
|
||||||
|
// not found
|
||||||
|
if (i >= charGroupCount - 1) {
|
||||||
|
return FormatSpec.NOT_VALID_WORD;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return FormatSpec.NOT_VALID_WORD;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1242,8 +1242,9 @@ public class BinaryDictInputOutput {
|
||||||
* @param formatOptions file format options.
|
* @param formatOptions file format options.
|
||||||
* @return the word, as a string.
|
* @return the word, as a string.
|
||||||
*/
|
*/
|
||||||
private static String getWordAtAddress(final FusionDictionaryBufferInterface buffer,
|
/* packages for tests */ static String getWordAtAddress(
|
||||||
final int headerSize, final int address, final FormatOptions formatOptions) {
|
final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
|
||||||
|
final FormatOptions formatOptions) {
|
||||||
final String cachedString = wordCache.get(address);
|
final String cachedString = wordCache.get(address);
|
||||||
if (null != cachedString) return cachedString;
|
if (null != cachedString) return cachedString;
|
||||||
|
|
||||||
|
|
|
@ -207,6 +207,9 @@ public final class FormatSpec {
|
||||||
static final int MAX_TERMINAL_FREQUENCY = 255;
|
static final int MAX_TERMINAL_FREQUENCY = 255;
|
||||||
static final int MAX_BIGRAM_FREQUENCY = 15;
|
static final int MAX_BIGRAM_FREQUENCY = 15;
|
||||||
|
|
||||||
|
// This option needs to be the same numeric value as the one in binary_format.h.
|
||||||
|
static final int NOT_VALID_WORD = -99;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Options about file format.
|
* Options about file format.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -19,7 +19,7 @@ package com.android.inputmethod.latin.makedict;
|
||||||
import com.android.inputmethod.latin.CollectionUtils;
|
import com.android.inputmethod.latin.CollectionUtils;
|
||||||
import com.android.inputmethod.latin.UserHistoryDictIOUtils;
|
import com.android.inputmethod.latin.UserHistoryDictIOUtils;
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
|
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec;
|
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||||
|
@ -475,4 +475,93 @@ public class BinaryDictIOTests extends AndroidTestCase {
|
||||||
Log.d(TAG, result);
|
Log.d(TAG, result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Tests for getTerminalPosition
|
||||||
|
private String getWordFromBinary(final FusionDictionaryBufferInterface buffer,
|
||||||
|
final int address) {
|
||||||
|
if (buffer.position() != 0) buffer.position(0);
|
||||||
|
|
||||||
|
FileHeader header = null;
|
||||||
|
try {
|
||||||
|
header = BinaryDictInputOutput.readHeader(buffer);
|
||||||
|
} catch (IOException e) {
|
||||||
|
return null;
|
||||||
|
} catch (UnsupportedFormatException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (header == null) return null;
|
||||||
|
return BinaryDictInputOutput.getWordAtAddress(buffer, header.mHeaderSize,
|
||||||
|
address - header.mHeaderSize, header.mFormatOptions);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long runGetTerminalPosition(final FusionDictionaryBufferInterface buffer,
|
||||||
|
final String word, int index, boolean contained) {
|
||||||
|
final int expectedFrequency = (UNIGRAM_FREQ + index) % 255;
|
||||||
|
long diff = -1;
|
||||||
|
int position = -1;
|
||||||
|
try {
|
||||||
|
final long now = System.nanoTime();
|
||||||
|
position = BinaryDictIOUtils.getTerminalPosition(buffer, word);
|
||||||
|
diff = System.nanoTime() - now;
|
||||||
|
} catch (IOException e) {
|
||||||
|
Log.e(TAG, "IOException while getTerminalPosition: " + e);
|
||||||
|
} catch (UnsupportedFormatException e) {
|
||||||
|
Log.e(TAG, "UnsupportedFormatException while getTermianlPosition: " + e);
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(FormatSpec.NOT_VALID_WORD != position, contained);
|
||||||
|
if (contained) assertEquals(getWordFromBinary(buffer, position), word);
|
||||||
|
return diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGetTerminalPosition() {
|
||||||
|
File file = null;
|
||||||
|
try {
|
||||||
|
file = File.createTempFile("runReadUnigrams", ".dict");
|
||||||
|
} catch (IOException e) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
assertNotNull(file);
|
||||||
|
|
||||||
|
final FusionDictionary dict = new FusionDictionary(new Node(),
|
||||||
|
new FusionDictionary.DictionaryOptions(
|
||||||
|
new HashMap<String, String>(), false, false));
|
||||||
|
addUnigrams(sWords.size(), dict, sWords, null /* shortcutMap */);
|
||||||
|
timeWritingDictToFile(file, dict, VERSION3_WITH_LINKEDLIST_NODE);
|
||||||
|
|
||||||
|
final FusionDictionaryBufferInterface buffer = getBuffer(file, USE_BYTE_ARRAY);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// too long word
|
||||||
|
final String longWord = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
|
||||||
|
assertEquals(FormatSpec.NOT_VALID_WORD,
|
||||||
|
BinaryDictIOUtils.getTerminalPosition(buffer, longWord));
|
||||||
|
|
||||||
|
// null
|
||||||
|
assertEquals(FormatSpec.NOT_VALID_WORD,
|
||||||
|
BinaryDictIOUtils.getTerminalPosition(buffer, null));
|
||||||
|
|
||||||
|
// empty string
|
||||||
|
assertEquals(FormatSpec.NOT_VALID_WORD,
|
||||||
|
BinaryDictIOUtils.getTerminalPosition(buffer, ""));
|
||||||
|
} catch (IOException e) {
|
||||||
|
} catch (UnsupportedFormatException e) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test a word that is contained within the dictionary.
|
||||||
|
long sum = 0;
|
||||||
|
for (int i = 0; i < sWords.size(); ++i) {
|
||||||
|
final long time = runGetTerminalPosition(buffer, sWords.get(i), i, true);
|
||||||
|
sum += time == -1 ? 0 : time;
|
||||||
|
}
|
||||||
|
Log.d(TAG, "per a search : " + (((double)sum) / sWords.size() / 1000000));
|
||||||
|
|
||||||
|
// Test a word that isn't contained within the dictionary.
|
||||||
|
final Random random = new Random((int)System.currentTimeMillis());
|
||||||
|
for (int i = 0; i < 1000; ++i) {
|
||||||
|
final String word = generateWord(random.nextInt());
|
||||||
|
if (sWords.indexOf(word) != -1) continue;
|
||||||
|
runGetTerminalPosition(buffer, word, i, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue