Support migration/dump of Beginning-of-Sentence entries.

Bug: 14119293
Change-Id: Ie975138f819794d5c34a7a547be5a6117050e084
main
Keisuke Kuroyanagi 2014-06-24 12:37:07 +09:00
parent f7322b166b
commit 88fa47a27d
9 changed files with 111 additions and 46 deletions

View File

@ -64,11 +64,12 @@ public final class BinaryDictionary extends Dictionary {
public static final int NOT_A_VALID_TIMESTAMP = -1; public static final int NOT_A_VALID_TIMESTAMP = -1;
// Format to get unigram flags from native side via getWordPropertyNative(). // Format to get unigram flags from native side via getWordPropertyNative().
private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 4; private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 5;
private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0; private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1; private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1;
private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2; private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2;
private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3; private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
private static final int FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX = 4;
// Format to get probability and historical info from native side via getWordPropertyNative(). // Format to get probability and historical info from native side via getWordPropertyNative().
public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4; public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4;
@ -176,10 +177,12 @@ public final class BinaryDictionary extends Dictionary {
private static native int getBigramProbabilityNative(long dict, int[] word0, private static native int getBigramProbabilityNative(long dict, int[] word0,
boolean isBeginningOfSentence, int[] word1); boolean isBeginningOfSentence, int[] word1);
private static native void getWordPropertyNative(long dict, int[] word, private static native void getWordPropertyNative(long dict, int[] word,
int[] outCodePoints, boolean[] outFlags, int[] outProbabilityInfo, boolean isBeginningOfSentence, int[] outCodePoints, boolean[] outFlags,
ArrayList<int[]> outBigramTargets, ArrayList<int[]> outBigramProbabilityInfo, int[] outProbabilityInfo, ArrayList<int[]> outBigramTargets,
ArrayList<int[]> outShortcutTargets, ArrayList<Integer> outShortcutProbabilities); ArrayList<int[]> outBigramProbabilityInfo, ArrayList<int[]> outShortcutTargets,
private static native int getNextWordNative(long dict, int token, int[] outCodePoints); ArrayList<Integer> outShortcutProbabilities);
private static native int getNextWordNative(long dict, int token, int[] outCodePoints,
boolean[] outIsBeginningOfSentence);
private static native void getSuggestionsNative(long dict, long proximityInfo, private static native void getSuggestionsNative(long dict, long proximityInfo,
long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
int[] pointerIds, int[] inputCodePoints, int inputSize, int[] suggestOptions, int[] pointerIds, int[] inputCodePoints, int inputSize, int[] suggestOptions,
@ -358,8 +361,8 @@ public final class BinaryDictionary extends Dictionary {
prevWordsInfo.mIsBeginningOfSentence, codePoints1); prevWordsInfo.mIsBeginningOfSentence, codePoints1);
} }
public WordProperty getWordProperty(final String word) { public WordProperty getWordProperty(final String word, final boolean isBeginningOfSentence) {
if (TextUtils.isEmpty(word)) { if (word == null) {
return null; return null;
} }
final int[] codePoints = StringUtils.toCodePointArray(word); final int[] codePoints = StringUtils.toCodePointArray(word);
@ -371,14 +374,15 @@ public final class BinaryDictionary extends Dictionary {
final ArrayList<int[]> outBigramProbabilityInfo = new ArrayList<>(); final ArrayList<int[]> outBigramProbabilityInfo = new ArrayList<>();
final ArrayList<int[]> outShortcutTargets = new ArrayList<>(); final ArrayList<int[]> outShortcutTargets = new ArrayList<>();
final ArrayList<Integer> outShortcutProbabilities = new ArrayList<>(); final ArrayList<Integer> outShortcutProbabilities = new ArrayList<>();
getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbabilityInfo, getWordPropertyNative(mNativeDict, codePoints, isBeginningOfSentence, outCodePoints,
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, outFlags, outProbabilityInfo, outBigramTargets, outBigramProbabilityInfo,
outShortcutProbabilities); outShortcutTargets, outShortcutProbabilities);
return new WordProperty(codePoints, return new WordProperty(codePoints,
outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX], outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX], outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX],
outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX], outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX],
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbabilityInfo, outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX],
outFlags[FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX], outProbabilityInfo,
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities); outShortcutProbabilities);
} }
@ -399,9 +403,12 @@ public final class BinaryDictionary extends Dictionary {
*/ */
public GetNextWordPropertyResult getNextWordProperty(final int token) { public GetNextWordPropertyResult getNextWordProperty(final int token) {
final int[] codePoints = new int[Constants.DICTIONARY_MAX_WORD_LENGTH]; final int[] codePoints = new int[Constants.DICTIONARY_MAX_WORD_LENGTH];
final int nextToken = getNextWordNative(mNativeDict, token, codePoints); final boolean[] isBeginningOfSentence = new boolean[1];
final int nextToken = getNextWordNative(mNativeDict, token, codePoints,
isBeginningOfSentence);
final String word = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints); final String word = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints);
return new GetNextWordPropertyResult(getWordProperty(word), nextToken); return new GetNextWordPropertyResult(
getWordProperty(word, isBeginningOfSentence[0]), nextToken);
} }
// Add a unigram entry to binary dictionary with unigram attributes in native code. // Add a unigram entry to binary dictionary with unigram attributes in native code.

View File

@ -70,8 +70,8 @@ public final class WordProperty implements Comparable<WordProperty> {
// Construct word property using information from native code. // Construct word property using information from native code.
// This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY. // This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY.
public WordProperty(final int[] codePoints, final boolean isNotAWord, public WordProperty(final int[] codePoints, final boolean isNotAWord,
final boolean isBlacklisted, final boolean hasBigram, final boolean isBlacklisted, final boolean hasBigram, final boolean hasShortcuts,
final boolean hasShortcuts, final int[] probabilityInfo, final boolean isBeginningOfSentence, final int[] probabilityInfo,
final ArrayList<int[]> bigramTargets, final ArrayList<int[]> bigramProbabilityInfo, final ArrayList<int[]> bigramTargets, final ArrayList<int[]> bigramProbabilityInfo,
final ArrayList<int[]> shortcutTargets, final ArrayList<int[]> shortcutTargets,
final ArrayList<Integer> shortcutProbabilities) { final ArrayList<Integer> shortcutProbabilities) {
@ -79,7 +79,7 @@ public final class WordProperty implements Comparable<WordProperty> {
mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo); mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo);
mShortcutTargets = new ArrayList<>(); mShortcutTargets = new ArrayList<>();
mBigrams = new ArrayList<>(); mBigrams = new ArrayList<>();
mIsBeginningOfSentence = false; mIsBeginningOfSentence = isBeginningOfSentence;
mIsNotAWord = isNotAWord; mIsNotAWord = isNotAWord;
mIsBlacklistEntry = isBlacklisted; mIsBlacklistEntry = isBlacklisted;
mHasShortcuts = hasShortcuts; mHasShortcuts = hasShortcuts;

View File

@ -31,6 +31,7 @@ public class CombinedFormatUtils {
public static final String HISTORICAL_INFO_TAG = "historicalInfo"; public static final String HISTORICAL_INFO_TAG = "historicalInfo";
public static final String HISTORICAL_INFO_SEPARATOR = ":"; public static final String HISTORICAL_INFO_SEPARATOR = ":";
public static final String WORD_TAG = "word"; public static final String WORD_TAG = "word";
public static final String BEGINNING_OF_SENTENCE_TAG = "beginning_of_sentence";
public static final String NOT_A_WORD_TAG = "not_a_word"; public static final String NOT_A_WORD_TAG = "not_a_word";
public static final String BLACKLISTED_TAG = "blacklisted"; public static final String BLACKLISTED_TAG = "blacklisted";
@ -56,6 +57,9 @@ public class CombinedFormatUtils {
builder.append(" " + WORD_TAG + "=" + wordProperty.mWord); builder.append(" " + WORD_TAG + "=" + wordProperty.mWord);
builder.append(","); builder.append(",");
builder.append(formatProbabilityInfo(wordProperty.mProbabilityInfo)); builder.append(formatProbabilityInfo(wordProperty.mProbabilityInfo));
if (wordProperty.mIsBeginningOfSentence) {
builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true");
}
if (wordProperty.mIsNotAWord) { if (wordProperty.mIsNotAWord) {
builder.append("," + NOT_A_WORD_TAG + "=true"); builder.append("," + NOT_A_WORD_TAG + "=true");
} }

View File

@ -301,7 +301,7 @@ static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass c
// If token is 0, this method newly starts iterating the dictionary. This method returns 0 when // If token is 0, this method newly starts iterating the dictionary. This method returns 0 when
// the dictionary does not have a next word. // the dictionary does not have a next word.
static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz, static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
jlong dict, jint token, jintArray outCodePoints) { jlong dict, jint token, jintArray outCodePoints, jbooleanArray outIsBeginningOfSentence) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) return 0; if (!dictionary) return 0;
const jsize codePointBufSize = env->GetArrayLength(outCodePoints); const jsize codePointBufSize = env->GetArrayLength(outCodePoints);
@ -317,19 +317,39 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount, MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount,
false /* needsNullTermination */); false /* needsNullTermination */);
bool isBeginningOfSentence = false;
if (wordCodePointCount > 0 && wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
isBeginningOfSentence = true;
}
JniDataUtils::putBooleanToArray(env, outIsBeginningOfSentence, 0 /* index */,
isBeginningOfSentence);
return nextToken; return nextToken;
} }
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
jlong dict, jintArray word, jintArray outCodePoints, jbooleanArray outFlags, jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilityInfo, jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
jobject outShortcutTargets, jobject outShortcutProbabilities) { jobject outBigramProbabilityInfo, jobject outShortcutTargets,
jobject outShortcutProbabilities) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) return; if (!dictionary) return;
const jsize wordLength = env->GetArrayLength(word); const jsize wordLength = env->GetArrayLength(word);
int wordCodePoints[wordLength]; if (wordLength > MAX_WORD_LENGTH) {
AKLOGE("Invalid wordLength: %d", wordLength);
return;
}
int wordCodePoints[MAX_WORD_LENGTH];
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength); int codePointCount = wordLength;
if (isBeginningOfSentence) {
codePointCount = CharUtils::attachBeginningOfSentenceMarker(
wordCodePoints, wordLength, MAX_WORD_LENGTH);
if (codePointCount < 0) {
AKLOGE("Cannot attach Beginning-of-Sentence marker.");
return;
}
}
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, codePointCount);
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo, wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities); outShortcutProbabilities);
@ -554,7 +574,6 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
return false; return false;
} }
// TODO: Migrate historical information.
int wordCodePoints[MAX_WORD_LENGTH]; int wordCodePoints[MAX_WORD_LENGTH];
int wordCodePointCount = 0; int wordCodePointCount = 0;
int token = 0; int token = 0;
@ -563,6 +582,10 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount); token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints,
wordCodePointCount); wordCodePointCount);
if (wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
// Skip beginning-of-sentence unigram.
continue;
}
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) { if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy( dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars); std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
@ -592,7 +615,7 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
} }
} }
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount, const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount,
false /* isBeginningOfSentence */); wordProperty.getUnigramProperty()->representsBeginningOfSentence());
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) { for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo, if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
&bigramProperty)) { &bigramProperty)) {
@ -669,13 +692,13 @@ static const JNINativeMethod sMethods[] = {
}, },
{ {
const_cast<char *>("getWordPropertyNative"), const_cast<char *>("getWordPropertyNative"),
const_cast<char *>("(J[I[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;" const_cast<char *>("(J[IZ[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;"
"Ljava/util/ArrayList;Ljava/util/ArrayList;)V"), "Ljava/util/ArrayList;Ljava/util/ArrayList;)V"),
reinterpret_cast<void *>(latinime_BinaryDictionary_getWordProperty) reinterpret_cast<void *>(latinime_BinaryDictionary_getWordProperty)
}, },
{ {
const_cast<char *>("getNextWordNative"), const_cast<char *>("getNextWordNative"),
const_cast<char *>("(JI[I)I"), const_cast<char *>("(JI[I[Z)I"),
reinterpret_cast<void *>(latinime_BinaryDictionary_getNextWord) reinterpret_cast<void *>(latinime_BinaryDictionary_getNextWord)
}, },
{ {

View File

@ -28,7 +28,8 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(), MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
false /* needsNullTermination */); false /* needsNullTermination */);
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(), jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
!mBigrams.empty(), mUnigramProperty.hasShortcuts()}; !mBigrams.empty(), mUnigramProperty.hasShortcuts(),
mUnigramProperty.representsBeginningOfSentence()};
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(), int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(),
mUnigramProperty.getLevel(), mUnigramProperty.getCount()}; mUnigramProperty.getLevel(), mUnigramProperty.getCount()};

View File

@ -98,6 +98,10 @@ class CharUtils {
// Beginning-of-Sentence. // Beginning-of-Sentence.
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
const int codePointCount, const int maxCodePoint) { const int codePointCount, const int maxCodePoint) {
if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
// Marker has already been attached.
return codePointCount;
}
if (codePointCount >= maxCodePoint) { if (codePointCount >= maxCodePoint) {
// the code points cannot be marked as a Beginning-of-Sentence. // the code points cannot be marked as a Beginning-of-Sentence.
return 0; return 0;

View File

@ -69,18 +69,23 @@ class JniDataUtils {
static void outputCodePoints(JNIEnv *env, jintArray intArrayToOutputCodePoints, const int start, static void outputCodePoints(JNIEnv *env, jintArray intArrayToOutputCodePoints, const int start,
const int maxLength, const int *const codePoints, const int codePointCount, const int maxLength, const int *const codePoints, const int codePointCount,
const bool needsNullTermination) { const bool needsNullTermination) {
const int outputCodePointCount = std::min(maxLength, codePointCount); const int codePointBufSize = std::min(maxLength, codePointCount);
int outputCodePonts[outputCodePointCount]; int outputCodePonts[codePointBufSize];
for (int i = 0; i < outputCodePointCount; ++i) { int outputCodePointCount = 0;
for (int i = 0; i < codePointBufSize; ++i) {
const int codePoint = codePoints[i]; const int codePoint = codePoints[i];
int codePointToOutput = codePoint;
if (!CharUtils::isInUnicodeSpace(codePoint)) { if (!CharUtils::isInUnicodeSpace(codePoint)) {
outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER; if (codePoint == CODE_POINT_BEGINNING_OF_SENTENCE) {
// Just skip Beginning-of-Sentence marker.
continue;
}
codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER;
} else if (codePoint >= 0x01 && codePoint <= 0x1F) { } else if (codePoint >= 0x01 && codePoint <= 0x1F) {
// Control code. // Control code.
outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER; codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER;
} else {
outputCodePonts[i] = codePoint;
} }
outputCodePonts[outputCodePointCount++] = codePointToOutput;
} }
env->SetIntArrayRegion(intArrayToOutputCodePoints, start, outputCodePointCount, env->SetIntArrayRegion(intArrayToOutputCodePoints, start, outputCodePointCount,
outputCodePonts); outputCodePonts);
@ -90,6 +95,11 @@ class JniDataUtils {
} }
} }
static void putBooleanToArray(JNIEnv *env, jbooleanArray array, const int index,
const jboolean value) {
env->SetBooleanArrayRegion(array, index, 1 /* len */, &value);
}
static void putIntToArray(JNIEnv *env, jintArray array, const int index, const int value) { static void putIntToArray(JNIEnv *env, jintArray array, const int index, const int value) {
env->SetIntArrayRegion(array, index, 1 /* len */, &value); env->SetIntArrayRegion(array, index, 1 /* len */, &value);
} }

View File

@ -994,7 +994,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
false /* isBeginningOfSentence */);
assertFalse(invalidWordProperty.isValid()); assertFalse(invalidWordProperty.isValid());
final ArrayList<String> words = new ArrayList<>(); final ArrayList<String> words = new ArrayList<>();
@ -1017,7 +1018,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
} }
words.add(word); words.add(word);
wordProbabilities.put(word, unigramProbability); wordProbabilities.put(word, unigramProbability);
final WordProperty wordProperty = binaryDictionary.getWordProperty(word); final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
false /* isBeginningOfSentence */);
assertEquals(word, wordProperty.mWord); assertEquals(word, wordProperty.mWord);
assertTrue(wordProperty.isValid()); assertTrue(wordProperty.isValid());
assertEquals(isNotAWord, wordProperty.mIsNotAWord); assertEquals(isNotAWord, wordProperty.mIsNotAWord);
@ -1057,7 +1059,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
continue; continue;
} }
final HashSet<String> bigramWord1s = bigrams.get(word0); final HashSet<String> bigramWord1s = bigrams.get(word0);
final WordProperty wordProperty = binaryDictionary.getWordProperty(word0); final WordProperty wordProperty = binaryDictionary.getWordProperty(word0,
false /* isBeginningOfSentence */);
assertEquals(bigramWord1s.size(), wordProperty.mBigrams.size()); assertEquals(bigramWord1s.size(), wordProperty.mBigrams.size());
for (int j = 0; j < wordProperty.mBigrams.size(); j++) { for (int j = 0; j < wordProperty.mBigrams.size(); j++) {
final String word1 = wordProperty.mBigrams.get(j).mWord; final String word1 = wordProperty.mBigrams.get(j).mWord;
@ -1094,7 +1097,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
0 /* offset */, dictFile.length(), true /* useFullEditDistance */, 0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
false /* isBeginningOfSentence */);
assertFalse(invalidWordProperty.isValid()); assertFalse(invalidWordProperty.isValid());
final ArrayList<String> words = new ArrayList<>(); final ArrayList<String> words = new ArrayList<>();
@ -1188,7 +1192,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz", binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
shortcutProbability, false /* isBeginningOfSentence */, shortcutProbability, false /* isBeginningOfSentence */,
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */); false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa"); WordProperty wordProperty = binaryDictionary.getWordProperty("aaa",
false /* isBeginningOfSentence */);
assertEquals(1, wordProperty.mShortcutTargets.size()); assertEquals(1, wordProperty.mShortcutTargets.size());
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability()); assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability());
@ -1196,7 +1201,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz", binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
updatedShortcutProbability, false /* isBeginningOfSentence */, updatedShortcutProbability, false /* isBeginningOfSentence */,
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */); false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
wordProperty = binaryDictionary.getWordProperty("aaa"); wordProperty = binaryDictionary.getWordProperty("aaa",
false /* isBeginningOfSentence */);
assertEquals(1, wordProperty.mShortcutTargets.size()); assertEquals(1, wordProperty.mShortcutTargets.size());
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
assertEquals(updatedShortcutProbability, assertEquals(updatedShortcutProbability,
@ -1207,7 +1213,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
final HashMap<String, Integer> shortcutTargets = new HashMap<>(); final HashMap<String, Integer> shortcutTargets = new HashMap<>();
shortcutTargets.put("zzz", updatedShortcutProbability); shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability); shortcutTargets.put("yyy", shortcutProbability);
wordProperty = binaryDictionary.getWordProperty("aaa"); wordProperty = binaryDictionary.getWordProperty("aaa",
false /* isBeginningOfSentence */);
assertEquals(2, wordProperty.mShortcutTargets.size()); assertEquals(2, wordProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
@ -1218,7 +1225,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
shortcutTargets.put("zzz", updatedShortcutProbability); shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability); shortcutTargets.put("yyy", shortcutProbability);
binaryDictionary.flushWithGC(); binaryDictionary.flushWithGC();
wordProperty = binaryDictionary.getWordProperty("aaa"); wordProperty = binaryDictionary.getWordProperty("aaa",
false /* isBeginningOfSentence */);
assertEquals(2, wordProperty.mShortcutTargets.size()); assertEquals(2, wordProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
@ -1288,7 +1296,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
} }
for (final String word : words) { for (final String word : words) {
final WordProperty wordProperty = binaryDictionary.getWordProperty(word); final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
false /* isBeginningOfSentence */);
assertEquals((int)unigramProbabilities.get(word), assertEquals((int)unigramProbabilities.get(word),
wordProperty.mProbabilityInfo.mProbability); wordProperty.mProbabilityInfo.mProbability);
if (!shortcutTargets.containsKey(word)) { if (!shortcutTargets.containsKey(word)) {
@ -1332,6 +1341,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */, binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */,
Dictionary.NOT_A_PROBABILITY, false /* isBeginningOfSentence */, Dictionary.NOT_A_PROBABILITY, false /* isBeginningOfSentence */,
true /* isNotAWord */, true /* isBlacklisted */, 0 /* timestamp */); true /* isNotAWord */, true /* isBlacklisted */, 0 /* timestamp */);
binaryDictionary.addNgramEntry(PrevWordsInfo.BEGINNING_OF_SENTENCE,
"aaa", bigramProbability, 0 /* timestamp */);
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa")); assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb")); assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb")); assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
@ -1343,12 +1354,16 @@ public class BinaryDictionaryTests extends AndroidTestCase {
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb")); assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
if (canCheckBigramProbability(toFormatVersion)) { if (canCheckBigramProbability(toFormatVersion)) {
assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bbb")); assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bbb"));
assertEquals(bigramProbability, binaryDictionary.getNgramProbability(
PrevWordsInfo.BEGINNING_OF_SENTENCE, "aaa"));
} }
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb")); assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
WordProperty wordProperty = binaryDictionary.getWordProperty("ccc"); WordProperty wordProperty = binaryDictionary.getWordProperty("ccc",
false /* isBeginningOfSentence */);
assertEquals(1, wordProperty.mShortcutTargets.size()); assertEquals(1, wordProperty.mShortcutTargets.size());
assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord); assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord);
wordProperty = binaryDictionary.getWordProperty("ddd"); wordProperty = binaryDictionary.getWordProperty("ddd",
false /* isBeginningOfSentence */);
assertTrue(wordProperty.mIsBlacklistEntry); assertTrue(wordProperty.mIsBlacklistEntry);
assertTrue(wordProperty.mIsNotAWord); assertTrue(wordProperty.mIsNotAWord);
} }

View File

@ -614,7 +614,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
0 /* offset */, file.length(), true /* useFullEditDistance */, 0 /* offset */, file.length(), true /* useFullEditDistance */,
Locale.ENGLISH, dictName, false /* isUpdatable */); Locale.ENGLISH, dictName, false /* isUpdatable */);
for (final String word : words) { for (final String word : words) {
final WordProperty wordProperty = binaryDictionary.getWordProperty(word); final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
false /* isBeginningOfSentence */);
assertEquals(word, wordProperty.mWord); assertEquals(word, wordProperty.mWord);
assertEquals(UNIGRAM_FREQ, wordProperty.getProbability()); assertEquals(UNIGRAM_FREQ, wordProperty.getProbability());
if (shortcuts.containsKey(word)) { if (shortcuts.containsKey(word)) {