am 702e153f
: Merge "Support migration/dump of Beginning-of-Sentence entries."
* commit '702e153fbc28d54aeb2ded40b9f3f31c1fd154e9': Support migration/dump of Beginning-of-Sentence entries.
This commit is contained in:
commit
c1ded3dd79
9 changed files with 111 additions and 46 deletions
|
@ -64,11 +64,12 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
public static final int NOT_A_VALID_TIMESTAMP = -1;
|
public static final int NOT_A_VALID_TIMESTAMP = -1;
|
||||||
|
|
||||||
// Format to get unigram flags from native side via getWordPropertyNative().
|
// Format to get unigram flags from native side via getWordPropertyNative().
|
||||||
private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 4;
|
private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 5;
|
||||||
private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
|
private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
|
||||||
private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1;
|
private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1;
|
||||||
private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2;
|
private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2;
|
||||||
private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
|
private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
|
||||||
|
private static final int FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX = 4;
|
||||||
|
|
||||||
// Format to get probability and historical info from native side via getWordPropertyNative().
|
// Format to get probability and historical info from native side via getWordPropertyNative().
|
||||||
public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4;
|
public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4;
|
||||||
|
@ -176,10 +177,12 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
private static native int getBigramProbabilityNative(long dict, int[] word0,
|
private static native int getBigramProbabilityNative(long dict, int[] word0,
|
||||||
boolean isBeginningOfSentence, int[] word1);
|
boolean isBeginningOfSentence, int[] word1);
|
||||||
private static native void getWordPropertyNative(long dict, int[] word,
|
private static native void getWordPropertyNative(long dict, int[] word,
|
||||||
int[] outCodePoints, boolean[] outFlags, int[] outProbabilityInfo,
|
boolean isBeginningOfSentence, int[] outCodePoints, boolean[] outFlags,
|
||||||
ArrayList<int[]> outBigramTargets, ArrayList<int[]> outBigramProbabilityInfo,
|
int[] outProbabilityInfo, ArrayList<int[]> outBigramTargets,
|
||||||
ArrayList<int[]> outShortcutTargets, ArrayList<Integer> outShortcutProbabilities);
|
ArrayList<int[]> outBigramProbabilityInfo, ArrayList<int[]> outShortcutTargets,
|
||||||
private static native int getNextWordNative(long dict, int token, int[] outCodePoints);
|
ArrayList<Integer> outShortcutProbabilities);
|
||||||
|
private static native int getNextWordNative(long dict, int token, int[] outCodePoints,
|
||||||
|
boolean[] outIsBeginningOfSentence);
|
||||||
private static native void getSuggestionsNative(long dict, long proximityInfo,
|
private static native void getSuggestionsNative(long dict, long proximityInfo,
|
||||||
long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
|
long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
|
||||||
int[] pointerIds, int[] inputCodePoints, int inputSize, int[] suggestOptions,
|
int[] pointerIds, int[] inputCodePoints, int inputSize, int[] suggestOptions,
|
||||||
|
@ -358,8 +361,8 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
prevWordsInfo.mIsBeginningOfSentence, codePoints1);
|
prevWordsInfo.mIsBeginningOfSentence, codePoints1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordProperty getWordProperty(final String word) {
|
public WordProperty getWordProperty(final String word, final boolean isBeginningOfSentence) {
|
||||||
if (TextUtils.isEmpty(word)) {
|
if (word == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
final int[] codePoints = StringUtils.toCodePointArray(word);
|
final int[] codePoints = StringUtils.toCodePointArray(word);
|
||||||
|
@ -371,14 +374,15 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
final ArrayList<int[]> outBigramProbabilityInfo = new ArrayList<>();
|
final ArrayList<int[]> outBigramProbabilityInfo = new ArrayList<>();
|
||||||
final ArrayList<int[]> outShortcutTargets = new ArrayList<>();
|
final ArrayList<int[]> outShortcutTargets = new ArrayList<>();
|
||||||
final ArrayList<Integer> outShortcutProbabilities = new ArrayList<>();
|
final ArrayList<Integer> outShortcutProbabilities = new ArrayList<>();
|
||||||
getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbabilityInfo,
|
getWordPropertyNative(mNativeDict, codePoints, isBeginningOfSentence, outCodePoints,
|
||||||
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
|
outFlags, outProbabilityInfo, outBigramTargets, outBigramProbabilityInfo,
|
||||||
outShortcutProbabilities);
|
outShortcutTargets, outShortcutProbabilities);
|
||||||
return new WordProperty(codePoints,
|
return new WordProperty(codePoints,
|
||||||
outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
|
outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
|
||||||
outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX],
|
outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX],
|
||||||
outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX],
|
outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX],
|
||||||
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbabilityInfo,
|
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX],
|
||||||
|
outFlags[FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX], outProbabilityInfo,
|
||||||
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
|
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
|
||||||
outShortcutProbabilities);
|
outShortcutProbabilities);
|
||||||
}
|
}
|
||||||
|
@ -399,9 +403,12 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
*/
|
*/
|
||||||
public GetNextWordPropertyResult getNextWordProperty(final int token) {
|
public GetNextWordPropertyResult getNextWordProperty(final int token) {
|
||||||
final int[] codePoints = new int[Constants.DICTIONARY_MAX_WORD_LENGTH];
|
final int[] codePoints = new int[Constants.DICTIONARY_MAX_WORD_LENGTH];
|
||||||
final int nextToken = getNextWordNative(mNativeDict, token, codePoints);
|
final boolean[] isBeginningOfSentence = new boolean[1];
|
||||||
|
final int nextToken = getNextWordNative(mNativeDict, token, codePoints,
|
||||||
|
isBeginningOfSentence);
|
||||||
final String word = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints);
|
final String word = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints);
|
||||||
return new GetNextWordPropertyResult(getWordProperty(word), nextToken);
|
return new GetNextWordPropertyResult(
|
||||||
|
getWordProperty(word, isBeginningOfSentence[0]), nextToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add a unigram entry to binary dictionary with unigram attributes in native code.
|
// Add a unigram entry to binary dictionary with unigram attributes in native code.
|
||||||
|
|
|
@ -70,8 +70,8 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
// Construct word property using information from native code.
|
// Construct word property using information from native code.
|
||||||
// This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY.
|
// This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY.
|
||||||
public WordProperty(final int[] codePoints, final boolean isNotAWord,
|
public WordProperty(final int[] codePoints, final boolean isNotAWord,
|
||||||
final boolean isBlacklisted, final boolean hasBigram,
|
final boolean isBlacklisted, final boolean hasBigram, final boolean hasShortcuts,
|
||||||
final boolean hasShortcuts, final int[] probabilityInfo,
|
final boolean isBeginningOfSentence, final int[] probabilityInfo,
|
||||||
final ArrayList<int[]> bigramTargets, final ArrayList<int[]> bigramProbabilityInfo,
|
final ArrayList<int[]> bigramTargets, final ArrayList<int[]> bigramProbabilityInfo,
|
||||||
final ArrayList<int[]> shortcutTargets,
|
final ArrayList<int[]> shortcutTargets,
|
||||||
final ArrayList<Integer> shortcutProbabilities) {
|
final ArrayList<Integer> shortcutProbabilities) {
|
||||||
|
@ -79,7 +79,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo);
|
mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo);
|
||||||
mShortcutTargets = new ArrayList<>();
|
mShortcutTargets = new ArrayList<>();
|
||||||
mBigrams = new ArrayList<>();
|
mBigrams = new ArrayList<>();
|
||||||
mIsBeginningOfSentence = false;
|
mIsBeginningOfSentence = isBeginningOfSentence;
|
||||||
mIsNotAWord = isNotAWord;
|
mIsNotAWord = isNotAWord;
|
||||||
mIsBlacklistEntry = isBlacklisted;
|
mIsBlacklistEntry = isBlacklisted;
|
||||||
mHasShortcuts = hasShortcuts;
|
mHasShortcuts = hasShortcuts;
|
||||||
|
|
|
@ -31,6 +31,7 @@ public class CombinedFormatUtils {
|
||||||
public static final String HISTORICAL_INFO_TAG = "historicalInfo";
|
public static final String HISTORICAL_INFO_TAG = "historicalInfo";
|
||||||
public static final String HISTORICAL_INFO_SEPARATOR = ":";
|
public static final String HISTORICAL_INFO_SEPARATOR = ":";
|
||||||
public static final String WORD_TAG = "word";
|
public static final String WORD_TAG = "word";
|
||||||
|
public static final String BEGINNING_OF_SENTENCE_TAG = "beginning_of_sentence";
|
||||||
public static final String NOT_A_WORD_TAG = "not_a_word";
|
public static final String NOT_A_WORD_TAG = "not_a_word";
|
||||||
public static final String BLACKLISTED_TAG = "blacklisted";
|
public static final String BLACKLISTED_TAG = "blacklisted";
|
||||||
|
|
||||||
|
@ -56,6 +57,9 @@ public class CombinedFormatUtils {
|
||||||
builder.append(" " + WORD_TAG + "=" + wordProperty.mWord);
|
builder.append(" " + WORD_TAG + "=" + wordProperty.mWord);
|
||||||
builder.append(",");
|
builder.append(",");
|
||||||
builder.append(formatProbabilityInfo(wordProperty.mProbabilityInfo));
|
builder.append(formatProbabilityInfo(wordProperty.mProbabilityInfo));
|
||||||
|
if (wordProperty.mIsBeginningOfSentence) {
|
||||||
|
builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true");
|
||||||
|
}
|
||||||
if (wordProperty.mIsNotAWord) {
|
if (wordProperty.mIsNotAWord) {
|
||||||
builder.append("," + NOT_A_WORD_TAG + "=true");
|
builder.append("," + NOT_A_WORD_TAG + "=true");
|
||||||
}
|
}
|
||||||
|
|
|
@ -301,7 +301,7 @@ static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass c
|
||||||
// If token is 0, this method newly starts iterating the dictionary. This method returns 0 when
|
// If token is 0, this method newly starts iterating the dictionary. This method returns 0 when
|
||||||
// the dictionary does not have a next word.
|
// the dictionary does not have a next word.
|
||||||
static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
|
static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
|
||||||
jlong dict, jint token, jintArray outCodePoints) {
|
jlong dict, jint token, jintArray outCodePoints, jbooleanArray outIsBeginningOfSentence) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
if (!dictionary) return 0;
|
if (!dictionary) return 0;
|
||||||
const jsize codePointBufSize = env->GetArrayLength(outCodePoints);
|
const jsize codePointBufSize = env->GetArrayLength(outCodePoints);
|
||||||
|
@ -317,19 +317,39 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
|
||||||
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
||||||
MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount,
|
MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount,
|
||||||
false /* needsNullTermination */);
|
false /* needsNullTermination */);
|
||||||
|
bool isBeginningOfSentence = false;
|
||||||
|
if (wordCodePointCount > 0 && wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
|
||||||
|
isBeginningOfSentence = true;
|
||||||
|
}
|
||||||
|
JniDataUtils::putBooleanToArray(env, outIsBeginningOfSentence, 0 /* index */,
|
||||||
|
isBeginningOfSentence);
|
||||||
return nextToken;
|
return nextToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
|
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
|
||||||
jlong dict, jintArray word, jintArray outCodePoints, jbooleanArray outFlags,
|
jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
|
||||||
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilityInfo,
|
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
|
||||||
jobject outShortcutTargets, jobject outShortcutProbabilities) {
|
jobject outBigramProbabilityInfo, jobject outShortcutTargets,
|
||||||
|
jobject outShortcutProbabilities) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
if (!dictionary) return;
|
if (!dictionary) return;
|
||||||
const jsize wordLength = env->GetArrayLength(word);
|
const jsize wordLength = env->GetArrayLength(word);
|
||||||
int wordCodePoints[wordLength];
|
if (wordLength > MAX_WORD_LENGTH) {
|
||||||
|
AKLOGE("Invalid wordLength: %d", wordLength);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int wordCodePoints[MAX_WORD_LENGTH];
|
||||||
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
|
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
|
||||||
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength);
|
int codePointCount = wordLength;
|
||||||
|
if (isBeginningOfSentence) {
|
||||||
|
codePointCount = CharUtils::attachBeginningOfSentenceMarker(
|
||||||
|
wordCodePoints, wordLength, MAX_WORD_LENGTH);
|
||||||
|
if (codePointCount < 0) {
|
||||||
|
AKLOGE("Cannot attach Beginning-of-Sentence marker.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, codePointCount);
|
||||||
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
|
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
|
||||||
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
|
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
|
||||||
outShortcutProbabilities);
|
outShortcutProbabilities);
|
||||||
|
@ -554,7 +574,6 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Migrate historical information.
|
|
||||||
int wordCodePoints[MAX_WORD_LENGTH];
|
int wordCodePoints[MAX_WORD_LENGTH];
|
||||||
int wordCodePointCount = 0;
|
int wordCodePointCount = 0;
|
||||||
int token = 0;
|
int token = 0;
|
||||||
|
@ -563,6 +582,10 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
|
token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
|
||||||
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints,
|
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints,
|
||||||
wordCodePointCount);
|
wordCodePointCount);
|
||||||
|
if (wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
|
||||||
|
// Skip beginning-of-sentence unigram.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
|
if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
|
dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy(
|
||||||
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
|
std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars);
|
||||||
|
@ -592,7 +615,7 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount,
|
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount,
|
||||||
false /* isBeginningOfSentence */);
|
wordProperty.getUnigramProperty()->representsBeginningOfSentence());
|
||||||
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
|
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
|
||||||
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
|
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
|
||||||
&bigramProperty)) {
|
&bigramProperty)) {
|
||||||
|
@ -669,13 +692,13 @@ static const JNINativeMethod sMethods[] = {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
const_cast<char *>("getWordPropertyNative"),
|
const_cast<char *>("getWordPropertyNative"),
|
||||||
const_cast<char *>("(J[I[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;"
|
const_cast<char *>("(J[IZ[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;"
|
||||||
"Ljava/util/ArrayList;Ljava/util/ArrayList;)V"),
|
"Ljava/util/ArrayList;Ljava/util/ArrayList;)V"),
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_getWordProperty)
|
reinterpret_cast<void *>(latinime_BinaryDictionary_getWordProperty)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
const_cast<char *>("getNextWordNative"),
|
const_cast<char *>("getNextWordNative"),
|
||||||
const_cast<char *>("(JI[I)I"),
|
const_cast<char *>("(JI[I[Z)I"),
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_getNextWord)
|
reinterpret_cast<void *>(latinime_BinaryDictionary_getNextWord)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -28,7 +28,8 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
||||||
false /* needsNullTermination */);
|
false /* needsNullTermination */);
|
||||||
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
|
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
|
||||||
!mBigrams.empty(), mUnigramProperty.hasShortcuts()};
|
!mBigrams.empty(), mUnigramProperty.hasShortcuts(),
|
||||||
|
mUnigramProperty.representsBeginningOfSentence()};
|
||||||
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
|
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
|
||||||
int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(),
|
int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(),
|
||||||
mUnigramProperty.getLevel(), mUnigramProperty.getCount()};
|
mUnigramProperty.getLevel(), mUnigramProperty.getCount()};
|
||||||
|
|
|
@ -98,6 +98,10 @@ class CharUtils {
|
||||||
// Beginning-of-Sentence.
|
// Beginning-of-Sentence.
|
||||||
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
|
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
|
||||||
const int codePointCount, const int maxCodePoint) {
|
const int codePointCount, const int maxCodePoint) {
|
||||||
|
if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
|
||||||
|
// Marker has already been attached.
|
||||||
|
return codePointCount;
|
||||||
|
}
|
||||||
if (codePointCount >= maxCodePoint) {
|
if (codePointCount >= maxCodePoint) {
|
||||||
// the code points cannot be marked as a Beginning-of-Sentence.
|
// the code points cannot be marked as a Beginning-of-Sentence.
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -69,18 +69,23 @@ class JniDataUtils {
|
||||||
static void outputCodePoints(JNIEnv *env, jintArray intArrayToOutputCodePoints, const int start,
|
static void outputCodePoints(JNIEnv *env, jintArray intArrayToOutputCodePoints, const int start,
|
||||||
const int maxLength, const int *const codePoints, const int codePointCount,
|
const int maxLength, const int *const codePoints, const int codePointCount,
|
||||||
const bool needsNullTermination) {
|
const bool needsNullTermination) {
|
||||||
const int outputCodePointCount = std::min(maxLength, codePointCount);
|
const int codePointBufSize = std::min(maxLength, codePointCount);
|
||||||
int outputCodePonts[outputCodePointCount];
|
int outputCodePonts[codePointBufSize];
|
||||||
for (int i = 0; i < outputCodePointCount; ++i) {
|
int outputCodePointCount = 0;
|
||||||
|
for (int i = 0; i < codePointBufSize; ++i) {
|
||||||
const int codePoint = codePoints[i];
|
const int codePoint = codePoints[i];
|
||||||
|
int codePointToOutput = codePoint;
|
||||||
if (!CharUtils::isInUnicodeSpace(codePoint)) {
|
if (!CharUtils::isInUnicodeSpace(codePoint)) {
|
||||||
outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER;
|
if (codePoint == CODE_POINT_BEGINNING_OF_SENTENCE) {
|
||||||
|
// Just skip Beginning-of-Sentence marker.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER;
|
||||||
} else if (codePoint >= 0x01 && codePoint <= 0x1F) {
|
} else if (codePoint >= 0x01 && codePoint <= 0x1F) {
|
||||||
// Control code.
|
// Control code.
|
||||||
outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER;
|
codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER;
|
||||||
} else {
|
|
||||||
outputCodePonts[i] = codePoint;
|
|
||||||
}
|
}
|
||||||
|
outputCodePonts[outputCodePointCount++] = codePointToOutput;
|
||||||
}
|
}
|
||||||
env->SetIntArrayRegion(intArrayToOutputCodePoints, start, outputCodePointCount,
|
env->SetIntArrayRegion(intArrayToOutputCodePoints, start, outputCodePointCount,
|
||||||
outputCodePonts);
|
outputCodePonts);
|
||||||
|
@ -90,6 +95,11 @@ class JniDataUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void putBooleanToArray(JNIEnv *env, jbooleanArray array, const int index,
|
||||||
|
const jboolean value) {
|
||||||
|
env->SetBooleanArrayRegion(array, index, 1 /* len */, &value);
|
||||||
|
}
|
||||||
|
|
||||||
static void putIntToArray(JNIEnv *env, jintArray array, const int index, const int value) {
|
static void putIntToArray(JNIEnv *env, jintArray array, const int index, const int value) {
|
||||||
env->SetIntArrayRegion(array, index, 1 /* len */, &value);
|
env->SetIntArrayRegion(array, index, 1 /* len */, &value);
|
||||||
}
|
}
|
||||||
|
|
|
@ -994,7 +994,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
|
||||||
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
|
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertFalse(invalidWordProperty.isValid());
|
assertFalse(invalidWordProperty.isValid());
|
||||||
|
|
||||||
final ArrayList<String> words = new ArrayList<>();
|
final ArrayList<String> words = new ArrayList<>();
|
||||||
|
@ -1017,7 +1018,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
words.add(word);
|
words.add(word);
|
||||||
wordProbabilities.put(word, unigramProbability);
|
wordProbabilities.put(word, unigramProbability);
|
||||||
final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
|
final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(word, wordProperty.mWord);
|
assertEquals(word, wordProperty.mWord);
|
||||||
assertTrue(wordProperty.isValid());
|
assertTrue(wordProperty.isValid());
|
||||||
assertEquals(isNotAWord, wordProperty.mIsNotAWord);
|
assertEquals(isNotAWord, wordProperty.mIsNotAWord);
|
||||||
|
@ -1057,7 +1059,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
final HashSet<String> bigramWord1s = bigrams.get(word0);
|
final HashSet<String> bigramWord1s = bigrams.get(word0);
|
||||||
final WordProperty wordProperty = binaryDictionary.getWordProperty(word0);
|
final WordProperty wordProperty = binaryDictionary.getWordProperty(word0,
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(bigramWord1s.size(), wordProperty.mBigrams.size());
|
assertEquals(bigramWord1s.size(), wordProperty.mBigrams.size());
|
||||||
for (int j = 0; j < wordProperty.mBigrams.size(); j++) {
|
for (int j = 0; j < wordProperty.mBigrams.size(); j++) {
|
||||||
final String word1 = wordProperty.mBigrams.get(j).mWord;
|
final String word1 = wordProperty.mBigrams.get(j).mWord;
|
||||||
|
@ -1094,7 +1097,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
|
||||||
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
|
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertFalse(invalidWordProperty.isValid());
|
assertFalse(invalidWordProperty.isValid());
|
||||||
|
|
||||||
final ArrayList<String> words = new ArrayList<>();
|
final ArrayList<String> words = new ArrayList<>();
|
||||||
|
@ -1188,7 +1192,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
||||||
shortcutProbability, false /* isBeginningOfSentence */,
|
shortcutProbability, false /* isBeginningOfSentence */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
||||||
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa");
|
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||||
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
|
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
|
||||||
assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability());
|
assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).getProbability());
|
||||||
|
@ -1196,7 +1201,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
||||||
updatedShortcutProbability, false /* isBeginningOfSentence */,
|
updatedShortcutProbability, false /* isBeginningOfSentence */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
||||||
wordProperty = binaryDictionary.getWordProperty("aaa");
|
wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||||
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
|
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
|
||||||
assertEquals(updatedShortcutProbability,
|
assertEquals(updatedShortcutProbability,
|
||||||
|
@ -1207,7 +1213,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
final HashMap<String, Integer> shortcutTargets = new HashMap<>();
|
final HashMap<String, Integer> shortcutTargets = new HashMap<>();
|
||||||
shortcutTargets.put("zzz", updatedShortcutProbability);
|
shortcutTargets.put("zzz", updatedShortcutProbability);
|
||||||
shortcutTargets.put("yyy", shortcutProbability);
|
shortcutTargets.put("yyy", shortcutProbability);
|
||||||
wordProperty = binaryDictionary.getWordProperty("aaa");
|
wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(2, wordProperty.mShortcutTargets.size());
|
assertEquals(2, wordProperty.mShortcutTargets.size());
|
||||||
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
|
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
|
||||||
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
||||||
|
@ -1218,7 +1225,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
shortcutTargets.put("zzz", updatedShortcutProbability);
|
shortcutTargets.put("zzz", updatedShortcutProbability);
|
||||||
shortcutTargets.put("yyy", shortcutProbability);
|
shortcutTargets.put("yyy", shortcutProbability);
|
||||||
binaryDictionary.flushWithGC();
|
binaryDictionary.flushWithGC();
|
||||||
wordProperty = binaryDictionary.getWordProperty("aaa");
|
wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(2, wordProperty.mShortcutTargets.size());
|
assertEquals(2, wordProperty.mShortcutTargets.size());
|
||||||
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
|
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
|
||||||
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
||||||
|
@ -1288,7 +1296,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (final String word : words) {
|
for (final String word : words) {
|
||||||
final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
|
final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals((int)unigramProbabilities.get(word),
|
assertEquals((int)unigramProbabilities.get(word),
|
||||||
wordProperty.mProbabilityInfo.mProbability);
|
wordProperty.mProbabilityInfo.mProbability);
|
||||||
if (!shortcutTargets.containsKey(word)) {
|
if (!shortcutTargets.containsKey(word)) {
|
||||||
|
@ -1332,6 +1341,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */,
|
binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */,
|
||||||
Dictionary.NOT_A_PROBABILITY, false /* isBeginningOfSentence */,
|
Dictionary.NOT_A_PROBABILITY, false /* isBeginningOfSentence */,
|
||||||
true /* isNotAWord */, true /* isBlacklisted */, 0 /* timestamp */);
|
true /* isNotAWord */, true /* isBlacklisted */, 0 /* timestamp */);
|
||||||
|
binaryDictionary.addNgramEntry(PrevWordsInfo.BEGINNING_OF_SENTENCE,
|
||||||
|
"aaa", bigramProbability, 0 /* timestamp */);
|
||||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
||||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
|
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
|
||||||
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
|
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
|
||||||
|
@ -1343,12 +1354,16 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
|
assertEquals(unigramProbability, binaryDictionary.getFrequency("bbb"));
|
||||||
if (canCheckBigramProbability(toFormatVersion)) {
|
if (canCheckBigramProbability(toFormatVersion)) {
|
||||||
assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bbb"));
|
assertEquals(bigramProbability, getBigramProbability(binaryDictionary, "aaa", "bbb"));
|
||||||
|
assertEquals(bigramProbability, binaryDictionary.getNgramProbability(
|
||||||
|
PrevWordsInfo.BEGINNING_OF_SENTENCE, "aaa"));
|
||||||
}
|
}
|
||||||
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
|
assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb"));
|
||||||
WordProperty wordProperty = binaryDictionary.getWordProperty("ccc");
|
WordProperty wordProperty = binaryDictionary.getWordProperty("ccc",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||||
assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord);
|
assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord);
|
||||||
wordProperty = binaryDictionary.getWordProperty("ddd");
|
wordProperty = binaryDictionary.getWordProperty("ddd",
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertTrue(wordProperty.mIsBlacklistEntry);
|
assertTrue(wordProperty.mIsBlacklistEntry);
|
||||||
assertTrue(wordProperty.mIsNotAWord);
|
assertTrue(wordProperty.mIsNotAWord);
|
||||||
}
|
}
|
||||||
|
|
|
@ -614,7 +614,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
0 /* offset */, file.length(), true /* useFullEditDistance */,
|
0 /* offset */, file.length(), true /* useFullEditDistance */,
|
||||||
Locale.ENGLISH, dictName, false /* isUpdatable */);
|
Locale.ENGLISH, dictName, false /* isUpdatable */);
|
||||||
for (final String word : words) {
|
for (final String word : words) {
|
||||||
final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
|
final WordProperty wordProperty = binaryDictionary.getWordProperty(word,
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(word, wordProperty.mWord);
|
assertEquals(word, wordProperty.mWord);
|
||||||
assertEquals(UNIGRAM_FREQ, wordProperty.getProbability());
|
assertEquals(UNIGRAM_FREQ, wordProperty.getProbability());
|
||||||
if (shortcuts.containsKey(word)) {
|
if (shortcuts.containsKey(word)) {
|
||||||
|
|
Loading…
Reference in a new issue