Merge "Fix handling multi-bytes characters and add a test."
commit
280fb1a14e
|
@ -18,7 +18,8 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
||||||
|
const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF;
|
||||||
const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F;
|
const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F;
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -135,7 +135,7 @@ class ByteArrayUtils {
|
||||||
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
|
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, int *const pos) {
|
||||||
const uint8_t firstByte = readUint8(buffer, *pos);
|
const uint8_t firstByte = readUint8(buffer, *pos);
|
||||||
if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
|
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
|
||||||
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
|
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
|
||||||
*pos += 1;
|
*pos += 1;
|
||||||
return NOT_A_CODE_POINT;
|
return NOT_A_CODE_POINT;
|
||||||
|
@ -187,7 +187,8 @@ class ByteArrayUtils {
|
||||||
const int codePoint = codePoints[i];
|
const int codePoint = codePoints[i];
|
||||||
if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
|
if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
|
||||||
break;
|
break;
|
||||||
} else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
|
} else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
|
||||||
|
|| codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
|
||||||
// three bytes character.
|
// three bytes character.
|
||||||
writeUint24AndAdvancePosition(buffer, codePoint, pos);
|
writeUint24AndAdvancePosition(buffer, codePoint, pos);
|
||||||
} else {
|
} else {
|
||||||
|
@ -207,7 +208,8 @@ class ByteArrayUtils {
|
||||||
const int codePoint = codePoints[i];
|
const int codePoint = codePoints[i];
|
||||||
if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
|
if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
|
||||||
break;
|
break;
|
||||||
} else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
|
} else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
|
||||||
|
|| codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
|
||||||
// three bytes character.
|
// three bytes character.
|
||||||
byteCount += 3;
|
byteCount += 3;
|
||||||
} else {
|
} else {
|
||||||
|
@ -225,7 +227,8 @@ class ByteArrayUtils {
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
|
||||||
|
|
||||||
static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE;
|
static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE;
|
||||||
|
static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE;
|
||||||
static const uint8_t CHARACTER_ARRAY_TERMINATOR;
|
static const uint8_t CHARACTER_ARRAY_TERMINATOR;
|
||||||
|
|
||||||
static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
|
static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
|
||||||
|
|
|
@ -19,6 +19,7 @@ package com.android.inputmethod.latin;
|
||||||
import android.test.AndroidTestCase;
|
import android.test.AndroidTestCase;
|
||||||
import android.test.suitebuilder.annotation.LargeTest;
|
import android.test.suitebuilder.annotation.LargeTest;
|
||||||
|
|
||||||
|
import com.android.inputmethod.latin.makedict.CodePointUtils;
|
||||||
import com.android.inputmethod.latin.makedict.DictEncoder;
|
import com.android.inputmethod.latin.makedict.DictEncoder;
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec;
|
import com.android.inputmethod.latin.makedict.FormatSpec;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
|
@ -30,6 +31,7 @@ import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
@LargeTest
|
@LargeTest
|
||||||
public class BinaryDictionaryTests extends AndroidTestCase {
|
public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
|
@ -117,10 +119,46 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
|
|
||||||
assertEquals(probability, binaryDictionary.getFrequency("aab"));
|
assertEquals(probability, binaryDictionary.getFrequency("aab"));
|
||||||
assertEquals(probability, binaryDictionary.getFrequency("aac"));
|
assertEquals(probability, binaryDictionary.getFrequency("aac"));
|
||||||
assertEquals(probability, binaryDictionary.getFrequency("aac"));
|
assertEquals(probability, binaryDictionary.getFrequency("aa"));
|
||||||
assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
|
assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
|
||||||
assertEquals(probability, binaryDictionary.getFrequency("a"));
|
assertEquals(probability, binaryDictionary.getFrequency("a"));
|
||||||
assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
|
assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));
|
||||||
|
|
||||||
|
dictFile.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomlyAddUnigramWord() {
|
||||||
|
final int wordCount = 1000;
|
||||||
|
final int codePointSetSize = 50;
|
||||||
|
final int seed = 123456789;
|
||||||
|
|
||||||
|
File dictFile = null;
|
||||||
|
try {
|
||||||
|
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while writing an initial dictionary : " + e);
|
||||||
|
} catch (UnsupportedFormatException e) {
|
||||||
|
fail("UnsupportedFormatException while writing an initial dictionary : " + e);
|
||||||
|
}
|
||||||
|
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||||
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
|
||||||
|
final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>();
|
||||||
|
// Test a word that isn't contained within the dictionary.
|
||||||
|
final Random random = new Random(seed);
|
||||||
|
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||||
|
for (int i = 0; i < wordCount; ++i) {
|
||||||
|
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||||
|
probabilityMap.put(word, random.nextInt() & 0xFF);
|
||||||
|
}
|
||||||
|
for (String word : probabilityMap.keySet()) {
|
||||||
|
binaryDictionary.addUnigramWord(word, probabilityMap.get(word));
|
||||||
|
}
|
||||||
|
for (String word : probabilityMap.keySet()) {
|
||||||
|
assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
|
||||||
|
}
|
||||||
|
dictFile.delete();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAddBigramWords() {
|
public void testAddBigramWords() {
|
||||||
|
|
|
@ -87,7 +87,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
Log.e(TAG, "Testing dictionary: seed is " + seed);
|
Log.e(TAG, "Testing dictionary: seed is " + seed);
|
||||||
final Random random = new Random(seed);
|
final Random random = new Random(seed);
|
||||||
sWords.clear();
|
sWords.clear();
|
||||||
final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random);
|
final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
|
||||||
|
random);
|
||||||
generateWords(maxUnigrams, random, codePointSet);
|
generateWords(maxUnigrams, random, codePointSet);
|
||||||
|
|
||||||
for (int i = 0; i < sWords.size(); ++i) {
|
for (int i = 0; i < sWords.size(); ++i) {
|
||||||
|
@ -113,51 +114,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int[] generateCodePointSet(final int codePointSetSize, final Random random) {
|
|
||||||
final int[] codePointSet = new int[codePointSetSize];
|
|
||||||
for (int i = codePointSet.length - 1; i >= 0; ) {
|
|
||||||
final int r = Math.abs(random.nextInt());
|
|
||||||
if (r < 0) continue;
|
|
||||||
// Don't insert 0~0x20, but insert any other code point.
|
|
||||||
// Code points are in the range 0~0x10FFFF.
|
|
||||||
final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20);
|
|
||||||
// Code points between MIN_ and MAX_SURROGATE are not valid on their own.
|
|
||||||
if (candidateCodePoint >= Character.MIN_SURROGATE
|
|
||||||
&& candidateCodePoint <= Character.MAX_SURROGATE) continue;
|
|
||||||
codePointSet[i] = candidateCodePoint;
|
|
||||||
--i;
|
|
||||||
}
|
|
||||||
return codePointSet;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Utilities for test
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generates a random word.
|
|
||||||
*/
|
|
||||||
private String generateWord(final Random random, final int[] codePointSet) {
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
// 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward
|
|
||||||
// longer words. This should be closer to natural language, and more importantly, it will
|
|
||||||
// exercise the algorithms in dicttool much more.
|
|
||||||
final int count = 1 + (Math.abs(random.nextInt()) % 5)
|
|
||||||
+ (Math.abs(random.nextInt()) % 5)
|
|
||||||
+ (Math.abs(random.nextInt()) % 5)
|
|
||||||
+ (Math.abs(random.nextInt()) % 5)
|
|
||||||
+ (Math.abs(random.nextInt()) % 5)
|
|
||||||
+ (Math.abs(random.nextInt()) % 5)
|
|
||||||
+ (Math.abs(random.nextInt()) % 5)
|
|
||||||
+ (Math.abs(random.nextInt()) % 5);
|
|
||||||
while (builder.length() < count) {
|
|
||||||
builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]);
|
|
||||||
}
|
|
||||||
return builder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void generateWords(final int number, final Random random, final int[] codePointSet) {
|
private void generateWords(final int number, final Random random, final int[] codePointSet) {
|
||||||
final Set<String> wordSet = CollectionUtils.newHashSet();
|
final Set<String> wordSet = CollectionUtils.newHashSet();
|
||||||
while (wordSet.size() < number) {
|
while (wordSet.size() < number) {
|
||||||
wordSet.add(generateWord(random, codePointSet));
|
wordSet.add(CodePointUtils.generateWord(random, codePointSet));
|
||||||
}
|
}
|
||||||
sWords.addAll(wordSet);
|
sWords.addAll(wordSet);
|
||||||
}
|
}
|
||||||
|
@ -606,9 +566,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
|
|
||||||
// Test a word that isn't contained within the dictionary.
|
// Test a word that isn't contained within the dictionary.
|
||||||
final Random random = new Random((int)System.currentTimeMillis());
|
final Random random = new Random((int)System.currentTimeMillis());
|
||||||
final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random);
|
final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
|
||||||
|
random);
|
||||||
for (int i = 0; i < 1000; ++i) {
|
for (int i = 0; i < 1000; ++i) {
|
||||||
final String word = generateWord(random, codePointSet);
|
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||||
if (sWords.indexOf(word) != -1) continue;
|
if (sWords.indexOf(word) != -1) continue;
|
||||||
runGetTerminalPosition(dictDecoder, word, i, false);
|
runGetTerminalPosition(dictDecoder, word, i, false);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.android.inputmethod.latin.makedict;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
// Utility methods related with code points used for tests.
|
||||||
|
public class CodePointUtils {
|
||||||
|
private CodePointUtils() {
|
||||||
|
// This utility class is not publicly instantiable.
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int[] generateCodePointSet(final int codePointSetSize, final Random random) {
|
||||||
|
final int[] codePointSet = new int[codePointSetSize];
|
||||||
|
for (int i = codePointSet.length - 1; i >= 0; ) {
|
||||||
|
final int r = Math.abs(random.nextInt());
|
||||||
|
if (r < 0) continue;
|
||||||
|
// Don't insert 0~0x20, but insert any other code point.
|
||||||
|
// Code points are in the range 0~0x10FFFF.
|
||||||
|
final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20);
|
||||||
|
// Code points between MIN_ and MAX_SURROGATE are not valid on their own.
|
||||||
|
if (candidateCodePoint >= Character.MIN_SURROGATE
|
||||||
|
&& candidateCodePoint <= Character.MAX_SURROGATE) continue;
|
||||||
|
codePointSet[i] = candidateCodePoint;
|
||||||
|
--i;
|
||||||
|
}
|
||||||
|
return codePointSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a random word.
|
||||||
|
*/
|
||||||
|
public static String generateWord(final Random random, final int[] codePointSet) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
// 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward
|
||||||
|
// longer words. This should be closer to natural language, and more importantly, it will
|
||||||
|
// exercise the algorithms in dicttool much more.
|
||||||
|
final int count = 1 + (Math.abs(random.nextInt()) % 5)
|
||||||
|
+ (Math.abs(random.nextInt()) % 5)
|
||||||
|
+ (Math.abs(random.nextInt()) % 5)
|
||||||
|
+ (Math.abs(random.nextInt()) % 5)
|
||||||
|
+ (Math.abs(random.nextInt()) % 5)
|
||||||
|
+ (Math.abs(random.nextInt()) % 5)
|
||||||
|
+ (Math.abs(random.nextInt()) % 5)
|
||||||
|
+ (Math.abs(random.nextInt()) % 5);
|
||||||
|
while (builder.length() < count) {
|
||||||
|
builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]);
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue