From 78b55a31cb158b1e14ccf678133269b0f99c7f9a Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Tue, 17 Sep 2013 15:11:24 +0900 Subject: [PATCH] Fix handling multi-bytes characters and add a test. Bug: 6669677 Change-Id: Id2154db47adea2929559a4187a726f9dfa83363e --- .../dictionary/utils/byte_array_utils.cpp | 3 +- .../dictionary/utils/byte_array_utils.h | 11 ++-- .../latin/BinaryDictionaryTests.java | 40 +++++++++++- .../BinaryDictDecoderEncoderTests.java | 51 ++------------- .../latin/makedict/CodePointUtils.java | 65 +++++++++++++++++++ 5 files changed, 119 insertions(+), 51 deletions(-) create mode 100644 tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp index a84cfb9d5..1833e8832 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp @@ -18,7 +18,8 @@ namespace latinime { -const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h index 6bafb64ee..0c1576818 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h @@ -135,7 +135,7 @@ class ByteArrayUtils { static AK_FORCE_INLINE int readCodePointAndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint8_t firstByte = readUint8(buffer, *pos); - if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { if (firstByte == CHARACTER_ARRAY_TERMINATOR) { *pos += 1; return NOT_A_CODE_POINT; @@ -187,7 +187,8 @@ class ByteArrayUtils { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; - } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. writeUint24AndAdvancePosition(buffer, codePoint, pos); } else { @@ -207,7 +208,8 @@ class ByteArrayUtils { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; - } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. byteCount += 3; } else { @@ -225,7 +227,8 @@ class ByteArrayUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); - static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; static const uint8_t CHARACTER_ARRAY_TERMINATOR; static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 6f05d428c..501a035e7 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -19,6 +19,7 @@ package com.android.inputmethod.latin; import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; +import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.DictEncoder; import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FusionDictionary; @@ -30,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Locale; +import java.util.Random; @LargeTest public class BinaryDictionaryTests extends AndroidTestCase { @@ -117,10 +119,46 @@ public class BinaryDictionaryTests extends AndroidTestCase { assertEquals(probability, binaryDictionary.getFrequency("aab")); assertEquals(probability, binaryDictionary.getFrequency("aac")); - assertEquals(probability, binaryDictionary.getFrequency("aac")); + assertEquals(probability, binaryDictionary.getFrequency("aa")); assertEquals(probability, binaryDictionary.getFrequency("aaaa")); assertEquals(probability, binaryDictionary.getFrequency("a")); assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); + + dictFile.delete(); + } + + public void testRandomlyAddUnigramWord() { + final int wordCount = 1000; + final int codePointSetSize = 50; + final int seed = 123456789; + + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } catch (UnsupportedFormatException e) { + fail("UnsupportedFormatException while writing an initial dictionary : " + e); + } + BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + final HashMap probabilityMap = new HashMap(); + // Test a word that isn't contained within the dictionary. + final Random random = new Random(seed); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + for (int i = 0; i < wordCount; ++i) { + final String word = CodePointUtils.generateWord(random, codePointSet); + probabilityMap.put(word, random.nextInt() & 0xFF); + } + for (String word : probabilityMap.keySet()) { + binaryDictionary.addUnigramWord(word, probabilityMap.get(word)); + } + for (String word : probabilityMap.keySet()) { + assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); + } + dictFile.delete(); } public void testAddBigramWords() { diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java index 807c25244..8bc0095a5 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java @@ -87,7 +87,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { Log.e(TAG, "Testing dictionary: seed is " + seed); final Random random = new Random(seed); sWords.clear(); - final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); + final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, + random); generateWords(maxUnigrams, random, codePointSet); for (int i = 0; i < sWords.size(); ++i) { @@ -113,51 +114,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { } } - private int[] generateCodePointSet(final int codePointSetSize, final Random random) { - final int[] codePointSet = new int[codePointSetSize]; - for (int i = codePointSet.length - 1; i >= 0; ) { - final int r = Math.abs(random.nextInt()); - if (r < 0) continue; - // Don't insert 0~0x20, but insert any other code point. - // Code points are in the range 0~0x10FFFF. - final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20); - // Code points between MIN_ and MAX_SURROGATE are not valid on their own. - if (candidateCodePoint >= Character.MIN_SURROGATE - && candidateCodePoint <= Character.MAX_SURROGATE) continue; - codePointSet[i] = candidateCodePoint; - --i; - } - return codePointSet; - } - - // Utilities for test - - /** - * Generates a random word. - */ - private String generateWord(final Random random, final int[] codePointSet) { - StringBuilder builder = new StringBuilder(); - // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward - // longer words. This should be closer to natural language, and more importantly, it will - // exercise the algorithms in dicttool much more. - final int count = 1 + (Math.abs(random.nextInt()) % 5) - + (Math.abs(random.nextInt()) % 5) - + (Math.abs(random.nextInt()) % 5) - + (Math.abs(random.nextInt()) % 5) - + (Math.abs(random.nextInt()) % 5) - + (Math.abs(random.nextInt()) % 5) - + (Math.abs(random.nextInt()) % 5) - + (Math.abs(random.nextInt()) % 5); - while (builder.length() < count) { - builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]); - } - return builder.toString(); - } - private void generateWords(final int number, final Random random, final int[] codePointSet) { final Set wordSet = CollectionUtils.newHashSet(); while (wordSet.size() < number) { - wordSet.add(generateWord(random, codePointSet)); + wordSet.add(CodePointUtils.generateWord(random, codePointSet)); } sWords.addAll(wordSet); } @@ -606,9 +566,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { // Test a word that isn't contained within the dictionary. final Random random = new Random((int)System.currentTimeMillis()); - final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); + final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, + random); for (int i = 0; i < 1000; ++i) { - final String word = generateWord(random, codePointSet); + final String word = CodePointUtils.generateWord(random, codePointSet); if (sWords.indexOf(word) != -1) continue; runGetTerminalPosition(dictDecoder, word, i, false); } diff --git a/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java b/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java new file mode 100644 index 000000000..36b958af8 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import java.util.Random; + +// Utility methods related with code points used for tests. +public class CodePointUtils { + private CodePointUtils() { + // This utility class is not publicly instantiable. + } + + public static int[] generateCodePointSet(final int codePointSetSize, final Random random) { + final int[] codePointSet = new int[codePointSetSize]; + for (int i = codePointSet.length - 1; i >= 0; ) { + final int r = Math.abs(random.nextInt()); + if (r < 0) continue; + // Don't insert 0~0x20, but insert any other code point. + // Code points are in the range 0~0x10FFFF. + final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20); + // Code points between MIN_ and MAX_SURROGATE are not valid on their own. + if (candidateCodePoint >= Character.MIN_SURROGATE + && candidateCodePoint <= Character.MAX_SURROGATE) continue; + codePointSet[i] = candidateCodePoint; + --i; + } + return codePointSet; + } + + /** + * Generates a random word. + */ + public static String generateWord(final Random random, final int[] codePointSet) { + StringBuilder builder = new StringBuilder(); + // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward + // longer words. This should be closer to natural language, and more importantly, it will + // exercise the algorithms in dicttool much more. + final int count = 1 + (Math.abs(random.nextInt()) % 5) + + (Math.abs(random.nextInt()) % 5) + + (Math.abs(random.nextInt()) % 5) + + (Math.abs(random.nextInt()) % 5) + + (Math.abs(random.nextInt()) % 5) + + (Math.abs(random.nextInt()) % 5) + + (Math.abs(random.nextInt()) % 5) + + (Math.abs(random.nextInt()) % 5); + while (builder.length() < count) { + builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]); + } + return builder.toString(); + } +}