Fix two nasty bugs with surrogate pairs.

The important bug is in findWordInTree. The problem, which is
not obvious, is that we were calling codePointAt() with the
code point index in the string, instead of the char index.
The other bug this change fixes was harmless in the practice,
because it's in the iteration which is only used for debug and
pretty printing purposes. It's very similar in that it would
substract a length in code point to a length in chars and
truncate a StringBuilder at that length, so it would fail in a
quite similar manner. This changes the meaning of the "length"
attribute in Position, but it's clearer this way anyway.

Bug: 8450145
Change-Id: If396f883a9e6449de39351553ba83f5be5bd30f0
main
Jean Chalard 2013-04-01 15:23:24 +09:00
parent 48d8d8d0ae
commit a411595b16
3 changed files with 141 additions and 27 deletions

View File

@ -620,34 +620,34 @@ public final class FusionDictionary implements Iterable<Word> {
* Helper method to find a word in a given branch. * Helper method to find a word in a given branch.
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")
public static CharGroup findWordInTree(Node node, final String s) { public static CharGroup findWordInTree(Node node, final String string) {
int index = 0; int index = 0;
final StringBuilder checker = DBG ? new StringBuilder() : null; final StringBuilder checker = DBG ? new StringBuilder() : null;
final int[] codePoints = getCodePoints(string);
CharGroup currentGroup; CharGroup currentGroup;
final int codePointCountInS = s.codePointCount(0, s.length());
do { do {
int indexOfGroup = findIndexOfChar(node, s.codePointAt(index)); int indexOfGroup = findIndexOfChar(node, codePoints[index]);
if (CHARACTER_NOT_FOUND == indexOfGroup) return null; if (CHARACTER_NOT_FOUND == indexOfGroup) return null;
currentGroup = node.mData.get(indexOfGroup); currentGroup = node.mData.get(indexOfGroup);
if (s.length() - index < currentGroup.mChars.length) return null; if (codePoints.length - index < currentGroup.mChars.length) return null;
int newIndex = index; int newIndex = index;
while (newIndex < s.length() && newIndex - index < currentGroup.mChars.length) { while (newIndex < codePoints.length && newIndex - index < currentGroup.mChars.length) {
if (currentGroup.mChars[newIndex - index] != s.codePointAt(newIndex)) return null; if (currentGroup.mChars[newIndex - index] != codePoints[newIndex]) return null;
newIndex++; newIndex++;
} }
index = newIndex; index = newIndex;
if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length)); if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length));
if (index < codePointCountInS) { if (index < codePoints.length) {
node = currentGroup.mChildren; node = currentGroup.mChildren;
} }
} while (null != node && index < codePointCountInS); } while (null != node && index < codePoints.length);
if (index < codePointCountInS) return null; if (index < codePoints.length) return null;
if (!currentGroup.isTerminal()) return null; if (!currentGroup.isTerminal()) return null;
if (DBG && !s.equals(checker.toString())) return null; if (DBG && !codePoints.equals(checker.toString())) return null;
return currentGroup; return currentGroup;
} }
@ -847,12 +847,12 @@ public final class FusionDictionary implements Iterable<Word> {
@Override @Override
public Word next() { public Word next() {
Position currentPos = mPositions.getLast(); Position currentPos = mPositions.getLast();
mCurrentString.setLength(mCurrentString.length() - currentPos.length); mCurrentString.setLength(currentPos.length);
do { do {
if (currentPos.pos.hasNext()) { if (currentPos.pos.hasNext()) {
final CharGroup currentGroup = currentPos.pos.next(); final CharGroup currentGroup = currentPos.pos.next();
currentPos.length = currentGroup.mChars.length; currentPos.length = mCurrentString.length();
for (int i : currentGroup.mChars) for (int i : currentGroup.mChars)
mCurrentString.append(Character.toChars(i)); mCurrentString.append(Character.toChars(i));
if (null != currentGroup.mChildren) { if (null != currentGroup.mChildren) {
@ -866,7 +866,7 @@ public final class FusionDictionary implements Iterable<Word> {
} else { } else {
mPositions.removeLast(); mPositions.removeLast();
currentPos = mPositions.getLast(); currentPos = mPositions.getLast();
mCurrentString.setLength(mCurrentString.length() - mPositions.getLast().length); mCurrentString.setLength(mPositions.getLast().length);
} }
} while (true); } while (true);
} }

View File

@ -72,15 +72,12 @@ public class BinaryDictIOTests extends AndroidTestCase {
private static final FormatSpec.FormatOptions VERSION3_WITH_DYNAMIC_UPDATE = private static final FormatSpec.FormatOptions VERSION3_WITH_DYNAMIC_UPDATE =
new FormatSpec.FormatOptions(3, true /* supportsDynamicUpdate */); new FormatSpec.FormatOptions(3, true /* supportsDynamicUpdate */);
private static final String[] CHARACTERS = {
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
};
public BinaryDictIOTests() { public BinaryDictIOTests() {
super(); super();
final Random random = new Random(123456); final long time = System.currentTimeMillis();
Log.e(TAG, "Testing dictionary: seed is " + time);
final Random random = new Random(time);
sWords.clear(); sWords.clear();
generateWords(MAX_UNIGRAMS, random); generateWords(MAX_UNIGRAMS, random);
@ -132,13 +129,16 @@ public class BinaryDictIOTests extends AndroidTestCase {
/** /**
* Generates a random word. * Generates a random word.
*/ */
private String generateWord(final int value) { private String generateWord(final Random random) {
final int lengthOfChars = CHARACTERS.length;
StringBuilder builder = new StringBuilder("a"); StringBuilder builder = new StringBuilder("a");
long lvalue = Math.abs((long)value); int count = random.nextInt() % 30; // Arbitrarily 30 chars max
while (lvalue > 0) { while (count > 0) {
builder.append(CHARACTERS[(int)(lvalue % lengthOfChars)]); final long r = Math.abs(random.nextInt());
lvalue /= lengthOfChars; if (r < 0) continue;
// Don't insert 0~20, but insert any other code point.
// Code points are in the range 0~0x10FFFF.
builder.appendCodePoint((int)(20 + r % (0x10FFFF - 20)));
--count;
} }
return builder.toString(); return builder.toString();
} }
@ -146,7 +146,7 @@ public class BinaryDictIOTests extends AndroidTestCase {
private void generateWords(final int number, final Random random) { private void generateWords(final int number, final Random random) {
final Set<String> wordSet = CollectionUtils.newHashSet(); final Set<String> wordSet = CollectionUtils.newHashSet();
while (wordSet.size() < number) { while (wordSet.size() < number) {
wordSet.add(generateWord(random.nextInt())); wordSet.add(generateWord(random));
} }
sWords.addAll(wordSet); sWords.addAll(wordSet);
} }
@ -555,7 +555,7 @@ public class BinaryDictIOTests extends AndroidTestCase {
// Test a word that isn't contained within the dictionary. // Test a word that isn't contained within the dictionary.
final Random random = new Random((int)System.currentTimeMillis()); final Random random = new Random((int)System.currentTimeMillis());
for (int i = 0; i < 1000; ++i) { for (int i = 0; i < 1000; ++i) {
final String word = generateWord(random.nextInt()); final String word = generateWord(random);
if (sWords.indexOf(word) != -1) continue; if (sWords.indexOf(word) != -1) continue;
runGetTerminalPosition(buffer, word, i, false); runGetTerminalPosition(buffer, word, i, false);
} }

View File

@ -0,0 +1,114 @@
/*
* Copyright (C) 2013 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
import com.android.inputmethod.latin.makedict.Word;
import junit.framework.TestCase;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Random;
/**
* Unit tests for BinaryDictInputOutput.
*/
public class FusionDictionaryTest extends TestCase {
private static final ArrayList<String> sWords = new ArrayList<String>();
private static final int MAX_UNIGRAMS = 1000;
private void prepare(final long seed) {
System.out.println("Seed is " + seed);
final Random random = new Random(seed);
sWords.clear();
generateWords(MAX_UNIGRAMS, random);
}
/**
* Generates a random word.
*/
private String generateWord(final Random random) {
StringBuilder builder = new StringBuilder("a");
int count = random.nextInt() % 30;
while (count > 0) {
final long r = Math.abs(random.nextInt());
if (r < 0) continue;
// Don't insert 0~20, but insert any other code point.
// Code points are in the range 0~0x10FFFF.
if (builder.length() < 7)
builder.appendCodePoint((int)(20 +r % (0x10FFFF - 20)));
--count;
}
if (builder.length() == 1) return generateWord(random);
return builder.toString();
}
private void generateWords(final int number, final Random random) {
while (sWords.size() < number) {
sWords.add(generateWord(random));
}
}
private void checkDictionary(final FusionDictionary dict, final ArrayList<String> words,
int limit) {
assertNotNull(dict);
for (final String word : words) {
if (--limit < 0) return;
final CharGroup cg = FusionDictionary.findWordInTree(dict.mRoot, word);
if (null == cg) {
System.out.println("word " + dumpWord(word));
dumpDict(dict);
}
assertNotNull(cg);
}
}
private String dumpWord(final String word) {
final StringBuilder sb = new StringBuilder("");
for (int i = 0; i < word.length(); i = word.offsetByCodePoints(i, 1)) {
sb.append(word.codePointAt(i));
sb.append(" ");
}
return sb.toString();
}
private void dumpDict(final FusionDictionary dict) {
for (Word w : dict) {
System.out.println("Word " + dumpWord(w.mWord));
}
}
// Test the flattened array contains the expected number of nodes, and
// that it does not contain any duplicates.
public void testFusion() {
final FusionDictionary dict = new FusionDictionary(new Node(),
new DictionaryOptions(new HashMap<String, String>(),
false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
final long time = System.currentTimeMillis();
prepare(time);
for (int i = 0; i < sWords.size(); ++i) {
System.out.println("Adding in pos " + i + " : " + dumpWord(sWords.get(i)));
dict.add(sWords.get(i), 180, null, false);
dumpDict(dict);
checkDictionary(dict, sWords, i);
}
}
}