Add insertWord.
bug: 6669677 Change-Id: Ide55a4931071de9cd42c1cddae63ddd531d2feba
This commit is contained in:
parent
c3a98ca306
commit
3c6d9fe148
4 changed files with 639 additions and 2 deletions
|
@ -27,12 +27,15 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
|||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Stack;
|
||||
|
||||
public final class BinaryDictIOUtils {
|
||||
private static final boolean DBG = false;
|
||||
private static final int MSB24 = 0x800000;
|
||||
private static final int SINT24_MAX = 0x7FFFFF;
|
||||
private static final int MAX_JUMPS = 10000;
|
||||
|
||||
private BinaryDictIOUtils() {
|
||||
|
@ -646,4 +649,302 @@ public final class BinaryDictIOUtils {
|
|||
writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
|
||||
return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Move a group that is referred to by oldGroupOrigin to the tail of the file.
|
||||
* And set the children address to the byte after the group.
|
||||
*
|
||||
* @param nodeOrigin the address of the tail of the file.
|
||||
* @param characters
|
||||
* @param length
|
||||
* @param flags
|
||||
* @param frequency
|
||||
* @param parentAddress
|
||||
* @param shortcutTargets
|
||||
* @param bigrams
|
||||
* @param destination the stream representing the tail of the file.
|
||||
* @param buffer the buffer representing the (constant-size) body of the file.
|
||||
* @param oldNodeOrigin
|
||||
* @param oldGroupOrigin
|
||||
* @param formatOptions
|
||||
* @return the size written, in bytes.
|
||||
* @throws IOException
|
||||
*/
|
||||
private static int moveGroup(final int nodeOrigin, final int[] characters, final int length,
|
||||
final int flags, final int frequency, final int parentAddress,
|
||||
final ArrayList<WeightedString> shortcutTargets,
|
||||
final ArrayList<PendingAttribute> bigrams, final OutputStream destination,
|
||||
final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin,
|
||||
final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException {
|
||||
int size = 0;
|
||||
final int newGroupOrigin = nodeOrigin + 1;
|
||||
final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length);
|
||||
final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */,
|
||||
flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS,
|
||||
shortcutTargets, bigrams);
|
||||
size = computeGroupSize(tmpInfo, formatOptions);
|
||||
final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size,
|
||||
flags, writtenCharacters, frequency, parentAddress,
|
||||
nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets,
|
||||
bigrams);
|
||||
moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions);
|
||||
return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a word into a binary dictionary.
|
||||
*
|
||||
* @param buffer
|
||||
* @param destination
|
||||
* @param word
|
||||
* @param frequency
|
||||
* @param bigramStrings
|
||||
* @param shortcuts
|
||||
* @throws IOException
|
||||
* @throws UnsupportedFormatException
|
||||
*/
|
||||
// TODO: Support batch insertion.
|
||||
public static void insertWord(final FusionDictionaryBufferInterface buffer,
|
||||
final OutputStream destination, final String word, final int frequency,
|
||||
final ArrayList<WeightedString> bigramStrings,
|
||||
final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
|
||||
final boolean isBlackListEntry)
|
||||
throws IOException, UnsupportedFormatException {
|
||||
final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>();
|
||||
if (bigramStrings != null) {
|
||||
for (final WeightedString bigram : bigramStrings) {
|
||||
int position = getTerminalPosition(buffer, bigram.mWord);
|
||||
if (position == FormatSpec.NOT_VALID_WORD) {
|
||||
// TODO: figure out what is the correct thing to do here.
|
||||
} else {
|
||||
bigrams.add(new PendingAttribute(position, bigram.mFrequency));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final boolean isTerminal = true;
|
||||
final boolean hasBigrams = !bigrams.isEmpty();
|
||||
final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty();
|
||||
|
||||
// find the insert position of the word.
|
||||
if (buffer.position() != 0) buffer.position(0);
|
||||
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
|
||||
|
||||
int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position();
|
||||
final int[] codePoints = FusionDictionary.getCodePoints(word);
|
||||
final int wordLen = codePoints.length;
|
||||
|
||||
for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
|
||||
if (wordPos >= wordLen) break;
|
||||
nodeOriginAddress = buffer.position();
|
||||
int nodeParentAddress = -1;
|
||||
final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
|
||||
boolean foundNextGroup = false;
|
||||
|
||||
for (int i = 0; i < charGroupCount; ++i) {
|
||||
address = buffer.position();
|
||||
final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
|
||||
buffer.position(), header.mFormatOptions);
|
||||
final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
|
||||
header.mFormatOptions);
|
||||
if (isMovedGroup) continue;
|
||||
nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS)
|
||||
? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address;
|
||||
boolean matched = true;
|
||||
for (int p = 0; p < currentInfo.mCharacters.length; ++p) {
|
||||
if (wordPos + p >= wordLen) {
|
||||
/*
|
||||
* splitting
|
||||
* before
|
||||
* abcd - ef
|
||||
*
|
||||
* insert "abc"
|
||||
*
|
||||
* after
|
||||
* abc - d - ef
|
||||
*/
|
||||
final int newNodeAddress = buffer.limit();
|
||||
final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
|
||||
isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */,
|
||||
false /* isBlackListEntry */, header.mFormatOptions);
|
||||
int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags,
|
||||
frequency, nodeParentAddress, shortcuts, bigrams, destination,
|
||||
buffer, nodeOriginAddress, address, header.mFormatOptions);
|
||||
|
||||
final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p,
|
||||
currentInfo.mCharacters.length);
|
||||
if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
|
||||
updateParentAddresses(buffer, currentInfo.mChildrenAddress,
|
||||
newNodeAddress + written + 1, header.mFormatOptions);
|
||||
}
|
||||
final CharGroupInfo newInfo2 = new CharGroupInfo(
|
||||
newNodeAddress + written + 1, -1 /* endAddress */,
|
||||
currentInfo.mFlags, characters2, currentInfo.mFrequency,
|
||||
newNodeAddress + 1, currentInfo.mChildrenAddress,
|
||||
currentInfo.mShortcutTargets, currentInfo.mBigrams);
|
||||
writeNode(destination, new CharGroupInfo[] { newInfo2 });
|
||||
return;
|
||||
} else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) {
|
||||
if (p > 0) {
|
||||
/*
|
||||
* splitting
|
||||
* before
|
||||
* ab - cd
|
||||
*
|
||||
* insert "ac"
|
||||
*
|
||||
* after
|
||||
* a - b - cd
|
||||
* |
|
||||
* - c
|
||||
*/
|
||||
|
||||
final int newNodeAddress = buffer.limit();
|
||||
final int childrenAddress = currentInfo.mChildrenAddress;
|
||||
|
||||
// move prefix
|
||||
final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
|
||||
false /* isTerminal */, 0 /* childrenAddressSize*/,
|
||||
false /* hasShortcut */, false /* hasBigrams */,
|
||||
false /* isNotAWord */, false /* isBlackListEntry */,
|
||||
header.mFormatOptions);
|
||||
int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p,
|
||||
prefixFlags, -1 /* frequency */, nodeParentAddress, null, null,
|
||||
destination, buffer, nodeOriginAddress, address,
|
||||
header.mFormatOptions);
|
||||
|
||||
final int[] suffixCharacters = Arrays.copyOfRange(
|
||||
currentInfo.mCharacters, p, currentInfo.mCharacters.length);
|
||||
if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
|
||||
updateParentAddresses(buffer, currentInfo.mChildrenAddress,
|
||||
newNodeAddress + written + 1, header.mFormatOptions);
|
||||
}
|
||||
final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags(
|
||||
suffixCharacters.length > 1,
|
||||
(currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0,
|
||||
0 /* childrenAddressSize */,
|
||||
(currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)
|
||||
!= 0,
|
||||
(currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0,
|
||||
isNotAWord, isBlackListEntry, header.mFormatOptions);
|
||||
final CharGroupInfo suffixInfo = new CharGroupInfo(
|
||||
newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags,
|
||||
suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1,
|
||||
currentInfo.mChildrenAddress, currentInfo.mShortcutTargets,
|
||||
currentInfo.mBigrams);
|
||||
written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1;
|
||||
|
||||
final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p,
|
||||
codePoints.length);
|
||||
final int flags = BinaryDictInputOutput.makeCharGroupFlags(
|
||||
newCharacters.length > 1, isTerminal,
|
||||
0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
|
||||
isNotAWord, isBlackListEntry, header.mFormatOptions);
|
||||
final CharGroupInfo newInfo = new CharGroupInfo(
|
||||
newNodeAddress + written, -1 /* endAddress */, flags,
|
||||
newCharacters, frequency, newNodeAddress + 1,
|
||||
FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
|
||||
writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo });
|
||||
return;
|
||||
}
|
||||
matched = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matched) {
|
||||
if (wordPos + currentInfo.mCharacters.length == wordLen) {
|
||||
// the word exists in the dictionary.
|
||||
// only update group.
|
||||
final int newNodeAddress = buffer.limit();
|
||||
final boolean hasMultipleChars = currentInfo.mCharacters.length > 1;
|
||||
final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
|
||||
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
|
||||
isNotAWord, isBlackListEntry, header.mFormatOptions);
|
||||
final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
|
||||
-1 /* endAddress */, flags, currentInfo.mCharacters, frequency,
|
||||
nodeParentAddress, currentInfo.mChildrenAddress, shortcuts,
|
||||
bigrams);
|
||||
moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address,
|
||||
header.mFormatOptions);
|
||||
return;
|
||||
}
|
||||
wordPos += currentInfo.mCharacters.length;
|
||||
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
|
||||
/*
|
||||
* found the prefix of the word.
|
||||
* make new node and link to the node from this group.
|
||||
*
|
||||
* before
|
||||
* ab - cd
|
||||
*
|
||||
* insert "abcde"
|
||||
*
|
||||
* after
|
||||
* ab - cd - e
|
||||
*/
|
||||
final int newNodeAddress = buffer.limit();
|
||||
updateChildrenAddress(buffer, address, newNodeAddress,
|
||||
header.mFormatOptions);
|
||||
final int newGroupAddress = newNodeAddress + 1;
|
||||
final boolean hasMultipleChars = (wordLen - wordPos) > 1;
|
||||
final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
|
||||
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
|
||||
isNotAWord, isBlackListEntry, header.mFormatOptions);
|
||||
final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
|
||||
final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags,
|
||||
characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS,
|
||||
shortcuts, bigrams);
|
||||
writeNode(destination, new CharGroupInfo[] { newInfo });
|
||||
return;
|
||||
}
|
||||
buffer.position(currentInfo.mChildrenAddress);
|
||||
foundNextGroup = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (foundNextGroup) continue;
|
||||
|
||||
// reached the end of the array.
|
||||
final int linkAddressPosition = buffer.position();
|
||||
int nextLink = buffer.readUnsignedInt24();
|
||||
if ((nextLink & MSB24) != 0) {
|
||||
nextLink = -(nextLink & SINT24_MAX);
|
||||
}
|
||||
if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
|
||||
/*
|
||||
* expand this node.
|
||||
*
|
||||
* before
|
||||
* ab - cd
|
||||
*
|
||||
* insert "abef"
|
||||
*
|
||||
* after
|
||||
* ab - cd
|
||||
* |
|
||||
* - ef
|
||||
*/
|
||||
|
||||
// change the forward link address.
|
||||
final int newNodeAddress = buffer.limit();
|
||||
buffer.position(linkAddressPosition);
|
||||
writeSInt24ToBuffer(buffer, newNodeAddress);
|
||||
|
||||
final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
|
||||
final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1,
|
||||
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
|
||||
isNotAWord, isBlackListEntry, header.mFormatOptions);
|
||||
final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
|
||||
-1 /* endAddress */, flags, characters, frequency, nodeParentAddress,
|
||||
FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
|
||||
writeNode(destination, new CharGroupInfo[]{ newInfo });
|
||||
return;
|
||||
} else {
|
||||
depth--;
|
||||
buffer.position(nextLink);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -411,7 +411,8 @@ public final class BinaryDictInputOutput {
|
|||
* Helper method to check whether the group is moved.
|
||||
*/
|
||||
public static boolean isMovedGroup(final int flags, final FormatOptions options) {
|
||||
return options.mSupportsDynamicUpdate && ((flags & FormatSpec.FLAG_IS_MOVED) == 1);
|
||||
return options.mSupportsDynamicUpdate
|
||||
&& ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -279,7 +279,7 @@ public final class FusionDictionary implements Iterable<Word> {
|
|||
/**
|
||||
* Helper method to convert a String to an int array.
|
||||
*/
|
||||
static private int[] getCodePoints(final String word) {
|
||||
static int[] getCodePoints(final String word) {
|
||||
// TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray,
|
||||
// which is not visible from the makedict package. Factor this code.
|
||||
final char[] characters = word.toCharArray();
|
||||
|
|
|
@ -0,0 +1,335 @@
|
|||
/*
|
||||
* Copyright (C) 2012 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.android.inputmethod.latin.makedict;
|
||||
|
||||
import com.android.inputmethod.latin.CollectionUtils;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.ByteBufferWrapper;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding;
|
||||
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
|
||||
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||
|
||||
import android.test.AndroidTestCase;
|
||||
import android.test.MoreAsserts;
|
||||
import android.util.Log;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Random;
|
||||
|
||||
public class BinaryDictIOUtilsTests extends AndroidTestCase{
|
||||
private static final String TAG = BinaryDictIOUtilsTests.class.getSimpleName();
|
||||
private static final FormatSpec.FormatOptions FORMAT_OPTIONS =
|
||||
new FormatSpec.FormatOptions(3, true);
|
||||
private static final int MAX_UNIGRAMS = 1500;
|
||||
|
||||
private static final ArrayList<String> sWords = CollectionUtils.newArrayList();
|
||||
|
||||
private static final String[] CHARACTERS = {
|
||||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
|
||||
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
|
||||
"\u00FC" /* ü */, "\u00E2" /* â */, "\u00F1" /* ñ */, // accented characters
|
||||
"\u4E9C" /* 亜 */, "\u4F0A" /* 伊 */, "\u5B87" /* 宇 */, // kanji
|
||||
"\uD841\uDE28" /* 𠘨 */, "\uD840\uDC0B" /* 𠀋 */, "\uD861\uDeD7" /* 𨛗 */ // surrogate pair
|
||||
};
|
||||
|
||||
public BinaryDictIOUtilsTests() {
|
||||
super();
|
||||
final Random random = new Random(123456);
|
||||
sWords.clear();
|
||||
for (int i = 0; i < MAX_UNIGRAMS; ++i) {
|
||||
sWords.add(generateWord(random.nextInt()));
|
||||
}
|
||||
}
|
||||
|
||||
// Utilities for test
|
||||
private String generateWord(final int value) {
|
||||
final int lengthOfChars = CHARACTERS.length;
|
||||
StringBuilder builder = new StringBuilder("");
|
||||
long lvalue = Math.abs((long)value);
|
||||
while (lvalue > 0) {
|
||||
builder.append(CHARACTERS[(int)(lvalue % lengthOfChars)]);
|
||||
lvalue /= lengthOfChars;
|
||||
}
|
||||
if (builder.toString().equals("")) return "a";
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private static void printCharGroup(final CharGroupInfo info) {
|
||||
Log.d(TAG, " CharGroup at " + info.mOriginalAddress);
|
||||
Log.d(TAG, " flags = " + info.mFlags);
|
||||
Log.d(TAG, " parentAddress = " + info.mParentAddress);
|
||||
Log.d(TAG, " characters = " + new String(info.mCharacters, 0,
|
||||
info.mCharacters.length));
|
||||
if (info.mFrequency != -1) Log.d(TAG, " frequency = " + info.mFrequency);
|
||||
if (info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
|
||||
Log.d(TAG, " children address = no children address");
|
||||
} else {
|
||||
Log.d(TAG, " children address = " + info.mChildrenAddress);
|
||||
}
|
||||
if (info.mShortcutTargets != null) {
|
||||
for (final WeightedString ws : info.mShortcutTargets) {
|
||||
Log.d(TAG, " shortcuts = " + ws.mWord);
|
||||
}
|
||||
}
|
||||
if (info.mBigrams != null) {
|
||||
for (final PendingAttribute attr : info.mBigrams) {
|
||||
Log.d(TAG, " bigram = " + attr.mAddress);
|
||||
}
|
||||
}
|
||||
Log.d(TAG, " end address = " + info.mEndAddress);
|
||||
}
|
||||
|
||||
private static void printNode(final FusionDictionaryBufferInterface buffer,
|
||||
final FormatSpec.FormatOptions formatOptions) {
|
||||
Log.d(TAG, "Node at " + buffer.position());
|
||||
final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
|
||||
Log.d(TAG, " charGroupCount = " + count);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
|
||||
buffer.position(), formatOptions);
|
||||
printCharGroup(currentInfo);
|
||||
}
|
||||
if (formatOptions.mSupportsDynamicUpdate) {
|
||||
final int forwardLinkAddress = buffer.readUnsignedInt24();
|
||||
Log.d(TAG, " forwardLinkAddress = " + forwardLinkAddress);
|
||||
}
|
||||
}
|
||||
|
||||
private static void printBinaryFile(final FusionDictionaryBufferInterface buffer)
|
||||
throws IOException, UnsupportedFormatException {
|
||||
FileHeader header = BinaryDictInputOutput.readHeader(buffer);
|
||||
while (buffer.position() < buffer.limit()) {
|
||||
printNode(buffer, header.mFormatOptions);
|
||||
}
|
||||
}
|
||||
|
||||
private int getWordPosition(final File file, final String word) {
|
||||
int position = FormatSpec.NOT_VALID_WORD;
|
||||
FileInputStream inStream = null;
|
||||
try {
|
||||
inStream = new FileInputStream(file);
|
||||
final FusionDictionaryBufferInterface buffer = new ByteBufferWrapper(
|
||||
inStream.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, file.length()));
|
||||
position = BinaryDictIOUtils.getTerminalPosition(buffer, word);
|
||||
} catch (IOException e) {
|
||||
} catch (UnsupportedFormatException e) {
|
||||
} finally {
|
||||
if (inStream != null) {
|
||||
try {
|
||||
inStream.close();
|
||||
} catch (IOException e) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
return position;
|
||||
}
|
||||
|
||||
// return amount of time to insert a word
|
||||
private long insertAndCheckWord(final File file, final String word, final int frequency,
|
||||
final boolean exist) {
|
||||
RandomAccessFile raFile = null;
|
||||
FileOutputStream outStream = null;
|
||||
FusionDictionaryBufferInterface buffer = null;
|
||||
long amountOfTime = -1;
|
||||
try {
|
||||
raFile = new RandomAccessFile(file, "rw");
|
||||
buffer = new ByteBufferWrapper(raFile.getChannel().map(
|
||||
FileChannel.MapMode.READ_WRITE, 0, file.length()));
|
||||
outStream = new FileOutputStream(file, true);
|
||||
|
||||
if (!exist) {
|
||||
assertEquals(FormatSpec.NOT_VALID_WORD, getWordPosition(file, word));
|
||||
}
|
||||
final long now = System.nanoTime();
|
||||
BinaryDictIOUtils.insertWord(buffer, outStream, word, frequency, null, null, false,
|
||||
false);
|
||||
amountOfTime = System.nanoTime() - now;
|
||||
MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, getWordPosition(file, word));
|
||||
outStream.close();
|
||||
raFile.close();
|
||||
} catch (IOException e) {
|
||||
} catch (UnsupportedFormatException e) {
|
||||
} finally {
|
||||
if (outStream != null) {
|
||||
try {
|
||||
outStream.close();
|
||||
} catch (IOException e) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
if (raFile != null) {
|
||||
try {
|
||||
raFile.close();
|
||||
} catch (IOException e) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
return amountOfTime;
|
||||
}
|
||||
|
||||
private void deleteWord(final File file, final String word) {
|
||||
RandomAccessFile raFile = null;
|
||||
FusionDictionaryBufferInterface buffer = null;
|
||||
try {
|
||||
raFile = new RandomAccessFile(file, "rw");
|
||||
buffer = new ByteBufferWrapper(raFile.getChannel().map(
|
||||
FileChannel.MapMode.READ_WRITE, 0, file.length()));
|
||||
BinaryDictIOUtils.deleteWord(buffer, word);
|
||||
} catch (IOException e) {
|
||||
} catch (UnsupportedFormatException e) {
|
||||
} finally {
|
||||
if (raFile != null) {
|
||||
try {
|
||||
raFile.close();
|
||||
} catch (IOException e) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void checkReverseLookup(final File file, final String word, final int position) {
|
||||
FileInputStream inStream = null;
|
||||
try {
|
||||
inStream = new FileInputStream(file);
|
||||
final FusionDictionaryBufferInterface buffer = new ByteBufferWrapper(
|
||||
inStream.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, file.length()));
|
||||
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
|
||||
assertEquals(word, BinaryDictInputOutput.getWordAtAddress(buffer, header.mHeaderSize,
|
||||
position - header.mHeaderSize, header.mFormatOptions));
|
||||
} catch (IOException e) {
|
||||
} catch (UnsupportedFormatException e) {
|
||||
} finally {
|
||||
if (inStream != null) {
|
||||
try {
|
||||
inStream.close();
|
||||
} catch (IOException e) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testInsertWord() {
|
||||
File file = null;
|
||||
try {
|
||||
file = File.createTempFile("testInsertWord", ".dict");
|
||||
} catch (IOException e) {
|
||||
fail("IOException while creating temporary file: " + e);
|
||||
}
|
||||
|
||||
// set an initial dictionary.
|
||||
final FusionDictionary dict = new FusionDictionary(new Node(),
|
||||
new FusionDictionary.DictionaryOptions(new HashMap<String,String>(), false, false));
|
||||
dict.add("abcd", 10, null, false);
|
||||
|
||||
try {
|
||||
final FileOutputStream out = new FileOutputStream(file);
|
||||
BinaryDictInputOutput.writeDictionaryBinary(out, dict, FORMAT_OPTIONS);
|
||||
out.close();
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
} catch (UnsupportedFormatException e) {
|
||||
fail("UnsupportedFormatException while writing an initial dictionary : " + e);
|
||||
}
|
||||
|
||||
MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, getWordPosition(file, "abcd"));
|
||||
insertAndCheckWord(file, "abcde", 10, false);
|
||||
|
||||
insertAndCheckWord(file, "abcdefghijklmn", 10, false);
|
||||
checkReverseLookup(file, "abcdefghijklmn", getWordPosition(file, "abcdefghijklmn"));
|
||||
|
||||
insertAndCheckWord(file, "abcdabcd", 10, false);
|
||||
checkReverseLookup(file, "abcdabcd", getWordPosition(file, "abcdabcd"));
|
||||
|
||||
// update the existing word.
|
||||
insertAndCheckWord(file, "abcdabcd", 15, true);
|
||||
|
||||
// split 1
|
||||
insertAndCheckWord(file, "ab", 20, false);
|
||||
|
||||
// split 2
|
||||
insertAndCheckWord(file, "ami", 30, false);
|
||||
|
||||
deleteWord(file, "ami");
|
||||
assertEquals(FormatSpec.NOT_VALID_WORD, getWordPosition(file, "ami"));
|
||||
|
||||
insertAndCheckWord(file, "abcdabfg", 30, false);
|
||||
|
||||
deleteWord(file, "abcd");
|
||||
assertEquals(FormatSpec.NOT_VALID_WORD, getWordPosition(file, "abcd"));
|
||||
}
|
||||
|
||||
public void testRandomWords() {
|
||||
File file = null;
|
||||
try {
|
||||
file = File.createTempFile("testRandomWord", ".dict");
|
||||
} catch (IOException e) {
|
||||
}
|
||||
assertNotNull(file);
|
||||
|
||||
// set an initial dictionary.
|
||||
final FusionDictionary dict = new FusionDictionary(new Node(),
|
||||
new FusionDictionary.DictionaryOptions(new HashMap<String, String>(), false,
|
||||
false));
|
||||
dict.add("initial", 10, null, false);
|
||||
|
||||
try {
|
||||
final FileOutputStream out = new FileOutputStream(file);
|
||||
BinaryDictInputOutput.writeDictionaryBinary(out, dict, FORMAT_OPTIONS);
|
||||
out.close();
|
||||
} catch (IOException e) {
|
||||
assertTrue(false);
|
||||
} catch (UnsupportedFormatException e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
|
||||
long maxTimeToInsert = 0, sum = 0;
|
||||
long minTimeToInsert = 100000000; // 1000000000 is an upper bound for minTimeToInsert.
|
||||
int cnt = 0;
|
||||
for (final String word : sWords) {
|
||||
final long diff = insertAndCheckWord(file, word, cnt%255, false);
|
||||
maxTimeToInsert = Math.max(maxTimeToInsert, diff);
|
||||
minTimeToInsert = Math.min(minTimeToInsert, diff);
|
||||
sum += diff;
|
||||
cnt++;
|
||||
}
|
||||
cnt = 0;
|
||||
for (final String word : sWords) {
|
||||
MoreAsserts.assertNotEqual(FormatSpec.NOT_VALID_WORD, getWordPosition(file, word));
|
||||
}
|
||||
|
||||
Log.d(TAG, "max = " + ((double)maxTimeToInsert/1000000) + " ms.");
|
||||
Log.d(TAG, "min = " + ((double)minTimeToInsert/1000000) + " ms.");
|
||||
Log.d(TAG, "avg = " + ((double)sum/MAX_UNIGRAMS/1000000) + " ms.");
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue