From d4fe7fda303ff937d2e44c15dde9d90cbf59376b Mon Sep 17 00:00:00 2001 From: Yuichiro Hanada Date: Tue, 21 Aug 2012 18:30:18 +0900 Subject: [PATCH] Use ByteBuffer when reading FusionDictionary from file. Change-Id: Ia71561648e17f846d277c22309ac37c21c67a537 --- .../latin/BinaryDictionaryGetter.java | 27 ++- .../latin/makedict/BinaryDictInputOutput.java | 225 +++++++++++------- .../inputmethod/latin/BinaryDictIOTests.java | 224 +++++++++++++++++ .../latin/dicttool/DictionaryMaker.java | 22 +- 4 files changed, 399 insertions(+), 99 deletions(-) create mode 100644 tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java b/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java index 4ada909de..e1cb195bc 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java @@ -25,7 +25,10 @@ import android.content.res.AssetFileDescriptor; import android.util.Log; import java.io.File; -import java.io.RandomAccessFile; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.HashMap; import java.util.Locale; @@ -349,17 +352,21 @@ class BinaryDictionaryGetter { // ad-hock ## HACK ## if (!Locale.ENGLISH.getLanguage().equals(locale.getLanguage())) return true; + FileInputStream inStream = null; try { // Read the version of the file - final RandomAccessFile raf = new RandomAccessFile(f, "r"); - final int magic = raf.readInt(); + inStream = new FileInputStream(f); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, f.length()); + final int magic = buffer.getInt(); if (magic != BinaryDictInputOutput.VERSION_2_MAGIC_NUMBER) { return false; } - final int formatVersion = raf.readInt(); - final int headerSize = raf.readInt(); + final int formatVersion = buffer.getInt(); + final int headerSize = buffer.getInt(); final HashMap options = CollectionUtils.newHashMap(); - BinaryDictInputOutput.populateOptionsFromFile(raf, headerSize, options); + BinaryDictInputOutput.populateOptions(buffer, headerSize, options); + final String version = options.get(VERSION_KEY); if (null == version) { // No version in the options : the format is unexpected @@ -374,6 +381,14 @@ class BinaryDictionaryGetter { return false; } catch (NumberFormatException e) { return false; + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index b23b7db34..161b94ca0 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -22,10 +22,13 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.Node; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; -import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -307,33 +310,32 @@ public class BinaryDictInputOutput { } /** - * Reads a string from a RandomAccessFile. This is the converse of the above method. + * Reads a string from a ByteBuffer. This is the converse of the above method. */ - private static String readString(final RandomAccessFile source) throws IOException { + private static String readString(final ByteBuffer buffer) { final StringBuilder s = new StringBuilder(); - int character = readChar(source); + int character = readChar(buffer); while (character != INVALID_CHARACTER) { s.appendCodePoint(character); - character = readChar(source); + character = readChar(buffer); } return s.toString(); } /** - * Reads a character from the file. + * Reads a character from the ByteBuffer. * * This follows the character format documented earlier in this source file. * - * @param source the file, positioned over an encoded character. + * @param buffer the buffer, positioned over an encoded character. * @return the character code. */ - private static int readChar(RandomAccessFile source) throws IOException { - int character = source.readUnsignedByte(); + private static int readChar(final ByteBuffer buffer) { + int character = readUnsignedByte(buffer); if (!fitsOnOneByte(character)) { - if (GROUP_CHARACTERS_TERMINATOR == character) - return INVALID_CHARACTER; + if (GROUP_CHARACTERS_TERMINATOR == character) return INVALID_CHARACTER; character <<= 16; - character += source.readUnsignedShort(); + character += readUnsignedShort(buffer); } return character; } @@ -1091,46 +1093,46 @@ public class BinaryDictInputOutput { // readDictionaryBinary is the public entry point for them. static final int[] characterBuffer = new int[MAX_WORD_LENGTH]; - private static CharGroupInfo readCharGroup(RandomAccessFile source, - final int originalGroupAddress) throws IOException { + private static CharGroupInfo readCharGroup(final ByteBuffer buffer, + final int originalGroupAddress) { int addressPointer = originalGroupAddress; - final int flags = source.readUnsignedByte(); + final int flags = readUnsignedByte(buffer); ++addressPointer; final int characters[]; if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) { int index = 0; - int character = CharEncoding.readChar(source); + int character = CharEncoding.readChar(buffer); addressPointer += CharEncoding.getCharSize(character); while (-1 != character) { characterBuffer[index++] = character; - character = CharEncoding.readChar(source); + character = CharEncoding.readChar(buffer); addressPointer += CharEncoding.getCharSize(character); } characters = Arrays.copyOfRange(characterBuffer, 0, index); } else { - final int character = CharEncoding.readChar(source); + final int character = CharEncoding.readChar(buffer); addressPointer += CharEncoding.getCharSize(character); characters = new int[] { character }; } final int frequency; if (0 != (FLAG_IS_TERMINAL & flags)) { ++addressPointer; - frequency = source.readUnsignedByte(); + frequency = readUnsignedByte(buffer); } else { frequency = CharGroup.NOT_A_TERMINAL; } int childrenAddress = addressPointer; switch (flags & MASK_GROUP_ADDRESS_TYPE) { case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: - childrenAddress += source.readUnsignedByte(); + childrenAddress += readUnsignedByte(buffer); addressPointer += 1; break; case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: - childrenAddress += source.readUnsignedShort(); + childrenAddress += readUnsignedShort(buffer); addressPointer += 2; break; case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: - childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort(); + childrenAddress += readUnsignedInt24(buffer); addressPointer += 3; break; case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: @@ -1140,38 +1142,38 @@ public class BinaryDictInputOutput { } ArrayList shortcutTargets = null; if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { - final long pointerBefore = source.getFilePointer(); + final int pointerBefore = buffer.position(); shortcutTargets = new ArrayList(); - source.readUnsignedShort(); // Skip the size + buffer.getShort(); // Skip the size while (true) { - final int targetFlags = source.readUnsignedByte(); - final String word = CharEncoding.readString(source); + final int targetFlags = readUnsignedByte(buffer); + final String word = CharEncoding.readString(buffer); shortcutTargets.add(new WeightedString(word, targetFlags & FLAG_ATTRIBUTE_FREQUENCY)); if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; } - addressPointer += (source.getFilePointer() - pointerBefore); + addressPointer += buffer.position() - pointerBefore; } ArrayList bigrams = null; if (0 != (flags & FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList(); while (true) { - final int bigramFlags = source.readUnsignedByte(); + final int bigramFlags = readUnsignedByte(buffer); ++addressPointer; final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; int bigramAddress = addressPointer; switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: - bigramAddress += sign * source.readUnsignedByte(); + bigramAddress += sign * readUnsignedByte(buffer); addressPointer += 1; break; case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: - bigramAddress += sign * source.readUnsignedShort(); + bigramAddress += sign * readUnsignedShort(buffer); addressPointer += 2; break; case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: - final int offset = ((source.readUnsignedByte() << 16) - + source.readUnsignedShort()); + final int offset = (readUnsignedByte(buffer) << 16) + + readUnsignedShort(buffer); bigramAddress += sign * offset; addressPointer += 3; break; @@ -1188,15 +1190,15 @@ public class BinaryDictInputOutput { } /** - * Reads and returns the char group count out of a file and forwards the pointer. + * Reads and returns the char group count out of a buffer and forwards the pointer. */ - private static int readCharGroupCount(RandomAccessFile source) throws IOException { - final int msb = source.readUnsignedByte(); + private static int readCharGroupCount(final ByteBuffer buffer) { + final int msb = readUnsignedByte(buffer); if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) { return msb; } else { return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8) - + source.readUnsignedByte(); + + readUnsignedByte(buffer); } } @@ -1204,31 +1206,29 @@ public class BinaryDictInputOutput { // of this method. Since it performs direct, unbuffered random access to the file and // may be called hundreds of thousands of times, the resulting performance is not // reasonable without some kind of cache. Thus: - // TODO: perform buffered I/O here and in other places in the code. private static TreeMap wordCache = new TreeMap(); /** * Finds, as a string, the word at the address passed as an argument. * - * @param source the file to read from. + * @param buffer the buffer to read from. * @param headerSize the size of the header. * @param address the address to seek. * @return the word, as a string. - * @throws IOException if the file can't be read. */ - private static String getWordAtAddress(final RandomAccessFile source, final long headerSize, - int address) throws IOException { + private static String getWordAtAddress(final ByteBuffer buffer, final int headerSize, + final int address) { final String cachedString = wordCache.get(address); if (null != cachedString) return cachedString; - final long originalPointer = source.getFilePointer(); - source.seek(headerSize); - final int count = readCharGroupCount(source); + final int originalPointer = buffer.position(); + buffer.position(headerSize); + final int count = readCharGroupCount(buffer); int groupOffset = getGroupCountSize(count); final StringBuilder builder = new StringBuilder(); String result = null; CharGroupInfo last = null; for (int i = count - 1; i >= 0; --i) { - CharGroupInfo info = readCharGroup(source, groupOffset); + CharGroupInfo info = readCharGroup(buffer, groupOffset); groupOffset = info.mEndAddress; if (info.mOriginalAddress == address) { builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); @@ -1239,9 +1239,9 @@ public class BinaryDictInputOutput { if (info.mChildrenAddress > address) { if (null == last) continue; builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); - source.seek(last.mChildrenAddress + headerSize); + buffer.position(last.mChildrenAddress + headerSize); groupOffset = last.mChildrenAddress + 1; - i = source.readUnsignedByte(); + i = readUnsignedByte(buffer); last = null; continue; } @@ -1249,14 +1249,14 @@ public class BinaryDictInputOutput { } if (0 == i && hasChildrenAddress(last.mChildrenAddress)) { builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); - source.seek(last.mChildrenAddress + headerSize); + buffer.position(last.mChildrenAddress + headerSize); groupOffset = last.mChildrenAddress + 1; - i = source.readUnsignedByte(); + i = readUnsignedByte(buffer); last = null; continue; } } - source.seek(originalPointer); + buffer.position(originalPointer); wordCache.put(address, result); return result; } @@ -1269,44 +1269,47 @@ public class BinaryDictInputOutput { * This will recursively read other nodes into the structure, populating the reverse * maps on the fly and using them to keep track of already read nodes. * - * @param source the data file, correctly positioned at the start of a node. + * @param buffer the buffer, correctly positioned at the start of a node. * @param headerSize the size, in bytes, of the file header. * @param reverseNodeMap a mapping from addresses to already read nodes. * @param reverseGroupMap a mapping from addresses to already read character groups. * @return the read node with all his children already read. */ - private static Node readNode(RandomAccessFile source, long headerSize, - Map reverseNodeMap, Map reverseGroupMap) + private static Node readNode(final ByteBuffer buffer, final int headerSize, + final Map reverseNodeMap, final Map reverseGroupMap) throws IOException { - final int nodeOrigin = (int)(source.getFilePointer() - headerSize); - final int count = readCharGroupCount(source); + final int nodeOrigin = buffer.position() - headerSize; + final int count = readCharGroupCount(buffer); final ArrayList nodeContents = new ArrayList(); int groupOffset = nodeOrigin + getGroupCountSize(count); for (int i = count; i > 0; --i) { - CharGroupInfo info = readCharGroup(source, groupOffset); + CharGroupInfo info =readCharGroup(buffer, groupOffset); ArrayList shortcutTargets = info.mShortcutTargets; ArrayList bigrams = null; if (null != info.mBigrams) { bigrams = new ArrayList(); for (PendingAttribute bigram : info.mBigrams) { - final String word = getWordAtAddress(source, headerSize, bigram.mAddress); + final String word = getWordAtAddress( + buffer, headerSize, bigram.mAddress); bigrams.add(new WeightedString(word, bigram.mFrequency)); } } if (hasChildrenAddress(info.mChildrenAddress)) { Node children = reverseNodeMap.get(info.mChildrenAddress); if (null == children) { - final long currentPosition = source.getFilePointer(); - source.seek(info.mChildrenAddress + headerSize); - children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap); - source.seek(currentPosition); + final int currentPosition = buffer.position(); + buffer.position(info.mChildrenAddress + headerSize); + children = readNode( + buffer, headerSize, reverseNodeMap, reverseGroupMap); + buffer.position(currentPosition); } nodeContents.add( - new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, - children)); + new CharGroup(info.mCharacters, shortcutTargets, + bigrams, info.mFrequency, children)); } else { nodeContents.add( - new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency)); + new CharGroup(info.mCharacters, shortcutTargets, + bigrams, info.mFrequency)); } groupOffset = info.mEndAddress; } @@ -1318,12 +1321,13 @@ public class BinaryDictInputOutput { /** * Helper function to get the binary format version from the header. + * @throws IOException */ - private static int getFormatVersion(final RandomAccessFile source) throws IOException { - final int magic_v1 = source.readUnsignedShort(); - if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte(); - final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort(); - if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort(); + private static int getFormatVersion(final ByteBuffer buffer) throws IOException { + final int magic_v1 = readUnsignedShort(buffer); + if (VERSION_1_MAGIC_NUMBER == magic_v1) return readUnsignedByte(buffer); + final int magic_v2 = (magic_v1 << 16) + readUnsignedShort(buffer); + if (VERSION_2_MAGIC_NUMBER == magic_v2) return readUnsignedShort(buffer); return NOT_A_VERSION_NUMBER; } @@ -1333,53 +1337,60 @@ public class BinaryDictInputOutput { * The file is read at the current file pointer, so the caller must take care the pointer * is in the right place before calling this. */ - public static void populateOptionsFromFile(final RandomAccessFile source, final long headerSize, - final HashMap options) throws IOException { - while (source.getFilePointer() < headerSize) { - final String key = CharEncoding.readString(source); - final String value = CharEncoding.readString(source); + public static void populateOptions(final ByteBuffer buffer, final int headerSize, + final HashMap options) { + while (buffer.position() < headerSize) { + final String key = CharEncoding.readString(buffer); + final String value = CharEncoding.readString(buffer); options.put(key, value); } } /** - * Reads a random access file and returns the memory representation of the dictionary. + * Reads a byte buffer and returns the memory representation of the dictionary. * * This high-level method takes a binary file and reads its contents, populating a * FusionDictionary structure. The optional dict argument is an existing dictionary to * which words from the file should be added. If it is null, a new dictionary is created. * - * @param source the file to read. + * @param buffer the buffer to read. * @param dict an optional dictionary to add words to, or null. * @return the created (or merged) dictionary. */ - public static FusionDictionary readDictionaryBinary(final RandomAccessFile source, + public static FusionDictionary readDictionaryBinary(final ByteBuffer buffer, final FusionDictionary dict) throws IOException, UnsupportedFormatException { // Check file version - final int version = getFormatVersion(source); - if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION ) { + final int version = getFormatVersion(buffer); + if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { throw new UnsupportedFormatException("This file has version " + version + ", but this implementation does not support versions above " + MAXIMUM_SUPPORTED_VERSION); } - // Read options - final int optionsFlags = source.readUnsignedShort(); + // clear cache + wordCache.clear(); - final long headerSize; + // Read options + final int optionsFlags = readUnsignedShort(buffer); + + final int headerSize; final HashMap options = new HashMap(); if (version < FIRST_VERSION_WITH_HEADER_SIZE) { - headerSize = source.getFilePointer(); + headerSize = buffer.position(); } else { - headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16) - + (source.readUnsignedByte() << 8) + source.readUnsignedByte(); - populateOptionsFromFile(source, headerSize, options); - source.seek(headerSize); + headerSize = buffer.getInt(); + populateOptions(buffer, headerSize, options); + buffer.position(headerSize); + } + + if (headerSize < 0) { + throw new UnsupportedFormatException("header size can't be negative."); } Map reverseNodeMapping = new TreeMap(); Map reverseGroupMapping = new TreeMap(); - final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping); + final Node root = readNode( + buffer, headerSize, reverseNodeMapping, reverseGroupMapping); FusionDictionary newDict = new FusionDictionary(root, new FusionDictionary.DictionaryOptions(options, @@ -1402,6 +1413,28 @@ public class BinaryDictInputOutput { return newDict; } + /** + * Helper function to read one byte from ByteBuffer. + */ + private static int readUnsignedByte(final ByteBuffer buffer) { + return ((int)buffer.get()) & 0xFF; + } + + /** + * Helper function to read two byte from ByteBuffer. + */ + private static int readUnsignedShort(final ByteBuffer buffer) { + return ((int)buffer.getShort()) & 0xFFFF; + } + + /** + * Helper function to read three byte from ByteBuffer. + */ + private static int readUnsignedInt24(final ByteBuffer buffer) { + final int value = readUnsignedByte(buffer) << 16; + return value + readUnsignedShort(buffer); + } + /** * Basic test to find out whether the file is a binary dictionary or not. * @@ -1411,14 +1444,26 @@ public class BinaryDictInputOutput { * @return true if it's a binary dictionary, false otherwise */ public static boolean isBinaryDictionary(final String filename) { + FileInputStream inStream = null; try { - RandomAccessFile f = new RandomAccessFile(filename, "r"); - final int version = getFormatVersion(f); + final File file = new File(filename); + inStream = new FileInputStream(file); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, file.length()); + final int version = getFormatVersion(buffer); return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION); } catch (FileNotFoundException e) { return false; } catch (IOException e) { return false; + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } } } diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java new file mode 100644 index 000000000..0094db8a7 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import android.test.AndroidTestCase; +import android.util.Log; +import android.util.SparseArray; + +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; +import java.util.Set; + +/** + * Unit tests for BinaryDictInputOutput + */ +public class BinaryDictIOTests extends AndroidTestCase { + private static final String TAG = BinaryDictIOTests.class.getSimpleName(); + private static final int MAX_UNIGRAMS = 1000; + private static final int UNIGRAM_FREQ = 10; + private static final int BIGRAM_FREQ = 50; + + private static final String[] CHARACTERS = + { + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" + }; + + /** + * Generates a random word. + */ + private String generateWord(final int value) { + final int lengthOfChars = CHARACTERS.length; + StringBuilder builder = new StringBuilder("a"); + long lvalue = Math.abs((long)value); + while (lvalue > 0) { + builder.append(CHARACTERS[(int)(lvalue % lengthOfChars)]); + lvalue /= lengthOfChars; + } + return builder.toString(); + } + + private List generateWords(final int number, final Random random) { + final Set wordSet = CollectionUtils.newHashSet(); + while (wordSet.size() < number) { + wordSet.add(generateWord(random.nextInt())); + } + return new ArrayList(wordSet); + } + + private void addUnigrams(final int number, + final FusionDictionary dict, + final List words) { + for (int i = 0; i < number; ++i) { + final String word = words.get(i); + dict.add(word, UNIGRAM_FREQ, null); + } + } + + private void addBigrams(final FusionDictionary dict, + final List words, + final SparseArray> sparseArray) { + for (int i = 0; i < sparseArray.size(); ++i) { + final int w1 = sparseArray.keyAt(i); + for (int w2 : sparseArray.valueAt(i)) { + dict.setBigram(words.get(w1), words.get(w2), BIGRAM_FREQ); + } + } + } + + private long timeWritingDictToFile(final String fileName, + final FusionDictionary dict) { + + final File file = new File(getContext().getFilesDir(), fileName); + long now = -1, diff = -1; + + try { + final FileOutputStream out = new FileOutputStream(file); + + now = System.currentTimeMillis(); + BinaryDictInputOutput.writeDictionaryBinary(out, dict, 2); + diff = System.currentTimeMillis() - now; + + out.flush(); + out.close(); + } catch (IOException e) { + Log.e(TAG, "IO exception while writing file: " + e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException: " + e); + } + + return diff; + } + + private void checkDictionary(final FusionDictionary dict, + final List words, + final SparseArray> bigrams) { + assertNotNull(dict); + + // check unigram + for (final String word : words) { + final CharGroup cg = FusionDictionary.findWordInTree(dict.mRoot, word); + assertNotNull(cg); + } + + // check bigram + for (int i = 0; i < bigrams.size(); ++i) { + final int w1 = bigrams.keyAt(i); + for (final int w2 : bigrams.valueAt(i)) { + final CharGroup cg = FusionDictionary.findWordInTree(dict.mRoot, words.get(w1)); + assertNotNull(words.get(w1) + "," + words.get(w2), cg.getBigram(words.get(w2))); + } + } + } + + private long timeReadingAndCheckDict(final String fileName, + final List words, + final SparseArray> bigrams) { + + long now, diff = -1; + + try { + final File file = new File(getContext().getFilesDir(), fileName); + final FileInputStream inStream = new FileInputStream(file); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, file.length()); + + now = System.currentTimeMillis(); + + final FusionDictionary dict = + BinaryDictInputOutput.readDictionaryBinary(buffer, null); + + diff = System.currentTimeMillis() - now; + + checkDictionary(dict, words, bigrams); + return diff; + + } catch (IOException e) { + Log.e(TAG, "raise IOException while reading file " + e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "Unsupported format: " + e); + } + + return diff; + } + + private String runReadAndWrite(final List words, + final SparseArray> bigrams, + final String message) { + final FusionDictionary dict = new FusionDictionary(new Node(), + new FusionDictionary.DictionaryOptions( + new HashMap(), false, false)); + + final String fileName = generateWord((int)System.currentTimeMillis()) + ".dict"; + + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + // check original dictionary + checkDictionary(dict, words, bigrams); + + final long write = timeWritingDictToFile(fileName, dict); + final long read = timeReadingAndCheckDict(fileName, words, bigrams); + deleteFile(fileName); + + return "PROF: read=" + read + "ms, write=" + write + "ms :" + message; + } + + private void deleteFile(final String fileName) { + final File file = new File(getContext().getFilesDir(), fileName); + file.delete(); + } + + public void testReadAndWrite() { + final List results = new ArrayList(); + + final Random random = new Random(123456); + final List words = generateWords(MAX_UNIGRAMS, random); + final SparseArray> emptyArray = CollectionUtils.newSparseArray(); + + final SparseArray> chain = CollectionUtils.newSparseArray(); + for (int i = 0; i < words.size(); ++i) chain.put(i, new ArrayList()); + for (int i = 1; i < words.size(); ++i) chain.get(i-1).add(i); + + final SparseArray> star = CollectionUtils.newSparseArray(); + final List list0 = CollectionUtils.newArrayList(); + star.put(0, list0); + for (int i = 1; i < words.size(); ++i) star.get(0).add(i); + + results.add(runReadAndWrite(words, emptyArray, "only unigram")); + results.add(runReadAndWrite(words, chain, "chain")); + results.add(runReadAndWrite(words, star, "star")); + + for (final String result : results) { + Log.d(TAG, result); + } + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java index 25e1740cb..fbfc1dabb 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -27,7 +27,8 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; -import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; import java.util.Arrays; import java.util.LinkedList; @@ -238,8 +239,23 @@ public class DictionaryMaker { */ private static FusionDictionary readBinaryFile(final String binaryFilename) throws FileNotFoundException, IOException, UnsupportedFormatException { - final RandomAccessFile inputFile = new RandomAccessFile(binaryFilename, "r"); - return BinaryDictInputOutput.readDictionaryBinary(inputFile, null); + FileInputStream inStream = null; + + try { + final File file = new File(binaryFilename); + inStream = new FileInputStream(file); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, file.length()); + return BinaryDictInputOutput.readDictionaryBinary(buffer, null); + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } + } } /**