From b2a43a2ed4df8c3cacf21168cd742e30fa37e964 Mon Sep 17 00:00:00 2001 From: Yuichiro Hanada Date: Wed, 29 Aug 2012 17:32:50 +0900 Subject: [PATCH] add readUnigramsAndBigramsBinary. Change-Id: I7967f11211221d4877bf0a0c30183af885f45390 --- .../latin/makedict/BinaryDictInputOutput.java | 123 +++++++++++- .../inputmethod/latin/BinaryDictIOTests.java | 187 ++++++++++++++++-- 2 files changed, 281 insertions(+), 29 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index d4f7cab5c..9f7f41331 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -34,6 +34,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Stack; import java.util.TreeMap; /** @@ -197,20 +198,21 @@ public class BinaryDictInputOutput { public void position(int newPosition); } - private static final class ByteBufferWrapper implements FusionDictionaryBufferInterface { - private ByteBuffer buffer; - ByteBufferWrapper(final ByteBuffer buffer) { - this.buffer = buffer; + public static final class ByteBufferWrapper implements FusionDictionaryBufferInterface { + private ByteBuffer mBuffer; + + public ByteBufferWrapper(final ByteBuffer buffer) { + mBuffer = buffer; } @Override public int readUnsignedByte() { - return ((int)buffer.get()) & 0xFF; + return ((int)mBuffer.get()) & 0xFF; } @Override public int readUnsignedShort() { - return ((int)buffer.getShort()) & 0xFFFF; + return ((int)mBuffer.getShort()) & 0xFFFF; } @Override @@ -221,17 +223,17 @@ public class BinaryDictInputOutput { @Override public int readInt() { - return buffer.getInt(); + return mBuffer.getInt(); } @Override public int position() { - return buffer.position(); + return mBuffer.position(); } @Override public void position(int newPos) { - buffer.position(newPos); + mBuffer.position(newPos); return; } } @@ -1367,6 +1369,109 @@ public class BinaryDictInputOutput { return node; } + // TODO: move these methods (readUnigramsAndBigramsBinary(|Inner)) and an inner class (Position) + // out of this class. + private static class Position { + public static final int NOT_READ_GROUPCOUNT = -1; + + public int mAddress; + public int mNumOfCharGroup; + public int mPosition; + public int mLength; + + public Position(int address, int length) { + mAddress = address; + mLength = length; + mNumOfCharGroup = NOT_READ_GROUPCOUNT; + } + } + + /** + * Tours all node without recursive call. + */ + private static void readUnigramsAndBigramsBinaryInner( + final FusionDictionaryBufferInterface buffer, final int headerSize, + final Map words, final Map frequencies, + final Map> bigrams) { + + int[] pushedChars = new int[MAX_WORD_LENGTH + 1]; + + Stack stack = new Stack(); + int index = 0; + + Position initPos = new Position(headerSize, 0); + stack.push(initPos); + + while (!stack.empty()) { + Position p = stack.peek(); + + if (DBG) { + MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" + + p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength); + } + + if (buffer.position() != p.mAddress) buffer.position(p.mAddress); + if (index != p.mLength) index = p.mLength; + + if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) { + p.mNumOfCharGroup = readCharGroupCount(buffer); + p.mAddress += getGroupCountSize(p.mNumOfCharGroup); + p.mPosition = 0; + } + + CharGroupInfo info = readCharGroup(buffer, p.mAddress - headerSize); + for (int i = 0; i < info.mCharacters.length; ++i) { + pushedChars[index++] = info.mCharacters[i]; + } + p.mPosition++; + + if (info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) { // found word + words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); + frequencies.put(info.mOriginalAddress, info.mFrequency); + if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); + } + + if (p.mPosition == p.mNumOfCharGroup) { + stack.pop(); + } else { + // the node has more groups. + p.mAddress = buffer.position(); + } + + if (hasChildrenAddress(info.mChildrenAddress)) { + Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); + stack.push(childrenPos); + } + } + + return; + } + + /** + * Reads unigrams and bigrams from the binary file. + * Doesn't make the memory representation of the dictionary. + * + * @param buffer the buffer to read. + * @param words the map to store the address as a key and the word as a value. + * @param frequencies the map to store the address as a key and the frequency as a value. + * @param bigrams the map to store the address as a key and the list of address as a value. + * @throws IOException + * @throws UnsupportedFormatException + */ + public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer, + final Map words, final Map frequencies, + final Map> bigrams) throws IOException, + UnsupportedFormatException { + + // Read header + final int version = checkFormatVersion(buffer); + final int optionsFlags = buffer.readUnsignedShort(); + final HashMap options = new HashMap(); + final int headerSize = readHeader(buffer, options, version); + + readUnigramsAndBigramsBinaryInner(buffer, headerSize, words, frequencies, bigrams); + } + /** * Helper function to get the binary format version from the header. * @throws IOException diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java index 0094db8a7..2cedb5445 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java @@ -20,6 +20,7 @@ import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.PendingAttribute; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; import android.test.AndroidTestCase; @@ -34,7 +35,10 @@ import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import java.util.Random; import java.util.Set; @@ -46,6 +50,7 @@ public class BinaryDictIOTests extends AndroidTestCase { private static final int MAX_UNIGRAMS = 1000; private static final int UNIGRAM_FREQ = 10; private static final int BIGRAM_FREQ = 50; + private static final int TOLERANCE_OF_BIGRAM_FREQ = 5; private static final String[] CHARACTERS = { @@ -53,6 +58,7 @@ public class BinaryDictIOTests extends AndroidTestCase { "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" }; + // Utilities for test /** * Generates a random word. */ @@ -75,6 +81,9 @@ public class BinaryDictIOTests extends AndroidTestCase { return new ArrayList(wordSet); } + /** + * Adds unigrams to the dictionary. + */ private void addUnigrams(final int number, final FusionDictionary dict, final List words) { @@ -86,19 +95,17 @@ public class BinaryDictIOTests extends AndroidTestCase { private void addBigrams(final FusionDictionary dict, final List words, - final SparseArray> sparseArray) { - for (int i = 0; i < sparseArray.size(); ++i) { - final int w1 = sparseArray.keyAt(i); - for (int w2 : sparseArray.valueAt(i)) { + final SparseArray> bigrams) { + for (int i = 0; i < bigrams.size(); ++i) { + final int w1 = bigrams.keyAt(i); + for (int w2 : bigrams.valueAt(i)) { dict.setBigram(words.get(w1), words.get(w2), BIGRAM_FREQ); } } } - private long timeWritingDictToFile(final String fileName, - final FusionDictionary dict) { + private long timeWritingDictToFile(final File file, final FusionDictionary dict) { - final File file = new File(getContext().getFilesDir(), fileName); long now = -1, diff = -1; try { @@ -140,15 +147,16 @@ public class BinaryDictIOTests extends AndroidTestCase { } } - private long timeReadingAndCheckDict(final String fileName, - final List words, + // Tests for readDictionaryBinary and writeDictionaryBinary + + private long timeReadingAndCheckDict(final File file, final List words, final SparseArray> bigrams) { long now, diff = -1; + FileInputStream inStream = null; try { - final File file = new File(getContext().getFilesDir(), fileName); - final FileInputStream inStream = new FileInputStream(file); + inStream = new FileInputStream(file); final ByteBuffer buffer = inStream.getChannel().map( FileChannel.MapMode.READ_ONLY, 0, file.length()); @@ -166,6 +174,14 @@ public class BinaryDictIOTests extends AndroidTestCase { Log.e(TAG, "raise IOException while reading file " + e); } catch (UnsupportedFormatException e) { Log.e(TAG, "Unsupported format: " + e); + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } } return diff; @@ -178,25 +194,26 @@ public class BinaryDictIOTests extends AndroidTestCase { new FusionDictionary.DictionaryOptions( new HashMap(), false, false)); - final String fileName = generateWord((int)System.currentTimeMillis()) + ".dict"; + File file = null; + try { + file = File.createTempFile("runReadAndWrite", ".dict"); + } catch (IOException e) { + Log.e(TAG, "IOException: " + e); + } + + assertNotNull(file); addUnigrams(words.size(), dict, words); addBigrams(dict, words, bigrams); // check original dictionary checkDictionary(dict, words, bigrams); - final long write = timeWritingDictToFile(fileName, dict); - final long read = timeReadingAndCheckDict(fileName, words, bigrams); - deleteFile(fileName); + final long write = timeWritingDictToFile(file, dict); + final long read = timeReadingAndCheckDict(file, words, bigrams); return "PROF: read=" + read + "ms, write=" + write + "ms :" + message; } - private void deleteFile(final String fileName) { - final File file = new File(getContext().getFilesDir(), fileName); - file.delete(); - } - public void testReadAndWrite() { final List results = new ArrayList(); @@ -221,4 +238,134 @@ public class BinaryDictIOTests extends AndroidTestCase { Log.d(TAG, result); } } + + // Tests for readUnigramsAndBigramsBinary + + private void checkWordMap(final List expectedWords, + final SparseArray> expectedBigrams, + final Map resultWords, + final Map resultFrequencies, + final Map> resultBigrams) { + // check unigrams + final Set actualWordsSet = new HashSet(resultWords.values()); + final Set expectedWordsSet = new HashSet(expectedWords); + assertEquals(actualWordsSet, expectedWordsSet); + + for (int freq : resultFrequencies.values()) { + assertEquals(freq, UNIGRAM_FREQ); + } + + // check bigrams + final Map> expBigrams = new HashMap>(); + for (int i = 0; i < expectedBigrams.size(); ++i) { + final String word1 = expectedWords.get(expectedBigrams.keyAt(i)); + for (int w2 : expectedBigrams.valueAt(i)) { + if (expBigrams.get(word1) == null) { + expBigrams.put(word1, new ArrayList()); + } + expBigrams.get(word1).add(expectedWords.get(w2)); + } + } + + final Map> actBigrams = new HashMap>(); + for (Entry> entry : resultBigrams.entrySet()) { + final String word1 = resultWords.get(entry.getKey()); + final int unigramFreq = resultFrequencies.get(entry.getKey()); + for (PendingAttribute attr : entry.getValue()) { + final String word2 = resultWords.get(attr.mAddress); + if (actBigrams.get(word1) == null) { + actBigrams.put(word1, new ArrayList()); + } + actBigrams.get(word1).add(word2); + + final int bigramFreq = BinaryDictInputOutput.reconstructBigramFrequency( + unigramFreq, attr.mFrequency); + assertTrue(Math.abs(bigramFreq - BIGRAM_FREQ) < TOLERANCE_OF_BIGRAM_FREQ); + } + } + + assertEquals(actBigrams, expBigrams); + } + + private long timeAndCheckReadUnigramsAndBigramsBinary(final File file, final List words, + final SparseArray> bigrams) { + FileInputStream inStream = null; + + final Map resultWords = CollectionUtils.newTreeMap(); + final Map> resultBigrams = + CollectionUtils.newTreeMap(); + final Map resultFreqs = CollectionUtils.newTreeMap(); + + long now = -1, diff = -1; + try { + inStream = new FileInputStream(file); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, file.length()); + + now = System.currentTimeMillis(); + BinaryDictInputOutput.readUnigramsAndBigramsBinary( + new BinaryDictInputOutput.ByteBufferWrapper(buffer), resultWords, resultFreqs, + resultBigrams); + diff = System.currentTimeMillis() - now; + checkWordMap(words, bigrams, resultWords, resultFreqs, resultBigrams); + } catch (IOException e) { + Log.e(TAG, "IOException " + e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException: " + e); + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } + } + + return diff; + } + + private void runReadUnigramsAndBigramsBinary(final List words, + final SparseArray> bigrams) { + + // making the dictionary from lists of words. + final FusionDictionary dict = new FusionDictionary(new Node(), + new FusionDictionary.DictionaryOptions( + new HashMap(), false, false)); + + File file = null; + try { + file = File.createTempFile("runReadUnigrams", ".dict"); + } catch (IOException e) { + Log.e(TAG, "IOException: " + e); + } + + assertNotNull(file); + + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + timeWritingDictToFile(file, dict); + + long wordMap = timeAndCheckReadUnigramsAndBigramsBinary(file, words, bigrams); + long fullReading = timeReadingAndCheckDict(file, words, bigrams); + + Log.d(TAG, "read=" + fullReading + ", bytearray=" + wordMap); + } + + public void testReadUnigramsAndBigramsBinary() { + final List results = new ArrayList(); + + final Random random = new Random(123456); + final List words = generateWords(MAX_UNIGRAMS, random); + final SparseArray> emptyArray = CollectionUtils.newSparseArray(); + + runReadUnigramsAndBigramsBinary(words, emptyArray); + + final SparseArray> star = CollectionUtils.newSparseArray(); + for (int i = 1; i < words.size(); ++i) { + star.put(i-1, new ArrayList()); + star.get(i-1).add(i); + } + runReadUnigramsAndBigramsBinary(words, star); + } }