From b3c98901c5fc1460b54cdf27d74405f27c88e74b Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 23 Oct 2012 17:14:12 +0900 Subject: [PATCH] Add auto detection and decoding of dictionary files. (A2) Bug: 7388852 Change-Id: I25e755fc15f5b383acc046f668e9681efa4f0c2f --- .../latin/makedict/BinaryDictInputOutput.java | 11 +- .../latin/makedict/FusionDictionary.java | 6 + .../dicttool/BinaryDictOffdeviceUtils.java | 82 +++++++++++++- .../inputmethod/latin/dicttool/Compress.java | 14 ++- .../BinaryDictOffdeviceUtilsTests.java | 106 ++++++++++++++++++ tools/dicttool/tests/etc/test-dicttool.sh | 1 + 6 files changed, 206 insertions(+), 14 deletions(-) create mode 100644 tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index da5236974..031306e1d 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -1698,6 +1698,14 @@ public final class BinaryDictInputOutput { return newDict; } + /** + * Helper method to pass a file name instead of a File object to isBinaryDictionary. + */ + public static boolean isBinaryDictionary(final String filename) { + final File file = new File(filename); + return isBinaryDictionary(file); + } + /** * Basic test to find out whether the file is a binary dictionary or not. * @@ -1706,10 +1714,9 @@ public final class BinaryDictInputOutput { * @param filename The name of the file to test. * @return true if it's a binary dictionary, false otherwise */ - public static boolean isBinaryDictionary(final String filename) { + public static boolean isBinaryDictionary(final File file) { FileInputStream inStream = null; try { - final File file = new File(filename); inStream = new FileInputStream(file); final ByteBuffer buffer = inStream.getChannel().map( FileChannel.MapMode.READ_ONLY, 0, file.length()); diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 7fd13d78b..44537986b 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -16,6 +16,7 @@ package com.android.inputmethod.latin.makedict; +import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.Constants; import java.util.ArrayList; @@ -141,6 +142,11 @@ public final class FusionDictionary implements Iterable { return NOT_A_TERMINAL != mFrequency; } + @UsedForTesting + public int getFrequency() { + return mFrequency; + } + public boolean hasSeveralChars() { assert(mChars.length > 0); return 1 < mChars.length; diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java index 83c5d9ac6..9dcd7eb42 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java @@ -16,19 +16,42 @@ package com.android.inputmethod.latin.dicttool; +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; + +import java.io.File; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.InputStream; import java.io.OutputStream; +import java.util.ArrayList; /** -* Class grouping utilities for offline dictionary making. -* -* Those should not be used on-device, essentially because they are quite -* liberal about I/O and performance. -*/ -public class BinaryDictOffdeviceUtils { + * Class grouping utilities for offline dictionary making. + * + * Those should not be used on-device, essentially because they are quite + * liberal about I/O and performance. + */ +public final class BinaryDictOffdeviceUtils { + // Prefix and suffix are arbitrary, the values do not really matter + private final static String PREFIX = "dicttool"; + private final static String SUFFIX = ".tmp"; + + public final static String COMPRESSION = "compression"; + + public static class DecoderChainSpec { + ArrayList mDecoderSpec = new ArrayList(); + File mFile; + public DecoderChainSpec addStep(final String stepDescription) { + mDecoderSpec.add(stepDescription); + return this; + } + } + public static void copy(final InputStream input, final OutputStream output) throws IOException { final byte[] buffer = new byte[1000]; final BufferedInputStream in = new BufferedInputStream(input); @@ -38,4 +61,51 @@ public class BinaryDictOffdeviceUtils { in.close(); out.close(); } + + /** + * Returns a decrypted/uncompressed binary dictionary. + * + * This will decrypt/uncompress any number of times as necessary until it finds the binary + * dictionary signature, and copy the decoded file to a temporary place. + * If this is not a binary dictionary, the method returns null. + */ + public static DecoderChainSpec getRawBinaryDictionaryOrNull(final File src) { + return getRawBinaryDictionaryOrNullInternal(new DecoderChainSpec(), src); + } + + private static DecoderChainSpec getRawBinaryDictionaryOrNullInternal( + final DecoderChainSpec spec, final File src) { + // TODO: arrange for the intermediary files to be deleted + if (BinaryDictInputOutput.isBinaryDictionary(src)) { + spec.mFile = src; + return spec; + } + // It's not a raw dictionary - try to see if it's compressed. + final File uncompressedFile = tryGetUncompressedFile(src); + if (null != uncompressedFile) { + final DecoderChainSpec newSpec = + getRawBinaryDictionaryOrNullInternal(spec, uncompressedFile); + if (null == newSpec) return null; + return newSpec.addStep(COMPRESSION); + } + return null; + } + + /* Try to uncompress the file passed as an argument. + * + * If the file can be uncompressed, the uncompressed version is returned. Otherwise, null + * is returned. + */ + private static File tryGetUncompressedFile(final File src) { + try { + final File dst = File.createTempFile(PREFIX, SUFFIX); + final FileOutputStream dstStream = new FileOutputStream(dst); + copy(Compress.getUncompressedStream(new BufferedInputStream(new FileInputStream(src))), + new BufferedOutputStream(dstStream)); // #copy() closes the streams + return dst; + } catch (IOException e) { + // Could not uncompress the file: presumably the file is simply not a compressed file + return null; + } + } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java index 49e90ada2..072de5c01 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java @@ -16,6 +16,8 @@ package com.android.inputmethod.latin.dicttool; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -27,12 +29,12 @@ import java.util.zip.GZIPOutputStream; public class Compress { - private static OutputStream getCompressedStream(final OutputStream out) + public static OutputStream getCompressedStream(final OutputStream out) throws java.io.IOException { return new GZIPOutputStream(out); } - private static InputStream getUncompressedStream(final InputStream in) throws IOException { + public static InputStream getUncompressedStream(final InputStream in) throws IOException { return new GZIPInputStream(in); } @@ -55,9 +57,9 @@ public class Compress { final String inFilename = mArgs.length >= 1 ? mArgs[0] : STDIN_OR_STDOUT; final String outFilename = mArgs.length >= 2 ? mArgs[1] : STDIN_OR_STDOUT; final InputStream input = inFilename.equals(STDIN_OR_STDOUT) ? System.in - : new FileInputStream(new File(inFilename)); + : new BufferedInputStream(new FileInputStream(new File(inFilename))); final OutputStream output = outFilename.equals(STDIN_OR_STDOUT) ? System.out - : new FileOutputStream(new File(outFilename)); + : new BufferedOutputStream(new FileOutputStream(new File(outFilename))); BinaryDictOffdeviceUtils.copy(input, new GZIPOutputStream(output)); } } @@ -81,9 +83,9 @@ public class Compress { final String inFilename = mArgs.length >= 1 ? mArgs[0] : STDIN_OR_STDOUT; final String outFilename = mArgs.length >= 2 ? mArgs[1] : STDIN_OR_STDOUT; final InputStream input = inFilename.equals(STDIN_OR_STDOUT) ? System.in - : new FileInputStream(new File(inFilename)); + : new BufferedInputStream(new FileInputStream(new File(inFilename))); final OutputStream output = outFilename.equals(STDIN_OR_STDOUT) ? System.out - : new FileOutputStream(new File(outFilename)); + : new BufferedOutputStream(new FileOutputStream(new File(outFilename))); BinaryDictOffdeviceUtils.copy(new GZIPInputStream(input), output); } } diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java new file mode 100644 index 000000000..7a686e556 --- /dev/null +++ b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import junit.framework.TestCase; + +import java.io.File; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * Unit tests for BinaryDictOffdeviceUtilsTests + */ +public class BinaryDictOffdeviceUtilsTests extends TestCase { + private static final int TEST_FREQ = 37; // Some arbitrary value unlikely to happen by chance + + public void testGetRawDictWorks() throws IOException, UnsupportedFormatException { + // Create a thrice-compressed dictionary file. + final FusionDictionary dict = new FusionDictionary(new Node(), + new DictionaryOptions(new HashMap(), + false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); + dict.add("foo", TEST_FREQ, null, false /* isNotAWord */); + dict.add("fta", 1, null, false /* isNotAWord */); + dict.add("ftb", 1, null, false /* isNotAWord */); + dict.add("bar", 1, null, false /* isNotAWord */); + dict.add("fool", 1, null, false /* isNotAWord */); + + final File dst = File.createTempFile("testGetRawDict", ".tmp"); + final OutputStream out = Compress.getCompressedStream( + Compress.getCompressedStream( + Compress.getCompressedStream( + new BufferedOutputStream(new FileOutputStream(dst))))); + + BinaryDictInputOutput.writeDictionaryBinary(out, dict, new FormatOptions(2, false)); + + // Test for an actually compressed dictionary and its contents + final BinaryDictOffdeviceUtils.DecoderChainSpec decodeSpec = + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(dst); + for (final String step : decodeSpec.mDecoderSpec) { + assertEquals("Wrong decode spec", BinaryDictOffdeviceUtils.COMPRESSION, step); + } + assertEquals("Wrong decode spec", 3, decodeSpec.mDecoderSpec.size()); + final FileInputStream inStream = new FileInputStream(decodeSpec.mFile); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, decodeSpec.mFile.length()); + final FusionDictionary resultDict = BinaryDictInputOutput.readDictionaryBinary( + new BinaryDictInputOutput.ByteBufferWrapper(buffer), + null /* dict : an optional dictionary to add words to, or null */); + assertEquals("Dictionary can't be read back correctly", + resultDict.findWordInTree(resultDict.mRoot, "foo").getFrequency(), TEST_FREQ); + } + + public void testGetRawDictFails() throws IOException { + // Randomly create some 4k file containing garbage + final File dst = File.createTempFile("testGetRawDict", ".tmp"); + final OutputStream out = new BufferedOutputStream(new FileOutputStream(dst)); + for (int i = 0; i < 1024; ++i) { + out.write(0x12345678); + } + out.close(); + + // Test that a random data file actually fails + assertNull("Wrongly identified data file", + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(dst)); + + final File gzDst = File.createTempFile("testGetRawDict", ".tmp"); + final OutputStream gzOut = + Compress.getCompressedStream(new BufferedOutputStream(new FileOutputStream(gzDst))); + for (int i = 0; i < 1024; ++i) { + gzOut.write(0x12345678); + } + gzOut.close(); + + // Test that a compressed random data file actually fails + assertNull("Wrongly identified data file", + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(gzDst)); + } +} diff --git a/tools/dicttool/tests/etc/test-dicttool.sh b/tools/dicttool/tests/etc/test-dicttool.sh index 1283be21a..0f3ed6d62 100755 --- a/tools/dicttool/tests/etc/test-dicttool.sh +++ b/tools/dicttool/tests/etc/test-dicttool.sh @@ -14,3 +14,4 @@ # limitations under the License. java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/framework/dicttool_aosp.jar junit.textui.TestRunner com.android.inputmethod.latin.makedict.BinaryDictInputOutputTest +java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/framework/dicttool_aosp.jar junit.textui.TestRunner com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtilsTests