Merge "Add reader and writer for the combined dict format."

2012-10-22 23:11:20 -07:00 · 2012-10-22 23:11:20 -07:00 · c7020f54f9
commit c7020f54f9
parent 751f3dc0e1 9bb4eebf48
4 changed files with 319 additions and 19 deletions
--- a/java/src/com/android/inputmethod/latin/BinaryDictionaryFileDumper.java
+++ b/java/src/com/android/inputmethod/latin/BinaryDictionaryFileDumper.java
@ -149,13 +149,7 @@ public final class BinaryDictionaryFileDumper {

        final Uri.Builder wordListUriBuilder = getProviderUriBuilder(id);
        final String finalFileName = BinaryDictionaryGetter.getCacheFileName(id, locale, context);
-        String tempFileName;
-        try {
-            tempFileName = BinaryDictionaryGetter.getTempFileName(id, context);
-        } catch (IOException e) {
-            Log.e(TAG, "Can't open the temporary file", e);
-            return null;
-        }
+        final String tempFileName = BinaryDictionaryGetter.getTempFileName(id, context);

        for (int mode = MODE_MIN; mode <= MODE_MAX; ++mode) {
            InputStream originalSourceStream = null;
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java
@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.android.inputmethod.latin.dicttool;
+
+import com.android.inputmethod.latin.makedict.FusionDictionary;
+import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
+import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
+import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
+import com.android.inputmethod.latin.makedict.Word;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.TreeSet;
+
+/**
+ * Reads and writes combined format for a FusionDictionary.
+ *
+ * All functions in this class are static.
+ */
+public class CombinedInputOutput {
+
+    private static final String DICTIONARY_TAG = "dictionary";
+    private static final String BIGRAM_TAG = "bigram";
+    private static final String SHORTCUT_TAG = "shortcut";
+    private static final String FREQUENCY_TAG = "f";
+    private static final String WORD_TAG = "word";
+    private static final String NOT_A_WORD_TAG = "not_a_word";
+    private static final String WHITELIST_TAG = "whitelist";
+    private static final String OPTIONS_TAG = "options";
+    private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing";
+    private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing";
+
+    /**
+     * Basic test to find out whether the file is in the combined format or not.
+     *
+     * Concretely this only tests the header line.
+     *
+     * @param filename The name of the file to test.
+     * @return true if the file is in the combined format, false otherwise
+     */
+    public static boolean isCombinedDictionary(final String filename) {
+        BufferedReader reader = null;
+        try {
+            reader = new BufferedReader(new FileReader(new File(filename)));
+            final String firstLine = reader.readLine();
+            return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
+        } catch (FileNotFoundException e) {
+            return false;
+        } catch (IOException e) {
+            return false;
+        } finally {
+            if (reader != null) {
+                try {
+                    reader.close();
+                } catch (IOException e) {
+                    // do nothing
+                }
+            }
+        }
+    }
+
+    /**
+     * Reads a dictionary from a combined format file.
+     *
+     * This is the public method that will read a combined file and return the corresponding memory
+     * representation.
+     *
+     * @param source the file to read the data from.
+     * @return the in-memory representation of the dictionary.
+     */
+    public static FusionDictionary readDictionaryCombined(final InputStream source)
+            throws IOException {
+        final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8"));
+        final String headerLine = reader.readLine();
+        final String header[] = headerLine.split(",");
+        final HashMap<String, String> attributes = new HashMap<String, String>();
+        for (String item : header) {
+            final String keyValue[] = item.split("=");
+            if (2 != keyValue.length) {
+                throw new RuntimeException("Wrong header format : " + headerLine);
+            }
+            attributes.put(keyValue[0], keyValue[1]);
+        }
+
+        final boolean processUmlauts =
+                GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG));
+        final boolean processLigatures =
+                FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG));
+        attributes.remove(OPTIONS_TAG);
+        final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions(
+                attributes, processUmlauts, processLigatures));
+
+        String line;
+        String word = null;
+        int freq = 0;
+        boolean isNotAWord = false;
+        ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>();
+        ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>();
+        while (null != (line = reader.readLine())) {
+            final String args[] = line.trim().split(",");
+            if (args[0].matches(WORD_TAG + "=.*")) {
+                if (null != word) {
+                    dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
+                    for (WeightedString s : bigrams) {
+                        dict.setBigram(word, s.mWord, s.mFrequency);
+                    }
+                }
+                if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>();
+                if (!bigrams.isEmpty()) bigrams = new ArrayList<WeightedString>();
+                isNotAWord = false;
+                for (String param : args) {
+                    final String params[] = param.split("=", 2);
+                    if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
+                    if (WORD_TAG.equals(params[0])) {
+                        word = params[1];
+                    } else if (FREQUENCY_TAG.equals(params[0])) {
+                        freq = Integer.parseInt(params[1]);
+                    } else if (NOT_A_WORD_TAG.equals(params[0])) {
+                        isNotAWord = "true".equals(params[1]);
+                    }
+                }
+            } else if (args[0].matches(SHORTCUT_TAG + "=.*")) {
+                String shortcut = null;
+                int shortcutFreq = 0;
+                for (String param : args) {
+                    final String params[] = param.split("=", 2);
+                    if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
+                    if (SHORTCUT_TAG.equals(params[0])) {
+                        shortcut = params[1];
+                    } else if (FREQUENCY_TAG.equals(params[0])) {
+                        shortcutFreq =
+                                WHITELIST_TAG.equals(params[1]) ? 15 : Integer.parseInt(params[1]);
+                    }
+                }
+                if (null != shortcut) {
+                    shortcuts.add(new WeightedString(shortcut, shortcutFreq));
+                } else {
+                    throw new RuntimeException("Wrong format : " + line);
+                }
+            } else if (args[0].matches(BIGRAM_TAG + "=.*")) {
+                String secondWordOfBigram = null;
+                int bigramFreq = 0;
+                for (String param : args) {
+                    final String params[] = param.split("=", 2);
+                    if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
+                    if (BIGRAM_TAG.equals(params[0])) {
+                        secondWordOfBigram = params[1];
+                    } else if (FREQUENCY_TAG.equals(params[0])) {
+                        bigramFreq = Integer.parseInt(params[1]);
+                    }
+                }
+                if (null != secondWordOfBigram) {
+                    bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq));
+                } else {
+                    throw new RuntimeException("Wrong format : " + line);
+                }
+            }
+        }
+        if (null != word) {
+            dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
+            for (WeightedString s : bigrams) {
+                dict.setBigram(word, s.mWord, s.mFrequency);
+            }
+        }
+
+        return dict;
+    }
+
+    /**
+     * Writes a dictionary to a combined file.
+     *
+     * @param destination a destination stream to write to.
+     * @param dict the dictionary to write.
+     */
+    public static void writeDictionaryCombined(Writer destination, FusionDictionary dict)
+            throws IOException {
+        final TreeSet<Word> set = new TreeSet<Word>();
+        for (Word word : dict) {
+            set.add(word); // This for ordering by frequency, then by asciibetic order
+        }
+        final HashMap<String, String> options = dict.mOptions.mAttributes;
+        destination.write(DICTIONARY_TAG + "=");
+        if (options.containsKey(DICTIONARY_TAG)) {
+            destination.write(options.get(DICTIONARY_TAG));
+            options.remove(DICTIONARY_TAG);
+        }
+        if (dict.mOptions.mGermanUmlautProcessing) {
+            destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION);
+        } else if (dict.mOptions.mFrenchLigatureProcessing) {
+            destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION);
+        }
+        for (final String key : dict.mOptions.mAttributes.keySet()) {
+            final String value = dict.mOptions.mAttributes.get(key);
+            destination.write("," + key + "=" + value);
+        }
+        destination.write("\n");
+        for (Word word : set) {
+            destination.write("\t" + WORD_TAG + "=" + word.mWord + ","
+                    + FREQUENCY_TAG + "=" + word.mFrequency
+                    + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n"));
+            if (null != word.mShortcutTargets) {
+                for (WeightedString target : word.mShortcutTargets) {
+                    destination.write("\t\t" + SHORTCUT_TAG + "=" + target.mWord + ","
+                            + FREQUENCY_TAG + "=" + target.mFrequency + "\n");
+                }
+            }
+            if (null != word.mBigrams) {
+                for (WeightedString bigram : word.mBigrams) {
+                    destination.write("\t\t" + BIGRAM_TAG + "=" + bigram.mWord + ","
+                            + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n");
+                }
+            }
+        }
+        destination.close();
+    }
+}
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java
@ -52,13 +52,16 @@ public class DictionaryMaker {
        private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
        private static final String OPTION_OUTPUT_BINARY = "-d";
        private static final String OPTION_OUTPUT_XML = "-x";
+        private static final String OPTION_OUTPUT_COMBINED = "-o";
        private static final String OPTION_HELP = "-h";
        public final String mInputBinary;
+        public final String mInputCombined;
        public final String mInputUnigramXml;
        public final String mInputShortcutXml;
        public final String mInputBigramXml;
        public final String mOutputBinary;
        public final String mOutputXml;
+        public final String mOutputCombined;
        public final int mOutputBinaryFormatVersion;

        private void checkIntegrity() throws IOException {
@ -66,28 +69,38 @@ public class DictionaryMaker {
            checkHasAtLeastOneOutput();
            checkNotSameFile(mInputBinary, mOutputBinary);
            checkNotSameFile(mInputBinary, mOutputXml);
+            checkNotSameFile(mInputCombined, mOutputBinary);
+            checkNotSameFile(mInputCombined, mOutputXml);
            checkNotSameFile(mInputUnigramXml, mOutputBinary);
            checkNotSameFile(mInputUnigramXml, mOutputXml);
+            checkNotSameFile(mInputUnigramXml, mOutputCombined);
            checkNotSameFile(mInputShortcutXml, mOutputBinary);
            checkNotSameFile(mInputShortcutXml, mOutputXml);
+            checkNotSameFile(mInputShortcutXml, mOutputCombined);
            checkNotSameFile(mInputBigramXml, mOutputBinary);
            checkNotSameFile(mInputBigramXml, mOutputXml);
+            checkNotSameFile(mInputBigramXml, mOutputCombined);
            checkNotSameFile(mOutputBinary, mOutputXml);
+            checkNotSameFile(mOutputBinary, mOutputCombined);
+            checkNotSameFile(mOutputXml, mOutputCombined);
        }

        private void checkHasExactlyOneInput() {
-            if (null == mInputUnigramXml && null == mInputBinary) {
+            if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) {
                throw new RuntimeException("No input file specified");
-            } else if (null != mInputUnigramXml && null != mInputBinary) {
-                throw new RuntimeException("Both input XML and binary specified");
-            } else if (null != mInputBinary && null != mInputBigramXml) {
-                throw new RuntimeException("Cannot specify a binary input and a separate bigram "
-                        + "file");
+            } else if ((null != mInputUnigramXml && null != mInputBinary)
+                    || (null != mInputUnigramXml && null != mInputCombined)
+                    || (null != mInputBinary && null != mInputCombined)) {
+                throw new RuntimeException("Several input files specified");
+            } else if ((null != mInputBinary || null != mInputCombined)
+                    && (null != mInputBigramXml || null != mInputShortcutXml)) {
+                throw new RuntimeException("Separate bigrams/shortcut files are only supported"
+                        + " with XML input (other formats include bigrams and shortcuts already)");
            }
        }

        private void checkHasAtLeastOneOutput() {
-            if (null == mOutputBinary && null == mOutputXml) {
+            if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) {
                throw new RuntimeException("No output specified");
            }
        }
@ -111,17 +124,16 @@ public class DictionaryMaker {
        public static String getHelp() {
            return "Usage: makedict "
                    + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
+                    + "| [-s <combined format input]"
                    + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
+                    + " [-o <combined output>]"
                    + "[-1] [-2] [-3]\n"
                    + "\n"
                    + "  Converts a source dictionary file to one or several outputs.\n"
                    + "  Source can be an XML file, with an optional XML bigrams file, or a\n"
                    + "  binary dictionary file.\n"
-                    + "  Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3 and XML outputs\n"
-                    + "  are supported. All three can be output at the same time, but the same\n"
-                    + "  output format cannot be specified several times. The behavior is\n"
-                    + "  unspecified if the same file is specified for input and output, or for\n"
-                    + "  several outputs.";
+                    + "  Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3, XML and\n"
+                    + "  combined format outputs are supported.";
        }

        public Arguments(String[] argsArray) throws IOException {
@ -130,11 +142,13 @@ public class DictionaryMaker {
                displayHelp();
            }
            String inputBinary = null;
+            String inputCombined = null;
            String inputUnigramXml = null;
            String inputShortcutXml = null;
            String inputBigramXml = null;
            String outputBinary = null;
            String outputXml = null;
+            String outputCombined = null;
            int outputBinaryFormatVersion = 2; // the default version is 2.

            while (!args.isEmpty()) {
@ -160,6 +174,8 @@ public class DictionaryMaker {
                        if (OPTION_INPUT_SOURCE.equals(arg)) {
                            if (BinaryDictInputOutput.isBinaryDictionary(filename)) {
                                inputBinary = filename;
+                            } else if (CombinedInputOutput.isCombinedDictionary(filename)) {
+                                inputCombined = filename;
                            } else {
                                inputUnigramXml = filename;
                            }
@ -171,6 +187,8 @@ public class DictionaryMaker {
                            outputBinary = filename;
                        } else if (OPTION_OUTPUT_XML.equals(arg)) {
                            outputXml = filename;
+                        } else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
+                            outputCombined = filename;
                        } else {
                            throw new IllegalArgumentException("Unknown option : " + arg);
                        }
@ -179,6 +197,8 @@ public class DictionaryMaker {
                    if (null == inputBinary && null == inputUnigramXml) {
                        if (BinaryDictInputOutput.isBinaryDictionary(arg)) {
                            inputBinary = arg;
+                        } else if (CombinedInputOutput.isCombinedDictionary(arg)) {
+                            inputCombined = arg;
                        } else {
                            inputUnigramXml = arg;
                        }
@ -191,11 +211,13 @@ public class DictionaryMaker {
            }

            mInputBinary = inputBinary;
+            mInputCombined = inputCombined;
            mInputUnigramXml = inputUnigramXml;
            mInputShortcutXml = inputShortcutXml;
            mInputBigramXml = inputBigramXml;
            mOutputBinary = outputBinary;
            mOutputXml = outputXml;
+            mOutputCombined = outputCombined;
            mOutputBinaryFormatVersion = outputBinaryFormatVersion;
            checkIntegrity();
        }
@ -220,6 +242,8 @@ public class DictionaryMaker {
            SAXException, FileNotFoundException {
        if (null != args.mInputBinary) {
            return readBinaryFile(args.mInputBinary);
+        } else if (null != args.mInputCombined) {
+            return readCombinedFile(args.mInputCombined);
        } else if (null != args.mInputUnigramXml) {
            return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
        } else {
@ -258,6 +282,32 @@ public class DictionaryMaker {
        }
    }

+    /**
+     * Read a dictionary from the name of a combined file.
+     *
+     * @param combinedFilename the name of the file in the combined format.
+     * @return the read dictionary.
+     * @throws FileNotFoundException if the file can't be found
+     * @throws IOException if the input file can't be read
+     */
+    private static FusionDictionary readCombinedFile(final String combinedFilename)
+        throws FileNotFoundException, IOException {
+        FileInputStream inStream = null;
+        try {
+            final File file = new File(combinedFilename);
+            inStream = new FileInputStream(file);
+            return CombinedInputOutput.readDictionaryCombined(inStream);
+        } finally {
+            if (null != inStream) {
+                try {
+                    inStream.close();
+                } catch (IOException e) {
+                    // do nothing
+                }
+            }
+        }
+    }
+
    /**
     * Read a dictionary from a unigram XML file, and optionally a bigram XML file.
     *
@ -299,6 +349,9 @@ public class DictionaryMaker {
        if (null != args.mOutputXml) {
            writeXmlDictionary(args.mOutputXml, dict);
        }
+        if (null != args.mOutputCombined) {
+            writeCombinedDictionary(args.mOutputCombined, dict);
+        }
    }

    /**
@ -332,4 +385,18 @@ public class DictionaryMaker {
        XmlDictInputOutput.writeDictionaryXml(new BufferedWriter(new FileWriter(outputFilename)),
                dict);
    }
+
+    /**
+     * Write the dictionary in the combined format to the specified filename.
+     *
+     * @param outputFilename the name of the file to write to.
+     * @param dict the dictionary to write.
+     * @throws FileNotFoundException if the output file can't be created.
+     * @throws IOException if the output file can't be written to.
+     */
+    private static void writeCombinedDictionary(final String outputFilename,
+            final FusionDictionary dict) throws FileNotFoundException, IOException {
+        CombinedInputOutput.writeDictionaryCombined(
+                new BufferedWriter(new FileWriter(outputFilename)), dict);
+    }
 }
--- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java
+++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java
@ -80,6 +80,7 @@ public class Dicttool {
        } catch (Exception e) {
            System.out.println("Exception while processing command "
                    + command.getClass().getSimpleName() + " : " + e);
+            e.printStackTrace();
            return;
        }
    }