From 9bb4eebf48f3e15bcdfe37d73f26693bd4eaf0f4 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Mon, 22 Oct 2012 17:00:39 +0900 Subject: [PATCH] Add reader and writer for the combined dict format. This introduces a new textual format for the dictionary that combines words, bigrams and shortcuts to avoid complexity. It is also extensible to n-grams to fool-prof for the future, and easier to read than XML. Bug: 7388540 Change-Id: I942bbad51bd0c905a5a54c278667563fd6dd66ec --- .../latin/BinaryDictionaryFileDumper.java | 8 +- .../latin/dicttool/CombinedInputOutput.java | 238 ++++++++++++++++++ .../latin/dicttool/DictionaryMaker.java | 91 ++++++- .../inputmethod/latin/dicttool/Dicttool.java | 1 + 4 files changed, 319 insertions(+), 19 deletions(-) create mode 100644 tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionaryFileDumper.java b/java/src/com/android/inputmethod/latin/BinaryDictionaryFileDumper.java index bed31a7d1..b0b65edb6 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionaryFileDumper.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionaryFileDumper.java @@ -149,13 +149,7 @@ public final class BinaryDictionaryFileDumper { final Uri.Builder wordListUriBuilder = getProviderUriBuilder(id); final String finalFileName = BinaryDictionaryGetter.getCacheFileName(id, locale, context); - String tempFileName; - try { - tempFileName = BinaryDictionaryGetter.getTempFileName(id, context); - } catch (IOException e) { - Log.e(TAG, "Can't open the temporary file", e); - return null; - } + final String tempFileName = BinaryDictionaryGetter.getTempFileName(id, context); for (int mode = MODE_MIN; mode <= MODE_MAX; ++mode) { InputStream originalSourceStream = null; diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java new file mode 100644 index 000000000..c17667536 --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.Word; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.TreeSet; + +/** + * Reads and writes combined format for a FusionDictionary. + * + * All functions in this class are static. + */ +public class CombinedInputOutput { + + private static final String DICTIONARY_TAG = "dictionary"; + private static final String BIGRAM_TAG = "bigram"; + private static final String SHORTCUT_TAG = "shortcut"; + private static final String FREQUENCY_TAG = "f"; + private static final String WORD_TAG = "word"; + private static final String NOT_A_WORD_TAG = "not_a_word"; + private static final String WHITELIST_TAG = "whitelist"; + private static final String OPTIONS_TAG = "options"; + private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; + private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; + + /** + * Basic test to find out whether the file is in the combined format or not. + * + * Concretely this only tests the header line. + * + * @param filename The name of the file to test. + * @return true if the file is in the combined format, false otherwise + */ + public static boolean isCombinedDictionary(final String filename) { + BufferedReader reader = null; + try { + reader = new BufferedReader(new FileReader(new File(filename))); + final String firstLine = reader.readLine(); + return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); + } catch (FileNotFoundException e) { + return false; + } catch (IOException e) { + return false; + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** + * Reads a dictionary from a combined format file. + * + * This is the public method that will read a combined file and return the corresponding memory + * representation. + * + * @param source the file to read the data from. + * @return the in-memory representation of the dictionary. + */ + public static FusionDictionary readDictionaryCombined(final InputStream source) + throws IOException { + final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8")); + final String headerLine = reader.readLine(); + final String header[] = headerLine.split(","); + final HashMap attributes = new HashMap(); + for (String item : header) { + final String keyValue[] = item.split("="); + if (2 != keyValue.length) { + throw new RuntimeException("Wrong header format : " + headerLine); + } + attributes.put(keyValue[0], keyValue[1]); + } + + final boolean processUmlauts = + GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); + final boolean processLigatures = + FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); + attributes.remove(OPTIONS_TAG); + final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions( + attributes, processUmlauts, processLigatures)); + + String line; + String word = null; + int freq = 0; + boolean isNotAWord = false; + ArrayList bigrams = new ArrayList(); + ArrayList shortcuts = new ArrayList(); + while (null != (line = reader.readLine())) { + final String args[] = line.trim().split(","); + if (args[0].matches(WORD_TAG + "=.*")) { + if (null != word) { + dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + for (WeightedString s : bigrams) { + dict.setBigram(word, s.mWord, s.mFrequency); + } + } + if (!shortcuts.isEmpty()) shortcuts = new ArrayList(); + if (!bigrams.isEmpty()) bigrams = new ArrayList(); + isNotAWord = false; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (WORD_TAG.equals(params[0])) { + word = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + freq = Integer.parseInt(params[1]); + } else if (NOT_A_WORD_TAG.equals(params[0])) { + isNotAWord = "true".equals(params[1]); + } + } + } else if (args[0].matches(SHORTCUT_TAG + "=.*")) { + String shortcut = null; + int shortcutFreq = 0; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (SHORTCUT_TAG.equals(params[0])) { + shortcut = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + shortcutFreq = + WHITELIST_TAG.equals(params[1]) ? 15 : Integer.parseInt(params[1]); + } + } + if (null != shortcut) { + shortcuts.add(new WeightedString(shortcut, shortcutFreq)); + } else { + throw new RuntimeException("Wrong format : " + line); + } + } else if (args[0].matches(BIGRAM_TAG + "=.*")) { + String secondWordOfBigram = null; + int bigramFreq = 0; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (BIGRAM_TAG.equals(params[0])) { + secondWordOfBigram = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + bigramFreq = Integer.parseInt(params[1]); + } + } + if (null != secondWordOfBigram) { + bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq)); + } else { + throw new RuntimeException("Wrong format : " + line); + } + } + } + if (null != word) { + dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + for (WeightedString s : bigrams) { + dict.setBigram(word, s.mWord, s.mFrequency); + } + } + + return dict; + } + + /** + * Writes a dictionary to a combined file. + * + * @param destination a destination stream to write to. + * @param dict the dictionary to write. + */ + public static void writeDictionaryCombined(Writer destination, FusionDictionary dict) + throws IOException { + final TreeSet set = new TreeSet(); + for (Word word : dict) { + set.add(word); // This for ordering by frequency, then by asciibetic order + } + final HashMap options = dict.mOptions.mAttributes; + destination.write(DICTIONARY_TAG + "="); + if (options.containsKey(DICTIONARY_TAG)) { + destination.write(options.get(DICTIONARY_TAG)); + options.remove(DICTIONARY_TAG); + } + if (dict.mOptions.mGermanUmlautProcessing) { + destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION); + } else if (dict.mOptions.mFrenchLigatureProcessing) { + destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION); + } + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + destination.write("," + key + "=" + value); + } + destination.write("\n"); + for (Word word : set) { + destination.write("\t" + WORD_TAG + "=" + word.mWord + "," + + FREQUENCY_TAG + "=" + word.mFrequency + + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); + if (null != word.mShortcutTargets) { + for (WeightedString target : word.mShortcutTargets) { + destination.write("\t\t" + SHORTCUT_TAG + "=" + target.mWord + "," + + FREQUENCY_TAG + "=" + target.mFrequency + "\n"); + } + } + if (null != word.mBigrams) { + for (WeightedString bigram : word.mBigrams) { + destination.write("\t\t" + BIGRAM_TAG + "=" + bigram.mWord + "," + + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n"); + } + } + } + destination.close(); + } +} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java index 2cdd83e96..561db7363 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -52,13 +52,16 @@ public class DictionaryMaker { private static final String OPTION_INPUT_SHORTCUT_XML = "-c"; private static final String OPTION_OUTPUT_BINARY = "-d"; private static final String OPTION_OUTPUT_XML = "-x"; + private static final String OPTION_OUTPUT_COMBINED = "-o"; private static final String OPTION_HELP = "-h"; public final String mInputBinary; + public final String mInputCombined; public final String mInputUnigramXml; public final String mInputShortcutXml; public final String mInputBigramXml; public final String mOutputBinary; public final String mOutputXml; + public final String mOutputCombined; public final int mOutputBinaryFormatVersion; private void checkIntegrity() throws IOException { @@ -66,28 +69,38 @@ public class DictionaryMaker { checkHasAtLeastOneOutput(); checkNotSameFile(mInputBinary, mOutputBinary); checkNotSameFile(mInputBinary, mOutputXml); + checkNotSameFile(mInputCombined, mOutputBinary); + checkNotSameFile(mInputCombined, mOutputXml); checkNotSameFile(mInputUnigramXml, mOutputBinary); checkNotSameFile(mInputUnigramXml, mOutputXml); + checkNotSameFile(mInputUnigramXml, mOutputCombined); checkNotSameFile(mInputShortcutXml, mOutputBinary); checkNotSameFile(mInputShortcutXml, mOutputXml); + checkNotSameFile(mInputShortcutXml, mOutputCombined); checkNotSameFile(mInputBigramXml, mOutputBinary); checkNotSameFile(mInputBigramXml, mOutputXml); + checkNotSameFile(mInputBigramXml, mOutputCombined); checkNotSameFile(mOutputBinary, mOutputXml); + checkNotSameFile(mOutputBinary, mOutputCombined); + checkNotSameFile(mOutputXml, mOutputCombined); } private void checkHasExactlyOneInput() { - if (null == mInputUnigramXml && null == mInputBinary) { + if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) { throw new RuntimeException("No input file specified"); - } else if (null != mInputUnigramXml && null != mInputBinary) { - throw new RuntimeException("Both input XML and binary specified"); - } else if (null != mInputBinary && null != mInputBigramXml) { - throw new RuntimeException("Cannot specify a binary input and a separate bigram " - + "file"); + } else if ((null != mInputUnigramXml && null != mInputBinary) + || (null != mInputUnigramXml && null != mInputCombined) + || (null != mInputBinary && null != mInputCombined)) { + throw new RuntimeException("Several input files specified"); + } else if ((null != mInputBinary || null != mInputCombined) + && (null != mInputBigramXml || null != mInputShortcutXml)) { + throw new RuntimeException("Separate bigrams/shortcut files are only supported" + + " with XML input (other formats include bigrams and shortcuts already)"); } } private void checkHasAtLeastOneOutput() { - if (null == mOutputBinary && null == mOutputXml) { + if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) { throw new RuntimeException("No output specified"); } } @@ -111,17 +124,16 @@ public class DictionaryMaker { public static String getHelp() { return "Usage: makedict " + "[-s [-b ] [-c ] " + + "| [-s ] [-d ] [-x ] " + + " [-o ]" + "[-1] [-2] [-3]\n" + "\n" + " Converts a source dictionary file to one or several outputs.\n" + " Source can be an XML file, with an optional XML bigrams file, or a\n" + " binary dictionary file.\n" - + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3 and XML outputs\n" - + " are supported. All three can be output at the same time, but the same\n" - + " output format cannot be specified several times. The behavior is\n" - + " unspecified if the same file is specified for input and output, or for\n" - + " several outputs."; + + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3, XML and\n" + + " combined format outputs are supported."; } public Arguments(String[] argsArray) throws IOException { @@ -130,11 +142,13 @@ public class DictionaryMaker { displayHelp(); } String inputBinary = null; + String inputCombined = null; String inputUnigramXml = null; String inputShortcutXml = null; String inputBigramXml = null; String outputBinary = null; String outputXml = null; + String outputCombined = null; int outputBinaryFormatVersion = 2; // the default version is 2. while (!args.isEmpty()) { @@ -160,6 +174,8 @@ public class DictionaryMaker { if (OPTION_INPUT_SOURCE.equals(arg)) { if (BinaryDictInputOutput.isBinaryDictionary(filename)) { inputBinary = filename; + } else if (CombinedInputOutput.isCombinedDictionary(filename)) { + inputCombined = filename; } else { inputUnigramXml = filename; } @@ -171,6 +187,8 @@ public class DictionaryMaker { outputBinary = filename; } else if (OPTION_OUTPUT_XML.equals(arg)) { outputXml = filename; + } else if (OPTION_OUTPUT_COMBINED.equals(arg)) { + outputCombined = filename; } else { throw new IllegalArgumentException("Unknown option : " + arg); } @@ -179,6 +197,8 @@ public class DictionaryMaker { if (null == inputBinary && null == inputUnigramXml) { if (BinaryDictInputOutput.isBinaryDictionary(arg)) { inputBinary = arg; + } else if (CombinedInputOutput.isCombinedDictionary(arg)) { + inputCombined = arg; } else { inputUnigramXml = arg; } @@ -191,11 +211,13 @@ public class DictionaryMaker { } mInputBinary = inputBinary; + mInputCombined = inputCombined; mInputUnigramXml = inputUnigramXml; mInputShortcutXml = inputShortcutXml; mInputBigramXml = inputBigramXml; mOutputBinary = outputBinary; mOutputXml = outputXml; + mOutputCombined = outputCombined; mOutputBinaryFormatVersion = outputBinaryFormatVersion; checkIntegrity(); } @@ -220,6 +242,8 @@ public class DictionaryMaker { SAXException, FileNotFoundException { if (null != args.mInputBinary) { return readBinaryFile(args.mInputBinary); + } else if (null != args.mInputCombined) { + return readCombinedFile(args.mInputCombined); } else if (null != args.mInputUnigramXml) { return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); } else { @@ -258,6 +282,32 @@ public class DictionaryMaker { } } + /** + * Read a dictionary from the name of a combined file. + * + * @param combinedFilename the name of the file in the combined format. + * @return the read dictionary. + * @throws FileNotFoundException if the file can't be found + * @throws IOException if the input file can't be read + */ + private static FusionDictionary readCombinedFile(final String combinedFilename) + throws FileNotFoundException, IOException { + FileInputStream inStream = null; + try { + final File file = new File(combinedFilename); + inStream = new FileInputStream(file); + return CombinedInputOutput.readDictionaryCombined(inStream); + } finally { + if (null != inStream) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + /** * Read a dictionary from a unigram XML file, and optionally a bigram XML file. * @@ -299,6 +349,9 @@ public class DictionaryMaker { if (null != args.mOutputXml) { writeXmlDictionary(args.mOutputXml, dict); } + if (null != args.mOutputCombined) { + writeCombinedDictionary(args.mOutputCombined, dict); + } } /** @@ -332,4 +385,18 @@ public class DictionaryMaker { XmlDictInputOutput.writeDictionaryXml(new BufferedWriter(new FileWriter(outputFilename)), dict); } + + /** + * Write the dictionary in the combined format to the specified filename. + * + * @param outputFilename the name of the file to write to. + * @param dict the dictionary to write. + * @throws FileNotFoundException if the output file can't be created. + * @throws IOException if the output file can't be written to. + */ + private static void writeCombinedDictionary(final String outputFilename, + final FusionDictionary dict) throws FileNotFoundException, IOException { + CombinedInputOutput.writeDictionaryCombined( + new BufferedWriter(new FileWriter(outputFilename)), dict); + } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java index bf417fb5a..75ce104e0 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java @@ -80,6 +80,7 @@ public class Dicttool { } catch (Exception e) { System.out.println("Exception while processing command " + command.getClass().getSimpleName() + " : " + e); + e.printStackTrace(); return; } }