Add reader and writer for the combined dict format.
This introduces a new textual format for the dictionary that combines words, bigrams and shortcuts to avoid complexity. It is also extensible to n-grams to fool-prof for the future, and easier to read than XML. Bug: 7388540 Change-Id: I942bbad51bd0c905a5a54c278667563fd6dd66ec
This commit is contained in:
parent
cbb3252731
commit
9bb4eebf48
4 changed files with 319 additions and 19 deletions
|
@ -149,13 +149,7 @@ public final class BinaryDictionaryFileDumper {
|
||||||
|
|
||||||
final Uri.Builder wordListUriBuilder = getProviderUriBuilder(id);
|
final Uri.Builder wordListUriBuilder = getProviderUriBuilder(id);
|
||||||
final String finalFileName = BinaryDictionaryGetter.getCacheFileName(id, locale, context);
|
final String finalFileName = BinaryDictionaryGetter.getCacheFileName(id, locale, context);
|
||||||
String tempFileName;
|
final String tempFileName = BinaryDictionaryGetter.getTempFileName(id, context);
|
||||||
try {
|
|
||||||
tempFileName = BinaryDictionaryGetter.getTempFileName(id, context);
|
|
||||||
} catch (IOException e) {
|
|
||||||
Log.e(TAG, "Can't open the temporary file", e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int mode = MODE_MIN; mode <= MODE_MAX; ++mode) {
|
for (int mode = MODE_MIN; mode <= MODE_MAX; ++mode) {
|
||||||
InputStream originalSourceStream = null;
|
InputStream originalSourceStream = null;
|
||||||
|
|
|
@ -0,0 +1,238 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2012 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||||
|
* use this file except in compliance with the License. You may obtain a copy of
|
||||||
|
* the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations under
|
||||||
|
* the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.android.inputmethod.latin.dicttool;
|
||||||
|
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||||
|
import com.android.inputmethod.latin.makedict.Word;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Writer;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads and writes combined format for a FusionDictionary.
|
||||||
|
*
|
||||||
|
* All functions in this class are static.
|
||||||
|
*/
|
||||||
|
public class CombinedInputOutput {
|
||||||
|
|
||||||
|
private static final String DICTIONARY_TAG = "dictionary";
|
||||||
|
private static final String BIGRAM_TAG = "bigram";
|
||||||
|
private static final String SHORTCUT_TAG = "shortcut";
|
||||||
|
private static final String FREQUENCY_TAG = "f";
|
||||||
|
private static final String WORD_TAG = "word";
|
||||||
|
private static final String NOT_A_WORD_TAG = "not_a_word";
|
||||||
|
private static final String WHITELIST_TAG = "whitelist";
|
||||||
|
private static final String OPTIONS_TAG = "options";
|
||||||
|
private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing";
|
||||||
|
private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic test to find out whether the file is in the combined format or not.
|
||||||
|
*
|
||||||
|
* Concretely this only tests the header line.
|
||||||
|
*
|
||||||
|
* @param filename The name of the file to test.
|
||||||
|
* @return true if the file is in the combined format, false otherwise
|
||||||
|
*/
|
||||||
|
public static boolean isCombinedDictionary(final String filename) {
|
||||||
|
BufferedReader reader = null;
|
||||||
|
try {
|
||||||
|
reader = new BufferedReader(new FileReader(new File(filename)));
|
||||||
|
final String firstLine = reader.readLine();
|
||||||
|
return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
return false;
|
||||||
|
} catch (IOException e) {
|
||||||
|
return false;
|
||||||
|
} finally {
|
||||||
|
if (reader != null) {
|
||||||
|
try {
|
||||||
|
reader.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a dictionary from a combined format file.
|
||||||
|
*
|
||||||
|
* This is the public method that will read a combined file and return the corresponding memory
|
||||||
|
* representation.
|
||||||
|
*
|
||||||
|
* @param source the file to read the data from.
|
||||||
|
* @return the in-memory representation of the dictionary.
|
||||||
|
*/
|
||||||
|
public static FusionDictionary readDictionaryCombined(final InputStream source)
|
||||||
|
throws IOException {
|
||||||
|
final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8"));
|
||||||
|
final String headerLine = reader.readLine();
|
||||||
|
final String header[] = headerLine.split(",");
|
||||||
|
final HashMap<String, String> attributes = new HashMap<String, String>();
|
||||||
|
for (String item : header) {
|
||||||
|
final String keyValue[] = item.split("=");
|
||||||
|
if (2 != keyValue.length) {
|
||||||
|
throw new RuntimeException("Wrong header format : " + headerLine);
|
||||||
|
}
|
||||||
|
attributes.put(keyValue[0], keyValue[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
final boolean processUmlauts =
|
||||||
|
GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG));
|
||||||
|
final boolean processLigatures =
|
||||||
|
FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG));
|
||||||
|
attributes.remove(OPTIONS_TAG);
|
||||||
|
final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions(
|
||||||
|
attributes, processUmlauts, processLigatures));
|
||||||
|
|
||||||
|
String line;
|
||||||
|
String word = null;
|
||||||
|
int freq = 0;
|
||||||
|
boolean isNotAWord = false;
|
||||||
|
ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>();
|
||||||
|
ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>();
|
||||||
|
while (null != (line = reader.readLine())) {
|
||||||
|
final String args[] = line.trim().split(",");
|
||||||
|
if (args[0].matches(WORD_TAG + "=.*")) {
|
||||||
|
if (null != word) {
|
||||||
|
dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
||||||
|
for (WeightedString s : bigrams) {
|
||||||
|
dict.setBigram(word, s.mWord, s.mFrequency);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>();
|
||||||
|
if (!bigrams.isEmpty()) bigrams = new ArrayList<WeightedString>();
|
||||||
|
isNotAWord = false;
|
||||||
|
for (String param : args) {
|
||||||
|
final String params[] = param.split("=", 2);
|
||||||
|
if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
|
||||||
|
if (WORD_TAG.equals(params[0])) {
|
||||||
|
word = params[1];
|
||||||
|
} else if (FREQUENCY_TAG.equals(params[0])) {
|
||||||
|
freq = Integer.parseInt(params[1]);
|
||||||
|
} else if (NOT_A_WORD_TAG.equals(params[0])) {
|
||||||
|
isNotAWord = "true".equals(params[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (args[0].matches(SHORTCUT_TAG + "=.*")) {
|
||||||
|
String shortcut = null;
|
||||||
|
int shortcutFreq = 0;
|
||||||
|
for (String param : args) {
|
||||||
|
final String params[] = param.split("=", 2);
|
||||||
|
if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
|
||||||
|
if (SHORTCUT_TAG.equals(params[0])) {
|
||||||
|
shortcut = params[1];
|
||||||
|
} else if (FREQUENCY_TAG.equals(params[0])) {
|
||||||
|
shortcutFreq =
|
||||||
|
WHITELIST_TAG.equals(params[1]) ? 15 : Integer.parseInt(params[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (null != shortcut) {
|
||||||
|
shortcuts.add(new WeightedString(shortcut, shortcutFreq));
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Wrong format : " + line);
|
||||||
|
}
|
||||||
|
} else if (args[0].matches(BIGRAM_TAG + "=.*")) {
|
||||||
|
String secondWordOfBigram = null;
|
||||||
|
int bigramFreq = 0;
|
||||||
|
for (String param : args) {
|
||||||
|
final String params[] = param.split("=", 2);
|
||||||
|
if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
|
||||||
|
if (BIGRAM_TAG.equals(params[0])) {
|
||||||
|
secondWordOfBigram = params[1];
|
||||||
|
} else if (FREQUENCY_TAG.equals(params[0])) {
|
||||||
|
bigramFreq = Integer.parseInt(params[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (null != secondWordOfBigram) {
|
||||||
|
bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq));
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("Wrong format : " + line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (null != word) {
|
||||||
|
dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
||||||
|
for (WeightedString s : bigrams) {
|
||||||
|
dict.setBigram(word, s.mWord, s.mFrequency);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return dict;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes a dictionary to a combined file.
|
||||||
|
*
|
||||||
|
* @param destination a destination stream to write to.
|
||||||
|
* @param dict the dictionary to write.
|
||||||
|
*/
|
||||||
|
public static void writeDictionaryCombined(Writer destination, FusionDictionary dict)
|
||||||
|
throws IOException {
|
||||||
|
final TreeSet<Word> set = new TreeSet<Word>();
|
||||||
|
for (Word word : dict) {
|
||||||
|
set.add(word); // This for ordering by frequency, then by asciibetic order
|
||||||
|
}
|
||||||
|
final HashMap<String, String> options = dict.mOptions.mAttributes;
|
||||||
|
destination.write(DICTIONARY_TAG + "=");
|
||||||
|
if (options.containsKey(DICTIONARY_TAG)) {
|
||||||
|
destination.write(options.get(DICTIONARY_TAG));
|
||||||
|
options.remove(DICTIONARY_TAG);
|
||||||
|
}
|
||||||
|
if (dict.mOptions.mGermanUmlautProcessing) {
|
||||||
|
destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION);
|
||||||
|
} else if (dict.mOptions.mFrenchLigatureProcessing) {
|
||||||
|
destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION);
|
||||||
|
}
|
||||||
|
for (final String key : dict.mOptions.mAttributes.keySet()) {
|
||||||
|
final String value = dict.mOptions.mAttributes.get(key);
|
||||||
|
destination.write("," + key + "=" + value);
|
||||||
|
}
|
||||||
|
destination.write("\n");
|
||||||
|
for (Word word : set) {
|
||||||
|
destination.write("\t" + WORD_TAG + "=" + word.mWord + ","
|
||||||
|
+ FREQUENCY_TAG + "=" + word.mFrequency
|
||||||
|
+ (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n"));
|
||||||
|
if (null != word.mShortcutTargets) {
|
||||||
|
for (WeightedString target : word.mShortcutTargets) {
|
||||||
|
destination.write("\t\t" + SHORTCUT_TAG + "=" + target.mWord + ","
|
||||||
|
+ FREQUENCY_TAG + "=" + target.mFrequency + "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (null != word.mBigrams) {
|
||||||
|
for (WeightedString bigram : word.mBigrams) {
|
||||||
|
destination.write("\t\t" + BIGRAM_TAG + "=" + bigram.mWord + ","
|
||||||
|
+ FREQUENCY_TAG + "=" + bigram.mFrequency + "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
destination.close();
|
||||||
|
}
|
||||||
|
}
|
|
@ -52,13 +52,16 @@ public class DictionaryMaker {
|
||||||
private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
|
private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
|
||||||
private static final String OPTION_OUTPUT_BINARY = "-d";
|
private static final String OPTION_OUTPUT_BINARY = "-d";
|
||||||
private static final String OPTION_OUTPUT_XML = "-x";
|
private static final String OPTION_OUTPUT_XML = "-x";
|
||||||
|
private static final String OPTION_OUTPUT_COMBINED = "-o";
|
||||||
private static final String OPTION_HELP = "-h";
|
private static final String OPTION_HELP = "-h";
|
||||||
public final String mInputBinary;
|
public final String mInputBinary;
|
||||||
|
public final String mInputCombined;
|
||||||
public final String mInputUnigramXml;
|
public final String mInputUnigramXml;
|
||||||
public final String mInputShortcutXml;
|
public final String mInputShortcutXml;
|
||||||
public final String mInputBigramXml;
|
public final String mInputBigramXml;
|
||||||
public final String mOutputBinary;
|
public final String mOutputBinary;
|
||||||
public final String mOutputXml;
|
public final String mOutputXml;
|
||||||
|
public final String mOutputCombined;
|
||||||
public final int mOutputBinaryFormatVersion;
|
public final int mOutputBinaryFormatVersion;
|
||||||
|
|
||||||
private void checkIntegrity() throws IOException {
|
private void checkIntegrity() throws IOException {
|
||||||
|
@ -66,28 +69,38 @@ public class DictionaryMaker {
|
||||||
checkHasAtLeastOneOutput();
|
checkHasAtLeastOneOutput();
|
||||||
checkNotSameFile(mInputBinary, mOutputBinary);
|
checkNotSameFile(mInputBinary, mOutputBinary);
|
||||||
checkNotSameFile(mInputBinary, mOutputXml);
|
checkNotSameFile(mInputBinary, mOutputXml);
|
||||||
|
checkNotSameFile(mInputCombined, mOutputBinary);
|
||||||
|
checkNotSameFile(mInputCombined, mOutputXml);
|
||||||
checkNotSameFile(mInputUnigramXml, mOutputBinary);
|
checkNotSameFile(mInputUnigramXml, mOutputBinary);
|
||||||
checkNotSameFile(mInputUnigramXml, mOutputXml);
|
checkNotSameFile(mInputUnigramXml, mOutputXml);
|
||||||
|
checkNotSameFile(mInputUnigramXml, mOutputCombined);
|
||||||
checkNotSameFile(mInputShortcutXml, mOutputBinary);
|
checkNotSameFile(mInputShortcutXml, mOutputBinary);
|
||||||
checkNotSameFile(mInputShortcutXml, mOutputXml);
|
checkNotSameFile(mInputShortcutXml, mOutputXml);
|
||||||
|
checkNotSameFile(mInputShortcutXml, mOutputCombined);
|
||||||
checkNotSameFile(mInputBigramXml, mOutputBinary);
|
checkNotSameFile(mInputBigramXml, mOutputBinary);
|
||||||
checkNotSameFile(mInputBigramXml, mOutputXml);
|
checkNotSameFile(mInputBigramXml, mOutputXml);
|
||||||
|
checkNotSameFile(mInputBigramXml, mOutputCombined);
|
||||||
checkNotSameFile(mOutputBinary, mOutputXml);
|
checkNotSameFile(mOutputBinary, mOutputXml);
|
||||||
|
checkNotSameFile(mOutputBinary, mOutputCombined);
|
||||||
|
checkNotSameFile(mOutputXml, mOutputCombined);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkHasExactlyOneInput() {
|
private void checkHasExactlyOneInput() {
|
||||||
if (null == mInputUnigramXml && null == mInputBinary) {
|
if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) {
|
||||||
throw new RuntimeException("No input file specified");
|
throw new RuntimeException("No input file specified");
|
||||||
} else if (null != mInputUnigramXml && null != mInputBinary) {
|
} else if ((null != mInputUnigramXml && null != mInputBinary)
|
||||||
throw new RuntimeException("Both input XML and binary specified");
|
|| (null != mInputUnigramXml && null != mInputCombined)
|
||||||
} else if (null != mInputBinary && null != mInputBigramXml) {
|
|| (null != mInputBinary && null != mInputCombined)) {
|
||||||
throw new RuntimeException("Cannot specify a binary input and a separate bigram "
|
throw new RuntimeException("Several input files specified");
|
||||||
+ "file");
|
} else if ((null != mInputBinary || null != mInputCombined)
|
||||||
|
&& (null != mInputBigramXml || null != mInputShortcutXml)) {
|
||||||
|
throw new RuntimeException("Separate bigrams/shortcut files are only supported"
|
||||||
|
+ " with XML input (other formats include bigrams and shortcuts already)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkHasAtLeastOneOutput() {
|
private void checkHasAtLeastOneOutput() {
|
||||||
if (null == mOutputBinary && null == mOutputXml) {
|
if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) {
|
||||||
throw new RuntimeException("No output specified");
|
throw new RuntimeException("No output specified");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -111,17 +124,16 @@ public class DictionaryMaker {
|
||||||
public static String getHelp() {
|
public static String getHelp() {
|
||||||
return "Usage: makedict "
|
return "Usage: makedict "
|
||||||
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
|
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
|
||||||
|
+ "| [-s <combined format input]"
|
||||||
+ "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
|
+ "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
|
||||||
|
+ " [-o <combined output>]"
|
||||||
+ "[-1] [-2] [-3]\n"
|
+ "[-1] [-2] [-3]\n"
|
||||||
+ "\n"
|
+ "\n"
|
||||||
+ " Converts a source dictionary file to one or several outputs.\n"
|
+ " Converts a source dictionary file to one or several outputs.\n"
|
||||||
+ " Source can be an XML file, with an optional XML bigrams file, or a\n"
|
+ " Source can be an XML file, with an optional XML bigrams file, or a\n"
|
||||||
+ " binary dictionary file.\n"
|
+ " binary dictionary file.\n"
|
||||||
+ " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3 and XML outputs\n"
|
+ " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3, XML and\n"
|
||||||
+ " are supported. All three can be output at the same time, but the same\n"
|
+ " combined format outputs are supported.";
|
||||||
+ " output format cannot be specified several times. The behavior is\n"
|
|
||||||
+ " unspecified if the same file is specified for input and output, or for\n"
|
|
||||||
+ " several outputs.";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Arguments(String[] argsArray) throws IOException {
|
public Arguments(String[] argsArray) throws IOException {
|
||||||
|
@ -130,11 +142,13 @@ public class DictionaryMaker {
|
||||||
displayHelp();
|
displayHelp();
|
||||||
}
|
}
|
||||||
String inputBinary = null;
|
String inputBinary = null;
|
||||||
|
String inputCombined = null;
|
||||||
String inputUnigramXml = null;
|
String inputUnigramXml = null;
|
||||||
String inputShortcutXml = null;
|
String inputShortcutXml = null;
|
||||||
String inputBigramXml = null;
|
String inputBigramXml = null;
|
||||||
String outputBinary = null;
|
String outputBinary = null;
|
||||||
String outputXml = null;
|
String outputXml = null;
|
||||||
|
String outputCombined = null;
|
||||||
int outputBinaryFormatVersion = 2; // the default version is 2.
|
int outputBinaryFormatVersion = 2; // the default version is 2.
|
||||||
|
|
||||||
while (!args.isEmpty()) {
|
while (!args.isEmpty()) {
|
||||||
|
@ -160,6 +174,8 @@ public class DictionaryMaker {
|
||||||
if (OPTION_INPUT_SOURCE.equals(arg)) {
|
if (OPTION_INPUT_SOURCE.equals(arg)) {
|
||||||
if (BinaryDictInputOutput.isBinaryDictionary(filename)) {
|
if (BinaryDictInputOutput.isBinaryDictionary(filename)) {
|
||||||
inputBinary = filename;
|
inputBinary = filename;
|
||||||
|
} else if (CombinedInputOutput.isCombinedDictionary(filename)) {
|
||||||
|
inputCombined = filename;
|
||||||
} else {
|
} else {
|
||||||
inputUnigramXml = filename;
|
inputUnigramXml = filename;
|
||||||
}
|
}
|
||||||
|
@ -171,6 +187,8 @@ public class DictionaryMaker {
|
||||||
outputBinary = filename;
|
outputBinary = filename;
|
||||||
} else if (OPTION_OUTPUT_XML.equals(arg)) {
|
} else if (OPTION_OUTPUT_XML.equals(arg)) {
|
||||||
outputXml = filename;
|
outputXml = filename;
|
||||||
|
} else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
|
||||||
|
outputCombined = filename;
|
||||||
} else {
|
} else {
|
||||||
throw new IllegalArgumentException("Unknown option : " + arg);
|
throw new IllegalArgumentException("Unknown option : " + arg);
|
||||||
}
|
}
|
||||||
|
@ -179,6 +197,8 @@ public class DictionaryMaker {
|
||||||
if (null == inputBinary && null == inputUnigramXml) {
|
if (null == inputBinary && null == inputUnigramXml) {
|
||||||
if (BinaryDictInputOutput.isBinaryDictionary(arg)) {
|
if (BinaryDictInputOutput.isBinaryDictionary(arg)) {
|
||||||
inputBinary = arg;
|
inputBinary = arg;
|
||||||
|
} else if (CombinedInputOutput.isCombinedDictionary(arg)) {
|
||||||
|
inputCombined = arg;
|
||||||
} else {
|
} else {
|
||||||
inputUnigramXml = arg;
|
inputUnigramXml = arg;
|
||||||
}
|
}
|
||||||
|
@ -191,11 +211,13 @@ public class DictionaryMaker {
|
||||||
}
|
}
|
||||||
|
|
||||||
mInputBinary = inputBinary;
|
mInputBinary = inputBinary;
|
||||||
|
mInputCombined = inputCombined;
|
||||||
mInputUnigramXml = inputUnigramXml;
|
mInputUnigramXml = inputUnigramXml;
|
||||||
mInputShortcutXml = inputShortcutXml;
|
mInputShortcutXml = inputShortcutXml;
|
||||||
mInputBigramXml = inputBigramXml;
|
mInputBigramXml = inputBigramXml;
|
||||||
mOutputBinary = outputBinary;
|
mOutputBinary = outputBinary;
|
||||||
mOutputXml = outputXml;
|
mOutputXml = outputXml;
|
||||||
|
mOutputCombined = outputCombined;
|
||||||
mOutputBinaryFormatVersion = outputBinaryFormatVersion;
|
mOutputBinaryFormatVersion = outputBinaryFormatVersion;
|
||||||
checkIntegrity();
|
checkIntegrity();
|
||||||
}
|
}
|
||||||
|
@ -220,6 +242,8 @@ public class DictionaryMaker {
|
||||||
SAXException, FileNotFoundException {
|
SAXException, FileNotFoundException {
|
||||||
if (null != args.mInputBinary) {
|
if (null != args.mInputBinary) {
|
||||||
return readBinaryFile(args.mInputBinary);
|
return readBinaryFile(args.mInputBinary);
|
||||||
|
} else if (null != args.mInputCombined) {
|
||||||
|
return readCombinedFile(args.mInputCombined);
|
||||||
} else if (null != args.mInputUnigramXml) {
|
} else if (null != args.mInputUnigramXml) {
|
||||||
return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
|
return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
|
||||||
} else {
|
} else {
|
||||||
|
@ -258,6 +282,32 @@ public class DictionaryMaker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a dictionary from the name of a combined file.
|
||||||
|
*
|
||||||
|
* @param combinedFilename the name of the file in the combined format.
|
||||||
|
* @return the read dictionary.
|
||||||
|
* @throws FileNotFoundException if the file can't be found
|
||||||
|
* @throws IOException if the input file can't be read
|
||||||
|
*/
|
||||||
|
private static FusionDictionary readCombinedFile(final String combinedFilename)
|
||||||
|
throws FileNotFoundException, IOException {
|
||||||
|
FileInputStream inStream = null;
|
||||||
|
try {
|
||||||
|
final File file = new File(combinedFilename);
|
||||||
|
inStream = new FileInputStream(file);
|
||||||
|
return CombinedInputOutput.readDictionaryCombined(inStream);
|
||||||
|
} finally {
|
||||||
|
if (null != inStream) {
|
||||||
|
try {
|
||||||
|
inStream.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
|
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
|
||||||
*
|
*
|
||||||
|
@ -299,6 +349,9 @@ public class DictionaryMaker {
|
||||||
if (null != args.mOutputXml) {
|
if (null != args.mOutputXml) {
|
||||||
writeXmlDictionary(args.mOutputXml, dict);
|
writeXmlDictionary(args.mOutputXml, dict);
|
||||||
}
|
}
|
||||||
|
if (null != args.mOutputCombined) {
|
||||||
|
writeCombinedDictionary(args.mOutputCombined, dict);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -332,4 +385,18 @@ public class DictionaryMaker {
|
||||||
XmlDictInputOutput.writeDictionaryXml(new BufferedWriter(new FileWriter(outputFilename)),
|
XmlDictInputOutput.writeDictionaryXml(new BufferedWriter(new FileWriter(outputFilename)),
|
||||||
dict);
|
dict);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the dictionary in the combined format to the specified filename.
|
||||||
|
*
|
||||||
|
* @param outputFilename the name of the file to write to.
|
||||||
|
* @param dict the dictionary to write.
|
||||||
|
* @throws FileNotFoundException if the output file can't be created.
|
||||||
|
* @throws IOException if the output file can't be written to.
|
||||||
|
*/
|
||||||
|
private static void writeCombinedDictionary(final String outputFilename,
|
||||||
|
final FusionDictionary dict) throws FileNotFoundException, IOException {
|
||||||
|
CombinedInputOutput.writeDictionaryCombined(
|
||||||
|
new BufferedWriter(new FileWriter(outputFilename)), dict);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,6 +80,7 @@ public class Dicttool {
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
System.out.println("Exception while processing command "
|
System.out.println("Exception while processing command "
|
||||||
+ command.getClass().getSimpleName() + " : " + e);
|
+ command.getClass().getSimpleName() + " : " + e);
|
||||||
|
e.printStackTrace();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue