am 90aa229f
: Remove XML input/output from dicttool.
* commit '90aa229f01f2a14ae5b4542e065d27d000dafb82': Remove XML input/output from dicttool.
This commit is contained in:
commit
c86945e319
4 changed files with 13 additions and 503 deletions
|
@ -22,8 +22,6 @@ import com.android.inputmethod.latin.makedict.DictDecoder;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
|
|
||||||
import org.xml.sax.SAXException;
|
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.BufferedOutputStream;
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
|
@ -36,8 +34,6 @@ import java.io.InputStreamReader;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class grouping utilities for offline dictionary making.
|
* Class grouping utilities for offline dictionary making.
|
||||||
*
|
*
|
||||||
|
@ -177,14 +173,6 @@ public final class BinaryDictOffdeviceUtils {
|
||||||
System.out.println("Size : " + file.length() + " bytes");
|
System.out.println("Size : " + file.length() + " bytes");
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) {
|
|
||||||
if (report) {
|
|
||||||
System.out.println("Format : XML unigram list");
|
|
||||||
}
|
|
||||||
return XmlDictInputOutput.readDictionaryXml(
|
|
||||||
new BufferedInputStream(new FileInputStream(file)),
|
|
||||||
null /* shortcuts */, null /* bigrams */);
|
|
||||||
}
|
|
||||||
final DecoderChainSpec decodedSpec = getRawDictionaryOrNull(file);
|
final DecoderChainSpec decodedSpec = getRawDictionaryOrNull(file);
|
||||||
if (null == decodedSpec) {
|
if (null == decodedSpec) {
|
||||||
throw new RuntimeException("Does not seem to be a dictionary file " + filename);
|
throw new RuntimeException("Does not seem to be a dictionary file " + filename);
|
||||||
|
@ -209,8 +197,7 @@ public final class BinaryDictOffdeviceUtils {
|
||||||
System.out.println("Uncompressed size : " + decodedSpec.mFile.length());
|
System.out.println("Uncompressed size : " + decodedSpec.mFile.length());
|
||||||
}
|
}
|
||||||
return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */);
|
return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */);
|
||||||
} catch (final IOException | SAXException | ParserConfigurationException |
|
} catch (final IOException | UnsupportedFormatException e) {
|
||||||
UnsupportedFormatException e) {
|
|
||||||
throw new RuntimeException("Can't read file " + filename, e);
|
throw new RuntimeException("Can't read file " + filename, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,8 +27,6 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
|
import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
|
||||||
import com.android.inputmethod.latin.makedict.Ver4DictEncoder;
|
import com.android.inputmethod.latin.makedict.Ver4DictEncoder;
|
||||||
|
|
||||||
import org.xml.sax.SAXException;
|
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.BufferedWriter;
|
import java.io.BufferedWriter;
|
||||||
|
@ -41,8 +39,6 @@ import java.io.InputStreamReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main class/method for DictionaryMaker.
|
* Main class/method for DictionaryMaker.
|
||||||
*/
|
*/
|
||||||
|
@ -52,10 +48,7 @@ public class DictionaryMaker {
|
||||||
private static final String OPTION_VERSION_2 = "-2";
|
private static final String OPTION_VERSION_2 = "-2";
|
||||||
private static final String OPTION_VERSION_4 = "-4";
|
private static final String OPTION_VERSION_4 = "-4";
|
||||||
private static final String OPTION_INPUT_SOURCE = "-s";
|
private static final String OPTION_INPUT_SOURCE = "-s";
|
||||||
private static final String OPTION_INPUT_BIGRAM_XML = "-b";
|
|
||||||
private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
|
|
||||||
private static final String OPTION_OUTPUT_BINARY = "-d";
|
private static final String OPTION_OUTPUT_BINARY = "-d";
|
||||||
private static final String OPTION_OUTPUT_XML = "-x";
|
|
||||||
private static final String OPTION_OUTPUT_COMBINED = "-o";
|
private static final String OPTION_OUTPUT_COMBINED = "-o";
|
||||||
private static final String OPTION_HELP = "-h";
|
private static final String OPTION_HELP = "-h";
|
||||||
private static final String OPTION_CODE_POINT_TABLE = "-t";
|
private static final String OPTION_CODE_POINT_TABLE = "-t";
|
||||||
|
@ -63,11 +56,7 @@ public class DictionaryMaker {
|
||||||
private static final String OPTION_CODE_POINT_TABLE_ON = "on";
|
private static final String OPTION_CODE_POINT_TABLE_ON = "on";
|
||||||
public final String mInputBinary;
|
public final String mInputBinary;
|
||||||
public final String mInputCombined;
|
public final String mInputCombined;
|
||||||
public final String mInputUnigramXml;
|
|
||||||
public final String mInputShortcutXml;
|
|
||||||
public final String mInputBigramXml;
|
|
||||||
public final String mOutputBinary;
|
public final String mOutputBinary;
|
||||||
public final String mOutputXml;
|
|
||||||
public final String mOutputCombined;
|
public final String mOutputCombined;
|
||||||
public final int mOutputBinaryFormatVersion;
|
public final int mOutputBinaryFormatVersion;
|
||||||
public final int mCodePointTableMode;
|
public final int mCodePointTableMode;
|
||||||
|
@ -76,39 +65,20 @@ public class DictionaryMaker {
|
||||||
checkHasExactlyOneInput();
|
checkHasExactlyOneInput();
|
||||||
checkHasAtLeastOneOutput();
|
checkHasAtLeastOneOutput();
|
||||||
checkNotSameFile(mInputBinary, mOutputBinary);
|
checkNotSameFile(mInputBinary, mOutputBinary);
|
||||||
checkNotSameFile(mInputBinary, mOutputXml);
|
|
||||||
checkNotSameFile(mInputCombined, mOutputBinary);
|
checkNotSameFile(mInputCombined, mOutputBinary);
|
||||||
checkNotSameFile(mInputCombined, mOutputXml);
|
|
||||||
checkNotSameFile(mInputUnigramXml, mOutputBinary);
|
|
||||||
checkNotSameFile(mInputUnigramXml, mOutputXml);
|
|
||||||
checkNotSameFile(mInputUnigramXml, mOutputCombined);
|
|
||||||
checkNotSameFile(mInputShortcutXml, mOutputBinary);
|
|
||||||
checkNotSameFile(mInputShortcutXml, mOutputXml);
|
|
||||||
checkNotSameFile(mInputShortcutXml, mOutputCombined);
|
|
||||||
checkNotSameFile(mInputBigramXml, mOutputBinary);
|
|
||||||
checkNotSameFile(mInputBigramXml, mOutputXml);
|
|
||||||
checkNotSameFile(mInputBigramXml, mOutputCombined);
|
|
||||||
checkNotSameFile(mOutputBinary, mOutputXml);
|
|
||||||
checkNotSameFile(mOutputBinary, mOutputCombined);
|
checkNotSameFile(mOutputBinary, mOutputCombined);
|
||||||
checkNotSameFile(mOutputXml, mOutputCombined);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkHasExactlyOneInput() {
|
private void checkHasExactlyOneInput() {
|
||||||
if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) {
|
if (null == mInputBinary && null == mInputCombined) {
|
||||||
throw new RuntimeException("No input file specified");
|
throw new RuntimeException("No input file specified");
|
||||||
} else if ((null != mInputUnigramXml && null != mInputBinary)
|
} else if (null != mInputBinary && null != mInputCombined) {
|
||||||
|| (null != mInputUnigramXml && null != mInputCombined)
|
|
||||||
|| (null != mInputBinary && null != mInputCombined)) {
|
|
||||||
throw new RuntimeException("Several input files specified");
|
throw new RuntimeException("Several input files specified");
|
||||||
} else if ((null != mInputBinary || null != mInputCombined)
|
|
||||||
&& (null != mInputBigramXml || null != mInputShortcutXml)) {
|
|
||||||
throw new RuntimeException("Separate bigrams/shortcut files are only supported"
|
|
||||||
+ " with XML input (other formats include bigrams and shortcuts already)");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkHasAtLeastOneOutput() {
|
private void checkHasAtLeastOneOutput() {
|
||||||
if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) {
|
if (null == mOutputBinary && null == mOutputCombined) {
|
||||||
throw new RuntimeException("No output specified");
|
throw new RuntimeException("No output specified");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -131,16 +101,14 @@ public class DictionaryMaker {
|
||||||
|
|
||||||
public static String getHelp() {
|
public static String getHelp() {
|
||||||
return "Usage: makedict "
|
return "Usage: makedict "
|
||||||
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
|
|
||||||
+ "| [-s <combined format input]"
|
+ "| [-s <combined format input]"
|
||||||
+ "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
|
+ "| [-s <binary input>] [-d <binary output>]"
|
||||||
+ " [-o <combined output>] [-t <code point table switch: on/off/auto>]"
|
+ " [-o <combined output>] [-t <code point table switch: on/off/auto>]"
|
||||||
+ "[-2] [-3] [-4]\n"
|
+ "[-2] [-3] [-4]\n"
|
||||||
+ "\n"
|
+ "\n"
|
||||||
+ " Converts a source dictionary file to one or several outputs.\n"
|
+ " Converts a source dictionary file to one or several outputs.\n"
|
||||||
+ " Source can be an XML file, with an optional XML bigrams file, or a\n"
|
+ " Source can be a binary dictionary file or a combined format file.\n"
|
||||||
+ " binary dictionary file.\n"
|
+ " Binary version 2 (Jelly Bean), 3, 4, and\n"
|
||||||
+ " Binary version 2 (Jelly Bean), 3, 4, XML and\n"
|
|
||||||
+ " combined format outputs are supported.";
|
+ " combined format outputs are supported.";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -151,11 +119,7 @@ public class DictionaryMaker {
|
||||||
}
|
}
|
||||||
String inputBinary = null;
|
String inputBinary = null;
|
||||||
String inputCombined = null;
|
String inputCombined = null;
|
||||||
String inputUnigramXml = null;
|
|
||||||
String inputShortcutXml = null;
|
|
||||||
String inputBigramXml = null;
|
|
||||||
String outputBinary = null;
|
String outputBinary = null;
|
||||||
String outputXml = null;
|
|
||||||
String outputCombined = null;
|
String outputCombined = null;
|
||||||
int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201.
|
int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201.
|
||||||
// Don't use code point table by default.
|
// Don't use code point table by default.
|
||||||
|
@ -180,9 +144,7 @@ public class DictionaryMaker {
|
||||||
String argValue = args.get(0);
|
String argValue = args.get(0);
|
||||||
args.remove(0);
|
args.remove(0);
|
||||||
if (OPTION_INPUT_SOURCE.equals(arg)) {
|
if (OPTION_INPUT_SOURCE.equals(arg)) {
|
||||||
if (XmlDictInputOutput.isXmlUnigramDictionary(argValue)) {
|
if (CombinedInputOutput.isCombinedDictionary(argValue)) {
|
||||||
inputUnigramXml = argValue;
|
|
||||||
} else if (CombinedInputOutput.isCombinedDictionary(argValue)) {
|
|
||||||
inputCombined = argValue;
|
inputCombined = argValue;
|
||||||
} else if (BinaryDictDecoderUtils.isBinaryDictionary(argValue)) {
|
} else if (BinaryDictDecoderUtils.isBinaryDictionary(argValue)) {
|
||||||
inputBinary = argValue;
|
inputBinary = argValue;
|
||||||
|
@ -190,14 +152,8 @@ public class DictionaryMaker {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"Unknown format for file " + argValue);
|
"Unknown format for file " + argValue);
|
||||||
}
|
}
|
||||||
} else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
|
|
||||||
inputShortcutXml = argValue;
|
|
||||||
} else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
|
|
||||||
inputBigramXml = argValue;
|
|
||||||
} else if (OPTION_OUTPUT_BINARY.equals(arg)) {
|
} else if (OPTION_OUTPUT_BINARY.equals(arg)) {
|
||||||
outputBinary = argValue;
|
outputBinary = argValue;
|
||||||
} else if (OPTION_OUTPUT_XML.equals(arg)) {
|
|
||||||
outputXml = argValue;
|
|
||||||
} else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
|
} else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
|
||||||
outputCombined = argValue;
|
outputCombined = argValue;
|
||||||
} else if (OPTION_CODE_POINT_TABLE.equals(arg)) {
|
} else if (OPTION_CODE_POINT_TABLE.equals(arg)) {
|
||||||
|
@ -214,13 +170,13 @@ public class DictionaryMaker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (null == inputBinary && null == inputUnigramXml) {
|
if (null == inputBinary) {
|
||||||
if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) {
|
if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) {
|
||||||
inputBinary = arg;
|
inputBinary = arg;
|
||||||
} else if (CombinedInputOutput.isCombinedDictionary(arg)) {
|
} else if (CombinedInputOutput.isCombinedDictionary(arg)) {
|
||||||
inputCombined = arg;
|
inputCombined = arg;
|
||||||
} else {
|
} else {
|
||||||
inputUnigramXml = arg;
|
throw new IllegalArgumentException("Unknown format for file " + arg);
|
||||||
}
|
}
|
||||||
} else if (null == outputBinary) {
|
} else if (null == outputBinary) {
|
||||||
outputBinary = arg;
|
outputBinary = arg;
|
||||||
|
@ -232,11 +188,7 @@ public class DictionaryMaker {
|
||||||
|
|
||||||
mInputBinary = inputBinary;
|
mInputBinary = inputBinary;
|
||||||
mInputCombined = inputCombined;
|
mInputCombined = inputCombined;
|
||||||
mInputUnigramXml = inputUnigramXml;
|
|
||||||
mInputShortcutXml = inputShortcutXml;
|
|
||||||
mInputBigramXml = inputBigramXml;
|
|
||||||
mOutputBinary = outputBinary;
|
mOutputBinary = outputBinary;
|
||||||
mOutputXml = outputXml;
|
|
||||||
mOutputCombined = outputCombined;
|
mOutputCombined = outputCombined;
|
||||||
mOutputBinaryFormatVersion = outputBinaryFormatVersion;
|
mOutputBinaryFormatVersion = outputBinaryFormatVersion;
|
||||||
mCodePointTableMode = codePointTableMode;
|
mCodePointTableMode = codePointTableMode;
|
||||||
|
@ -245,8 +197,7 @@ public class DictionaryMaker {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args)
|
public static void main(String[] args)
|
||||||
throws FileNotFoundException, ParserConfigurationException, SAXException, IOException,
|
throws FileNotFoundException, IOException, UnsupportedFormatException {
|
||||||
UnsupportedFormatException {
|
|
||||||
final Arguments parsedArgs = new Arguments(args);
|
final Arguments parsedArgs = new Arguments(args);
|
||||||
FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs);
|
FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs);
|
||||||
writeOutputToParsedArgs(parsedArgs, dictionary);
|
writeOutputToParsedArgs(parsedArgs, dictionary);
|
||||||
|
@ -259,14 +210,11 @@ public class DictionaryMaker {
|
||||||
* @return the read dictionary.
|
* @return the read dictionary.
|
||||||
*/
|
*/
|
||||||
private static FusionDictionary readInputFromParsedArgs(final Arguments args)
|
private static FusionDictionary readInputFromParsedArgs(final Arguments args)
|
||||||
throws IOException, UnsupportedFormatException, ParserConfigurationException,
|
throws IOException, UnsupportedFormatException, FileNotFoundException {
|
||||||
SAXException, FileNotFoundException {
|
|
||||||
if (null != args.mInputBinary) {
|
if (null != args.mInputBinary) {
|
||||||
return readBinaryFile(args.mInputBinary);
|
return readBinaryFile(args.mInputBinary);
|
||||||
} else if (null != args.mInputCombined) {
|
} else if (null != args.mInputCombined) {
|
||||||
return readCombinedFile(args.mInputCombined);
|
return readCombinedFile(args.mInputCombined);
|
||||||
} else if (null != args.mInputUnigramXml) {
|
|
||||||
return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
|
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException("No input file specified");
|
throw new RuntimeException("No input file specified");
|
||||||
}
|
}
|
||||||
|
@ -313,30 +261,6 @@ public class DictionaryMaker {
|
||||||
return new BufferedInputStream(new FileInputStream(filename));
|
return new BufferedInputStream(new FileInputStream(filename));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
|
|
||||||
*
|
|
||||||
* @param unigramXmlFilename the name of the unigram XML file. May not be null.
|
|
||||||
* @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none.
|
|
||||||
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
|
|
||||||
* @return the read dictionary.
|
|
||||||
* @throws FileNotFoundException if one of the files can't be found
|
|
||||||
* @throws SAXException if one or more of the XML files is not well-formed
|
|
||||||
* @throws IOException if one the input files can't be read
|
|
||||||
* @throws ParserConfigurationException if the system can't create a SAX parser
|
|
||||||
*/
|
|
||||||
private static FusionDictionary readXmlFile(final String unigramXmlFilename,
|
|
||||||
final String shortcutXmlFilename, final String bigramXmlFilename)
|
|
||||||
throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
|
|
||||||
try (
|
|
||||||
final BufferedInputStream unigrams = getBufferedFileInputStream(unigramXmlFilename);
|
|
||||||
final BufferedInputStream shortcuts = getBufferedFileInputStream(shortcutXmlFilename);
|
|
||||||
final BufferedInputStream bigrams = getBufferedFileInputStream(bigramXmlFilename);
|
|
||||||
) {
|
|
||||||
return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invoke the right output method according to args.
|
* Invoke the right output method according to args.
|
||||||
*
|
*
|
||||||
|
@ -353,9 +277,6 @@ public class DictionaryMaker {
|
||||||
writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion,
|
writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion,
|
||||||
args.mCodePointTableMode);
|
args.mCodePointTableMode);
|
||||||
}
|
}
|
||||||
if (null != args.mOutputXml) {
|
|
||||||
writeXmlDictionary(args.mOutputXml, dict);
|
|
||||||
}
|
|
||||||
if (null != args.mOutputCombined) {
|
if (null != args.mOutputCombined) {
|
||||||
writeCombinedDictionary(args.mOutputCombined, dict);
|
writeCombinedDictionary(args.mOutputCombined, dict);
|
||||||
}
|
}
|
||||||
|
@ -386,21 +307,6 @@ public class DictionaryMaker {
|
||||||
dictEncoder.writeDictionary(dict, formatOptions);
|
dictEncoder.writeDictionary(dict, formatOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Write the dictionary in XML format to the specified filename.
|
|
||||||
*
|
|
||||||
* @param outputFilename the name of the file to write to.
|
|
||||||
* @param dict the dictionary to write.
|
|
||||||
* @throws FileNotFoundException if the output file can't be created.
|
|
||||||
* @throws IOException if the output file can't be written to.
|
|
||||||
*/
|
|
||||||
private static void writeXmlDictionary(final String outputFilename,
|
|
||||||
final FusionDictionary dict) throws FileNotFoundException, IOException {
|
|
||||||
try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
|
|
||||||
XmlDictInputOutput.writeDictionaryXml(writer, dict);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write the dictionary in the combined format to the specified filename.
|
* Write the dictionary in the combined format to the specified filename.
|
||||||
*
|
*
|
||||||
|
|
|
@ -20,8 +20,6 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
|
||||||
import org.xml.sax.SAXException;
|
|
||||||
|
|
||||||
public class Makedict extends Dicttool.Command {
|
public class Makedict extends Dicttool.Command {
|
||||||
public static final String COMMAND = "makedict";
|
public static final String COMMAND = "makedict";
|
||||||
|
@ -35,8 +33,7 @@ public class Makedict extends Dicttool.Command {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() throws FileNotFoundException, IOException, ParserConfigurationException,
|
public void run() throws FileNotFoundException, IOException, UnsupportedFormatException {
|
||||||
SAXException, UnsupportedFormatException {
|
|
||||||
DictionaryMaker.main(mArgs);
|
DictionaryMaker.main(mArgs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,380 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (C) 2011 The Android Open Source Project
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
||||||
* use this file except in compliance with the License. You may obtain a copy of
|
|
||||||
* the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
||||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
||||||
* License for the specific language governing permissions and limitations under
|
|
||||||
* the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package com.android.inputmethod.latin.dicttool;
|
|
||||||
|
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
|
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
|
||||||
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
|
||||||
import com.android.inputmethod.latin.makedict.WeightedString;
|
|
||||||
import com.android.inputmethod.latin.makedict.WordProperty;
|
|
||||||
|
|
||||||
import org.xml.sax.Attributes;
|
|
||||||
import org.xml.sax.SAXException;
|
|
||||||
import org.xml.sax.helpers.DefaultHandler;
|
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.BufferedWriter;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
|
||||||
import javax.xml.parsers.SAXParser;
|
|
||||||
import javax.xml.parsers.SAXParserFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads and writes XML files for a FusionDictionary.
|
|
||||||
*
|
|
||||||
* All functions in this class are static.
|
|
||||||
*/
|
|
||||||
public class XmlDictInputOutput {
|
|
||||||
|
|
||||||
private static final String ROOT_TAG = "wordlist";
|
|
||||||
private static final String WORD_TAG = "w";
|
|
||||||
private static final String BIGRAM_TAG = "bigram";
|
|
||||||
private static final String SHORTCUT_TAG = "shortcut";
|
|
||||||
private static final String PROBABILITY_ATTR = "f";
|
|
||||||
private static final String WORD_ATTR = "word";
|
|
||||||
private static final String NOT_A_WORD_ATTR = "not_a_word";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* SAX handler for a unigram XML file.
|
|
||||||
*/
|
|
||||||
static private class UnigramHandler extends DefaultHandler {
|
|
||||||
// Parser states
|
|
||||||
private static final int START = 1;
|
|
||||||
private static final int WORD = 2;
|
|
||||||
private static final int UNKNOWN = 3;
|
|
||||||
private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1;
|
|
||||||
|
|
||||||
FusionDictionary mDictionary;
|
|
||||||
int mState; // the state of the parser
|
|
||||||
int mFreq; // the currently read freq
|
|
||||||
String mWord; // the current word
|
|
||||||
final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create the handler.
|
|
||||||
*
|
|
||||||
* @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
|
|
||||||
*/
|
|
||||||
public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
|
|
||||||
mDictionary = null;
|
|
||||||
mShortcutsMap = shortcuts;
|
|
||||||
mWord = "";
|
|
||||||
mState = START;
|
|
||||||
mFreq = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public FusionDictionary getFinalDictionary() {
|
|
||||||
final FusionDictionary dict = mDictionary;
|
|
||||||
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
|
||||||
if (dict.hasWord(shortcutOnly)) continue;
|
|
||||||
dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
|
|
||||||
mShortcutsMap.get(shortcutOnly), true /* isNotAWord */,
|
|
||||||
false /* isPossiblyOffensive */);
|
|
||||||
}
|
|
||||||
mDictionary = null;
|
|
||||||
mShortcutsMap.clear();
|
|
||||||
mWord = "";
|
|
||||||
mState = START;
|
|
||||||
mFreq = 0;
|
|
||||||
return dict;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void startElement(String uri, String localName, String qName, Attributes attrs) {
|
|
||||||
if (WORD_TAG.equals(localName)) {
|
|
||||||
mState = WORD;
|
|
||||||
mWord = "";
|
|
||||||
for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
|
|
||||||
final String attrName = attrs.getLocalName(attrIndex);
|
|
||||||
if (PROBABILITY_ATTR.equals(attrName)) {
|
|
||||||
mFreq = Integer.parseInt(attrs.getValue(attrIndex));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (ROOT_TAG.equals(localName)) {
|
|
||||||
final HashMap<String, String> attributes = new HashMap<>();
|
|
||||||
for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
|
|
||||||
final String attrName = attrs.getLocalName(attrIndex);
|
|
||||||
attributes.put(attrName, attrs.getValue(attrIndex));
|
|
||||||
}
|
|
||||||
mDictionary = new FusionDictionary(new PtNodeArray(),
|
|
||||||
new DictionaryOptions(attributes));
|
|
||||||
} else {
|
|
||||||
mState = UNKNOWN;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void characters(char[] ch, int start, int length) {
|
|
||||||
if (WORD == mState) {
|
|
||||||
// The XML parser is free to return text in arbitrary chunks one after the
|
|
||||||
// other. In particular, this happens in some implementations when it finds
|
|
||||||
// an escape code like "&".
|
|
||||||
mWord += String.copyValueOf(ch, start, length);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void endElement(String uri, String localName, String qName) {
|
|
||||||
if (WORD == mState) {
|
|
||||||
mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
|
|
||||||
false /* isNotAWord */, false /* isPossiblyOffensive */);
|
|
||||||
mState = START;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static private class AssociativeListHandler extends DefaultHandler {
|
|
||||||
private final String SRC_TAG;
|
|
||||||
private final String SRC_ATTRIBUTE;
|
|
||||||
private final String DST_TAG;
|
|
||||||
private final String DST_ATTRIBUTE;
|
|
||||||
private final String DST_FREQ;
|
|
||||||
|
|
||||||
// In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX
|
|
||||||
private final static int XML_MAX = 256;
|
|
||||||
// In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX
|
|
||||||
private final static int MEMORY_MAX = 256;
|
|
||||||
private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX;
|
|
||||||
|
|
||||||
private String mSrc;
|
|
||||||
private final HashMap<String, ArrayList<WeightedString>> mAssocMap;
|
|
||||||
|
|
||||||
public AssociativeListHandler(final String srcTag, final String srcAttribute,
|
|
||||||
final String dstTag, final String dstAttribute, final String dstFreq) {
|
|
||||||
SRC_TAG = srcTag;
|
|
||||||
SRC_ATTRIBUTE = srcAttribute;
|
|
||||||
DST_TAG = dstTag;
|
|
||||||
DST_ATTRIBUTE = dstAttribute;
|
|
||||||
DST_FREQ = dstFreq;
|
|
||||||
mSrc = null;
|
|
||||||
mAssocMap = new HashMap<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void startElement(String uri, String localName, String qName, Attributes attrs) {
|
|
||||||
if (SRC_TAG.equals(localName)) {
|
|
||||||
mSrc = attrs.getValue(uri, SRC_ATTRIBUTE);
|
|
||||||
} else if (DST_TAG.equals(localName)) {
|
|
||||||
String dst = attrs.getValue(uri, DST_ATTRIBUTE);
|
|
||||||
int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ));
|
|
||||||
WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO);
|
|
||||||
ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc);
|
|
||||||
if (null == bigramList) bigramList = new ArrayList<>();
|
|
||||||
bigramList.add(bigram);
|
|
||||||
mAssocMap.put(mSrc, bigramList);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected int getValueFromFreqString(final String freqString) {
|
|
||||||
return Integer.parseInt(freqString);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This may return an empty map, but will never return null.
|
|
||||||
public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
|
|
||||||
return mAssocMap;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* SAX handler for a bigram XML file.
|
|
||||||
*/
|
|
||||||
static private class BigramHandler extends AssociativeListHandler {
|
|
||||||
private final static String BIGRAM_W1_TAG = "bi";
|
|
||||||
private final static String BIGRAM_W2_TAG = "w";
|
|
||||||
private final static String BIGRAM_W1_ATTRIBUTE = "w1";
|
|
||||||
private final static String BIGRAM_W2_ATTRIBUTE = "w2";
|
|
||||||
private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
|
|
||||||
|
|
||||||
public BigramHandler() {
|
|
||||||
super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE,
|
|
||||||
BIGRAM_FREQ_ATTRIBUTE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// As per getAssocMap(), this never returns null.
|
|
||||||
public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
|
|
||||||
return getAssocMap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* SAX handler for a shortcut & whitelist XML file.
|
|
||||||
*/
|
|
||||||
static private class ShortcutAndWhitelistHandler extends AssociativeListHandler {
|
|
||||||
private final static String ENTRY_TAG = "entry";
|
|
||||||
private final static String ENTRY_ATTRIBUTE = "shortcut";
|
|
||||||
private final static String TARGET_TAG = "target";
|
|
||||||
private final static String REPLACEMENT_ATTRIBUTE = "replacement";
|
|
||||||
private final static String TARGET_PRIORITY_ATTRIBUTE = "priority";
|
|
||||||
private final static String WHITELIST_MARKER = "whitelist";
|
|
||||||
private final static int WHITELIST_FREQ_VALUE = 15;
|
|
||||||
private final static int MIN_FREQ = 0;
|
|
||||||
private final static int MAX_FREQ = 14;
|
|
||||||
|
|
||||||
public ShortcutAndWhitelistHandler() {
|
|
||||||
super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE,
|
|
||||||
TARGET_PRIORITY_ATTRIBUTE);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected int getValueFromFreqString(final String freqString) {
|
|
||||||
if (WHITELIST_MARKER.equals(freqString)) {
|
|
||||||
return WHITELIST_FREQ_VALUE;
|
|
||||||
}
|
|
||||||
final int intValue = super.getValueFromFreqString(freqString);
|
|
||||||
if (intValue < MIN_FREQ || intValue > MAX_FREQ) {
|
|
||||||
throw new RuntimeException("Shortcut freq out of range. Accepted range is "
|
|
||||||
+ MIN_FREQ + ".." + MAX_FREQ);
|
|
||||||
}
|
|
||||||
return intValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// As per getAssocMap(), this never returns null.
|
|
||||||
public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() {
|
|
||||||
return getAssocMap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Basic test to find out whether the file is in the unigram XML format or not.
|
|
||||||
*
|
|
||||||
* Concretely this only tests the header line.
|
|
||||||
*
|
|
||||||
* @param filename The name of the file to test.
|
|
||||||
* @return true if the file is in the unigram XML format, false otherwise
|
|
||||||
*/
|
|
||||||
public static boolean isXmlUnigramDictionary(final String filename) {
|
|
||||||
try (final BufferedReader reader = new BufferedReader(
|
|
||||||
new InputStreamReader(new FileInputStream(filename), "UTF-8"))) {
|
|
||||||
final String firstLine = reader.readLine();
|
|
||||||
return firstLine.matches("^\\s*<wordlist .*>\\s*$");
|
|
||||||
} catch (final IOException e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a dictionary from an XML file.
|
|
||||||
*
|
|
||||||
* This is the public method that will parse an XML file and return the corresponding memory
|
|
||||||
* representation.
|
|
||||||
*
|
|
||||||
* @param unigrams the file to read the data from.
|
|
||||||
* @param shortcuts the file to read the shortcuts & whitelist from, or null.
|
|
||||||
* @param bigrams the file to read the bigrams from, or null.
|
|
||||||
* @return the in-memory representation of the dictionary.
|
|
||||||
*/
|
|
||||||
public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams,
|
|
||||||
final BufferedInputStream shortcuts, final BufferedInputStream bigrams)
|
|
||||||
throws SAXException, IOException, ParserConfigurationException {
|
|
||||||
final SAXParserFactory factory = SAXParserFactory.newInstance();
|
|
||||||
factory.setNamespaceAware(true);
|
|
||||||
final SAXParser parser = factory.newSAXParser();
|
|
||||||
final BigramHandler bigramHandler = new BigramHandler();
|
|
||||||
if (null != bigrams) parser.parse(bigrams, bigramHandler);
|
|
||||||
|
|
||||||
final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler =
|
|
||||||
new ShortcutAndWhitelistHandler();
|
|
||||||
if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler);
|
|
||||||
|
|
||||||
final UnigramHandler unigramHandler =
|
|
||||||
new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap());
|
|
||||||
parser.parse(unigrams, unigramHandler);
|
|
||||||
final FusionDictionary dict = unigramHandler.getFinalDictionary();
|
|
||||||
final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
|
|
||||||
for (final String firstWord : bigramMap.keySet()) {
|
|
||||||
if (!dict.hasWord(firstWord)) continue;
|
|
||||||
final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
|
|
||||||
for (final WeightedString bigram : bigramList) {
|
|
||||||
if (!dict.hasWord(bigram.mWord)) continue;
|
|
||||||
dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return dict;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a dictionary in the first, legacy XML format
|
|
||||||
*
|
|
||||||
* This method reads data from the parser and creates a new FusionDictionary with it.
|
|
||||||
* The format parsed by this method is the format used before Ice Cream Sandwich,
|
|
||||||
* which has no support for bigrams or shortcuts/whitelist.
|
|
||||||
* It is important to note that this method expects the parser to have already eaten
|
|
||||||
* the first, all-encompassing tag.
|
|
||||||
*
|
|
||||||
* @param xpp the parser to read the data from.
|
|
||||||
* @return the parsed dictionary.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Writes a dictionary to an XML file.
|
|
||||||
*
|
|
||||||
* The output format is the "second" format, which supports bigrams and shortcuts/whitelist.
|
|
||||||
*
|
|
||||||
* @param destination a destination stream to write to.
|
|
||||||
* @param dict the dictionary to write.
|
|
||||||
*/
|
|
||||||
public static void writeDictionaryXml(final BufferedWriter destination,
|
|
||||||
final FusionDictionary dict) throws IOException {
|
|
||||||
final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
|
|
||||||
for (WordProperty wordProperty : dict) {
|
|
||||||
wordPropertiesInDict.add(wordProperty);
|
|
||||||
}
|
|
||||||
// TODO: use an XMLSerializer if this gets big
|
|
||||||
destination.write("<wordlist format=\"2\"");
|
|
||||||
for (final String key : dict.mOptions.mAttributes.keySet()) {
|
|
||||||
final String value = dict.mOptions.mAttributes.get(key);
|
|
||||||
destination.write(" " + key + "=\"" + value + "\"");
|
|
||||||
}
|
|
||||||
destination.write(">\n");
|
|
||||||
destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
|
|
||||||
for (WordProperty wordProperty : wordPropertiesInDict) {
|
|
||||||
destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord
|
|
||||||
+ "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability()
|
|
||||||
+ (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "")
|
|
||||||
+ "\">");
|
|
||||||
if (wordProperty.mHasShortcuts) {
|
|
||||||
destination.write("\n");
|
|
||||||
for (WeightedString target : wordProperty.mShortcutTargets) {
|
|
||||||
destination.write(" <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\""
|
|
||||||
+ target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG
|
|
||||||
+ ">\n");
|
|
||||||
}
|
|
||||||
destination.write(" ");
|
|
||||||
}
|
|
||||||
if (wordProperty.mHasNgrams) {
|
|
||||||
destination.write("\n");
|
|
||||||
for (WeightedString bigram : wordProperty.getBigrams()) {
|
|
||||||
destination.write(" <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\""
|
|
||||||
+ bigram.getProbability() + "\">" + bigram.mWord
|
|
||||||
+ "</" + BIGRAM_TAG + ">\n");
|
|
||||||
}
|
|
||||||
destination.write(" ");
|
|
||||||
}
|
|
||||||
destination.write("</" + WORD_TAG + ">\n");
|
|
||||||
}
|
|
||||||
destination.write("</wordlist>\n");
|
|
||||||
destination.close();
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in a new issue