Wire the Xml-read shortcuts into the dict creation code (B6)

Change-Id: I352064835abb62c294b48e080d9709ff013c7bb0
This commit is contained in:
Jean Chalard 2011-12-26 19:33:37 +09:00
parent 8edd306718
commit 342d5d5dd0
2 changed files with 31 additions and 12 deletions

View file

@ -39,11 +39,13 @@ public class DictionaryMaker {
private final static String OPTION_VERSION_2 = "-2";
private final static String OPTION_INPUT_SOURCE = "-s";
private final static String OPTION_INPUT_BIGRAM_XML = "-b";
private final static String OPTION_INPUT_SHORTCUT_XML = "-c";
private final static String OPTION_OUTPUT_BINARY = "-d";
private final static String OPTION_OUTPUT_XML = "-x";
private final static String OPTION_HELP = "-h";
public final String mInputBinary;
public final String mInputUnigramXml;
public final String mInputShortcutXml;
public final String mInputBigramXml;
public final String mOutputBinary;
public final String mOutputXml;
@ -72,7 +74,8 @@ public class DictionaryMaker {
private void displayHelp() {
MakedictLog.i("Usage: makedict "
+ "[-s <unigrams.xml> [-b <bigrams.xml>] | -s <binary input>] "
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts.xml>] "
+ "| -s <binary input>] "
+ "[-d <binary output>] [-x <xml output>] [-2]\n"
+ "\n"
+ " Converts a source dictionary file to one or several outputs.\n"
@ -90,6 +93,7 @@ public class DictionaryMaker {
}
String inputBinary = null;
String inputUnigramXml = null;
String inputShortcutXml = null;
String inputBigramXml = null;
String outputBinary = null;
String outputXml = null;
@ -116,6 +120,8 @@ public class DictionaryMaker {
} else {
inputUnigramXml = filename;
}
} else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
inputShortcutXml = filename;
} else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
inputBigramXml = filename;
} else if (OPTION_OUTPUT_BINARY.equals(arg)) {
@ -143,6 +149,7 @@ public class DictionaryMaker {
mInputBinary = inputBinary;
mInputUnigramXml = inputUnigramXml;
mInputShortcutXml = inputShortcutXml;
mInputBigramXml = inputBigramXml;
mOutputBinary = outputBinary;
mOutputXml = outputXml;
@ -170,7 +177,7 @@ public class DictionaryMaker {
if (null != args.mInputBinary) {
return readBinaryFile(args.mInputBinary);
} else if (null != args.mInputUnigramXml) {
return readXmlFile(args.mInputUnigramXml, args.mInputBigramXml);
return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
} else {
throw new RuntimeException("No input file specified");
}
@ -195,6 +202,7 @@ public class DictionaryMaker {
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
*
* @param unigramXmlFilename the name of the unigram XML file. May not be null.
* @param shortcutXmlFilename the name of the shortcut XML file, or null if there is none.
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
* @return the read dictionary.
* @throws FileNotFoundException if one of the files can't be found
@ -203,12 +211,14 @@ public class DictionaryMaker {
* @throws ParserConfigurationException if the system can't create a SAX parser
*/
private static FusionDictionary readXmlFile(final String unigramXmlFilename,
final String bigramXmlFilename) throws FileNotFoundException, SAXException,
IOException, ParserConfigurationException {
final String shortcutXmlFilename, final String bigramXmlFilename)
throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename));
final FileInputStream shortcuts = null == shortcutXmlFilename ? null :
new FileInputStream(new File(shortcutXmlFilename));
final FileInputStream bigrams = null == bigramXmlFilename ? null :
new FileInputStream(new File(bigramXmlFilename));
return XmlDictInputOutput.readDictionaryXml(unigrams, bigrams);
return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
}
/**

View file

@ -61,6 +61,7 @@ public class XmlDictInputOutput {
int mState; // the state of the parser
int mFreq; // the currently read freq
String mWord; // the current word
final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
/**
@ -69,9 +70,11 @@ public class XmlDictInputOutput {
* @param dict the dictionary to construct.
* @param bigrams the bigrams as a map. This may be empty, but may not be null.
*/
public UnigramHandler(FusionDictionary dict,
HashMap<String, ArrayList<WeightedString>> bigrams) {
public UnigramHandler(final FusionDictionary dict,
final HashMap<String, ArrayList<WeightedString>> shortcuts,
final HashMap<String, ArrayList<WeightedString>> bigrams) {
mDictionary = dict;
mShortcutsMap = shortcuts;
mBigramsMap = bigrams;
mWord = "";
mState = START;
@ -107,8 +110,7 @@ public class XmlDictInputOutput {
@Override
public void endElement(String uri, String localName, String qName) {
if (WORD == mState) {
// TODO: pass the shortcut targets
mDictionary.add(mWord, mFreq, null, mBigramsMap.get(mWord));
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord));
mState = START;
}
}
@ -208,9 +210,12 @@ public class XmlDictInputOutput {
* representation.
*
* @param unigrams the file to read the data from.
* @param shortcuts the file to read the shortcuts from, or null.
* @param bigrams the file to read the bigrams from, or null.
* @return the in-memory representation of the dictionary.
*/
public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams)
public static FusionDictionary readDictionaryXml(final InputStream unigrams,
final InputStream shortcuts, final InputStream bigrams)
throws SAXException, IOException, ParserConfigurationException {
final SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
@ -218,9 +223,13 @@ public class XmlDictInputOutput {
final BigramHandler bigramHandler = new BigramHandler();
if (null != bigrams) parser.parse(bigrams, bigramHandler);
final ShortcutHandler shortcutHandler = new ShortcutHandler();
if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);
final FusionDictionary dict = new FusionDictionary();
final UnigramHandler unigramHandler =
new UnigramHandler(dict, bigramHandler.getBigramMap());
new UnigramHandler(dict, shortcutHandler.getShortcutMap(),
bigramHandler.getBigramMap());
parser.parse(unigrams, unigramHandler);
return dict;
}