Wire the Xml-read shortcuts into the dict creation code (B6)

Change-Id: I352064835abb62c294b48e080d9709ff013c7bb0
This commit is contained in:
Jean Chalard 2011-12-26 19:33:37 +09:00
parent 8edd306718
commit 342d5d5dd0
2 changed files with 31 additions and 12 deletions

View file

@ -39,11 +39,13 @@ public class DictionaryMaker {
private final static String OPTION_VERSION_2 = "-2"; private final static String OPTION_VERSION_2 = "-2";
private final static String OPTION_INPUT_SOURCE = "-s"; private final static String OPTION_INPUT_SOURCE = "-s";
private final static String OPTION_INPUT_BIGRAM_XML = "-b"; private final static String OPTION_INPUT_BIGRAM_XML = "-b";
private final static String OPTION_INPUT_SHORTCUT_XML = "-c";
private final static String OPTION_OUTPUT_BINARY = "-d"; private final static String OPTION_OUTPUT_BINARY = "-d";
private final static String OPTION_OUTPUT_XML = "-x"; private final static String OPTION_OUTPUT_XML = "-x";
private final static String OPTION_HELP = "-h"; private final static String OPTION_HELP = "-h";
public final String mInputBinary; public final String mInputBinary;
public final String mInputUnigramXml; public final String mInputUnigramXml;
public final String mInputShortcutXml;
public final String mInputBigramXml; public final String mInputBigramXml;
public final String mOutputBinary; public final String mOutputBinary;
public final String mOutputXml; public final String mOutputXml;
@ -72,8 +74,9 @@ public class DictionaryMaker {
private void displayHelp() { private void displayHelp() {
MakedictLog.i("Usage: makedict " MakedictLog.i("Usage: makedict "
+ "[-s <unigrams.xml> [-b <bigrams.xml>] | -s <binary input>] " + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts.xml>] "
+ " [-d <binary output>] [-x <xml output>] [-2]\n" + "| -s <binary input>] "
+ "[-d <binary output>] [-x <xml output>] [-2]\n"
+ "\n" + "\n"
+ " Converts a source dictionary file to one or several outputs.\n" + " Converts a source dictionary file to one or several outputs.\n"
+ " Source can be an XML file, with an optional XML bigrams file, or a\n" + " Source can be an XML file, with an optional XML bigrams file, or a\n"
@ -90,6 +93,7 @@ public class DictionaryMaker {
} }
String inputBinary = null; String inputBinary = null;
String inputUnigramXml = null; String inputUnigramXml = null;
String inputShortcutXml = null;
String inputBigramXml = null; String inputBigramXml = null;
String outputBinary = null; String outputBinary = null;
String outputXml = null; String outputXml = null;
@ -116,6 +120,8 @@ public class DictionaryMaker {
} else { } else {
inputUnigramXml = filename; inputUnigramXml = filename;
} }
} else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
inputShortcutXml = filename;
} else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) { } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
inputBigramXml = filename; inputBigramXml = filename;
} else if (OPTION_OUTPUT_BINARY.equals(arg)) { } else if (OPTION_OUTPUT_BINARY.equals(arg)) {
@ -143,6 +149,7 @@ public class DictionaryMaker {
mInputBinary = inputBinary; mInputBinary = inputBinary;
mInputUnigramXml = inputUnigramXml; mInputUnigramXml = inputUnigramXml;
mInputShortcutXml = inputShortcutXml;
mInputBigramXml = inputBigramXml; mInputBigramXml = inputBigramXml;
mOutputBinary = outputBinary; mOutputBinary = outputBinary;
mOutputXml = outputXml; mOutputXml = outputXml;
@ -170,7 +177,7 @@ public class DictionaryMaker {
if (null != args.mInputBinary) { if (null != args.mInputBinary) {
return readBinaryFile(args.mInputBinary); return readBinaryFile(args.mInputBinary);
} else if (null != args.mInputUnigramXml) { } else if (null != args.mInputUnigramXml) {
return readXmlFile(args.mInputUnigramXml, args.mInputBigramXml); return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
} else { } else {
throw new RuntimeException("No input file specified"); throw new RuntimeException("No input file specified");
} }
@ -195,6 +202,7 @@ public class DictionaryMaker {
* Read a dictionary from a unigram XML file, and optionally a bigram XML file. * Read a dictionary from a unigram XML file, and optionally a bigram XML file.
* *
* @param unigramXmlFilename the name of the unigram XML file. May not be null. * @param unigramXmlFilename the name of the unigram XML file. May not be null.
* @param shortcutXmlFilename the name of the shortcut XML file, or null if there is none.
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams. * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
* @return the read dictionary. * @return the read dictionary.
* @throws FileNotFoundException if one of the files can't be found * @throws FileNotFoundException if one of the files can't be found
@ -203,12 +211,14 @@ public class DictionaryMaker {
* @throws ParserConfigurationException if the system can't create a SAX parser * @throws ParserConfigurationException if the system can't create a SAX parser
*/ */
private static FusionDictionary readXmlFile(final String unigramXmlFilename, private static FusionDictionary readXmlFile(final String unigramXmlFilename,
final String bigramXmlFilename) throws FileNotFoundException, SAXException, final String shortcutXmlFilename, final String bigramXmlFilename)
IOException, ParserConfigurationException { throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename)); final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename));
final FileInputStream shortcuts = null == shortcutXmlFilename ? null :
new FileInputStream(new File(shortcutXmlFilename));
final FileInputStream bigrams = null == bigramXmlFilename ? null : final FileInputStream bigrams = null == bigramXmlFilename ? null :
new FileInputStream(new File(bigramXmlFilename)); new FileInputStream(new File(bigramXmlFilename));
return XmlDictInputOutput.readDictionaryXml(unigrams, bigrams); return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
} }
/** /**

View file

@ -61,6 +61,7 @@ public class XmlDictInputOutput {
int mState; // the state of the parser int mState; // the state of the parser
int mFreq; // the currently read freq int mFreq; // the currently read freq
String mWord; // the current word String mWord; // the current word
final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
final HashMap<String, ArrayList<WeightedString>> mBigramsMap; final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
/** /**
@ -69,9 +70,11 @@ public class XmlDictInputOutput {
* @param dict the dictionary to construct. * @param dict the dictionary to construct.
* @param bigrams the bigrams as a map. This may be empty, but may not be null. * @param bigrams the bigrams as a map. This may be empty, but may not be null.
*/ */
public UnigramHandler(FusionDictionary dict, public UnigramHandler(final FusionDictionary dict,
HashMap<String, ArrayList<WeightedString>> bigrams) { final HashMap<String, ArrayList<WeightedString>> shortcuts,
final HashMap<String, ArrayList<WeightedString>> bigrams) {
mDictionary = dict; mDictionary = dict;
mShortcutsMap = shortcuts;
mBigramsMap = bigrams; mBigramsMap = bigrams;
mWord = ""; mWord = "";
mState = START; mState = START;
@ -107,8 +110,7 @@ public class XmlDictInputOutput {
@Override @Override
public void endElement(String uri, String localName, String qName) { public void endElement(String uri, String localName, String qName) {
if (WORD == mState) { if (WORD == mState) {
// TODO: pass the shortcut targets mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord));
mDictionary.add(mWord, mFreq, null, mBigramsMap.get(mWord));
mState = START; mState = START;
} }
} }
@ -208,9 +210,12 @@ public class XmlDictInputOutput {
* representation. * representation.
* *
* @param unigrams the file to read the data from. * @param unigrams the file to read the data from.
* @param shortcuts the file to read the shortcuts from, or null.
* @param bigrams the file to read the bigrams from, or null.
* @return the in-memory representation of the dictionary. * @return the in-memory representation of the dictionary.
*/ */
public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams) public static FusionDictionary readDictionaryXml(final InputStream unigrams,
final InputStream shortcuts, final InputStream bigrams)
throws SAXException, IOException, ParserConfigurationException { throws SAXException, IOException, ParserConfigurationException {
final SAXParserFactory factory = SAXParserFactory.newInstance(); final SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true); factory.setNamespaceAware(true);
@ -218,9 +223,13 @@ public class XmlDictInputOutput {
final BigramHandler bigramHandler = new BigramHandler(); final BigramHandler bigramHandler = new BigramHandler();
if (null != bigrams) parser.parse(bigrams, bigramHandler); if (null != bigrams) parser.parse(bigrams, bigramHandler);
final ShortcutHandler shortcutHandler = new ShortcutHandler();
if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);
final FusionDictionary dict = new FusionDictionary(); final FusionDictionary dict = new FusionDictionary();
final UnigramHandler unigramHandler = final UnigramHandler unigramHandler =
new UnigramHandler(dict, bigramHandler.getBigramMap()); new UnigramHandler(dict, shortcutHandler.getShortcutMap(),
bigramHandler.getBigramMap());
parser.parse(unigrams, unigramHandler); parser.parse(unigrams, unigramHandler);
return dict; return dict;
} }