am 342d5d5d
: Wire the Xml-read shortcuts into the dict creation code (B6)
* commit '342d5d5dd095a5f73413a630b5de9db334ca45e4': Wire the Xml-read shortcuts into the dict creation code (B6)
This commit is contained in:
commit
41dbae9ea6
2 changed files with 31 additions and 12 deletions
|
@ -39,11 +39,13 @@ public class DictionaryMaker {
|
||||||
private final static String OPTION_VERSION_2 = "-2";
|
private final static String OPTION_VERSION_2 = "-2";
|
||||||
private final static String OPTION_INPUT_SOURCE = "-s";
|
private final static String OPTION_INPUT_SOURCE = "-s";
|
||||||
private final static String OPTION_INPUT_BIGRAM_XML = "-b";
|
private final static String OPTION_INPUT_BIGRAM_XML = "-b";
|
||||||
|
private final static String OPTION_INPUT_SHORTCUT_XML = "-c";
|
||||||
private final static String OPTION_OUTPUT_BINARY = "-d";
|
private final static String OPTION_OUTPUT_BINARY = "-d";
|
||||||
private final static String OPTION_OUTPUT_XML = "-x";
|
private final static String OPTION_OUTPUT_XML = "-x";
|
||||||
private final static String OPTION_HELP = "-h";
|
private final static String OPTION_HELP = "-h";
|
||||||
public final String mInputBinary;
|
public final String mInputBinary;
|
||||||
public final String mInputUnigramXml;
|
public final String mInputUnigramXml;
|
||||||
|
public final String mInputShortcutXml;
|
||||||
public final String mInputBigramXml;
|
public final String mInputBigramXml;
|
||||||
public final String mOutputBinary;
|
public final String mOutputBinary;
|
||||||
public final String mOutputXml;
|
public final String mOutputXml;
|
||||||
|
@ -72,8 +74,9 @@ public class DictionaryMaker {
|
||||||
|
|
||||||
private void displayHelp() {
|
private void displayHelp() {
|
||||||
MakedictLog.i("Usage: makedict "
|
MakedictLog.i("Usage: makedict "
|
||||||
+ "[-s <unigrams.xml> [-b <bigrams.xml>] | -s <binary input>] "
|
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts.xml>] "
|
||||||
+ " [-d <binary output>] [-x <xml output>] [-2]\n"
|
+ "| -s <binary input>] "
|
||||||
|
+ "[-d <binary output>] [-x <xml output>] [-2]\n"
|
||||||
+ "\n"
|
+ "\n"
|
||||||
+ " Converts a source dictionary file to one or several outputs.\n"
|
+ " Converts a source dictionary file to one or several outputs.\n"
|
||||||
+ " Source can be an XML file, with an optional XML bigrams file, or a\n"
|
+ " Source can be an XML file, with an optional XML bigrams file, or a\n"
|
||||||
|
@ -90,6 +93,7 @@ public class DictionaryMaker {
|
||||||
}
|
}
|
||||||
String inputBinary = null;
|
String inputBinary = null;
|
||||||
String inputUnigramXml = null;
|
String inputUnigramXml = null;
|
||||||
|
String inputShortcutXml = null;
|
||||||
String inputBigramXml = null;
|
String inputBigramXml = null;
|
||||||
String outputBinary = null;
|
String outputBinary = null;
|
||||||
String outputXml = null;
|
String outputXml = null;
|
||||||
|
@ -116,6 +120,8 @@ public class DictionaryMaker {
|
||||||
} else {
|
} else {
|
||||||
inputUnigramXml = filename;
|
inputUnigramXml = filename;
|
||||||
}
|
}
|
||||||
|
} else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
|
||||||
|
inputShortcutXml = filename;
|
||||||
} else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
|
} else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
|
||||||
inputBigramXml = filename;
|
inputBigramXml = filename;
|
||||||
} else if (OPTION_OUTPUT_BINARY.equals(arg)) {
|
} else if (OPTION_OUTPUT_BINARY.equals(arg)) {
|
||||||
|
@ -143,6 +149,7 @@ public class DictionaryMaker {
|
||||||
|
|
||||||
mInputBinary = inputBinary;
|
mInputBinary = inputBinary;
|
||||||
mInputUnigramXml = inputUnigramXml;
|
mInputUnigramXml = inputUnigramXml;
|
||||||
|
mInputShortcutXml = inputShortcutXml;
|
||||||
mInputBigramXml = inputBigramXml;
|
mInputBigramXml = inputBigramXml;
|
||||||
mOutputBinary = outputBinary;
|
mOutputBinary = outputBinary;
|
||||||
mOutputXml = outputXml;
|
mOutputXml = outputXml;
|
||||||
|
@ -170,7 +177,7 @@ public class DictionaryMaker {
|
||||||
if (null != args.mInputBinary) {
|
if (null != args.mInputBinary) {
|
||||||
return readBinaryFile(args.mInputBinary);
|
return readBinaryFile(args.mInputBinary);
|
||||||
} else if (null != args.mInputUnigramXml) {
|
} else if (null != args.mInputUnigramXml) {
|
||||||
return readXmlFile(args.mInputUnigramXml, args.mInputBigramXml);
|
return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException("No input file specified");
|
throw new RuntimeException("No input file specified");
|
||||||
}
|
}
|
||||||
|
@ -195,6 +202,7 @@ public class DictionaryMaker {
|
||||||
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
|
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
|
||||||
*
|
*
|
||||||
* @param unigramXmlFilename the name of the unigram XML file. May not be null.
|
* @param unigramXmlFilename the name of the unigram XML file. May not be null.
|
||||||
|
* @param shortcutXmlFilename the name of the shortcut XML file, or null if there is none.
|
||||||
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
|
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
|
||||||
* @return the read dictionary.
|
* @return the read dictionary.
|
||||||
* @throws FileNotFoundException if one of the files can't be found
|
* @throws FileNotFoundException if one of the files can't be found
|
||||||
|
@ -203,12 +211,14 @@ public class DictionaryMaker {
|
||||||
* @throws ParserConfigurationException if the system can't create a SAX parser
|
* @throws ParserConfigurationException if the system can't create a SAX parser
|
||||||
*/
|
*/
|
||||||
private static FusionDictionary readXmlFile(final String unigramXmlFilename,
|
private static FusionDictionary readXmlFile(final String unigramXmlFilename,
|
||||||
final String bigramXmlFilename) throws FileNotFoundException, SAXException,
|
final String shortcutXmlFilename, final String bigramXmlFilename)
|
||||||
IOException, ParserConfigurationException {
|
throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
|
||||||
final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename));
|
final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename));
|
||||||
|
final FileInputStream shortcuts = null == shortcutXmlFilename ? null :
|
||||||
|
new FileInputStream(new File(shortcutXmlFilename));
|
||||||
final FileInputStream bigrams = null == bigramXmlFilename ? null :
|
final FileInputStream bigrams = null == bigramXmlFilename ? null :
|
||||||
new FileInputStream(new File(bigramXmlFilename));
|
new FileInputStream(new File(bigramXmlFilename));
|
||||||
return XmlDictInputOutput.readDictionaryXml(unigrams, bigrams);
|
return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -61,6 +61,7 @@ public class XmlDictInputOutput {
|
||||||
int mState; // the state of the parser
|
int mState; // the state of the parser
|
||||||
int mFreq; // the currently read freq
|
int mFreq; // the currently read freq
|
||||||
String mWord; // the current word
|
String mWord; // the current word
|
||||||
|
final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
|
||||||
final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
|
final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -69,9 +70,11 @@ public class XmlDictInputOutput {
|
||||||
* @param dict the dictionary to construct.
|
* @param dict the dictionary to construct.
|
||||||
* @param bigrams the bigrams as a map. This may be empty, but may not be null.
|
* @param bigrams the bigrams as a map. This may be empty, but may not be null.
|
||||||
*/
|
*/
|
||||||
public UnigramHandler(FusionDictionary dict,
|
public UnigramHandler(final FusionDictionary dict,
|
||||||
HashMap<String, ArrayList<WeightedString>> bigrams) {
|
final HashMap<String, ArrayList<WeightedString>> shortcuts,
|
||||||
|
final HashMap<String, ArrayList<WeightedString>> bigrams) {
|
||||||
mDictionary = dict;
|
mDictionary = dict;
|
||||||
|
mShortcutsMap = shortcuts;
|
||||||
mBigramsMap = bigrams;
|
mBigramsMap = bigrams;
|
||||||
mWord = "";
|
mWord = "";
|
||||||
mState = START;
|
mState = START;
|
||||||
|
@ -107,8 +110,7 @@ public class XmlDictInputOutput {
|
||||||
@Override
|
@Override
|
||||||
public void endElement(String uri, String localName, String qName) {
|
public void endElement(String uri, String localName, String qName) {
|
||||||
if (WORD == mState) {
|
if (WORD == mState) {
|
||||||
// TODO: pass the shortcut targets
|
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord));
|
||||||
mDictionary.add(mWord, mFreq, null, mBigramsMap.get(mWord));
|
|
||||||
mState = START;
|
mState = START;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -208,9 +210,12 @@ public class XmlDictInputOutput {
|
||||||
* representation.
|
* representation.
|
||||||
*
|
*
|
||||||
* @param unigrams the file to read the data from.
|
* @param unigrams the file to read the data from.
|
||||||
|
* @param shortcuts the file to read the shortcuts from, or null.
|
||||||
|
* @param bigrams the file to read the bigrams from, or null.
|
||||||
* @return the in-memory representation of the dictionary.
|
* @return the in-memory representation of the dictionary.
|
||||||
*/
|
*/
|
||||||
public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams)
|
public static FusionDictionary readDictionaryXml(final InputStream unigrams,
|
||||||
|
final InputStream shortcuts, final InputStream bigrams)
|
||||||
throws SAXException, IOException, ParserConfigurationException {
|
throws SAXException, IOException, ParserConfigurationException {
|
||||||
final SAXParserFactory factory = SAXParserFactory.newInstance();
|
final SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||||
factory.setNamespaceAware(true);
|
factory.setNamespaceAware(true);
|
||||||
|
@ -218,9 +223,13 @@ public class XmlDictInputOutput {
|
||||||
final BigramHandler bigramHandler = new BigramHandler();
|
final BigramHandler bigramHandler = new BigramHandler();
|
||||||
if (null != bigrams) parser.parse(bigrams, bigramHandler);
|
if (null != bigrams) parser.parse(bigrams, bigramHandler);
|
||||||
|
|
||||||
|
final ShortcutHandler shortcutHandler = new ShortcutHandler();
|
||||||
|
if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);
|
||||||
|
|
||||||
final FusionDictionary dict = new FusionDictionary();
|
final FusionDictionary dict = new FusionDictionary();
|
||||||
final UnigramHandler unigramHandler =
|
final UnigramHandler unigramHandler =
|
||||||
new UnigramHandler(dict, bigramHandler.getBigramMap());
|
new UnigramHandler(dict, shortcutHandler.getShortcutMap(),
|
||||||
|
bigramHandler.getBigramMap());
|
||||||
parser.parse(unigrams, unigramHandler);
|
parser.parse(unigrams, unigramHandler);
|
||||||
return dict;
|
return dict;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue