Merge "Support a syntax to add whitelist entries in shortcuts" into jb-mr1-dev
commit
e6af6c2f08
|
@ -112,7 +112,7 @@ public class DictionaryMaker {
|
||||||
|
|
||||||
public static String getHelp() {
|
public static String getHelp() {
|
||||||
return "Usage: makedict "
|
return "Usage: makedict "
|
||||||
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts.xml>] "
|
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
|
||||||
+ "| -s <binary input>] [-d <binary output format version 2>] "
|
+ "| -s <binary input>] [-d <binary output format version 2>] "
|
||||||
+ "[-d1 <binary output format version 1>] [-x <xml output>] [-2]\n"
|
+ "[-d1 <binary output format version 1>] [-x <xml output>] [-2]\n"
|
||||||
+ "\n"
|
+ "\n"
|
||||||
|
@ -246,7 +246,7 @@ public class DictionaryMaker {
|
||||||
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
|
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
|
||||||
*
|
*
|
||||||
* @param unigramXmlFilename the name of the unigram XML file. May not be null.
|
* @param unigramXmlFilename the name of the unigram XML file. May not be null.
|
||||||
* @param shortcutXmlFilename the name of the shortcut XML file, or null if there is none.
|
* @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none.
|
||||||
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
|
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
|
||||||
* @return the read dictionary.
|
* @return the read dictionary.
|
||||||
* @throws FileNotFoundException if one of the files can't be found
|
* @throws FileNotFoundException if one of the files can't be found
|
||||||
|
|
|
@ -179,7 +179,7 @@ public class XmlDictInputOutput {
|
||||||
mSrc = attrs.getValue(uri, SRC_ATTRIBUTE);
|
mSrc = attrs.getValue(uri, SRC_ATTRIBUTE);
|
||||||
} else if (DST_TAG.equals(localName)) {
|
} else if (DST_TAG.equals(localName)) {
|
||||||
String dst = attrs.getValue(uri, DST_ATTRIBUTE);
|
String dst = attrs.getValue(uri, DST_ATTRIBUTE);
|
||||||
int freq = Integer.parseInt(attrs.getValue(uri, DST_FREQ));
|
int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ));
|
||||||
WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO);
|
WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO);
|
||||||
ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc);
|
ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc);
|
||||||
if (null == bigramList) bigramList = new ArrayList<WeightedString>();
|
if (null == bigramList) bigramList = new ArrayList<WeightedString>();
|
||||||
|
@ -188,6 +188,10 @@ public class XmlDictInputOutput {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected int getValueFromFreqString(final String freqString) {
|
||||||
|
return Integer.parseInt(freqString);
|
||||||
|
}
|
||||||
|
|
||||||
// This may return an empty map, but will never return null.
|
// This may return an empty map, but will never return null.
|
||||||
public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
|
public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
|
||||||
return mAssocMap;
|
return mAssocMap;
|
||||||
|
@ -216,22 +220,40 @@ public class XmlDictInputOutput {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SAX handler for a shortcut XML file.
|
* SAX handler for a shortcut & whitelist XML file.
|
||||||
*/
|
*/
|
||||||
static private class ShortcutHandler extends AssociativeListHandler {
|
static private class ShortcutAndWhitelistHandler extends AssociativeListHandler {
|
||||||
private final static String ENTRY_TAG = "entry";
|
private final static String ENTRY_TAG = "entry";
|
||||||
private final static String ENTRY_ATTRIBUTE = "shortcut";
|
private final static String ENTRY_ATTRIBUTE = "shortcut";
|
||||||
private final static String TARGET_TAG = "target";
|
private final static String TARGET_TAG = "target";
|
||||||
private final static String REPLACEMENT_ATTRIBUTE = "replacement";
|
private final static String REPLACEMENT_ATTRIBUTE = "replacement";
|
||||||
private final static String TARGET_PRIORITY_ATTRIBUTE = "priority";
|
private final static String TARGET_PRIORITY_ATTRIBUTE = "priority";
|
||||||
|
private final static String WHITELIST_MARKER = "whitelist";
|
||||||
|
private final static int WHITELIST_FREQ_VALUE = 15;
|
||||||
|
private final static int MIN_FREQ = 0;
|
||||||
|
private final static int MAX_FREQ = 14;
|
||||||
|
|
||||||
public ShortcutHandler() {
|
public ShortcutAndWhitelistHandler() {
|
||||||
super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE,
|
super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE,
|
||||||
TARGET_PRIORITY_ATTRIBUTE);
|
TARGET_PRIORITY_ATTRIBUTE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int getValueFromFreqString(final String freqString) {
|
||||||
|
if (WHITELIST_MARKER.equals(freqString)) {
|
||||||
|
return WHITELIST_FREQ_VALUE;
|
||||||
|
} else {
|
||||||
|
final int intValue = super.getValueFromFreqString(freqString);
|
||||||
|
if (intValue < MIN_FREQ || intValue > MAX_FREQ) {
|
||||||
|
throw new RuntimeException("Shortcut freq out of range. Accepted range is "
|
||||||
|
+ MIN_FREQ + ".." + MAX_FREQ);
|
||||||
|
}
|
||||||
|
return intValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// As per getAssocMap(), this never returns null.
|
// As per getAssocMap(), this never returns null.
|
||||||
public HashMap<String, ArrayList<WeightedString>> getShortcutMap() {
|
public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() {
|
||||||
return getAssocMap();
|
return getAssocMap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -243,7 +265,7 @@ public class XmlDictInputOutput {
|
||||||
* representation.
|
* representation.
|
||||||
*
|
*
|
||||||
* @param unigrams the file to read the data from.
|
* @param unigrams the file to read the data from.
|
||||||
* @param shortcuts the file to read the shortcuts from, or null.
|
* @param shortcuts the file to read the shortcuts & whitelist from, or null.
|
||||||
* @param bigrams the file to read the bigrams from, or null.
|
* @param bigrams the file to read the bigrams from, or null.
|
||||||
* @return the in-memory representation of the dictionary.
|
* @return the in-memory representation of the dictionary.
|
||||||
*/
|
*/
|
||||||
|
@ -256,11 +278,12 @@ public class XmlDictInputOutput {
|
||||||
final BigramHandler bigramHandler = new BigramHandler();
|
final BigramHandler bigramHandler = new BigramHandler();
|
||||||
if (null != bigrams) parser.parse(bigrams, bigramHandler);
|
if (null != bigrams) parser.parse(bigrams, bigramHandler);
|
||||||
|
|
||||||
final ShortcutHandler shortcutHandler = new ShortcutHandler();
|
final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler =
|
||||||
if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);
|
new ShortcutAndWhitelistHandler();
|
||||||
|
if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler);
|
||||||
|
|
||||||
final UnigramHandler unigramHandler =
|
final UnigramHandler unigramHandler =
|
||||||
new UnigramHandler(shortcutHandler.getShortcutMap());
|
new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap());
|
||||||
parser.parse(unigrams, unigramHandler);
|
parser.parse(unigrams, unigramHandler);
|
||||||
final FusionDictionary dict = unigramHandler.getFinalDictionary();
|
final FusionDictionary dict = unigramHandler.getFinalDictionary();
|
||||||
final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
|
final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
|
||||||
|
@ -280,7 +303,7 @@ public class XmlDictInputOutput {
|
||||||
*
|
*
|
||||||
* This method reads data from the parser and creates a new FusionDictionary with it.
|
* This method reads data from the parser and creates a new FusionDictionary with it.
|
||||||
* The format parsed by this method is the format used before Ice Cream Sandwich,
|
* The format parsed by this method is the format used before Ice Cream Sandwich,
|
||||||
* which has no support for bigrams or shortcuts.
|
* which has no support for bigrams or shortcuts/whitelist.
|
||||||
* It is important to note that this method expects the parser to have already eaten
|
* It is important to note that this method expects the parser to have already eaten
|
||||||
* the first, all-encompassing tag.
|
* the first, all-encompassing tag.
|
||||||
*
|
*
|
||||||
|
@ -291,7 +314,7 @@ public class XmlDictInputOutput {
|
||||||
/**
|
/**
|
||||||
* Writes a dictionary to an XML file.
|
* Writes a dictionary to an XML file.
|
||||||
*
|
*
|
||||||
* The output format is the "second" format, which supports bigrams and shortcuts.
|
* The output format is the "second" format, which supports bigrams and shortcuts/whitelist.
|
||||||
*
|
*
|
||||||
* @param destination a destination stream to write to.
|
* @param destination a destination stream to write to.
|
||||||
* @param dict the dictionary to write.
|
* @param dict the dictionary to write.
|
||||||
|
|
Loading…
Reference in New Issue