Accept comments in the combined file format

Change-Id: I17b745281b8b7b1922e1c2d3717986e9bb11aa0b
main
Jean Chalard 2012-10-29 17:21:50 +09:00
parent cfbfb1c50a
commit e5b68fb51d
1 changed files with 10 additions and 2 deletions

View File

@ -52,6 +52,7 @@ public class CombinedInputOutput {
private static final String OPTIONS_TAG = "options";
private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing";
private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing";
private static final String COMMENT_LINE_STARTER = "#";
/**
* Basic test to find out whether the file is in the combined format or not.
@ -65,7 +66,10 @@ public class CombinedInputOutput {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(new File(filename)));
final String firstLine = reader.readLine();
String firstLine = reader.readLine();
while (firstLine.startsWith(COMMENT_LINE_STARTER)) {
firstLine = reader.readLine();
}
return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
} catch (FileNotFoundException e) {
return false;
@ -94,7 +98,10 @@ public class CombinedInputOutput {
public static FusionDictionary readDictionaryCombined(final InputStream source)
throws IOException {
final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8"));
final String headerLine = reader.readLine();
String headerLine = reader.readLine();
while (headerLine.startsWith(COMMENT_LINE_STARTER)) {
headerLine = reader.readLine();
}
final String header[] = headerLine.split(",");
final HashMap<String, String> attributes = new HashMap<String, String>();
for (String item : header) {
@ -120,6 +127,7 @@ public class CombinedInputOutput {
ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>();
ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>();
while (null != (line = reader.readLine())) {
if (line.startsWith(COMMENT_LINE_STARTER)) continue;
final String args[] = line.trim().split(",");
if (args[0].matches(WORD_TAG + "=.*")) {
if (null != word) {