Move tools/makedict from platform/development to platform/packages/inputmethods/LatinIME

The corresponding change is I559207ab

Change-Id: I01ef7084cffa72468e87147e0ec7a9b16dd19990
This commit is contained in:
Ken Wakasa 2010-12-10 20:59:10 +09:00
parent 35bcd5c27b
commit bd6f4bfc59
7 changed files with 854 additions and 0 deletions

17
tools/Android.mk Normal file
View file

@ -0,0 +1,17 @@
# Copyright (C) 2010 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LOCAL_PATH := $(call my-dir)
include $(call all-makefiles-under,$(LOCAL_PATH))

24
tools/makedict/Android.mk Normal file
View file

@ -0,0 +1,24 @@
#
# Copyright (C) 2009 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
LOCAL_SRC_FILES := $(call all-java-files-under,src)
LOCAL_JAR_MANIFEST := etc/manifest.txt
LOCAL_MODULE := makedict
include $(BUILD_HOST_JAVA_LIBRARY)
include $(LOCAL_PATH)/etc/Android.mk

View file

@ -0,0 +1,20 @@
# Copyright (C) 2009 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
LOCAL_PREBUILT_EXECUTABLES := makedict
include $(BUILD_HOST_PREBUILT)

63
tools/makedict/etc/makedict Executable file
View file

@ -0,0 +1,63 @@
#!/bin/sh
# Copyright 2009, The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Set up prog to be the path of this script, including following symlinks,
# and set up progdir to be the fully-qualified pathname of its directory.
prog="$0"
while [ -h "${prog}" ]; do
newProg=`/bin/ls -ld "${prog}"`
newProg=`expr "${newProg}" : ".* -> \(.*\)$"`
if expr "x${newProg}" : 'x/' >/dev/null; then
prog="${newProg}"
else
progdir=`dirname "${prog}"`
prog="${progdir}/${newProg}"
fi
done
oldwd=`pwd`
progdir=`dirname "${prog}"`
cd "${progdir}"
progdir=`pwd`
prog="${progdir}"/`basename "${prog}"`
cd "${oldwd}"
jarfile=makedict.jar
frameworkdir="$progdir"
if [ ! -r "$frameworkdir/$jarfile" ]
then
frameworkdir=`dirname "$progdir"`/tools/lib
libdir=`dirname "$progdir"`/tools/lib
fi
if [ ! -r "$frameworkdir/$jarfile" ]
then
frameworkdir=`dirname "$progdir"`/framework
libdir=`dirname "$progdir"`/lib
fi
if [ ! -r "$frameworkdir/$jarfile" ]
then
echo `basename "$prog"`": can't find $jarfile"
exit 1
fi
if [ "$OSTYPE" = "cygwin" ] ; then
jarpath=`cygpath -w "$frameworkdir/$jarfile"`
progdir=`cygpath -w "$progdir"`
else
jarpath="$frameworkdir/$jarfile"
fi
# need to use "java.ext.dirs" because "-jar" causes classpath to be ignored
# might need more memory, e.g. -Xmx128M
exec java -Djava.ext.dirs="$frameworkdir" -jar "$jarpath" "$@"

View file

@ -0,0 +1 @@
Main-Class: com.android.tools.dict.MakeBinaryDictionary

View file

@ -0,0 +1,286 @@
/*
* Copyright (C) 2010 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.tools.dict;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
/**
* Helper for MakeBinaryDictionary
* Deals with all the bigram data
*/
public class BigramDictionary {
/*
* Must match the values in the client side which is located in dictionary.cpp & dictionary.h
* Changing these values will generate totally different structure which must be also reflected
* on the client side.
*/
public static final int FLAG_BIGRAM_READ = 0x80;
public static final int FLAG_BIGRAM_CHILDEXIST = 0x40;
public static final int FLAG_BIGRAM_CONTINUED = 0x80;
public static final int FLAG_BIGRAM_FREQ = 0x7F;
public static final int FOR_REVERSE_LOOKUPALL = -99;
public ArrayList<String> mBigramToFill = new ArrayList<String>();
public ArrayList<Integer> mBigramToFillAddress = new ArrayList<Integer>();
public HashMap<String, Bigram> mBi;
public boolean mHasBigram;
public BigramDictionary(String bigramSrcFilename, boolean hasBigram) {
mHasBigram = hasBigram;
loadBigram(bigramSrcFilename);
}
private void loadBigram(String filename) {
mBi = new HashMap<String, Bigram>();
if (!mHasBigram) {
System.out.println("Number of bigrams = " + Bigram.sBigramNum);
return;
}
try {
SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(new File(filename), new DefaultHandler() {
String w1 = null;
boolean inWord1 = false;
boolean inWord2 = false;
int freq = 0, counter = 0;
Bigram tempBigram = null;
@Override
public void startElement(String uri, String localName,
String qName, Attributes attributes) {
if (qName.equals("bi")) {
inWord1 = true;
w1 = attributes.getValue(0);
int count = Integer.parseInt(attributes.getValue(1));
tempBigram = new Bigram(count);
counter = 0;
} else if (qName.equals("w")) {
inWord2 = true;
String word2 = attributes.getValue(0);
int freq = Integer.parseInt(attributes.getValue(1));
tempBigram.setWord2(counter, word2, freq);
counter++;
Bigram.sBigramNum++;
}
}
@Override
public void endElement(String uri, String localName,
String qName) {
if (inWord2) {
inWord2 = false;
} else if (inWord1) {
inWord1 = false;
mBi.put(w1, tempBigram);
}
}
});
} catch (Exception ioe) {
System.err.println("Exception in parsing bigram\n" + ioe);
ioe.printStackTrace();
}
System.out.println("Number of bigrams = " + Bigram.sBigramNum);
}
byte[] writeBigrams(byte[] dict, Map<String, Integer> mDictionary) {
for (int i = 0; i < mBigramToFill.size(); i++) {
String w1 = mBigramToFill.get(i);
int address = mBigramToFillAddress.get(i);
Bigram temp = mBi.get(w1);
int word2Count = temp.count;
int j4;
for (int j = 0; j < word2Count; j++) {
if (!mDictionary.containsKey(temp.word2[j])) {
System.out.println("Not in dictionary: " + temp.word2[j]);
System.exit(0);
} else {
j4 = (j * 4);
int addressOfWord2 = mDictionary.get(temp.word2[j]);
dict[address + j4 + 0] = (byte) (((addressOfWord2 & 0x3F0000) >> 16)
| FLAG_BIGRAM_READ);
dict[address + j4 + 1] = (byte) ((addressOfWord2 & 0x00FF00) >> 8);
dict[address + j4 + 2] = (byte) ((addressOfWord2 & 0x0000FF));
if (j == (word2Count - 1)) {
dict[address + j4 + 3] = (byte) (temp.freq[j] & FLAG_BIGRAM_FREQ);
} else {
dict[address + j4 + 3] = (byte) ((temp.freq[j] & FLAG_BIGRAM_FREQ)
| FLAG_BIGRAM_CONTINUED);
}
}
}
}
return dict;
}
void reverseLookupAll(Map<String, Integer> mDictionary, byte[] dict) {
Set<String> st = mDictionary.keySet();
for (String s : st) {
searchForTerminalNode(mDictionary.get(s), FOR_REVERSE_LOOKUPALL, dict);
}
}
void searchForTerminalNode(int bigramAddress, int frequency, byte[] dict) {
StringBuilder sb = new StringBuilder(48);
int pos;
boolean found = false;
int followDownBranchAddress = 2;
char followingChar = ' ';
int depth = 0;
int totalLoopCount = 0;
while (!found) {
boolean followDownAddressSearchStop = false;
boolean firstAddress = true;
boolean haveToSearchAll = true;
if (depth > 0) {
sb.append(followingChar);
}
pos = followDownBranchAddress; // pos start at count
int count = dict[pos] & 0xFF;
pos++;
for (int i = 0; i < count; i++) {
totalLoopCount++;
// pos at data
pos++;
// pos now at flag
if (!MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // non-terminal
if (!followDownAddressSearchStop) {
int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
if (addr > bigramAddress) {
followDownAddressSearchStop = true;
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
} else if (!haveToSearchAll) {
break;
}
} else {
followDownBranchAddress = addr;
followingChar = (char) (0xFF & dict[pos-1]);
if(firstAddress) {
firstAddress = false;
haveToSearchAll = false;
}
}
}
pos += 3;
} else if (MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // terminal
// found !!
if (bigramAddress == (pos-1)) {
sb.append((char) (0xFF & dict[pos-1]));
found = true;
break;
}
// address + freq (4 byte)
if (MakeBinaryDictionary.getSecondBitOfByte(pos, dict)) {
if (!followDownAddressSearchStop) {
int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
if (addr > bigramAddress) {
followDownAddressSearchStop = true;
if (firstAddress) {
firstAddress = false;
haveToSearchAll = true;
} else if (!haveToSearchAll) {
break;
}
} else {
followDownBranchAddress = addr;
followingChar = (char) (0xFF & dict[pos-1]);
if(firstAddress) {
firstAddress = false;
haveToSearchAll = true;
}
}
}
pos += 4;
} else { // freq only (2 byte)
pos += 2;
}
// skipping bigram
int bigramExist = (dict[pos] & FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0) {
pos += 3;
nextBigramExist = (dict[pos++] & FLAG_BIGRAM_CONTINUED);
}
} else {
pos++;
}
}
}
depth++;
if (followDownBranchAddress == 2) {
System.out.println("ERROR!!! Cannot find bigram!!");
System.exit(0);
}
}
if (frequency == FOR_REVERSE_LOOKUPALL) {
System.out.println("Reverse: " + sb.toString() + " (" + bigramAddress + ")"
+ " Loop: " + totalLoopCount);
} else {
System.out.println(" bigram: " + sb.toString() + " (" + bigramAddress + ") freq: "
+ frequency + " Loop: " + totalLoopCount);
}
}
static class Bigram {
String[] word2;
int[] freq;
int count;
static int sBigramNum = 0;
String getSecondWord(int i) {
return word2[i];
}
int getFrequency(int i) {
return (freq[i] == 0) ? 1 : freq[i];
}
void setWord2(int index, String word2, int freq) {
this.word2[index] = word2;
this.freq[index] = freq;
}
public Bigram(int word2Count) {
count = word2Count;
word2 = new String[word2Count];
freq = new int[word2Count];
}
}
}

View file

@ -0,0 +1,443 @@
/*
* Copyright (C) 2009 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.tools.dict;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
/**
* Compresses a list of words, frequencies, and bigram data
* into a tree structured binary dictionary.
* Dictionary Version: 200 (may contain bigrams)
* Version number started from 200 rather than 1 because we wanted to prevent number of roots in
* any old dictionaries being mistaken as the version number. There is not a chance that there
* will be more than 200 roots. Version number should be increased when there is structural change
* in the data. There is no need to increase the version when only the words in the data changes.
*/
public class MakeBinaryDictionary {
private static final int VERSION_NUM = 200;
public static final int ALPHA_SIZE = 256;
public static final String TAG_WORD = "w";
public static final String ATTR_FREQ = "f";
private static final int FLAG_ADDRESS_MASK = 0x400000;
private static final int FLAG_TERMINAL_MASK = 0x800000;
private static final int ADDRESS_MASK = 0x3FFFFF;
/**
* Unit for this variable is in bytes
* If destination file name is main.dict and file limit causes dictionary to be separated into
* multiple file, it will generate main0.dict, main1.dict, and so forth.
*/
private static int sOutputFileSize;
private static boolean sSplitOutput;
public static final CharNode EMPTY_NODE = new CharNode();
List<CharNode> roots;
Map<String, Integer> mDictionary;
int mWordCount;
BigramDictionary bigramDict;
static class CharNode {
char data;
int freq;
boolean terminal;
List<CharNode> children;
static int sNodes;
public CharNode() {
sNodes++;
}
}
public static void usage() {
System.err.println("Usage: makedict -s <src_dict.xml> [-b <src_bigram.xml>] "
+ "-d <dest.dict> [--size filesize]");
System.exit(-1);
}
public static void main(String[] args) {
int checkSource = -1;
int checkBigram = -1;
int checkDest = -1;
int checkFileSize = -1;
for (int i = 0; i < args.length; i+=2) {
if (args[i].equals("-s")) checkSource = (i + 1);
if (args[i].equals("-b")) checkBigram = (i + 1);
if (args[i].equals("-d")) checkDest = (i + 1);
if (args[i].equals("--size")) checkFileSize = (i + 1);
}
if (checkFileSize >= 0) {
sSplitOutput = true;
sOutputFileSize = Integer.parseInt(args[checkFileSize]);
} else {
sSplitOutput = false;
}
if (checkDest >= 0 && !args[checkDest].endsWith(".dict")) {
System.err.println("Error: Dictionary output file extension should be \".dict\"");
usage();
} else if (checkSource >= 0 && checkBigram >= 0 && checkDest >= 0 &&
((!sSplitOutput && args.length == 6) || (sSplitOutput && args.length == 8))) {
new MakeBinaryDictionary(args[checkSource], args[checkBigram], args[checkDest]);
} else if (checkSource >= 0 && checkDest >= 0 &&
((!sSplitOutput && args.length == 4) || (sSplitOutput && args.length == 6))) {
new MakeBinaryDictionary(args[checkSource], null, args[checkDest]);
} else {
usage();
}
}
public MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename){
System.out.println("Generating dictionary version " + VERSION_NUM);
bigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null));
populateDictionary(srcFilename);
writeToDict(destFilename);
// Enable the code below to verify that the generated tree is traversable
// and bigram data is stored correctly.
if (false) {
bigramDict.reverseLookupAll(mDictionary, dict);
traverseDict(2, new char[32], 0);
}
}
private void populateDictionary(String filename) {
roots = new ArrayList<CharNode>();
mDictionary = new HashMap<String, Integer>();
try {
SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(new File(filename), new DefaultHandler() {
boolean inWord;
int freq;
StringBuilder wordBuilder = new StringBuilder(48);
@Override
public void startElement(String uri, String localName,
String qName, Attributes attributes) {
if (qName.equals("w")) {
inWord = true;
freq = Integer.parseInt(attributes.getValue(0));
wordBuilder.setLength(0);
}
}
@Override
public void characters(char[] data, int offset, int length) {
// Ignore other whitespace
if (!inWord) return;
wordBuilder.append(data, offset, length);
}
@Override
public void endElement(String uri, String localName,
String qName) {
if (qName.equals("w")) {
if (wordBuilder.length() > 1) {
addWordTop(wordBuilder.toString(), freq);
mWordCount++;
}
inWord = false;
}
}
});
} catch (Exception ioe) {
System.err.println("Exception in parsing\n" + ioe);
ioe.printStackTrace();
}
System.out.println("Nodes = " + CharNode.sNodes);
}
private int indexOf(List<CharNode> children, char c) {
if (children == null) {
return -1;
}
for (int i = 0; i < children.size(); i++) {
if (children.get(i).data == c) {
return i;
}
}
return -1;
}
private void addWordTop(String word, int occur) {
if (occur > 255) occur = 255;
char firstChar = word.charAt(0);
int index = indexOf(roots, firstChar);
if (index == -1) {
CharNode newNode = new CharNode();
newNode.data = firstChar;
newNode.freq = occur;
index = roots.size();
roots.add(newNode);
} else {
roots.get(index).freq += occur;
}
if (word.length() > 1) {
addWordRec(roots.get(index), word, 1, occur);
} else {
roots.get(index).terminal = true;
}
}
private void addWordRec(CharNode parent, String word, int charAt, int occur) {
CharNode child = null;
char data = word.charAt(charAt);
if (parent.children == null) {
parent.children = new ArrayList<CharNode>();
} else {
for (int i = 0; i < parent.children.size(); i++) {
CharNode node = parent.children.get(i);
if (node.data == data) {
child = node;
break;
}
}
}
if (child == null) {
child = new CharNode();
parent.children.add(child);
}
child.data = data;
if (child.freq == 0) child.freq = occur;
if (word.length() > charAt + 1) {
addWordRec(child, word, charAt + 1, occur);
} else {
child.terminal = true;
child.freq = occur;
}
}
byte[] dict;
int dictSize;
static final int CHAR_WIDTH = 8;
static final int FLAGS_WIDTH = 1; // Terminal flag (word end)
static final int ADDR_WIDTH = 23; // Offset to children
static final int FREQ_WIDTH_BYTES = 1;
static final int COUNT_WIDTH_BYTES = 1;
private void addCount(int count) {
dict[dictSize++] = (byte) (0xFF & count);
}
private void addNode(CharNode node, String word1) {
if (node.terminal) { // store address of each word1
mDictionary.put(word1, dictSize);
}
int charData = 0xFFFF & node.data;
if (charData > 254) {
dict[dictSize++] = (byte) 255;
dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF);
dict[dictSize++] = (byte) (node.data & 0xFF);
} else {
dict[dictSize++] = (byte) (0xFF & node.data);
}
if (node.children != null) {
dictSize += 3; // Space for children address
} else {
dictSize += 1; // Space for just the terminal/address flags
}
if ((0xFFFFFF & node.freq) > 255) {
node.freq = 255;
}
if (node.terminal) {
byte freq = (byte) (0xFF & node.freq);
dict[dictSize++] = freq;
// bigram
if (bigramDict.mBi.containsKey(word1)) {
int count = bigramDict.mBi.get(word1).count;
bigramDict.mBigramToFill.add(word1);
bigramDict.mBigramToFillAddress.add(dictSize);
dictSize += (4 * count);
} else {
dict[dictSize++] = (byte) (0x00);
}
}
}
int nullChildrenCount = 0;
int notTerminalCount = 0;
private void updateNodeAddress(int nodeAddress, CharNode node,
int childrenAddress) {
if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character
nodeAddress += 2;
}
childrenAddress = ADDRESS_MASK & childrenAddress;
if (childrenAddress == 0) {
nullChildrenCount++;
} else {
childrenAddress |= FLAG_ADDRESS_MASK;
}
if (node.terminal) {
childrenAddress |= FLAG_TERMINAL_MASK;
} else {
notTerminalCount++;
}
dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) {
dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
}
}
void writeWordsRec(List<CharNode> children, StringBuilder word) {
if (children == null || children.size() == 0) {
return;
}
final int childCount = children.size();
addCount(childCount);
int[] childrenAddresses = new int[childCount];
for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
childrenAddresses[j] = dictSize;
word.append(children.get(j).data);
addNode(node, word.toString());
word.deleteCharAt(word.length()-1);
}
for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
int nodeAddress = childrenAddresses[j];
int cacheDictSize = dictSize;
word.append(children.get(j).data);
writeWordsRec(node.children, word);
word.deleteCharAt(word.length()-1);
updateNodeAddress(nodeAddress, node, node.children != null
? cacheDictSize : 0);
}
}
void writeToDict(String dictFilename) {
// 4MB max, 22-bit offsets
dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably
// < 1MB in most cases, as there is a limit in the
// resource size in apks.
dictSize = 0;
dict[dictSize++] = (byte) (0xFF & VERSION_NUM); // version info
dict[dictSize++] = (byte) (0xFF & (bigramDict.mHasBigram ? 1 : 0));
StringBuilder word = new StringBuilder(48);
writeWordsRec(roots, word);
dict = bigramDict.writeBigrams(dict, mDictionary);
System.out.println("Dict Size = " + dictSize);
if (!sSplitOutput) {
sOutputFileSize = dictSize;
}
try {
int currentLoc = 0;
int i = 0;
int extension = dictFilename.indexOf(".dict");
String filename = dictFilename.substring(0, extension);
while (dictSize > 0) {
FileOutputStream fos;
if (sSplitOutput) {
fos = new FileOutputStream(filename + i + ".dict");
} else {
fos = new FileOutputStream(filename + ".dict");
}
if (dictSize > sOutputFileSize) {
fos.write(dict, currentLoc, sOutputFileSize);
dictSize -= sOutputFileSize;
currentLoc += sOutputFileSize;
} else {
fos.write(dict, currentLoc, dictSize);
dictSize = 0;
}
fos.close();
i++;
}
} catch (IOException ioe) {
System.err.println("Error writing dict file:" + ioe);
}
}
void traverseDict(int pos, char[] word, int depth) {
int count = dict[pos++] & 0xFF;
for (int i = 0; i < count; i++) {
char c = (char) (dict[pos++] & 0xFF);
if (c == 0xFF) { // two byte character
c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
pos += 2;
}
word[depth] = c;
boolean terminal = getFirstBitOfByte(pos, dict);
int address = 0;
if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check
address = get22BitAddress(pos, dict);
pos += 3;
} else {
pos += 1;
}
if (terminal) {
showWord(word, depth + 1, dict[pos] & 0xFF);
pos++;
int bigramExist = (dict[pos] & bigramDict.FLAG_BIGRAM_READ);
if (bigramExist > 0) {
int nextBigramExist = 1;
while (nextBigramExist > 0) {
int bigramAddress = get22BitAddress(pos, dict);
pos += 3;
int frequency = (bigramDict.FLAG_BIGRAM_FREQ & dict[pos]);
bigramDict.searchForTerminalNode(bigramAddress, frequency, dict);
nextBigramExist = (dict[pos++] & bigramDict.FLAG_BIGRAM_CONTINUED);
}
} else {
pos++;
}
}
if (address != 0) {
traverseDict(address, word, depth + 1);
}
}
}
void showWord(char[] word, int size, int freq) {
System.out.print(new String(word, 0, size) + " " + freq + "\n");
}
static int get22BitAddress(int pos, byte[] dict) {
return ((dict[pos + 0] & 0x3F) << 16)
| ((dict[pos + 1] & 0xFF) << 8)
| ((dict[pos + 2] & 0xFF));
}
static boolean getFirstBitOfByte(int pos, byte[] dict) {
return (dict[pos] & 0x80) > 0;
}
static boolean getSecondBitOfByte(int pos, byte[] dict) {
return (dict[pos] & 0x40) > 0;
}
}