Merge "Include a code point table in the binary dictionary."

This commit is contained in:
Akifumi Yoshimoto 2014-10-02 08:55:18 +00:00 committed by Android (Google) Code Review
commit 7e5614520a
9 changed files with 162 additions and 74 deletions

View file

@ -163,13 +163,15 @@ public final class FormatSpec {
static final int NOT_A_VERSION_NUMBER = -1; static final int NOT_A_VERSION_NUMBER = -1;
// These MUST have the same values as the relevant constants in format_utils.h. // These MUST have the same values as the relevant constants in format_utils.h.
// From version 4 on, we use version * 100 + revision as a version number. That allows // From version 2.01 on, we use version * 100 + revision as a version number. That allows
// us to change the format during development while having testing devices remove // us to change the format during development while having testing devices remove
// older files with each upgrade, while still having a readable versioning scheme. // older files with each upgrade, while still having a readable versioning scheme.
// When we bump up the dictionary format version, we should update // When we bump up the dictionary format version, we should update
// ExpandableDictionary.needsToMigrateDictionary() and // ExpandableDictionary.needsToMigrateDictionary() and
// ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType().
public static final int VERSION2 = 2; public static final int VERSION2 = 2;
public static final int VERSION201 = 201;
public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201;
// Dictionary version used for testing. // Dictionary version used for testing.
public static final int VERSION4_ONLY_FOR_TESTING = 399; public static final int VERSION4_ONLY_FOR_TESTING = 399;
public static final int VERSION401 = 401; public static final int VERSION401 = 401;

View file

@ -312,7 +312,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
final DictBuffer dictBuffer = new ByteArrayDictBuffer(buffer); final DictBuffer dictBuffer = new ByteArrayDictBuffer(buffer);
for (final String word : sWords) { for (final String word : sWords) {
Arrays.fill(buffer, (byte) 0); Arrays.fill(buffer, (byte) 0);
CharEncoding.writeString(buffer, 0, word); CharEncoding.writeString(buffer, 0, word, null);
dictBuffer.position(0); dictBuffer.position(0);
final String str = CharEncoding.readString(dictBuffer); final String str = CharEncoding.readString(dictBuffer);
assertEquals(word, str); assertEquals(word, str);

View file

@ -17,11 +17,11 @@
package com.android.inputmethod.latin.makedict; package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.annotations.UsedForTesting;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.HashMap;
/** /**
* Decodes binary files for a FusionDictionary. * Decodes binary files for a FusionDictionary.
@ -109,15 +109,19 @@ public final class BinaryDictDecoderUtils {
* A class grouping utility function for our specific character encoding. * A class grouping utility function for our specific character encoding.
*/ */
static final class CharEncoding { static final class CharEncoding {
private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
/** /**
* Helper method to find out whether this code fits on one byte * Helper method to find out whether this code fits on one byte
*/ */
private static boolean fitsOnOneByte(final int character) { private static boolean fitsOnOneByte(int character,
return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
&& character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; if (codePointToOneByteCodeMap != null) {
if (codePointToOneByteCodeMap.containsKey(character)) {
character = codePointToOneByteCodeMap.get(character);
}
}
return character >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE
&& character <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
} }
/** /**
@ -137,9 +141,10 @@ public final class BinaryDictDecoderUtils {
* @param character the character code. * @param character the character code.
* @return the size in binary encoded-form, either 1 or 3 bytes. * @return the size in binary encoded-form, either 1 or 3 bytes.
*/ */
static int getCharSize(final int character) { static int getCharSize(final int character,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
// See char encoding in FusionDictionary.java // See char encoding in FusionDictionary.java
if (fitsOnOneByte(character)) return 1; if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1;
if (FormatSpec.INVALID_CHARACTER == character) return 1; if (FormatSpec.INVALID_CHARACTER == character) return 1;
return 3; return 3;
} }
@ -147,9 +152,10 @@ public final class BinaryDictDecoderUtils {
/** /**
* Compute the byte size of a character array. * Compute the byte size of a character array.
*/ */
static int getCharArraySize(final int[] chars) { static int getCharArraySize(final int[] chars,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
int size = 0; int size = 0;
for (int character : chars) size += getCharSize(character); for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap);
return size; return size;
} }
@ -159,11 +165,19 @@ public final class BinaryDictDecoderUtils {
* @param codePoints the code point array to write. * @param codePoints the code point array to write.
* @param buffer the byte buffer to write to. * @param buffer the byte buffer to write to.
* @param index the index in buffer to write the character array to. * @param index the index in buffer to write the character array to.
* @param codePointToOneByteCodeMap the map to convert the code point.
* @return the index after the last character. * @return the index after the last character.
*/ */
static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { static int writeCharArray(final int[] codePoints, final byte[] buffer, int index,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
for (int codePoint : codePoints) { for (int codePoint : codePoints) {
if (1 == getCharSize(codePoint)) { if (codePointToOneByteCodeMap != null) {
if (codePointToOneByteCodeMap.containsKey(codePoint)) {
// Convert code points
codePoint = codePointToOneByteCodeMap.get(codePoint);
}
}
if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
buffer[index++] = (byte)codePoint; buffer[index++] = (byte)codePoint;
} else { } else {
buffer[index++] = (byte)(0xFF & (codePoint >> 16)); buffer[index++] = (byte)(0xFF & (codePoint >> 16));
@ -184,12 +198,19 @@ public final class BinaryDictDecoderUtils {
* @param word the string to write. * @param word the string to write.
* @return the size written, in bytes. * @return the size written, in bytes.
*/ */
static int writeString(final byte[] buffer, final int origin, final String word) { static int writeString(final byte[] buffer, final int origin, final String word,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
final int length = word.length(); final int length = word.length();
int index = origin; int index = origin;
for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
final int codePoint = word.codePointAt(i); int codePoint = word.codePointAt(i);
if (1 == getCharSize(codePoint)) { if (codePointToOneByteCodeMap != null) {
if (codePointToOneByteCodeMap.containsKey(codePoint)) {
// Convert code points
codePoint = codePointToOneByteCodeMap.get(codePoint);
}
}
if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
buffer[index++] = (byte)codePoint; buffer[index++] = (byte)codePoint;
} else { } else {
buffer[index++] = (byte)(0xFF & (codePoint >> 16)); buffer[index++] = (byte)(0xFF & (codePoint >> 16));
@ -210,12 +231,13 @@ public final class BinaryDictDecoderUtils {
* @param word the string to write. * @param word the string to write.
* @return the size written, in bytes. * @return the size written, in bytes.
*/ */
static int writeString(final OutputStream stream, final String word) throws IOException { static int writeString(final OutputStream stream, final String word,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException {
final int length = word.length(); final int length = word.length();
int written = 0; int written = 0;
for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
final int codePoint = word.codePointAt(i); final int codePoint = word.codePointAt(i);
final int charSize = getCharSize(codePoint); final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap);
if (1 == charSize) { if (1 == charSize) {
stream.write((byte) codePoint); stream.write((byte) codePoint);
} else { } else {
@ -253,7 +275,7 @@ public final class BinaryDictDecoderUtils {
*/ */
static int readChar(final DictBuffer dictBuffer) { static int readChar(final DictBuffer dictBuffer) {
int character = dictBuffer.readUnsignedByte(); int character = dictBuffer.readUnsignedByte();
if (!fitsOnOneByte(character)) { if (!fitsOnOneByte(character, null)) {
if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
return FormatSpec.INVALID_CHARACTER; return FormatSpec.INVALID_CHARACTER;
} }

View file

@ -61,8 +61,9 @@ public class BinaryDictEncoderUtils {
* @param characters the character array * @param characters the character array
* @return the size of the char array, including the terminator if any * @return the size of the char array, including the terminator if any
*/ */
static int getPtNodeCharactersSize(final int[] characters) { static int getPtNodeCharactersSize(final int[] characters,
int size = CharEncoding.getCharArraySize(characters); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
int size = CharEncoding.getCharArraySize(characters, codePointToOneByteCodeMap);
if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE;
return size; return size;
} }
@ -76,8 +77,9 @@ public class BinaryDictEncoderUtils {
* @param ptNode the PtNode * @param ptNode the PtNode
* @return the size of the char array, including the terminator if any * @return the size of the char array, including the terminator if any
*/ */
private static int getPtNodeCharactersSize(final PtNode ptNode) { private static int getPtNodeCharactersSize(final PtNode ptNode,
return getPtNodeCharactersSize(ptNode.mChars); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
return getPtNodeCharactersSize(ptNode.mChars, codePointToOneByteCodeMap);
} }
/** /**
@ -92,13 +94,14 @@ public class BinaryDictEncoderUtils {
/** /**
* Compute the size of a shortcut in bytes. * Compute the size of a shortcut in bytes.
*/ */
private static int getShortcutSize(final WeightedString shortcut) { private static int getShortcutSize(final WeightedString shortcut,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
final String word = shortcut.mWord; final String word = shortcut.mWord;
final int length = word.length(); final int length = word.length();
for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
final int codePoint = word.codePointAt(i); final int codePoint = word.codePointAt(i);
size += CharEncoding.getCharSize(codePoint); size += CharEncoding.getCharSize(codePoint, codePointToOneByteCodeMap);
} }
size += FormatSpec.PTNODE_TERMINATOR_SIZE; size += FormatSpec.PTNODE_TERMINATOR_SIZE;
return size; return size;
@ -110,11 +113,12 @@ public class BinaryDictEncoderUtils {
* This is known in advance and does not change according to position in the file * This is known in advance and does not change according to position in the file
* like address lists do. * like address lists do.
*/ */
static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { static int getShortcutListSize(final ArrayList<WeightedString> shortcutList,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
if (null == shortcutList || shortcutList.isEmpty()) return 0; if (null == shortcutList || shortcutList.isEmpty()) return 0;
int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
for (final WeightedString shortcut : shortcutList) { for (final WeightedString shortcut : shortcutList) {
size += getShortcutSize(shortcut); size += getShortcutSize(shortcut, codePointToOneByteCodeMap);
} }
return size; return size;
} }
@ -125,14 +129,16 @@ public class BinaryDictEncoderUtils {
* @param ptNode the PtNode to compute the size of. * @param ptNode the PtNode to compute the size of.
* @return the maximum size of the PtNode. * @return the maximum size of the PtNode.
*/ */
private static int getPtNodeMaximumSize(final PtNode ptNode) { private static int getPtNodeMaximumSize(final PtNode ptNode,
int size = getNodeHeaderSize(ptNode); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
int size = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap);
if (ptNode.isTerminal()) { if (ptNode.isTerminal()) {
// If terminal, one byte for the frequency. // If terminal, one byte for the frequency.
size += FormatSpec.PTNODE_FREQUENCY_SIZE; size += FormatSpec.PTNODE_FREQUENCY_SIZE;
} }
size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address
size += getShortcutListSize(ptNode.mShortcutTargets); // TODO: Use codePointToOneByteCodeMap for shortcuts.
size += getShortcutListSize(ptNode.mShortcutTargets, null /* codePointToOneByteCodeMap */);
if (null != ptNode.mBigrams) { if (null != ptNode.mBigrams) {
size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE
+ FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE)
@ -148,10 +154,11 @@ public class BinaryDictEncoderUtils {
* *
* @param ptNodeArray the node array to compute the maximum size of. * @param ptNodeArray the node array to compute the maximum size of.
*/ */
private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray) { private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
int size = getPtNodeCountSize(ptNodeArray); int size = getPtNodeCountSize(ptNodeArray);
for (PtNode node : ptNodeArray.mData) { for (PtNode node : ptNodeArray.mData) {
final int nodeSize = getPtNodeMaximumSize(node); final int nodeSize = getPtNodeMaximumSize(node, codePointToOneByteCodeMap);
node.mCachedSize = nodeSize; node.mCachedSize = nodeSize;
size += nodeSize; size += nodeSize;
} }
@ -163,8 +170,10 @@ public class BinaryDictEncoderUtils {
* *
* @param ptNode the PtNode of which to compute the size of the header * @param ptNode the PtNode of which to compute the size of the header
*/ */
private static int getNodeHeaderSize(final PtNode ptNode) { private static int getNodeHeaderSize(final PtNode ptNode,
return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode,
codePointToOneByteCodeMap);
} }
/** /**
@ -367,7 +376,8 @@ public class BinaryDictEncoderUtils {
* @return false if none of the cached addresses inside the node array changed, true otherwise. * @return false if none of the cached addresses inside the node array changed, true otherwise.
*/ */
private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray,
final FusionDictionary dict) { final FusionDictionary dict,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
boolean changed = false; boolean changed = false;
int size = getPtNodeCountSize(ptNodeArray); int size = getPtNodeCountSize(ptNodeArray);
for (PtNode ptNode : ptNodeArray.mData) { for (PtNode ptNode : ptNodeArray.mData) {
@ -375,7 +385,7 @@ public class BinaryDictEncoderUtils {
if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) {
changed = true; changed = true;
} }
int nodeSize = getNodeHeaderSize(ptNode); int nodeSize = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap);
if (ptNode.isTerminal()) { if (ptNode.isTerminal()) {
nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
} }
@ -383,7 +393,9 @@ public class BinaryDictEncoderUtils {
nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
nodeSize + size, ptNode.mChildren)); nodeSize + size, ptNode.mChildren));
} }
nodeSize += getShortcutListSize(ptNode.mShortcutTargets); // TODO: Use codePointToOneByteCodeMap for shortcuts.
nodeSize += getShortcutListSize(ptNode.mShortcutTargets,
null /* codePointToOneByteCodeMap */);
if (null != ptNode.mBigrams) { if (null != ptNode.mBigrams) {
for (WeightedString bigram : ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) {
final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
@ -454,10 +466,11 @@ public class BinaryDictEncoderUtils {
* @return the same array it was passed. The nodes have been updated for address and size. * @return the same array it was passed. The nodes have been updated for address and size.
*/ */
/* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict,
final ArrayList<PtNodeArray> flatNodes) { final ArrayList<PtNodeArray> flatNodes,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
// First get the worst possible sizes and offsets // First get the worst possible sizes and offsets
for (final PtNodeArray n : flatNodes) { for (final PtNodeArray n : flatNodes) {
calculatePtNodeArrayMaximumSize(n); calculatePtNodeArrayMaximumSize(n, codePointToOneByteCodeMap);
} }
final int offset = initializePtNodeArraysCachedAddresses(flatNodes); final int offset = initializePtNodeArraysCachedAddresses(flatNodes);
@ -472,7 +485,8 @@ public class BinaryDictEncoderUtils {
for (final PtNodeArray ptNodeArray : flatNodes) { for (final PtNodeArray ptNodeArray : flatNodes) {
ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset;
final int oldNodeArraySize = ptNodeArray.mCachedSize; final int oldNodeArraySize = ptNodeArray.mCachedSize;
final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict); final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict,
codePointToOneByteCodeMap);
final int newNodeArraySize = ptNodeArray.mCachedSize; final int newNodeArraySize = ptNodeArray.mCachedSize;
if (oldNodeArraySize < newNodeArraySize) { if (oldNodeArraySize < newNodeArraySize) {
throw new RuntimeException("Increased size ?!"); throw new RuntimeException("Increased size ?!");
@ -686,9 +700,10 @@ public class BinaryDictEncoderUtils {
+ (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY); + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY);
} }
/* package */ static final int getChildrenPosition(final PtNode ptNode) { /* package */ static final int getChildrenPosition(final PtNode ptNode,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate
+ getNodeHeaderSize(ptNode); + getNodeHeaderSize(ptNode, codePointToOneByteCodeMap);
if (ptNode.isTerminal()) { if (ptNode.isTerminal()) {
// A terminal node has the frequency. // A terminal node has the frequency.
// If positionOfChildrenPosField is incorrect, we may crash when jumping to the children // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children
@ -705,10 +720,12 @@ public class BinaryDictEncoderUtils {
* @param dict the dictionary the node array is a part of (for relative offsets). * @param dict the dictionary the node array is a part of (for relative offsets).
* @param dictEncoder the dictionary encoder. * @param dictEncoder the dictionary encoder.
* @param ptNodeArray the node array to write. * @param ptNodeArray the node array to write.
* @param codePointToOneByteCodeMap the map to convert the code points.
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")
/* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict,
final DictEncoder dictEncoder, final PtNodeArray ptNodeArray) { final DictEncoder dictEncoder, final PtNodeArray ptNodeArray,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
// TODO: Make the code in common with BinaryDictIOUtils#writePtNode // TODO: Make the code in common with BinaryDictIOUtils#writePtNode
dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate);
@ -727,7 +744,7 @@ public class BinaryDictEncoderUtils {
+ FormatSpec.MAX_TERMINAL_FREQUENCY + FormatSpec.MAX_TERMINAL_FREQUENCY
+ " : " + ptNode.mProbabilityInfo.toString()); + " : " + ptNode.mProbabilityInfo.toString());
} }
dictEncoder.writePtNode(ptNode, dict); dictEncoder.writePtNode(ptNode, dict, codePointToOneByteCodeMap);
} }
if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate
+ ptNodeArray.mCachedSize) { + ptNodeArray.mCachedSize) {
@ -834,12 +851,16 @@ public class BinaryDictEncoderUtils {
// Write out the options. // Write out the options.
for (final String key : dict.mOptions.mAttributes.keySet()) { for (final String key : dict.mOptions.mAttributes.keySet()) {
final String value = dict.mOptions.mAttributes.get(key); final String value = dict.mOptions.mAttributes.get(key);
CharEncoding.writeString(headerBuffer, key); CharEncoding.writeString(headerBuffer, key, null);
CharEncoding.writeString(headerBuffer, value); CharEncoding.writeString(headerBuffer, value, null);
}
// Write out the codePointTable if there is codePointOccurrenceArray.
if (codePointOccurrenceArray != null) {
final String codePointTableString =
encodeCodePointTable(codePointOccurrenceArray);
CharEncoding.writeString(headerBuffer, DictionaryHeader.CODE_POINT_TABLE_KEY, null);
CharEncoding.writeString(headerBuffer, codePointTableString, null);
} }
// TODO: Write out the code point table.
final int size = headerBuffer.size(); final int size = headerBuffer.size();
final byte[] bytes = headerBuffer.toByteArray(); final byte[] bytes = headerBuffer.toByteArray();
// Write out the header size. // Write out the header size.
@ -857,10 +878,30 @@ public class BinaryDictEncoderUtils {
final HashMap<Integer, Integer> mCodePointToOneByteCodeMap; final HashMap<Integer, Integer> mCodePointToOneByteCodeMap;
final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray; final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray;
// Let code point table empty for version 200 dictionary which used in test
CodePointTable() {
mCodePointToOneByteCodeMap = null;
mCodePointOccurrenceArray = null;
}
CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap, CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap,
final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) {
mCodePointToOneByteCodeMap = codePointToOneByteCodeMap; mCodePointToOneByteCodeMap = codePointToOneByteCodeMap;
mCodePointOccurrenceArray = codePointOccurrenceArray; mCodePointOccurrenceArray = codePointOccurrenceArray;
} }
} }
private static String encodeCodePointTable(
final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) {
final StringBuilder codePointTableString = new StringBuilder();
int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
// Native reads the table as a string
codePointTableString.appendCodePoint(entry.getKey());
if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
break;
}
}
return codePointTableString.toString();
}
} }

View file

@ -21,6 +21,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap;
/** /**
* An interface of binary dictionary encoder. * An interface of binary dictionary encoder.
@ -33,5 +34,6 @@ public interface DictEncoder {
public void setPosition(final int position); public void setPosition(final int position);
public int getPosition(); public int getPosition();
public void writePtNodeCount(final int ptNodeCount); public void writePtNodeCount(final int ptNodeCount);
public void writePtNode(final PtNode ptNode, final FusionDictionary dict); public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
final HashMap<Integer, Integer> codePointToOneByteCodeMap);
} }

View file

@ -177,7 +177,8 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
if (header == null) { if (header == null) {
throw new IOException("Cannot read the dictionary header."); throw new IOException("Cannot read the dictionary header.");
} }
if (header.mFormatOptions.mVersion != FormatSpec.VERSION2) { if (header.mFormatOptions.mVersion != FormatSpec.VERSION2 &&
header.mFormatOptions.mVersion != FormatSpec.VERSION201) {
throw new UnsupportedFormatException("File header has a wrong version : " throw new UnsupportedFormatException("File header has a wrong version : "
+ header.mFormatOptions.mVersion); + header.mFormatOptions.mVersion);
} }
@ -200,19 +201,19 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
int index = 0; int index = 0;
int character = CharEncoding.readChar(mDictBuffer); int character = CharEncoding.readChar(mDictBuffer);
addressPointer += CharEncoding.getCharSize(character); addressPointer += CharEncoding.getCharSize(character, null);
while (FormatSpec.INVALID_CHARACTER != character) { while (FormatSpec.INVALID_CHARACTER != character) {
// FusionDictionary is making sure that the length of the word is smaller than // FusionDictionary is making sure that the length of the word is smaller than
// MAX_WORD_LENGTH. // MAX_WORD_LENGTH.
// So we'll never write past the end of mCharacterBuffer. // So we'll never write past the end of mCharacterBuffer.
mCharacterBuffer[index++] = character; mCharacterBuffer[index++] = character;
character = CharEncoding.readChar(mDictBuffer); character = CharEncoding.readChar(mDictBuffer);
addressPointer += CharEncoding.getCharSize(character); addressPointer += CharEncoding.getCharSize(character, null);
} }
characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
} else { } else {
final int character = CharEncoding.readChar(mDictBuffer); final int character = CharEncoding.readChar(mDictBuffer);
addressPointer += CharEncoding.getCharSize(character); addressPointer += CharEncoding.getCharSize(character, null);
characters = new int[] { character }; characters = new int[] { character };
} }
final ProbabilityInfo probabilityInfo; final ProbabilityInfo probabilityInfo;

View file

@ -124,7 +124,7 @@ public class Ver2DictEncoder implements DictEncoder {
@Override @Override
public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
throws IOException, UnsupportedFormatException { throws IOException, UnsupportedFormatException {
if (formatOptions.mVersion > FormatSpec.VERSION2) { if (formatOptions.mVersion > FormatSpec.VERSION201) {
throw new UnsupportedFormatException( throw new UnsupportedFormatException(
"The given format options has wrong version number : " "The given format options has wrong version number : "
+ formatOptions.mVersion); + formatOptions.mVersion);
@ -135,7 +135,13 @@ public class Ver2DictEncoder implements DictEncoder {
} }
// Make code point conversion table ordered by occurrence of code points // Make code point conversion table ordered by occurrence of code points
final CodePointTable codePointTable = makeCodePointTable(dict); // Version 201 or later have codePointTable
final CodePointTable codePointTable;
if (formatOptions.mVersion >= FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) {
codePointTable = makeCodePointTable(dict);
} else {
codePointTable = new CodePointTable();
}
BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
codePointTable.mCodePointOccurrenceArray); codePointTable.mCodePointOccurrenceArray);
@ -152,7 +158,8 @@ public class Ver2DictEncoder implements DictEncoder {
ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
MakedictLog.i("Computing addresses..."); MakedictLog.i("Computing addresses...");
BinaryDictEncoderUtils.computeAddresses(dict, flatNodes); BinaryDictEncoderUtils.computeAddresses(dict, flatNodes,
codePointTable.mCodePointToOneByteCodeMap);
MakedictLog.i("Checking PtNode array..."); MakedictLog.i("Checking PtNode array...");
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
@ -164,7 +171,8 @@ public class Ver2DictEncoder implements DictEncoder {
MakedictLog.i("Writing file..."); MakedictLog.i("Writing file...");
for (PtNodeArray nodeArray : flatNodes) { for (PtNodeArray nodeArray : flatNodes) {
BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray); BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray,
codePointTable.mCodePointToOneByteCodeMap);
} }
if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes);
mOutStream.write(mBuffer, 0, mPosition); mOutStream.write(mBuffer, 0, mPosition);
@ -196,15 +204,19 @@ public class Ver2DictEncoder implements DictEncoder {
countSize); countSize);
} }
private void writePtNodeFlags(final PtNode ptNode) { private void writePtNodeFlags(final PtNode ptNode,
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
codePointToOneByteCodeMap);
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition,
BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos), BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos),
FormatSpec.PTNODE_FLAGS_SIZE); FormatSpec.PTNODE_FLAGS_SIZE);
} }
private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars) { private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars,
mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition,
codePointToOneByteCodeMap);
if (hasSeveralChars) { if (hasSeveralChars) {
mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
} }
@ -217,8 +229,10 @@ public class Ver2DictEncoder implements DictEncoder {
} }
} }
private void writeChildrenPosition(final PtNode ptNode) { private void writeChildrenPosition(final PtNode ptNode,
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
codePointToOneByteCodeMap);
mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
childrenPos); childrenPos);
} }
@ -228,7 +242,8 @@ public class Ver2DictEncoder implements DictEncoder {
* *
* @param shortcuts the shortcut attributes list. * @param shortcuts the shortcut attributes list.
*/ */
private void writeShortcuts(final ArrayList<WeightedString> shortcuts) { private void writeShortcuts(final ArrayList<WeightedString> shortcuts,
final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
if (null == shortcuts || shortcuts.isEmpty()) return; if (null == shortcuts || shortcuts.isEmpty()) return;
final int indexOfShortcutByteSize = mPosition; final int indexOfShortcutByteSize = mPosition;
@ -241,7 +256,8 @@ public class Ver2DictEncoder implements DictEncoder {
target.getProbability()); target.getProbability());
mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags, mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags,
FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord); final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord,
codePointToOneByteCodeMap);
mPosition += shortcutShift; mPosition += shortcutShift;
} }
final int shortcutByteSize = mPosition - indexOfShortcutByteSize; final int shortcutByteSize = mPosition - indexOfShortcutByteSize;
@ -281,12 +297,14 @@ public class Ver2DictEncoder implements DictEncoder {
} }
@Override @Override
public void writePtNode(final PtNode ptNode, final FusionDictionary dict) { public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
writePtNodeFlags(ptNode); final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); writePtNodeFlags(ptNode, codePointToOneByteCodeMap);
writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap);
writeFrequency(ptNode.getProbability()); writeFrequency(ptNode.getProbability());
writeChildrenPosition(ptNode); writeChildrenPosition(ptNode, codePointToOneByteCodeMap);
writeShortcuts(ptNode.mShortcutTargets); // TODO: Use codePointToOneByteCodeMap for shortcuts.
writeShortcuts(ptNode.mShortcutTargets, null /* codePointToOneByteCodeMap */);
writeBigrams(ptNode.mBigrams, dict); writeBigrams(ptNode.mBigrams, dict);
} }
} }

View file

@ -27,6 +27,7 @@ import com.android.inputmethod.latin.utils.LocaleUtils;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap;
/** /**
* An implementation of DictEncoder for version 4 binary dictionary. * An implementation of DictEncoder for version 4 binary dictionary.
@ -142,6 +143,7 @@ public class Ver4DictEncoder implements DictEncoder {
} }
@Override @Override
public void writePtNode(PtNode ptNode, FusionDictionary dict) { public void writePtNode(PtNode ptNode, FusionDictionary dict,
HashMap<Integer, Integer> codePointToOneByteCodeMap) {
} }
} }

View file

@ -158,7 +158,7 @@ public class DictionaryMaker {
String outputBinary = null; String outputBinary = null;
String outputXml = null; String outputXml = null;
String outputCombined = null; String outputCombined = null;
int outputBinaryFormatVersion = 2; // the default version is 2. int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201.
// Don't use code point table by default. // Don't use code point table by default.
int codePointTableMode = Ver2DictEncoder.CODE_POINT_TABLE_OFF; int codePointTableMode = Ver2DictEncoder.CODE_POINT_TABLE_OFF;