Add a *FAST* dictionary header reader.
It's still unused as of this change but the next change will use it As a reference point, generating the metadata for Bayo takes 3'02" on my machine with the info command; it's down to 16" if made to use this instead. The gains increases with the number of dictionaries obviously. Change-Id: I0eeea2d8f81bb74b0d1570af658e91b56f7c2b79main
parent
5564317f83
commit
f6b0e32df3
|
@ -17,11 +17,16 @@
|
||||||
package com.android.inputmethod.latin.makedict;
|
package com.android.inputmethod.latin.makedict;
|
||||||
|
|
||||||
import com.android.inputmethod.annotations.UsedForTesting;
|
import com.android.inputmethod.annotations.UsedForTesting;
|
||||||
|
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
import javax.annotation.Nonnull;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decodes binary files for a FusionDictionary.
|
* Decodes binary files for a FusionDictionary.
|
||||||
|
@ -360,6 +365,43 @@ public final class BinaryDictDecoderUtils {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method that brutally decodes a header from a byte array.
|
||||||
|
*
|
||||||
|
* @param headerBuffer a buffer containing the bytes of the header.
|
||||||
|
* @return a hashmap of the attributes stored in the header
|
||||||
|
*/
|
||||||
|
@Nonnull
|
||||||
|
public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer)
|
||||||
|
throws UnsupportedFormatException {
|
||||||
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
final LinkedList<String> keyValues = new LinkedList<>();
|
||||||
|
int index = 0;
|
||||||
|
while (index < headerBuffer.length) {
|
||||||
|
if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) {
|
||||||
|
keyValues.add(sb.toString());
|
||||||
|
sb.setLength(0);
|
||||||
|
} else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF,
|
||||||
|
null /* codePointTable */)) {
|
||||||
|
sb.appendCodePoint(headerBuffer[index] & 0xFF);
|
||||||
|
} else {
|
||||||
|
sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16)
|
||||||
|
+ ((headerBuffer[index + 1] & 0xFF) << 8)
|
||||||
|
+ (headerBuffer[index + 2] & 0xFF));
|
||||||
|
index += 2;
|
||||||
|
}
|
||||||
|
index += 1;
|
||||||
|
}
|
||||||
|
if ((keyValues.size() & 1) != 0) {
|
||||||
|
throw new UnsupportedFormatException("Odd number of attributes");
|
||||||
|
}
|
||||||
|
final HashMap<String, String> attributes = new HashMap<>();
|
||||||
|
for (int i = 0; i < keyValues.size(); i += 2) {
|
||||||
|
attributes.put(keyValues.get(i), keyValues.get(i + 1));
|
||||||
|
}
|
||||||
|
return attributes;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method to pass a file name instead of a File object to isBinaryDictionary.
|
* Helper method to pass a file name instead of a File object to isBinaryDictionary.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -19,6 +19,10 @@ package com.android.inputmethod.latin.dicttool;
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils;
|
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils;
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
|
import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
|
||||||
import com.android.inputmethod.latin.makedict.DictDecoder;
|
import com.android.inputmethod.latin.makedict.DictDecoder;
|
||||||
|
import com.android.inputmethod.latin.makedict.DictionaryHeader;
|
||||||
|
import com.android.inputmethod.latin.makedict.FormatSpec;
|
||||||
|
import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
|
||||||
|
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
||||||
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
|
|
||||||
|
@ -34,6 +38,8 @@ import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
import javax.annotation.Nonnull;
|
import javax.annotation.Nonnull;
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
@ -142,6 +148,53 @@ public final class BinaryDictOffdeviceUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class HeaderReaderProcessor implements InputProcessor<DictionaryHeader> {
|
||||||
|
// Arbitrarily limit the header length to 32k. Sounds like it would never be larger
|
||||||
|
// than this. Revisit this if needed later.
|
||||||
|
private final int MAX_HEADER_LENGTH = 32 * 1024;
|
||||||
|
@Override @Nonnull
|
||||||
|
public DictionaryHeader process(final InputStream input) throws IOException,
|
||||||
|
UnsupportedFormatException {
|
||||||
|
// Do everything as curtly and ad-hoc as possible for performance.
|
||||||
|
final byte[] tmpBuffer = new byte[12];
|
||||||
|
if (tmpBuffer.length != input.read(tmpBuffer)) {
|
||||||
|
throw new UnsupportedFormatException("File too short, not a dictionary");
|
||||||
|
}
|
||||||
|
// Ad-hoc check for the magic number. See FormatSpec.java as well as
|
||||||
|
// byte_array_utils.h and BinaryDictEncoderUtils#writeDictionaryHeader().
|
||||||
|
final int MAGIC_NUMBER_START_OFFSET = 0;
|
||||||
|
final int VERSION_START_OFFSET = 4;
|
||||||
|
final int HEADER_SIZE_OFFSET = 8;
|
||||||
|
final int magicNumber = ((tmpBuffer[MAGIC_NUMBER_START_OFFSET] & 0xFF) << 24)
|
||||||
|
+ ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 1] & 0xFF) << 16)
|
||||||
|
+ ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 2] & 0xFF) << 8)
|
||||||
|
+ (tmpBuffer[MAGIC_NUMBER_START_OFFSET + 3] & 0xFF);
|
||||||
|
if (magicNumber != FormatSpec.MAGIC_NUMBER) {
|
||||||
|
throw new UnsupportedFormatException("Wrong magic number");
|
||||||
|
}
|
||||||
|
final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8)
|
||||||
|
+ (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF);
|
||||||
|
if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201) {
|
||||||
|
throw new UnsupportedFormatException("Only versions 2 and 201 are supported");
|
||||||
|
}
|
||||||
|
final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) >> 24)
|
||||||
|
+ ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) >> 16)
|
||||||
|
+ ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) >> 8)
|
||||||
|
+ (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF);
|
||||||
|
if (totalHeaderSize > MAX_HEADER_LENGTH) {
|
||||||
|
throw new UnsupportedFormatException("Header too large");
|
||||||
|
}
|
||||||
|
final byte[] headerBuffer = new byte[totalHeaderSize - tmpBuffer.length];
|
||||||
|
if (headerBuffer.length != input.read(headerBuffer)) {
|
||||||
|
throw new UnsupportedFormatException("File shorter than specified in the header");
|
||||||
|
}
|
||||||
|
final HashMap<String, String> attributes =
|
||||||
|
BinaryDictDecoderUtils.decodeHeaderAttributes(headerBuffer);
|
||||||
|
return new DictionaryHeader(totalHeaderSize, new DictionaryOptions(attributes),
|
||||||
|
new FormatOptions(version, false /* hasTimestamp */));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static void copy(final InputStream input, final OutputStream output) throws IOException {
|
public static void copy(final InputStream input, final OutputStream output) throws IOException {
|
||||||
final byte[] buffer = new byte[COPY_BUFFER_SIZE];
|
final byte[] buffer = new byte[COPY_BUFFER_SIZE];
|
||||||
for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) {
|
for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) {
|
||||||
|
|
Loading…
Reference in New Issue