Create a code point table based on occurrence counts.
Bug:17097992 Change-Id: Ifd76dbd4d385d800af416368e25c9e56a76d0fbfmain
parent
f4329f7fff
commit
8a6e96d286
|
@ -47,6 +47,7 @@ public final class DictionaryHeader {
|
||||||
public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
|
public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
|
||||||
public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
|
public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
|
||||||
public static final String ATTRIBUTE_VALUE_TRUE = "1";
|
public static final String ATTRIBUTE_VALUE_TRUE = "1";
|
||||||
|
public static final String CODE_POINT_TABLE_KEY = "codePointTable";
|
||||||
|
|
||||||
public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
|
public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
|
||||||
final FormatOptions formatOptions) throws UnsupportedFormatException {
|
final FormatOptions formatOptions) throws UnsupportedFormatException {
|
||||||
|
|
|
@ -237,6 +237,8 @@ public final class FormatSpec {
|
||||||
static final int UINT16_MAX = 0xFFFF;
|
static final int UINT16_MAX = 0xFFFF;
|
||||||
static final int UINT24_MAX = 0xFFFFFF;
|
static final int UINT24_MAX = 0xFFFFFF;
|
||||||
static final int MSB8 = 0x80;
|
static final int MSB8 = 0x80;
|
||||||
|
static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
|
||||||
|
static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Options about file format.
|
* Options about file format.
|
||||||
|
|
|
@ -27,6 +27,8 @@ import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Encodes binary files for a FusionDictionary.
|
* Encodes binary files for a FusionDictionary.
|
||||||
|
@ -791,10 +793,12 @@ public class BinaryDictEncoderUtils {
|
||||||
* @param destination the stream to write the file header to.
|
* @param destination the stream to write the file header to.
|
||||||
* @param dict the dictionary to write.
|
* @param dict the dictionary to write.
|
||||||
* @param formatOptions file format options.
|
* @param formatOptions file format options.
|
||||||
|
* @param codePointOccurrenceArray code points ordered by occurrence count.
|
||||||
* @return the size of the header.
|
* @return the size of the header.
|
||||||
*/
|
*/
|
||||||
/* package */ static int writeDictionaryHeader(final OutputStream destination,
|
/* package */ static int writeDictionaryHeader(final OutputStream destination,
|
||||||
final FusionDictionary dict, final FormatOptions formatOptions)
|
final FusionDictionary dict, final FormatOptions formatOptions,
|
||||||
|
final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray)
|
||||||
throws IOException, UnsupportedFormatException {
|
throws IOException, UnsupportedFormatException {
|
||||||
final int version = formatOptions.mVersion;
|
final int version = formatOptions.mVersion;
|
||||||
if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
|
if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
|
||||||
|
@ -833,6 +837,9 @@ public class BinaryDictEncoderUtils {
|
||||||
CharEncoding.writeString(headerBuffer, key);
|
CharEncoding.writeString(headerBuffer, key);
|
||||||
CharEncoding.writeString(headerBuffer, value);
|
CharEncoding.writeString(headerBuffer, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Write out the code point table.
|
||||||
|
|
||||||
final int size = headerBuffer.size();
|
final int size = headerBuffer.size();
|
||||||
final byte[] bytes = headerBuffer.toByteArray();
|
final byte[] bytes = headerBuffer.toByteArray();
|
||||||
// Write out the header size.
|
// Write out the header size.
|
||||||
|
@ -845,4 +852,15 @@ public class BinaryDictEncoderUtils {
|
||||||
headerBuffer.close();
|
headerBuffer.close();
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static final class CodePointTable {
|
||||||
|
final HashMap<Integer, Integer> mCodePointToOneByteCodeMap;
|
||||||
|
final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray;
|
||||||
|
|
||||||
|
CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap,
|
||||||
|
final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) {
|
||||||
|
mCodePointToOneByteCodeMap = codePointToOneByteCodeMap;
|
||||||
|
mCodePointOccurrenceArray = codePointOccurrenceArray;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ package com.android.inputmethod.latin.makedict;
|
||||||
|
|
||||||
import com.android.inputmethod.annotations.UsedForTesting;
|
import com.android.inputmethod.annotations.UsedForTesting;
|
||||||
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
|
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
|
||||||
|
import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
|
||||||
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
|
||||||
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
|
@ -28,7 +29,11 @@ import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An implementation of DictEncoder for version 2 binary dictionary.
|
* An implementation of DictEncoder for version 2 binary dictionary.
|
||||||
|
@ -73,6 +78,46 @@ public class Ver2DictEncoder implements DictEncoder {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Package for testing
|
||||||
|
static CodePointTable makeCodePointTable(final FusionDictionary dict) {
|
||||||
|
final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>();
|
||||||
|
for (final WordProperty word : dict) {
|
||||||
|
// Store per code point occurrence
|
||||||
|
final String wordString = word.mWord;
|
||||||
|
for (int i = 0; i < wordString.length(); ++i) {
|
||||||
|
final int codePoint = Character.codePointAt(wordString, i);
|
||||||
|
if (codePointOccurrenceCounts.containsKey(codePoint)) {
|
||||||
|
codePointOccurrenceCounts.put(codePoint,
|
||||||
|
codePointOccurrenceCounts.get(codePoint) + 1);
|
||||||
|
} else {
|
||||||
|
codePointOccurrenceCounts.put(codePoint, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray =
|
||||||
|
new ArrayList<>(codePointOccurrenceCounts.entrySet());
|
||||||
|
// Descending order sort by occurrence (value side)
|
||||||
|
Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() {
|
||||||
|
@Override
|
||||||
|
public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) {
|
||||||
|
return b.getValue().compareTo(a.getValue());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
|
||||||
|
// Temporary map for writing of nodes
|
||||||
|
final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>();
|
||||||
|
for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
|
||||||
|
// Put a relation from the original code point to the one byte code.
|
||||||
|
codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex);
|
||||||
|
if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// codePointToOneByteCodeMap for writing the trie
|
||||||
|
// codePointOccurrenceArray for writing the header
|
||||||
|
return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
|
public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
|
||||||
throws IOException, UnsupportedFormatException {
|
throws IOException, UnsupportedFormatException {
|
||||||
|
@ -85,7 +130,12 @@ public class Ver2DictEncoder implements DictEncoder {
|
||||||
if (mOutStream == null) {
|
if (mOutStream == null) {
|
||||||
openStream();
|
openStream();
|
||||||
}
|
}
|
||||||
BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions);
|
|
||||||
|
// Make code point conversion table ordered by occurrence of code points
|
||||||
|
final CodePointTable codePointTable = makeCodePointTable(dict);
|
||||||
|
|
||||||
|
BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
|
||||||
|
codePointTable.mCodePointOccurrenceArray);
|
||||||
|
|
||||||
// Addresses are limited to 3 bytes, but since addresses can be relative to each node
|
// Addresses are limited to 3 bytes, but since addresses can be relative to each node
|
||||||
// array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
|
// array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
|
||||||
|
|
|
@ -0,0 +1,91 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2014 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.android.inputmethod.latin.makedict;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
|
import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
|
||||||
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
||||||
|
|
||||||
|
import android.test.AndroidTestCase;
|
||||||
|
import android.test.suitebuilder.annotation.LargeTest;
|
||||||
|
import android.util.Log;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unit tests for Ver2DictEncoder
|
||||||
|
*/
|
||||||
|
@LargeTest
|
||||||
|
public class Ver2DictEncoderTests extends AndroidTestCase {
|
||||||
|
private static final String TAG = Ver2DictEncoderTests.class.getSimpleName();
|
||||||
|
private static final int UNIGRAM_FREQ = 10;
|
||||||
|
|
||||||
|
public void testCodePointTable() {
|
||||||
|
final String[] wordSource = {"words", "used", "for", "testing", "a", "code point", "table"};
|
||||||
|
final List<String> words = Arrays.asList(wordSource);
|
||||||
|
final String correctCodePointTable = "eotdsanirfg bclwup";
|
||||||
|
final String correctCodePointOccurrenceArrayString =
|
||||||
|
"10141164111411531003110297210521142103111911171108198199132111211021";
|
||||||
|
final String correctCodePointExpectedMapString = "323433363538373940494147454644424348";
|
||||||
|
final String dictName = "codePointTableTest";
|
||||||
|
final String dictVersion = Long.toString(System.currentTimeMillis());
|
||||||
|
|
||||||
|
final FormatSpec.FormatOptions formatOptions =
|
||||||
|
new FormatSpec.FormatOptions(FormatSpec.VERSION2);
|
||||||
|
final FusionDictionary sourcedict = new FusionDictionary(new PtNodeArray(),
|
||||||
|
BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions));
|
||||||
|
addUnigrams(sourcedict, words, null /* shortcutMap */);
|
||||||
|
final CodePointTable codePointTable = Ver2DictEncoder.makeCodePointTable(sourcedict);
|
||||||
|
|
||||||
|
// Check if mCodePointOccurrenceArray is correct
|
||||||
|
final StringBuilder codePointOccurrenceArrayString = new StringBuilder();
|
||||||
|
for (Entry<Integer, Integer> entry : codePointTable.mCodePointOccurrenceArray) {
|
||||||
|
codePointOccurrenceArrayString.append(entry.getKey());
|
||||||
|
codePointOccurrenceArrayString.append(entry.getValue());
|
||||||
|
}
|
||||||
|
assertEquals(codePointOccurrenceArrayString.toString(),
|
||||||
|
correctCodePointOccurrenceArrayString);
|
||||||
|
|
||||||
|
// Check if mCodePointToOneByteCodeMap is correct
|
||||||
|
final StringBuilder codePointExpectedMapString = new StringBuilder();
|
||||||
|
for (int i = 0; i < correctCodePointTable.length(); ++i) {
|
||||||
|
codePointExpectedMapString.append(codePointTable.mCodePointToOneByteCodeMap.get(
|
||||||
|
correctCodePointTable.codePointAt(i)));
|
||||||
|
}
|
||||||
|
assertEquals(codePointExpectedMapString.toString(), correctCodePointExpectedMapString);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds unigrams to the dictionary.
|
||||||
|
*/
|
||||||
|
private void addUnigrams(final FusionDictionary dict, final List<String> words,
|
||||||
|
final HashMap<String, List<String>> shortcutMap) {
|
||||||
|
for (final String word : words) {
|
||||||
|
final ArrayList<WeightedString> shortcuts = new ArrayList<>();
|
||||||
|
if (shortcutMap != null && shortcutMap.containsKey(word)) {
|
||||||
|
for (final String shortcut : shortcutMap.get(word)) {
|
||||||
|
shortcuts.add(new WeightedString(shortcut, UNIGRAM_FREQ));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
||||||
|
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue