2012-08-16 09:40:14 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012 The Android Open Source Project
|
|
|
|
*
|
2013-01-21 12:52:57 +00:00
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
2012-08-16 09:40:14 +00:00
|
|
|
*
|
2013-01-21 12:52:57 +00:00
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
2012-08-16 09:40:14 +00:00
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
2013-01-21 12:52:57 +00:00
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
2012-08-16 09:40:14 +00:00
|
|
|
*/
|
|
|
|
|
2013-06-23 16:11:32 +00:00
|
|
|
package com.android.inputmethod.latin.utils;
|
2012-08-16 09:40:14 +00:00
|
|
|
|
|
|
|
import android.util.Log;
|
|
|
|
|
2012-10-03 08:36:45 +00:00
|
|
|
import com.android.inputmethod.annotations.UsedForTesting;
|
2012-09-19 04:53:21 +00:00
|
|
|
import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
|
2013-09-12 08:46:09 +00:00
|
|
|
import com.android.inputmethod.latin.makedict.DictDecoder;
|
2013-08-21 12:15:21 +00:00
|
|
|
import com.android.inputmethod.latin.makedict.DictEncoder;
|
2012-09-12 09:53:33 +00:00
|
|
|
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
|
2012-08-16 09:40:14 +00:00
|
|
|
import com.android.inputmethod.latin.makedict.FusionDictionary;
|
2013-08-16 05:51:37 +00:00
|
|
|
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
|
2012-08-16 09:40:14 +00:00
|
|
|
import com.android.inputmethod.latin.makedict.PendingAttribute;
|
|
|
|
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
2013-07-26 03:35:11 +00:00
|
|
|
import com.android.inputmethod.latin.personalization.UserHistoryDictionaryBigramList;
|
2012-08-16 09:40:14 +00:00
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.HashMap;
|
2013-08-23 14:30:16 +00:00
|
|
|
import java.util.Map.Entry;
|
|
|
|
import java.util.TreeMap;
|
2013-09-06 12:03:09 +00:00
|
|
|
import java.util.concurrent.TimeUnit;
|
2012-08-16 09:40:14 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Reads and writes Binary files for a UserHistoryDictionary.
|
|
|
|
*
|
|
|
|
* All the methods in this class are static.
|
|
|
|
*/
|
2012-09-27 09:16:16 +00:00
|
|
|
public final class UserHistoryDictIOUtils {
|
2012-08-16 09:40:14 +00:00
|
|
|
private static final String TAG = UserHistoryDictIOUtils.class.getSimpleName();
|
|
|
|
private static final boolean DEBUG = false;
|
2013-09-06 12:03:09 +00:00
|
|
|
private static final String USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
|
|
|
|
private static final String USES_FORGETTING_CURVE_VALUE = "1";
|
|
|
|
private static final String LAST_UPDATED_TIME_KEY = "date";
|
2012-08-16 09:40:14 +00:00
|
|
|
|
|
|
|
public interface OnAddWordListener {
|
2013-10-04 14:26:18 +00:00
|
|
|
/**
|
|
|
|
* Callback to be notified when a word is added to the dictionary.
|
|
|
|
* @param word The added word.
|
|
|
|
* @param shortcutTarget A shortcut target for this word, or null if none.
|
|
|
|
* @param frequency The frequency for this word.
|
|
|
|
* @param shortcutFreq The frequency of the shortcut (0~15, with 15 = whitelist).
|
|
|
|
* Unspecified if shortcutTarget is null - do not rely on its value.
|
|
|
|
*/
|
|
|
|
public void setUnigram(final String word, final String shortcutTarget, final int frequency,
|
|
|
|
final int shortcutFreq);
|
2012-08-16 09:40:14 +00:00
|
|
|
public void setBigram(final String word1, final String word2, final int frequency);
|
|
|
|
}
|
|
|
|
|
2013-01-06 02:10:27 +00:00
|
|
|
@UsedForTesting
|
2012-08-16 09:40:14 +00:00
|
|
|
public interface BigramDictionaryInterface {
|
|
|
|
public int getFrequency(final String word1, final String word2);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Writes dictionary to file.
|
|
|
|
*/
|
2013-08-21 12:15:21 +00:00
|
|
|
public static void writeDictionary(final DictEncoder dictEncoder,
|
2012-08-16 09:40:14 +00:00
|
|
|
final BigramDictionaryInterface dict, final UserHistoryDictionaryBigramList bigrams,
|
2012-09-05 03:37:56 +00:00
|
|
|
final FormatOptions formatOptions) {
|
2012-08-16 09:40:14 +00:00
|
|
|
final FusionDictionary fusionDict = constructFusionDictionary(dict, bigrams);
|
2013-09-06 12:03:09 +00:00
|
|
|
fusionDict.addOptionAttribute(USES_FORGETTING_CURVE_KEY, USES_FORGETTING_CURVE_VALUE);
|
|
|
|
fusionDict.addOptionAttribute(LAST_UPDATED_TIME_KEY,
|
|
|
|
String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
|
2012-08-16 09:40:14 +00:00
|
|
|
try {
|
2013-08-21 12:15:21 +00:00
|
|
|
dictEncoder.writeDictionary(fusionDict, formatOptions);
|
2012-08-20 10:29:20 +00:00
|
|
|
Log.d(TAG, "end writing");
|
2012-08-16 09:40:14 +00:00
|
|
|
} catch (IOException e) {
|
2013-02-12 05:14:56 +00:00
|
|
|
Log.e(TAG, "IO exception while writing file", e);
|
2012-08-16 09:40:14 +00:00
|
|
|
} catch (UnsupportedFormatException e) {
|
2013-02-12 05:14:56 +00:00
|
|
|
Log.e(TAG, "Unsupported format", e);
|
2012-08-16 09:40:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Constructs a new FusionDictionary from BigramDictionaryInterface.
|
|
|
|
*/
|
2012-10-03 08:36:45 +00:00
|
|
|
@UsedForTesting
|
|
|
|
static FusionDictionary constructFusionDictionary(
|
2012-08-16 09:40:14 +00:00
|
|
|
final BigramDictionaryInterface dict, final UserHistoryDictionaryBigramList bigrams) {
|
2013-08-16 05:51:37 +00:00
|
|
|
final FusionDictionary fusionDict = new FusionDictionary(new PtNodeArray(),
|
2012-08-20 10:29:20 +00:00
|
|
|
new FusionDictionary.DictionaryOptions(new HashMap<String, String>(), false,
|
|
|
|
false));
|
|
|
|
int profTotal = 0;
|
2012-08-16 09:40:14 +00:00
|
|
|
for (final String word1 : bigrams.keySet()) {
|
|
|
|
final HashMap<String, Byte> word1Bigrams = bigrams.getBigrams(word1);
|
|
|
|
for (final String word2 : word1Bigrams.keySet()) {
|
|
|
|
final int freq = dict.getFrequency(word1, word2);
|
2012-08-20 10:29:20 +00:00
|
|
|
if (freq == -1) {
|
|
|
|
// don't add this bigram.
|
|
|
|
continue;
|
|
|
|
}
|
2012-08-16 09:40:14 +00:00
|
|
|
if (DEBUG) {
|
|
|
|
if (word1 == null) {
|
|
|
|
Log.d(TAG, "add unigram: " + word2 + "," + Integer.toString(freq));
|
|
|
|
} else {
|
|
|
|
Log.d(TAG, "add bigram: " + word1
|
|
|
|
+ "," + word2 + "," + Integer.toString(freq));
|
|
|
|
}
|
2012-08-20 10:29:20 +00:00
|
|
|
profTotal++;
|
2012-08-16 09:40:14 +00:00
|
|
|
}
|
|
|
|
if (word1 == null) { // unigram
|
2012-09-03 02:58:23 +00:00
|
|
|
fusionDict.add(word2, freq, null, false /* isNotAWord */);
|
2012-08-16 09:40:14 +00:00
|
|
|
} else { // bigram
|
2013-08-16 05:51:37 +00:00
|
|
|
if (FusionDictionary.findWordInTree(fusionDict.mRootNodeArray, word1) == null) {
|
2012-08-20 10:29:20 +00:00
|
|
|
fusionDict.add(word1, 2, null, false /* isNotAWord */);
|
|
|
|
}
|
2012-08-16 09:40:14 +00:00
|
|
|
fusionDict.setBigram(word1, word2, freq);
|
|
|
|
}
|
|
|
|
bigrams.updateBigram(word1, word2, (byte)freq);
|
|
|
|
}
|
|
|
|
}
|
2012-08-20 10:29:20 +00:00
|
|
|
if (DEBUG) {
|
|
|
|
Log.d(TAG, "add " + profTotal + "words");
|
|
|
|
}
|
2012-08-16 09:40:14 +00:00
|
|
|
return fusionDict;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Reads dictionary from file.
|
|
|
|
*/
|
2013-09-12 08:46:09 +00:00
|
|
|
public static void readDictionaryBinary(final DictDecoder dictDecoder,
|
2012-08-16 09:40:14 +00:00
|
|
|
final OnAddWordListener dict) {
|
2013-08-23 14:30:16 +00:00
|
|
|
final TreeMap<Integer, String> unigrams = CollectionUtils.newTreeMap();
|
|
|
|
final TreeMap<Integer, Integer> frequencies = CollectionUtils.newTreeMap();
|
|
|
|
final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams = CollectionUtils.newTreeMap();
|
2012-08-16 09:40:14 +00:00
|
|
|
try {
|
2013-08-23 14:30:16 +00:00
|
|
|
dictDecoder.readUnigramsAndBigramsBinary(unigrams, frequencies, bigrams);
|
2012-08-16 09:40:14 +00:00
|
|
|
} catch (IOException e) {
|
2013-02-12 05:14:56 +00:00
|
|
|
Log.e(TAG, "IO exception while reading file", e);
|
2012-08-16 09:40:14 +00:00
|
|
|
} catch (UnsupportedFormatException e) {
|
2013-02-12 05:14:56 +00:00
|
|
|
Log.e(TAG, "Unsupported format", e);
|
2012-09-28 05:14:31 +00:00
|
|
|
} catch (ArrayIndexOutOfBoundsException e) {
|
2013-02-12 05:14:56 +00:00
|
|
|
Log.e(TAG, "ArrayIndexOutOfBoundsException while reading file", e);
|
2012-08-16 09:40:14 +00:00
|
|
|
}
|
2012-09-28 05:14:31 +00:00
|
|
|
addWordsFromWordMap(unigrams, frequencies, bigrams, dict);
|
2012-08-16 09:40:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Adds all unigrams and bigrams in maps to OnAddWordListener.
|
|
|
|
*/
|
2012-10-03 08:36:45 +00:00
|
|
|
@UsedForTesting
|
2013-08-23 14:30:16 +00:00
|
|
|
static void addWordsFromWordMap(final TreeMap<Integer, String> unigrams,
|
|
|
|
final TreeMap<Integer, Integer> frequencies,
|
|
|
|
final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams,
|
|
|
|
final OnAddWordListener to) {
|
|
|
|
for (Entry<Integer, String> entry : unigrams.entrySet()) {
|
2012-08-16 09:40:14 +00:00
|
|
|
final String word1 = entry.getValue();
|
|
|
|
final int unigramFrequency = frequencies.get(entry.getKey());
|
2013-10-04 14:26:18 +00:00
|
|
|
to.setUnigram(word1, null /* shortcutTarget */, unigramFrequency, 0 /* shortcutFreq */);
|
2012-08-16 09:40:14 +00:00
|
|
|
final ArrayList<PendingAttribute> attrList = bigrams.get(entry.getKey());
|
|
|
|
if (attrList != null) {
|
|
|
|
for (final PendingAttribute attr : attrList) {
|
2013-03-26 03:58:44 +00:00
|
|
|
final String word2 = unigrams.get(attr.mAddress);
|
|
|
|
if (word1 == null || word2 == null) {
|
|
|
|
Log.e(TAG, "Invalid bigram pair detected: " + word1 + ", " + word2);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
to.setBigram(word1, word2,
|
2013-08-19 05:49:57 +00:00
|
|
|
BinaryDictIOUtils.reconstructBigramFrequency(unigramFrequency,
|
2012-08-16 09:40:14 +00:00
|
|
|
attr.mFrequency));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2013-01-06 02:10:27 +00:00
|
|
|
}
|