/* * Copyright (C) 2011 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.Constants; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; /** * A dictionary that can fusion heads and tails of words for more compression. */ @UsedForTesting public final class FusionDictionary implements Iterable { private static final boolean DBG = MakedictLog.DBG; private static int CHARACTER_NOT_FOUND_INDEX = -1; /** * A node array of the dictionary, containing several PtNodes. * * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the * real information. * This class also contains fields to cache size and address, to help with binary * generation. */ public static final class PtNodeArray { ArrayList mData; // To help with binary generation int mCachedSize = Integer.MIN_VALUE; // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They // always hold the same value except between dictionary address compression, during which // the update process needs to know about both values at the same time. Updating will // update the AfterUpdate value, and the code will move them to BeforeUpdate before // the next update pass. int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; int mCachedAddressAfterUpdate = Integer.MIN_VALUE; int mCachedParentAddress = 0; public PtNodeArray() { mData = new ArrayList(); } public PtNodeArray(ArrayList data) { Collections.sort(data, PTNODE_COMPARATOR); mData = data; } } /** * A string with a probability. * * This represents an "attribute", that is either a bigram or a shortcut. */ public static final class WeightedString { public final String mWord; public ProbabilityInfo mProbabilityInfo; public WeightedString(final String word, final int probability) { this(word, new ProbabilityInfo(probability)); } public WeightedString(final String word, final ProbabilityInfo probabilityInfo) { mWord = word; mProbabilityInfo = probabilityInfo; } public int getProbability() { return mProbabilityInfo.mProbability; } public void setProbability(final int probability) { mProbabilityInfo = new ProbabilityInfo(probability); } @Override public int hashCode() { return Arrays.hashCode(new Object[] { mWord, mProbabilityInfo}); } @Override public boolean equals(Object o) { if (o == this) return true; if (!(o instanceof WeightedString)) return false; final WeightedString w = (WeightedString)o; return mWord.equals(w.mWord) && mProbabilityInfo.equals(w.mProbabilityInfo); } } /** * PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children * (Pt means Patricia Trie). * * This is the central class of the in-memory representation. A PtNode is what can * be seen as a traditional "trie node", except it can hold several characters at the * same time. A PtNode essentially represents one or several characters in the middle * of the trie tree; as such, it can be a terminal, and it can have children. * In this in-memory representation, whether the PtNode is a terminal or not is represented * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or * bigrams, but a non-terminal may not. Moreover, children, if present, are null. */ public static final class PtNode { public static final int NOT_A_TERMINAL = -1; final int mChars[]; ArrayList mShortcutTargets; ArrayList mBigrams; int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. PtNodeArray mChildren; boolean mIsNotAWord; // Only a shortcut boolean mIsBlacklistEntry; // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary // generation. Before and After always hold the same value except during dictionary // address compression, where the update process needs to know about both values at the // same time. Updating will update the AfterUpdate value, and the code will move them // to BeforeUpdate before the next update pass. // The update process does not need two versions of mCachedSize. int mCachedSize; // The size, in bytes, of this PtNode. int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) int mCachedAddressAfterUpdate; // The address of this PtNode (after update) public PtNode(final int[] chars, final ArrayList shortcutTargets, final ArrayList bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry) { mChars = chars; mFrequency = frequency; mTerminalId = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = null; mIsNotAWord = isNotAWord; mIsBlacklistEntry = isBlacklistEntry; } public PtNode(final int[] chars, final ArrayList shortcutTargets, final ArrayList bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry, final PtNodeArray children) { mChars = chars; mFrequency = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = children; mIsNotAWord = isNotAWord; mIsBlacklistEntry = isBlacklistEntry; } public void addChild(PtNode n) { if (null == mChildren) { mChildren = new PtNodeArray(); } mChildren.mData.add(n); } public int getTerminalId() { return mTerminalId; } public boolean isTerminal() { return NOT_A_TERMINAL != mFrequency; } public int getFrequency() { return mFrequency; } public boolean getIsNotAWord() { return mIsNotAWord; } public boolean getIsBlacklistEntry() { return mIsBlacklistEntry; } public ArrayList getShortcutTargets() { // We don't want write permission to escape outside the package, so we return a copy if (null == mShortcutTargets) return null; final ArrayList copyOfShortcutTargets = new ArrayList(mShortcutTargets); return copyOfShortcutTargets; } public ArrayList getBigrams() { // We don't want write permission to escape outside the package, so we return a copy if (null == mBigrams) return null; final ArrayList copyOfBigrams = new ArrayList(mBigrams); return copyOfBigrams; } public boolean hasSeveralChars() { assert(mChars.length > 0); return 1 < mChars.length; } /** * Adds a word to the bigram list. Updates the probability if the word already * exists. */ public void addBigram(final String word, final int probability) { if (mBigrams == null) { mBigrams = new ArrayList(); } WeightedString bigram = getBigram(word); if (bigram != null) { bigram.setProbability(probability); } else { bigram = new WeightedString(word, probability); mBigrams.add(bigram); } } /** * Gets the shortcut target for the given word. Returns null if the word is not in the * shortcut list. */ public WeightedString getShortcut(final String word) { // TODO: Don't do a linear search if (mShortcutTargets != null) { final int size = mShortcutTargets.size(); for (int i = 0; i < size; ++i) { WeightedString shortcut = mShortcutTargets.get(i); if (shortcut.mWord.equals(word)) { return shortcut; } } } return null; } /** * Gets the bigram for the given word. * Returns null if the word is not in the bigrams list. */ public WeightedString getBigram(final String word) { // TODO: Don't do a linear search if (mBigrams != null) { final int size = mBigrams.size(); for (int i = 0; i < size; ++i) { WeightedString bigram = mBigrams.get(i); if (bigram.mWord.equals(word)) { return bigram; } } } return null; } /** * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only * updated if they are higher than the existing ones. */ public void update(final int frequency, final ArrayList shortcutTargets, final ArrayList bigrams, final boolean isNotAWord, final boolean isBlacklistEntry) { if (frequency > mFrequency) { mFrequency = frequency; } if (shortcutTargets != null) { if (mShortcutTargets == null) { mShortcutTargets = shortcutTargets; } else { final int size = shortcutTargets.size(); for (int i = 0; i < size; ++i) { final WeightedString shortcut = shortcutTargets.get(i); final WeightedString existingShortcut = getShortcut(shortcut.mWord); if (existingShortcut == null) { mShortcutTargets.add(shortcut); } else if (existingShortcut.getProbability() < shortcut.getProbability()) { existingShortcut.setProbability(shortcut.getProbability()); } } } } if (bigrams != null) { if (mBigrams == null) { mBigrams = bigrams; } else { final int size = bigrams.size(); for (int i = 0; i < size; ++i) { final WeightedString bigram = bigrams.get(i); final WeightedString existingBigram = getBigram(bigram.mWord); if (existingBigram == null) { mBigrams.add(bigram); } else if (existingBigram.getProbability() < bigram.getProbability()) { existingBigram.setProbability(bigram.getProbability()); } } } } mIsNotAWord = isNotAWord; mIsBlacklistEntry = isBlacklistEntry; } } /** * Options global to the dictionary. */ public static final class DictionaryOptions { public final HashMap mAttributes; public DictionaryOptions(final HashMap attributes) { mAttributes = attributes; } @Override public String toString() { // Convenience method return toString(0, false); } public String toString(final int indentCount, final boolean plumbing) { final StringBuilder indent = new StringBuilder(); if (plumbing) { indent.append("H:"); } else { for (int i = 0; i < indentCount; ++i) { indent.append(" "); } } final StringBuilder s = new StringBuilder(); for (final String optionKey : mAttributes.keySet()) { s.append(indent); s.append(optionKey); s.append(" = "); if ("date".equals(optionKey) && !plumbing) { // Date needs a number of milliseconds, but the dictionary contains seconds s.append(new Date( 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); } else { s.append(mAttributes.get(optionKey)); } s.append("\n"); } return s.toString(); } } public final DictionaryOptions mOptions; public final PtNodeArray mRootNodeArray; public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { mRootNodeArray = rootNodeArray; mOptions = options; } public void addOptionAttribute(final String key, final String value) { mOptions.mAttributes.put(key, value); } /** * Helper method to convert a String to an int array. */ static int[] getCodePoints(final String word) { // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, // which is not visible from the makedict package. Factor this code. final int length = word.length(); if (length <= 0) return new int[] {}; final char[] characters = word.toCharArray(); final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; int codePoint = Character.codePointAt(characters, 0); int dsti = 0; for (int srci = Character.charCount(codePoint); srci < length; srci += Character.charCount(codePoint), ++dsti) { codePoints[dsti] = codePoint; codePoint = Character.codePointAt(characters, srci); } codePoints[dsti] = codePoint; return codePoints; } /** * Helper method to add a word as a string. * * This method adds a word to the dictionary with the given frequency. Optional * lists of bigrams and shortcuts can be passed here. For each word inside, * they will be added to the dictionary as necessary. * * @param word the word to add. * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets a list of shortcut targets for this word, or null. * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) */ public void add(final String word, final int frequency, final ArrayList shortcutTargets, final boolean isNotAWord) { add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, false /* isBlacklistEntry */); } /** * Helper method to add a blacklist entry as a string. * * @param word the word to add as a blacklist entry. * @param shortcutTargets a list of shortcut targets for this word, or null. * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) */ public void addBlacklistEntry(final String word, final ArrayList shortcutTargets, final boolean isNotAWord) { add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); } /** * Sanity check for a PtNode array. * * This method checks that all PtNodes in a node array are ordered as expected. * If they are, nothing happens. If they aren't, an exception is thrown. */ private void checkStack(PtNodeArray ptNodeArray) { ArrayList stack = ptNodeArray.mData; int lastValue = -1; for (int i = 0; i < stack.size(); ++i) { int currentValue = stack.get(i).mChars[0]; if (currentValue <= lastValue) throw new RuntimeException("Invalid stack"); else lastValue = currentValue; } } /** * Helper method to add a new bigram to the dictionary. * * @param word1 the previous word of the context * @param word2 the next word of the context * @param frequency the bigram frequency */ public void setBigram(final String word1, final String word2, final int frequency) { PtNode ptNode = findWordInTree(mRootNodeArray, word1); if (ptNode != null) { final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2); if (ptNode2 == null) { add(getCodePoints(word2), 0, null, false /* isNotAWord */, false /* isBlacklistEntry */); // The PtNode for the first word may have moved by the above insertion, // if word1 and word2 share a common stem that happens not to have been // a cutting point until now. In this case, we need to refresh ptNode. ptNode = findWordInTree(mRootNodeArray, word1); } ptNode.addBigram(word2, frequency); } else { throw new RuntimeException("First word of bigram not found"); } } /** * Add a word to this dictionary. * * The shortcuts, if any, have to be in the dictionary already. If they aren't, * an exception is thrown. * * @param word the word, as an int array. * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets an optional list of shortcut targets for this word (null if none). * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) * @param isBlacklistEntry true if this is a blacklisted word, false otherwise */ private void add(final int[] word, final int frequency, final ArrayList shortcutTargets, final boolean isNotAWord, final boolean isBlacklistEntry) { assert(frequency >= 0 && frequency <= 255); if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); return; } PtNodeArray currentNodeArray = mRootNodeArray; int charIndex = 0; PtNode currentPtNode = null; int differentCharIndex = 0; // Set by the loop to the index of the char that differs int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { currentPtNode = currentNodeArray.mData.get(nodeIndex); differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); if (ARRAYS_ARE_EQUAL != differentCharIndex && differentCharIndex < currentPtNode.mChars.length) break; if (null == currentPtNode.mChildren) break; charIndex += currentPtNode.mChars.length; if (charIndex >= word.length) break; currentNodeArray = currentPtNode.mChildren; nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); } if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { // No node at this point to accept the word. Create one. final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); currentNodeArray.mData.add(insertionIndex, newPtNode); if (DBG) checkStack(currentNodeArray); } else { // There is a word with a common prefix. if (differentCharIndex == currentPtNode.mChars.length) { if (charIndex + differentCharIndex >= word.length) { // The new word is a prefix of an existing word, but the node on which it // should end already exists as is. Since the old PtNode was not a terminal, // make it one by filling in its frequency and other attributes currentPtNode.update(frequency, shortcutTargets, null, isNotAWord, isBlacklistEntry); } else { // The new word matches the full old word and extends past it. // We only have to create a new node and add it to the end of this. final PtNode newNode = new PtNode( Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); currentPtNode.mChildren = new PtNodeArray(); currentPtNode.mChildren.mData.add(newNode); } } else { if (0 == differentCharIndex) { // Exact same word. Update the frequency if higher. This will also add the // new shortcuts to the existing shortcut list if it already exists. currentPtNode.update(frequency, shortcutTargets, null, currentPtNode.mIsNotAWord && isNotAWord, currentPtNode.mIsBlacklistEntry || isBlacklistEntry); } else { // Partial prefix match only. We have to replace the current node with a node // containing the current prefix and create two new ones for the tails. PtNodeArray newChildren = new PtNodeArray(); final PtNode newOldWord = new PtNode( Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, currentPtNode.mChars.length), currentPtNode.mShortcutTargets, currentPtNode.mBigrams, currentPtNode.mFrequency, currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry, currentPtNode.mChildren); newChildren.mData.add(newOldWord); final PtNode newParent; if (charIndex + differentCharIndex >= word.length) { newParent = new PtNode( Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry, newChildren); } else { newParent = new PtNode( Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), null /* shortcutTargets */, null /* bigrams */, -1, false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); final PtNode newWord = new PtNode(Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); final int addIndex = word[charIndex + differentCharIndex] > currentPtNode.mChars[differentCharIndex] ? 1 : 0; newChildren.mData.add(addIndex, newWord); } currentNodeArray.mData.set(nodeIndex, newParent); } if (DBG) checkStack(currentNodeArray); } } } private static int ARRAYS_ARE_EQUAL = 0; /** * Custom comparison of two int arrays taken to contain character codes. * * This method compares the two arrays passed as an argument in a lexicographic way, * with an offset in the dst string. * This method does NOT test for the first character. It is taken to be equal. * I repeat: this method starts the comparison at 1 <> dstOffset + 1. * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the * strings are equal. This works BECAUSE we don't look at the first character. * * @param src the left-hand side string of the comparison. * @param dst the right-hand side string of the comparison. * @param dstOffset the offset in the right-hand side string. * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. */ private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { // We do NOT test the first char, because we come from a method that already // tested it. for (int i = 1; i < src.length; ++i) { if (dstOffset + i >= dst.length) return i; if (src[i] != dst[dstOffset + i]) return i; } if (dst.length > src.length) return src.length; return ARRAYS_ARE_EQUAL; } /** * Helper class that compares and sorts two PtNodes according to their * first element only. I repeat: ONLY the first element is considered, the rest * is ignored. * This comparator imposes orderings that are inconsistent with equals. */ static private final class PtNodeComparator implements java.util.Comparator { @Override public int compare(PtNode p1, PtNode p2) { if (p1.mChars[0] == p2.mChars[0]) return 0; return p1.mChars[0] < p2.mChars[0] ? -1 : 1; } } final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); /** * Finds the insertion index of a character within a node array. */ private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { final ArrayList data = nodeArray.mData; final PtNode reference = new PtNode(new int[] { character }, null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, false /* isBlacklistEntry */); int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); return result >= 0 ? result : -result - 1; } /** * Find the index of a char in a node array, if it exists. * * @param nodeArray the node array to search in. * @param character the character to search for. * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. */ private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { final int insertionIndex = findInsertionIndex(nodeArray, character); if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex : CHARACTER_NOT_FOUND_INDEX; } /** * Helper method to find a word in a given branch. */ @SuppressWarnings("unused") public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) { int index = 0; final StringBuilder checker = DBG ? new StringBuilder() : null; final int[] codePoints = getCodePoints(string); PtNode currentPtNode; do { int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; currentPtNode = nodeArray.mData.get(indexOfGroup); if (codePoints.length - index < currentPtNode.mChars.length) return null; int newIndex = index; while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; newIndex++; } index = newIndex; if (DBG) { checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); } if (index < codePoints.length) { nodeArray = currentPtNode.mChildren; } } while (null != nodeArray && index < codePoints.length); if (index < codePoints.length) return null; if (!currentPtNode.isTerminal()) return null; if (DBG && !string.equals(checker.toString())) return null; return currentPtNode; } /** * Helper method to find out whether a word is in the dict or not. */ public boolean hasWord(final String s) { if (null == s || "".equals(s)) { throw new RuntimeException("Can't search for a null or empty string"); } return null != findWordInTree(mRootNodeArray, s); } /** * Recursively count the number of PtNodes in a given branch of the trie. * * @param nodeArray the parent node. * @return the number of PtNodes in all the branch under this node. */ public static int countPtNodes(final PtNodeArray nodeArray) { final int nodeSize = nodeArray.mData.size(); int size = nodeSize; for (int i = nodeSize - 1; i >= 0; --i) { PtNode ptNode = nodeArray.mData.get(i); if (null != ptNode.mChildren) size += countPtNodes(ptNode.mChildren); } return size; } /** * Iterator to walk through a dictionary. * * This is purely for convenience. */ public static final class DictionaryIterator implements Iterator { private static final class Position { public Iterator pos; public int length; public Position(ArrayList ptNodes) { pos = ptNodes.iterator(); length = 0; } } final StringBuilder mCurrentString; final LinkedList mPositions; public DictionaryIterator(ArrayList ptRoot) { mCurrentString = new StringBuilder(); mPositions = new LinkedList(); final Position rootPos = new Position(ptRoot); mPositions.add(rootPos); } @Override public boolean hasNext() { for (Position p : mPositions) { if (p.pos.hasNext()) { return true; } } return false; } @Override public WordProperty next() { Position currentPos = mPositions.getLast(); mCurrentString.setLength(currentPos.length); do { if (currentPos.pos.hasNext()) { final PtNode currentPtNode = currentPos.pos.next(); currentPos.length = mCurrentString.length(); for (int i : currentPtNode.mChars) { mCurrentString.append(Character.toChars(i)); } if (null != currentPtNode.mChildren) { currentPos = new Position(currentPtNode.mChildren.mData); currentPos.length = mCurrentString.length(); mPositions.addLast(currentPos); } if (currentPtNode.mFrequency >= 0) { return new WordProperty(mCurrentString.toString(), currentPtNode.mFrequency, currentPtNode.mShortcutTargets, currentPtNode.mBigrams, currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry); } } else { mPositions.removeLast(); currentPos = mPositions.getLast(); mCurrentString.setLength(mPositions.getLast().length); } } while (true); } @Override public void remove() { throw new UnsupportedOperationException("Unsupported yet"); } } /** * Method to return an iterator. * * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x * and say : for (Word w : x) {} */ @Override public Iterator iterator() { return new DictionaryIterator(mRootNodeArray.mData); } }