From d36245fad292ea660ca49f38a3ec36e07727dda5 Mon Sep 17 00:00:00 2001 From: Yuichiro Hanada Date: Wed, 19 Sep 2012 14:38:17 +0900 Subject: [PATCH] Add getTerminalPosition. Change-Id: If04d779db23b1aea2cc12e5e9b8cecfcb35a5737 --- .../latin/makedict/BinaryDictIOUtils.java | 68 +++++++++++++- .../latin/makedict/BinaryDictInputOutput.java | 5 +- .../latin/makedict/FormatSpec.java | 3 + .../latin/makedict/BinaryDictIOTests.java | 91 ++++++++++++++++++- 4 files changed, 163 insertions(+), 4 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 1a85e71ce..7a1b9dcb7 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -16,10 +16,11 @@ package com.android.inputmethod.latin.makedict; -import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.Constants; import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; import java.io.IOException; import java.util.ArrayList; @@ -124,4 +125,69 @@ public class BinaryDictIOUtils { readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams, header.mFormatOptions); } + + /** + * Gets the address of the last CharGroup of the exact matching word in the dictionary. + * If no match is found, returns NOT_VALID_WORD. + * + * @param buffer the buffer to read. + * @param word the word we search for. + * @return the address of the terminal node. + * @throws IOException + * @throws UnsupportedFormatException + */ + public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer, + final String word) throws IOException, UnsupportedFormatException { + if (word == null) return FormatSpec.NOT_VALID_WORD; + if (buffer.position() != 0) buffer.position(0); + + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + int wordPos = 0; + final int wordLen = word.codePointCount(0, word.length()); + for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { + if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; + int groupOffset = buffer.position() - header.mHeaderSize; + final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); + groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount); + + for (int i = 0; i < charGroupCount; ++i) { + final int charGroupPos = buffer.position(); + final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, + buffer.position(), header.mFormatOptions); + boolean same = true; + for (int p = 0, j = word.offsetByCodePoints(0, wordPos); + p < currentInfo.mCharacters.length; + ++p, j = word.offsetByCodePoints(j, 1)) { + if (wordPos + p >= wordLen + || word.codePointAt(j) != currentInfo.mCharacters[p]) { + same = false; + break; + } + } + + if (same) { + if (wordPos + currentInfo.mCharacters.length == wordLen) { + if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) { + return FormatSpec.NOT_VALID_WORD; + } else { + return charGroupPos; + } + } + wordPos += currentInfo.mCharacters.length; + if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { + return FormatSpec.NOT_VALID_WORD; + } + buffer.position(currentInfo.mChildrenAddress); + break; + } + groupOffset = currentInfo.mEndAddress; + + // not found + if (i >= charGroupCount - 1) { + return FormatSpec.NOT_VALID_WORD; + } + } + } + return FormatSpec.NOT_VALID_WORD; + } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index c865702d6..1d3e94bb7 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -1242,8 +1242,9 @@ public class BinaryDictInputOutput { * @param formatOptions file format options. * @return the word, as a string. */ - private static String getWordAtAddress(final FusionDictionaryBufferInterface buffer, - final int headerSize, final int address, final FormatOptions formatOptions) { + /* packages for tests */ static String getWordAtAddress( + final FusionDictionaryBufferInterface buffer, final int headerSize, final int address, + final FormatOptions formatOptions) { final String cachedString = wordCache.get(address); if (null != cachedString) return cachedString; diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index f8f13b197..adc6037bb 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -207,6 +207,9 @@ public final class FormatSpec { static final int MAX_TERMINAL_FREQUENCY = 255; static final int MAX_BIGRAM_FREQUENCY = 15; + // This option needs to be the same numeric value as the one in binary_format.h. + static final int NOT_VALID_WORD = -99; + /** * Options about file format. */ diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java index 4c2d3f6fe..24776d536 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOTests.java @@ -19,7 +19,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.CollectionUtils; import com.android.inputmethod.latin.UserHistoryDictIOUtils; import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; -import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; import com.android.inputmethod.latin.makedict.FusionDictionary.Node; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; @@ -475,4 +475,93 @@ public class BinaryDictIOTests extends AndroidTestCase { Log.d(TAG, result); } } + + // Tests for getTerminalPosition + private String getWordFromBinary(final FusionDictionaryBufferInterface buffer, + final int address) { + if (buffer.position() != 0) buffer.position(0); + + FileHeader header = null; + try { + header = BinaryDictInputOutput.readHeader(buffer); + } catch (IOException e) { + return null; + } catch (UnsupportedFormatException e) { + return null; + } + if (header == null) return null; + return BinaryDictInputOutput.getWordAtAddress(buffer, header.mHeaderSize, + address - header.mHeaderSize, header.mFormatOptions); + } + + private long runGetTerminalPosition(final FusionDictionaryBufferInterface buffer, + final String word, int index, boolean contained) { + final int expectedFrequency = (UNIGRAM_FREQ + index) % 255; + long diff = -1; + int position = -1; + try { + final long now = System.nanoTime(); + position = BinaryDictIOUtils.getTerminalPosition(buffer, word); + diff = System.nanoTime() - now; + } catch (IOException e) { + Log.e(TAG, "IOException while getTerminalPosition: " + e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException while getTermianlPosition: " + e); + } + + assertEquals(FormatSpec.NOT_VALID_WORD != position, contained); + if (contained) assertEquals(getWordFromBinary(buffer, position), word); + return diff; + } + + public void testGetTerminalPosition() { + File file = null; + try { + file = File.createTempFile("runReadUnigrams", ".dict"); + } catch (IOException e) { + // do nothing + } + assertNotNull(file); + + final FusionDictionary dict = new FusionDictionary(new Node(), + new FusionDictionary.DictionaryOptions( + new HashMap(), false, false)); + addUnigrams(sWords.size(), dict, sWords, null /* shortcutMap */); + timeWritingDictToFile(file, dict, VERSION3_WITH_LINKEDLIST_NODE); + + final FusionDictionaryBufferInterface buffer = getBuffer(file, USE_BYTE_ARRAY); + + try { + // too long word + final String longWord = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + assertEquals(FormatSpec.NOT_VALID_WORD, + BinaryDictIOUtils.getTerminalPosition(buffer, longWord)); + + // null + assertEquals(FormatSpec.NOT_VALID_WORD, + BinaryDictIOUtils.getTerminalPosition(buffer, null)); + + // empty string + assertEquals(FormatSpec.NOT_VALID_WORD, + BinaryDictIOUtils.getTerminalPosition(buffer, "")); + } catch (IOException e) { + } catch (UnsupportedFormatException e) { + } + + // Test a word that is contained within the dictionary. + long sum = 0; + for (int i = 0; i < sWords.size(); ++i) { + final long time = runGetTerminalPosition(buffer, sWords.get(i), i, true); + sum += time == -1 ? 0 : time; + } + Log.d(TAG, "per a search : " + (((double)sum) / sWords.size() / 1000000)); + + // Test a word that isn't contained within the dictionary. + final Random random = new Random((int)System.currentTimeMillis()); + for (int i = 0; i < 1000; ++i) { + final String word = generateWord(random.nextInt()); + if (sWords.indexOf(word) != -1) continue; + runGetTerminalPosition(buffer, word, i, false); + } + } }