Start to support adding shortcuts.

Bug: 11073222
Bug: 11956652

Change-Id: Iea81603a140697594cfea4f4939e82cd1d3963ca
main
Keisuke Kuroyanagi 2013-12-09 13:05:44 +09:00
parent a2bbb1213d
commit 5f88c1e0f1
11 changed files with 201 additions and 11 deletions

View File

@ -88,11 +88,13 @@ class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos);
} }
// Overwrite existing entry. // Overwrite existing entry.
int writingPos = entryPos; bool hasNext = false;
if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */,
codePointCount, probability, true /* hasNext */, &writingPos)) { 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos);
if (!mShortcutDictContent->writeShortcutEntry(codePoints,
codePointCount, probability, hasNext, entryPos)) {
AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId,
writingPos); entryPos);
return false; return false;
} }
return true; return true;

View File

@ -74,6 +74,10 @@ class PtNodeWriter {
const DictPositionRelocationMap *const dictPositionRelocationMap, const DictPositionRelocationMap *const dictPositionRelocationMap,
int *const outBigramEntryCount) = 0; int *const outBigramEntryCount) = 0;
virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability) = 0;
protected: protected:
PtNodeWriter() {}; PtNodeWriter() {};

View File

@ -105,6 +105,14 @@ bool DynamicPatriciaTrieUpdatingHelper::removeBigramWords(const int word0Pos, co
return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams); return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams);
} }
bool DynamicPatriciaTrieUpdatingHelper::addShortcutTarget(const int wordPos,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability) {
const PtNodeParams ptNodeParams(mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(wordPos));
return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount,
shortcutProbability);
}
bool DynamicPatriciaTrieUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, bool DynamicPatriciaTrieUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
const int *const nodeCodePoints, const int nodeCodePointCount, const int *const nodeCodePoints, const int nodeCodePointCount,
const bool isNotAWord, const bool isBlacklisted, const int probability, const bool isNotAWord, const bool isBlacklisted, const int probability,

View File

@ -52,6 +52,10 @@ class DynamicPatriciaTrieUpdatingHelper {
// Remove a bigram relation from word0Pos to word1Pos. // Remove a bigram relation from word0Pos to word1Pos.
bool removeBigramWords(const int word0Pos, const int word1Pos); bool removeBigramWords(const int word0Pos, const int word1Pos);
// Add a shortcut target.
bool addShortcutTarget(const int wordPos, const int *const targetCodePoints,
const int targetCodePointCount, const int shortcutProbability);
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieUpdatingHelper); DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieUpdatingHelper);

View File

@ -38,6 +38,14 @@ class ShortcutDictContent : public SparseTableDictContent {
: SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint,
int *const outCodePointCount, int *const outProbability, bool *const outhasNext,
const int shortcutEntryPos) {
int readingPos = shortcutEntryPos;
return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint,
outCodePointCount, outProbability, outhasNext, &readingPos);
}
void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, void getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
int *const outCodePoint, int *const outCodePointCount, int *const outProbability, int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
bool *const outhasNext, int *const shortcutEntryPos) const; bool *const outhasNext, int *const shortcutEntryPos) const;

View File

@ -217,6 +217,13 @@ bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
return true; return true;
} }
bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability) {
return mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
targetCodePoints, targetCodePointCount, shortcutProbability);
}
bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
const PtNodeParams *const ptNodeParams, int *const outTerminalId, const PtNodeParams *const ptNodeParams, int *const outTerminalId,
int *const ptNodeWritingPos) { int *const ptNodeWritingPos) {

View File

@ -83,6 +83,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
const DictPositionRelocationMap *const dictPositionRelocationMap, const DictPositionRelocationMap *const dictPositionRelocationMap,
int *const outBigramEntryCount); int *const outBigramEntryCount);
virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability);
private: private:
DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter);

View File

@ -155,12 +155,26 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
DynamicPatriciaTrieReadingHelper readingHelper(mDictBuffer, &mNodeReader); DynamicPatriciaTrieReadingHelper readingHelper(mDictBuffer, &mNodeReader);
readingHelper.initWithPtNodeArrayPos(getRootPosition()); readingHelper.initWithPtNodeArrayPos(getRootPosition());
bool addedNewUnigram = false; bool addedNewUnigram = false;
// TODO: Add shortcut.
if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, probability, isNotAWord, if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, probability, isNotAWord,
isBlacklisted, timestamp, &addedNewUnigram)) { isBlacklisted, timestamp, &addedNewUnigram)) {
if (addedNewUnigram) { if (addedNewUnigram) {
mUnigramCount++; mUnigramCount++;
} }
if (shortcutLength > 0) {
// Add shortcut target.
const int wordPos = getTerminalPtNodePositionOfWord(word, length,
false /* forceLowerCaseSearch */);
if (wordPos == NOT_A_DICT_POS) {
AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
return false;
}
if (!mUpdatingHelper.addShortcutTarget(wordPos, shortcutTargetCodePoints,
shortcutLength, shortcutProbability)) {
AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, probability: %d",
wordPos, shortcutLength, shortcutProbability);
return false;
}
}
return true; return true;
} else { } else {
return false; return false;
@ -308,12 +322,12 @@ const UnigramProperty Ver4PatriciaTriePolicy::getUnigramProperty(const int *cons
// Fetch shortcut information. // Fetch shortcut information.
std::vector<std::vector<int> > shortcutTargets; std::vector<std::vector<int> > shortcutTargets;
std::vector<int> shortcutProbabilities; std::vector<int> shortcutProbabilities;
if (ptNodeParams.hasShortcutTargets()) { int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
if (shortcutPos != NOT_A_DICT_POS) {
int shortcutTarget[MAX_WORD_LENGTH]; int shortcutTarget[MAX_WORD_LENGTH];
const ShortcutDictContent *const shortcutDictContent = const ShortcutDictContent *const shortcutDictContent =
mBuffers.get()->getShortcutDictContent(); mBuffers.get()->getShortcutDictContent();
bool hasNext = true; bool hasNext = true;
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
while (hasNext) { while (hasNext) {
int shortcutTargetLength = 0; int shortcutTargetLength = 0;
int shortcutProbability = NOT_A_PROBABILITY; int shortcutProbability = NOT_A_PROBABILITY;

View File

@ -43,7 +43,7 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
*pos -= mOriginalBufferSize; *pos -= mOriginalBufferSize;
} }
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePointCount, pos); getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
if (readingPosIsInAdditionalBuffer) { if (readingPosIsInAdditionalBuffer) {
*pos += mOriginalBufferSize; *pos += mOriginalBufferSize;
} }

View File

@ -44,6 +44,7 @@ bool SparseTable::set(const int id, const uint32_t value) {
int tailPos = mIndexTableBuffer->getTailPosition(); int tailPos = mIndexTableBuffer->getTailPosition();
while(tailPos < posInIndexTable) { while(tailPos < posInIndexTable) {
if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) { if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) {
AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable);
return false; return false;
} }
} }
@ -51,12 +52,19 @@ bool SparseTable::set(const int id, const uint32_t value) {
if (contains(id)) { if (contains(id)) {
// The entry is already in the content table. // The entry is already in the content table.
const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable); const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable);
return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index)); if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) {
AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value,
getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(),
mDataSize);
return false;
}
return true;
} }
// The entry is not in the content table. // The entry is not in the content table.
// Create new entry in the content table. // Create new entry in the content table.
const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition()); const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition());
if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) { if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) {
AKLOGE("cannot write index %d. pos %d", index, posInIndexTable);
return false; return false;
} }
// Write a new block that containing the entry to be set. // Write a new block that containing the entry to be set.
@ -64,6 +72,8 @@ bool SparseTable::set(const int id, const uint32_t value) {
for (int i = 0; i < mBlockSize; ++i) { for (int i = 0; i < mBlockSize; ++i) {
if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_A_DICT_POS, mDataSize, if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_A_DICT_POS, mDataSize,
&writingPos)) { &writingPos)) {
AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, "
"mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize);
return false; return false;
} }
} }
@ -80,7 +90,7 @@ int SparseTable::getPosInIndexTable(const int id) const {
int SparseTable::getPosInContentTable(const int id, const int index) const { int SparseTable::getPosInContentTable(const int id, const int index) const {
const int offset = id % mBlockSize; const int offset = id % mBlockSize;
return (index * mDataSize + offset) * mBlockSize; return (index * mBlockSize + offset) * mDataSize;
} }
} // namespace latinime } // namespace latinime

View File

@ -24,6 +24,7 @@ import android.util.Pair;
import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam; import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam;
import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.CodePointUtils;
import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.utils.UnigramProperty; import com.android.inputmethod.latin.utils.UnigramProperty;
import java.io.File; import java.io.File;
@ -856,7 +857,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
final int unigramProbability = random.nextInt(0xFF); final int unigramProbability = random.nextInt(0xFF);
final boolean isNotAWord = random.nextBoolean(); final boolean isNotAWord = random.nextBoolean();
final boolean isBlacklisted = random.nextBoolean(); final boolean isBlacklisted = random.nextBoolean();
// TODO: Add tests for shortcut.
// TODO: Add tests for historical info. // TODO: Add tests for historical info.
binaryDictionary.addUnigramWord(word, unigramProbability, binaryDictionary.addUnigramWord(word, unigramProbability,
null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY, null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
@ -873,4 +873,133 @@ public class BinaryDictionaryTests extends AndroidTestCase {
assertTrue(unigramProperty.mShortcutTargets.isEmpty()); assertTrue(unigramProperty.mShortcutTargets.isEmpty());
} }
} }
public void testAddShortcuts() {
testAddShortcuts(4 /* formatVersion */);
}
private void testAddShortcuts(final int formatVersion) {
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final int unigramProbability = 100;
final int shortcutProbability = 10;
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(1, unigramProperty.mShortcutTargets.size());
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
assertEquals(shortcutProbability, unigramProperty.mShortcutTargets.get(0).mFrequency);
final int updatedShortcutProbability = 2;
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(1, unigramProperty.mShortcutTargets.size());
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
assertEquals(updatedShortcutProbability,
unigramProperty.mShortcutTargets.get(0).mFrequency);
binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy",
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
final HashMap<String, Integer> shortcutTargets = new HashMap<String, Integer>();
shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability);
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(2, unigramProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
shortcutTargets.remove(shortcutTarget.mWord);
}
shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability);
binaryDictionary.flushWithGC();
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(2, unigramProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
shortcutTargets.remove(shortcutTarget.mWord);
}
}
public void testAddManyShortcuts() {
testAddManyShortcuts(4 /* formatVersion */);
}
private void testAddManyShortcuts(final int formatVersion) {
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
final int UNIGRAM_COUNT = 1000;
final int SHORTCUT_COUNT = 10000;
final int codePointSetSize = 20;
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
final ArrayList<String> words = new ArrayList<String>();
final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
final HashMap<String, HashMap<String, Integer>> shortcutTargets =
new HashMap<String, HashMap<String, Integer>>();
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
for (int i = 0; i < UNIGRAM_COUNT; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
final int unigramProbability = random.nextInt(0xFF);
addUnigramWord(binaryDictionary, word, unigramProbability);
words.add(word);
unigramProbabilities.put(word, unigramProbability);
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
binaryDictionary.flushWithGC();
}
}
for (int i = 0; i < SHORTCUT_COUNT; i++) {
final String shortcutTarget = CodePointUtils.generateWord(random, codePointSet);
final int shortcutProbability = random.nextInt(0xF);
final String word = words.get(random.nextInt(words.size()));
final int unigramProbability = unigramProbabilities.get(word);
binaryDictionary.addUnigramWord(word, unigramProbability, shortcutTarget,
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
if (shortcutTargets.containsKey(word)) {
final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word);
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
} else {
final HashMap<String, Integer> shortcutTargetsOfWord =
new HashMap<String, Integer>();
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
shortcutTargets.put(word, shortcutTargetsOfWord);
}
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
binaryDictionary.flushWithGC();
}
}
for (final String word : words) {
final UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty(word);
assertEquals((int)unigramProbabilities.get(word), unigramProperty.mProbability);
assertEquals(shortcutTargets.get(word).size(), unigramProperty.mShortcutTargets.size());
for (final WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
final String targetCodePonts = shortcutTarget.mWord;
assertEquals((int)shortcutTargets.get(word).get(targetCodePonts),
shortcutTarget.mFrequency);
}
}
}
} }