am 0a213fd6: Merge "Start to support adding shortcuts."

* commit '0a213fd625e72a8d7d4ba7ed3cb995bf4888d9fb':
  Start to support adding shortcuts.
This commit is contained in:
Keisuke Kuroyanagi 2013-12-08 21:37:41 -08:00 committed by Android Git Automerger
commit 7b840bc41d
11 changed files with 201 additions and 11 deletions

View file

@ -88,11 +88,13 @@ class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos);
}
// Overwrite existing entry.
int writingPos = entryPos;
if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints,
codePointCount, probability, true /* hasNext */, &writingPos)) {
bool hasNext = false;
mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */,
0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos);
if (!mShortcutDictContent->writeShortcutEntry(codePoints,
codePointCount, probability, hasNext, entryPos)) {
AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId,
writingPos);
entryPos);
return false;
}
return true;

View file

@ -74,6 +74,10 @@ class PtNodeWriter {
const DictPositionRelocationMap *const dictPositionRelocationMap,
int *const outBigramEntryCount) = 0;
virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability) = 0;
protected:
PtNodeWriter() {};

View file

@ -105,6 +105,14 @@ bool DynamicPatriciaTrieUpdatingHelper::removeBigramWords(const int word0Pos, co
return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams);
}
bool DynamicPatriciaTrieUpdatingHelper::addShortcutTarget(const int wordPos,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability) {
const PtNodeParams ptNodeParams(mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(wordPos));
return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount,
shortcutProbability);
}
bool DynamicPatriciaTrieUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
const int *const nodeCodePoints, const int nodeCodePointCount,
const bool isNotAWord, const bool isBlacklisted, const int probability,

View file

@ -52,6 +52,10 @@ class DynamicPatriciaTrieUpdatingHelper {
// Remove a bigram relation from word0Pos to word1Pos.
bool removeBigramWords(const int word0Pos, const int word1Pos);
// Add a shortcut target.
bool addShortcutTarget(const int wordPos, const int *const targetCodePoints,
const int targetCodePointCount, const int shortcutProbability);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieUpdatingHelper);

View file

@ -38,6 +38,14 @@ class ShortcutDictContent : public SparseTableDictContent {
: SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint,
int *const outCodePointCount, int *const outProbability, bool *const outhasNext,
const int shortcutEntryPos) {
int readingPos = shortcutEntryPos;
return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint,
outCodePointCount, outProbability, outhasNext, &readingPos);
}
void getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
bool *const outhasNext, int *const shortcutEntryPos) const;

View file

@ -217,6 +217,13 @@ bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
return true;
}
bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability) {
return mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
targetCodePoints, targetCodePointCount, shortcutProbability);
}
bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
const PtNodeParams *const ptNodeParams, int *const outTerminalId,
int *const ptNodeWritingPos) {

View file

@ -83,6 +83,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
const DictPositionRelocationMap *const dictPositionRelocationMap,
int *const outBigramEntryCount);
virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
const int *const targetCodePoints, const int targetCodePointCount,
const int shortcutProbability);
private:
DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter);

View file

@ -155,12 +155,26 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
DynamicPatriciaTrieReadingHelper readingHelper(mDictBuffer, &mNodeReader);
readingHelper.initWithPtNodeArrayPos(getRootPosition());
bool addedNewUnigram = false;
// TODO: Add shortcut.
if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, probability, isNotAWord,
isBlacklisted, timestamp, &addedNewUnigram)) {
if (addedNewUnigram) {
mUnigramCount++;
}
if (shortcutLength > 0) {
// Add shortcut target.
const int wordPos = getTerminalPtNodePositionOfWord(word, length,
false /* forceLowerCaseSearch */);
if (wordPos == NOT_A_DICT_POS) {
AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
return false;
}
if (!mUpdatingHelper.addShortcutTarget(wordPos, shortcutTargetCodePoints,
shortcutLength, shortcutProbability)) {
AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, probability: %d",
wordPos, shortcutLength, shortcutProbability);
return false;
}
}
return true;
} else {
return false;
@ -309,12 +323,12 @@ const UnigramProperty Ver4PatriciaTriePolicy::getUnigramProperty(const int *cons
// Fetch shortcut information.
std::vector<std::vector<int> > shortcutTargets;
std::vector<int> shortcutProbabilities;
if (ptNodeParams.hasShortcutTargets()) {
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
if (shortcutPos != NOT_A_DICT_POS) {
int shortcutTarget[MAX_WORD_LENGTH];
const ShortcutDictContent *const shortcutDictContent =
mBuffers.get()->getShortcutDictContent();
bool hasNext = true;
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
while (hasNext) {
int shortcutTargetLength = 0;
int shortcutProbability = NOT_A_PROBABILITY;

View file

@ -43,7 +43,7 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
*pos -= mOriginalBufferSize;
}
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePointCount, pos);
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
if (readingPosIsInAdditionalBuffer) {
*pos += mOriginalBufferSize;
}

View file

@ -44,6 +44,7 @@ bool SparseTable::set(const int id, const uint32_t value) {
int tailPos = mIndexTableBuffer->getTailPosition();
while(tailPos < posInIndexTable) {
if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) {
AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable);
return false;
}
}
@ -51,12 +52,19 @@ bool SparseTable::set(const int id, const uint32_t value) {
if (contains(id)) {
// The entry is already in the content table.
const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable);
return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index));
if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) {
AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value,
getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(),
mDataSize);
return false;
}
return true;
}
// The entry is not in the content table.
// Create new entry in the content table.
const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition());
if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) {
AKLOGE("cannot write index %d. pos %d", index, posInIndexTable);
return false;
}
// Write a new block that containing the entry to be set.
@ -64,6 +72,8 @@ bool SparseTable::set(const int id, const uint32_t value) {
for (int i = 0; i < mBlockSize; ++i) {
if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_A_DICT_POS, mDataSize,
&writingPos)) {
AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, "
"mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize);
return false;
}
}
@ -80,7 +90,7 @@ int SparseTable::getPosInIndexTable(const int id) const {
int SparseTable::getPosInContentTable(const int id, const int index) const {
const int offset = id % mBlockSize;
return (index * mDataSize + offset) * mBlockSize;
return (index * mBlockSize + offset) * mDataSize;
}
} // namespace latinime

View file

@ -24,6 +24,7 @@ import android.util.Pair;
import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam;
import com.android.inputmethod.latin.makedict.CodePointUtils;
import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.utils.UnigramProperty;
import java.io.File;
@ -856,7 +857,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
final int unigramProbability = random.nextInt(0xFF);
final boolean isNotAWord = random.nextBoolean();
final boolean isBlacklisted = random.nextBoolean();
// TODO: Add tests for shortcut.
// TODO: Add tests for historical info.
binaryDictionary.addUnigramWord(word, unigramProbability,
null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
@ -873,4 +873,133 @@ public class BinaryDictionaryTests extends AndroidTestCase {
assertTrue(unigramProperty.mShortcutTargets.isEmpty());
}
}
public void testAddShortcuts() {
testAddShortcuts(4 /* formatVersion */);
}
private void testAddShortcuts(final int formatVersion) {
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final int unigramProbability = 100;
final int shortcutProbability = 10;
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(1, unigramProperty.mShortcutTargets.size());
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
assertEquals(shortcutProbability, unigramProperty.mShortcutTargets.get(0).mFrequency);
final int updatedShortcutProbability = 2;
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(1, unigramProperty.mShortcutTargets.size());
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
assertEquals(updatedShortcutProbability,
unigramProperty.mShortcutTargets.get(0).mFrequency);
binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy",
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
final HashMap<String, Integer> shortcutTargets = new HashMap<String, Integer>();
shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability);
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(2, unigramProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
shortcutTargets.remove(shortcutTarget.mWord);
}
shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability);
binaryDictionary.flushWithGC();
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
assertEquals(2, unigramProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
shortcutTargets.remove(shortcutTarget.mWord);
}
}
public void testAddManyShortcuts() {
testAddManyShortcuts(4 /* formatVersion */);
}
private void testAddManyShortcuts(final int formatVersion) {
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
final int UNIGRAM_COUNT = 1000;
final int SHORTCUT_COUNT = 10000;
final int codePointSetSize = 20;
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
final ArrayList<String> words = new ArrayList<String>();
final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
final HashMap<String, HashMap<String, Integer>> shortcutTargets =
new HashMap<String, HashMap<String, Integer>>();
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
for (int i = 0; i < UNIGRAM_COUNT; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
final int unigramProbability = random.nextInt(0xFF);
addUnigramWord(binaryDictionary, word, unigramProbability);
words.add(word);
unigramProbabilities.put(word, unigramProbability);
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
binaryDictionary.flushWithGC();
}
}
for (int i = 0; i < SHORTCUT_COUNT; i++) {
final String shortcutTarget = CodePointUtils.generateWord(random, codePointSet);
final int shortcutProbability = random.nextInt(0xF);
final String word = words.get(random.nextInt(words.size()));
final int unigramProbability = unigramProbabilities.get(word);
binaryDictionary.addUnigramWord(word, unigramProbability, shortcutTarget,
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
if (shortcutTargets.containsKey(word)) {
final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word);
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
} else {
final HashMap<String, Integer> shortcutTargetsOfWord =
new HashMap<String, Integer>();
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
shortcutTargets.put(word, shortcutTargetsOfWord);
}
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
binaryDictionary.flushWithGC();
}
}
for (final String word : words) {
final UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty(word);
assertEquals((int)unigramProbabilities.get(word), unigramProperty.mProbability);
assertEquals(shortcutTargets.get(word).size(), unigramProperty.mShortcutTargets.size());
for (final WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
final String targetCodePonts = shortcutTarget.mWord;
assertEquals((int)shortcutTargets.get(word).get(targetCodePonts),
shortcutTarget.mFrequency);
}
}
}
}