am 0a213fd6
: Merge "Start to support adding shortcuts."
* commit '0a213fd625e72a8d7d4ba7ed3cb995bf4888d9fb': Start to support adding shortcuts.
This commit is contained in:
commit
7b840bc41d
11 changed files with 201 additions and 11 deletions
|
@ -88,11 +88,13 @@ class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
|
|||
return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos);
|
||||
}
|
||||
// Overwrite existing entry.
|
||||
int writingPos = entryPos;
|
||||
if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints,
|
||||
codePointCount, probability, true /* hasNext */, &writingPos)) {
|
||||
bool hasNext = false;
|
||||
mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */,
|
||||
0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos);
|
||||
if (!mShortcutDictContent->writeShortcutEntry(codePoints,
|
||||
codePointCount, probability, hasNext, entryPos)) {
|
||||
AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId,
|
||||
writingPos);
|
||||
entryPos);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -74,6 +74,10 @@ class PtNodeWriter {
|
|||
const DictPositionRelocationMap *const dictPositionRelocationMap,
|
||||
int *const outBigramEntryCount) = 0;
|
||||
|
||||
virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
|
||||
const int *const targetCodePoints, const int targetCodePointCount,
|
||||
const int shortcutProbability) = 0;
|
||||
|
||||
protected:
|
||||
PtNodeWriter() {};
|
||||
|
||||
|
|
|
@ -105,6 +105,14 @@ bool DynamicPatriciaTrieUpdatingHelper::removeBigramWords(const int word0Pos, co
|
|||
return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams);
|
||||
}
|
||||
|
||||
bool DynamicPatriciaTrieUpdatingHelper::addShortcutTarget(const int wordPos,
|
||||
const int *const targetCodePoints, const int targetCodePointCount,
|
||||
const int shortcutProbability) {
|
||||
const PtNodeParams ptNodeParams(mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(wordPos));
|
||||
return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount,
|
||||
shortcutProbability);
|
||||
}
|
||||
|
||||
bool DynamicPatriciaTrieUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
|
||||
const int *const nodeCodePoints, const int nodeCodePointCount,
|
||||
const bool isNotAWord, const bool isBlacklisted, const int probability,
|
||||
|
|
|
@ -52,6 +52,10 @@ class DynamicPatriciaTrieUpdatingHelper {
|
|||
// Remove a bigram relation from word0Pos to word1Pos.
|
||||
bool removeBigramWords(const int word0Pos, const int word1Pos);
|
||||
|
||||
// Add a shortcut target.
|
||||
bool addShortcutTarget(const int wordPos, const int *const targetCodePoints,
|
||||
const int targetCodePointCount, const int shortcutProbability);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieUpdatingHelper);
|
||||
|
||||
|
|
|
@ -38,6 +38,14 @@ class ShortcutDictContent : public SparseTableDictContent {
|
|||
: SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
|
||||
Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
|
||||
|
||||
void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint,
|
||||
int *const outCodePointCount, int *const outProbability, bool *const outhasNext,
|
||||
const int shortcutEntryPos) {
|
||||
int readingPos = shortcutEntryPos;
|
||||
return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint,
|
||||
outCodePointCount, outProbability, outhasNext, &readingPos);
|
||||
}
|
||||
|
||||
void getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
|
||||
int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
|
||||
bool *const outhasNext, int *const shortcutEntryPos) const;
|
||||
|
|
|
@ -217,6 +217,13 @@ bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
|
|||
return true;
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
|
||||
const int *const targetCodePoints, const int targetCodePointCount,
|
||||
const int shortcutProbability) {
|
||||
return mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
|
||||
targetCodePoints, targetCodePointCount, shortcutProbability);
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
|
||||
const PtNodeParams *const ptNodeParams, int *const outTerminalId,
|
||||
int *const ptNodeWritingPos) {
|
||||
|
|
|
@ -83,6 +83,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
|||
const DictPositionRelocationMap *const dictPositionRelocationMap,
|
||||
int *const outBigramEntryCount);
|
||||
|
||||
virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
|
||||
const int *const targetCodePoints, const int targetCodePointCount,
|
||||
const int shortcutProbability);
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter);
|
||||
|
||||
|
|
|
@ -155,12 +155,26 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
|
|||
DynamicPatriciaTrieReadingHelper readingHelper(mDictBuffer, &mNodeReader);
|
||||
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
||||
bool addedNewUnigram = false;
|
||||
// TODO: Add shortcut.
|
||||
if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, probability, isNotAWord,
|
||||
isBlacklisted, timestamp, &addedNewUnigram)) {
|
||||
if (addedNewUnigram) {
|
||||
mUnigramCount++;
|
||||
}
|
||||
if (shortcutLength > 0) {
|
||||
// Add shortcut target.
|
||||
const int wordPos = getTerminalPtNodePositionOfWord(word, length,
|
||||
false /* forceLowerCaseSearch */);
|
||||
if (wordPos == NOT_A_DICT_POS) {
|
||||
AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
|
||||
return false;
|
||||
}
|
||||
if (!mUpdatingHelper.addShortcutTarget(wordPos, shortcutTargetCodePoints,
|
||||
shortcutLength, shortcutProbability)) {
|
||||
AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, probability: %d",
|
||||
wordPos, shortcutLength, shortcutProbability);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
@ -309,12 +323,12 @@ const UnigramProperty Ver4PatriciaTriePolicy::getUnigramProperty(const int *cons
|
|||
// Fetch shortcut information.
|
||||
std::vector<std::vector<int> > shortcutTargets;
|
||||
std::vector<int> shortcutProbabilities;
|
||||
if (ptNodeParams.hasShortcutTargets()) {
|
||||
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
|
||||
if (shortcutPos != NOT_A_DICT_POS) {
|
||||
int shortcutTarget[MAX_WORD_LENGTH];
|
||||
const ShortcutDictContent *const shortcutDictContent =
|
||||
mBuffers.get()->getShortcutDictContent();
|
||||
bool hasNext = true;
|
||||
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
|
||||
while (hasNext) {
|
||||
int shortcutTargetLength = 0;
|
||||
int shortcutProbability = NOT_A_PROBABILITY;
|
||||
|
|
|
@ -43,7 +43,7 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
|
|||
*pos -= mOriginalBufferSize;
|
||||
}
|
||||
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
|
||||
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePointCount, pos);
|
||||
getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
|
||||
if (readingPosIsInAdditionalBuffer) {
|
||||
*pos += mOriginalBufferSize;
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ bool SparseTable::set(const int id, const uint32_t value) {
|
|||
int tailPos = mIndexTableBuffer->getTailPosition();
|
||||
while(tailPos < posInIndexTable) {
|
||||
if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) {
|
||||
AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -51,12 +52,19 @@ bool SparseTable::set(const int id, const uint32_t value) {
|
|||
if (contains(id)) {
|
||||
// The entry is already in the content table.
|
||||
const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable);
|
||||
return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index));
|
||||
if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) {
|
||||
AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value,
|
||||
getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(),
|
||||
mDataSize);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// The entry is not in the content table.
|
||||
// Create new entry in the content table.
|
||||
const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition());
|
||||
if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) {
|
||||
AKLOGE("cannot write index %d. pos %d", index, posInIndexTable);
|
||||
return false;
|
||||
}
|
||||
// Write a new block that containing the entry to be set.
|
||||
|
@ -64,6 +72,8 @@ bool SparseTable::set(const int id, const uint32_t value) {
|
|||
for (int i = 0; i < mBlockSize; ++i) {
|
||||
if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_A_DICT_POS, mDataSize,
|
||||
&writingPos)) {
|
||||
AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, "
|
||||
"mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -80,7 +90,7 @@ int SparseTable::getPosInIndexTable(const int id) const {
|
|||
|
||||
int SparseTable::getPosInContentTable(const int id, const int index) const {
|
||||
const int offset = id % mBlockSize;
|
||||
return (index * mDataSize + offset) * mBlockSize;
|
||||
return (index * mBlockSize + offset) * mDataSize;
|
||||
}
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -24,6 +24,7 @@ import android.util.Pair;
|
|||
import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam;
|
||||
import com.android.inputmethod.latin.makedict.CodePointUtils;
|
||||
import com.android.inputmethod.latin.makedict.FormatSpec;
|
||||
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
|
||||
import com.android.inputmethod.latin.utils.UnigramProperty;
|
||||
|
||||
import java.io.File;
|
||||
|
@ -856,7 +857,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
final int unigramProbability = random.nextInt(0xFF);
|
||||
final boolean isNotAWord = random.nextBoolean();
|
||||
final boolean isBlacklisted = random.nextBoolean();
|
||||
// TODO: Add tests for shortcut.
|
||||
// TODO: Add tests for historical info.
|
||||
binaryDictionary.addUnigramWord(word, unigramProbability,
|
||||
null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
|
||||
|
@ -873,4 +873,133 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertTrue(unigramProperty.mShortcutTargets.isEmpty());
|
||||
}
|
||||
}
|
||||
|
||||
public void testAddShortcuts() {
|
||||
testAddShortcuts(4 /* formatVersion */);
|
||||
}
|
||||
|
||||
private void testAddShortcuts(final int formatVersion) {
|
||||
File dictFile = null;
|
||||
try {
|
||||
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
}
|
||||
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
|
||||
final int unigramProbability = 100;
|
||||
final int shortcutProbability = 10;
|
||||
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
|
||||
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
|
||||
0 /* timestamp */);
|
||||
UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty("aaa");
|
||||
assertEquals(1, unigramProperty.mShortcutTargets.size());
|
||||
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
|
||||
assertEquals(shortcutProbability, unigramProperty.mShortcutTargets.get(0).mFrequency);
|
||||
final int updatedShortcutProbability = 2;
|
||||
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
|
||||
updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
|
||||
0 /* timestamp */);
|
||||
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
|
||||
assertEquals(1, unigramProperty.mShortcutTargets.size());
|
||||
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
|
||||
assertEquals(updatedShortcutProbability,
|
||||
unigramProperty.mShortcutTargets.get(0).mFrequency);
|
||||
binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy",
|
||||
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
|
||||
0 /* timestamp */);
|
||||
final HashMap<String, Integer> shortcutTargets = new HashMap<String, Integer>();
|
||||
shortcutTargets.put("zzz", updatedShortcutProbability);
|
||||
shortcutTargets.put("yyy", shortcutProbability);
|
||||
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
|
||||
assertEquals(2, unigramProperty.mShortcutTargets.size());
|
||||
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
|
||||
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
||||
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
|
||||
shortcutTargets.remove(shortcutTarget.mWord);
|
||||
}
|
||||
shortcutTargets.put("zzz", updatedShortcutProbability);
|
||||
shortcutTargets.put("yyy", shortcutProbability);
|
||||
binaryDictionary.flushWithGC();
|
||||
unigramProperty = binaryDictionary.getUnigramProperty("aaa");
|
||||
assertEquals(2, unigramProperty.mShortcutTargets.size());
|
||||
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
|
||||
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
|
||||
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
|
||||
shortcutTargets.remove(shortcutTarget.mWord);
|
||||
}
|
||||
}
|
||||
|
||||
public void testAddManyShortcuts() {
|
||||
testAddManyShortcuts(4 /* formatVersion */);
|
||||
}
|
||||
|
||||
private void testAddManyShortcuts(final int formatVersion) {
|
||||
final long seed = System.currentTimeMillis();
|
||||
final Random random = new Random(seed);
|
||||
final int UNIGRAM_COUNT = 1000;
|
||||
final int SHORTCUT_COUNT = 10000;
|
||||
final int codePointSetSize = 20;
|
||||
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||
|
||||
final ArrayList<String> words = new ArrayList<String>();
|
||||
final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>();
|
||||
final HashMap<String, HashMap<String, Integer>> shortcutTargets =
|
||||
new HashMap<String, HashMap<String, Integer>>();
|
||||
|
||||
File dictFile = null;
|
||||
try {
|
||||
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
}
|
||||
final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
|
||||
for (int i = 0; i < UNIGRAM_COUNT; i++) {
|
||||
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||
final int unigramProbability = random.nextInt(0xFF);
|
||||
addUnigramWord(binaryDictionary, word, unigramProbability);
|
||||
words.add(word);
|
||||
unigramProbabilities.put(word, unigramProbability);
|
||||
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < SHORTCUT_COUNT; i++) {
|
||||
final String shortcutTarget = CodePointUtils.generateWord(random, codePointSet);
|
||||
final int shortcutProbability = random.nextInt(0xF);
|
||||
final String word = words.get(random.nextInt(words.size()));
|
||||
final int unigramProbability = unigramProbabilities.get(word);
|
||||
binaryDictionary.addUnigramWord(word, unigramProbability, shortcutTarget,
|
||||
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
|
||||
0 /* timestamp */);
|
||||
if (shortcutTargets.containsKey(word)) {
|
||||
final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word);
|
||||
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
|
||||
} else {
|
||||
final HashMap<String, Integer> shortcutTargetsOfWord =
|
||||
new HashMap<String, Integer>();
|
||||
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
|
||||
shortcutTargets.put(word, shortcutTargetsOfWord);
|
||||
}
|
||||
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
}
|
||||
|
||||
for (final String word : words) {
|
||||
final UnigramProperty unigramProperty = binaryDictionary.getUnigramProperty(word);
|
||||
assertEquals((int)unigramProbabilities.get(word), unigramProperty.mProbability);
|
||||
assertEquals(shortcutTargets.get(word).size(), unigramProperty.mShortcutTargets.size());
|
||||
for (final WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
|
||||
final String targetCodePonts = shortcutTarget.mWord;
|
||||
assertEquals((int)shortcutTargets.get(word).get(targetCodePonts),
|
||||
shortcutTarget.mFrequency);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue