Switch the AOSP word lists to the combined format.

This will help with managing the word lists. Bug: 7388859 Change-Id: I89f049569b177d3027fe56d6c67eaca27d44dc7d
2012-10-31 17:34:47 +09:00 · 2012-10-31 17:34:47 +09:00 · a424ff06ec
parent a8058d169d
commit a424ff06ec
51 changed files with 38 additions and 17 deletions
--- a/dictionaries/cs_wordlist.combined.gz
+++ b/dictionaries/cs_wordlist.combined.gz
--- a/dictionaries/cs_wordlist.xml.gz
+++ b/dictionaries/cs_wordlist.xml.gz
--- a/dictionaries/da_wordlist.combined.gz
+++ b/dictionaries/da_wordlist.combined.gz
--- a/dictionaries/da_wordlist.xml.gz
+++ b/dictionaries/da_wordlist.xml.gz
--- a/dictionaries/de_wordlist.combined.gz
+++ b/dictionaries/de_wordlist.combined.gz
--- a/dictionaries/de_wordlist.xml.gz
+++ b/dictionaries/de_wordlist.xml.gz
--- a/dictionaries/el_wordlist.combined.gz
+++ b/dictionaries/el_wordlist.combined.gz
--- a/dictionaries/el_wordlist.xml.gz
+++ b/dictionaries/el_wordlist.xml.gz
--- a/dictionaries/en_GB_wordlist.combined.gz
+++ b/dictionaries/en_GB_wordlist.combined.gz
--- a/dictionaries/en_US_wordlist.combined.gz
+++ b/dictionaries/en_US_wordlist.combined.gz
--- a/dictionaries/en_gb_wordlist.xml.gz
+++ b/dictionaries/en_gb_wordlist.xml.gz
--- a/dictionaries/en_us_wordlist.xml.gz
+++ b/dictionaries/en_us_wordlist.xml.gz
--- a/dictionaries/en_whitelist.xml.gz
+++ b/dictionaries/en_whitelist.xml.gz
--- a/dictionaries/en_wordlist.combined.gz
+++ b/dictionaries/en_wordlist.combined.gz
--- a/dictionaries/en_wordlist.xml.gz
+++ b/dictionaries/en_wordlist.xml.gz
--- a/dictionaries/es_wordlist.combined.gz
+++ b/dictionaries/es_wordlist.combined.gz
--- a/dictionaries/es_wordlist.xml.gz
+++ b/dictionaries/es_wordlist.xml.gz
--- a/dictionaries/fi_wordlist.combined.gz
+++ b/dictionaries/fi_wordlist.combined.gz
--- a/dictionaries/fi_wordlist.xml.gz
+++ b/dictionaries/fi_wordlist.xml.gz
--- a/dictionaries/fr_wordlist.combined.gz
+++ b/dictionaries/fr_wordlist.combined.gz
--- a/dictionaries/fr_wordlist.xml.gz
+++ b/dictionaries/fr_wordlist.xml.gz
--- a/dictionaries/hr_wordlist.combined.gz
+++ b/dictionaries/hr_wordlist.combined.gz
--- a/dictionaries/hr_wordlist.xml.gz
+++ b/dictionaries/hr_wordlist.xml.gz
--- a/dictionaries/it_wordlist.combined.gz
+++ b/dictionaries/it_wordlist.combined.gz
--- a/dictionaries/it_wordlist.xml.gz
+++ b/dictionaries/it_wordlist.xml.gz
--- a/dictionaries/lt_wordlist.combined.gz
+++ b/dictionaries/lt_wordlist.combined.gz
--- a/dictionaries/lt_wordlist.xml.gz
+++ b/dictionaries/lt_wordlist.xml.gz
--- a/dictionaries/lv_wordlist.combined.gz
+++ b/dictionaries/lv_wordlist.combined.gz
--- a/dictionaries/lv_wordlist.xml.gz
+++ b/dictionaries/lv_wordlist.xml.gz
--- a/dictionaries/nb_wordlist.combined.gz
+++ b/dictionaries/nb_wordlist.combined.gz
--- a/dictionaries/nb_wordlist.xml.gz
+++ b/dictionaries/nb_wordlist.xml.gz
--- a/dictionaries/nl_wordlist.combined.gz
+++ b/dictionaries/nl_wordlist.combined.gz
--- a/dictionaries/nl_wordlist.xml.gz
+++ b/dictionaries/nl_wordlist.xml.gz
--- a/dictionaries/pl_wordlist.combined.gz
+++ b/dictionaries/pl_wordlist.combined.gz
--- a/dictionaries/pl_wordlist.xml.gz
+++ b/dictionaries/pl_wordlist.xml.gz
--- a/dictionaries/pt_BR_wordlist.combined.gz
+++ b/dictionaries/pt_BR_wordlist.combined.gz
--- a/dictionaries/pt_PT_wordlist.combined.gz
+++ b/dictionaries/pt_PT_wordlist.combined.gz
--- a/dictionaries/pt_br_wordlist.xml.gz
+++ b/dictionaries/pt_br_wordlist.xml.gz
--- a/dictionaries/pt_pt_wordlist.xml.gz
+++ b/dictionaries/pt_pt_wordlist.xml.gz
--- a/dictionaries/ru_wordlist.combined.gz
+++ b/dictionaries/ru_wordlist.combined.gz
--- a/dictionaries/ru_wordlist.xml.gz
+++ b/dictionaries/ru_wordlist.xml.gz
--- a/dictionaries/sample.combined
+++ b/dictionaries/sample.combined
@ -0,0 +1,38 @@
+# This is a sample wordlist that can be converted to a binary dictionary
+# for use by the Latin IME.
+# The file is essentially a CSV file, with indent level denoting nesting.
+#
+# The file starts with a single CSV line with the header attributes. Whatever
+# the content, these are included as is in the binary file. The first attribute
+# of the file should be `dictionary'. Usual fields are `locale', `description',
+# `date', `version', `options'.
+#
+# Each word has a `word' entry and at least a `f' argument denoting its
+# probability, as an integer between 0 and 255 on a logarithmic scale, with
+# 255 meaning 1 and each decrement in 1 dividing probability by 1.15.
+# As a special case, a weight of 0 is taken to mean profanity - words that
+# should not be considered a typo, but that should never be suggested
+# explicitly. An entry may be made not a word by adding a `not_a_word'
+# field with a value of `true'. The main reason for putting such entries
+# into the dictionary is to add shortcut targets and maybe a whitelist
+# replacement.
+#
+# Each word may or may not have any number of shortcut target lines
+# starting with a `shortcut' entry and having at least a `f' frequency
+# value between 0 and 14, or the special value `whitelist' which becomes
+# 15, which is then taken to be the whitelist target of this word.
+#
+# Each word may also have any number of bigram lines starting with a
+# `bigram' entry containing the following word whose frequency should
+# override the unigram frequency when following the word this bigram is
+# for.
+#
+dictionary=main:en,locale=en,description=Sample wordlist,date=1351495318,version=1
+ word=sample,f=200
+  bigram=wordlist,f=243
+ word=wordlist,f=180
+ word=shortcut,f=176
+  shortcut=target,f=10
+ word=witelisted,f=10,not_a_word=true
+  shortcut=whitelisted,f=whitelist
+ word=profanity,f=0
--- a/dictionaries/sample.xml
+++ b/dictionaries/sample.xml
@ -1,17 +0,0 @@
-<!-- This is a sample wordlist that can be converted to a binary dictionary
-     for use by the Latin IME.
-     The format of the word list is a flat list of word entries.
-     Each entry has a frequency between 255 and 0.
-     Highest frequency words get more weight in the prediction algorithm. As a
-     special case, a weight of 0 is taken to mean profanity - words that should
-     not be considered a typo, but that should never be suggested explicitly.
-     You can capitalize words that must always be capitalized, such as "January".
-     You can have a capitalized and a non-capitalized word as separate entries,
-     such as "robin" and "Robin".
-->
-<wordlist>
-  <w f="255">this</w>
-  <w f="255">is</w>
-  <w f="128">sample</w>
-  <w f="1">wordlist</w>
-</wordlist>
--- a/dictionaries/sl_wordlist.combined.gz
+++ b/dictionaries/sl_wordlist.combined.gz
--- a/dictionaries/sl_wordlist.xml.gz
+++ b/dictionaries/sl_wordlist.xml.gz
--- a/dictionaries/sr_wordlist.combined.gz
+++ b/dictionaries/sr_wordlist.combined.gz
--- a/dictionaries/sr_wordlist.xml.gz
+++ b/dictionaries/sr_wordlist.xml.gz
--- a/dictionaries/sv_wordlist.combined.gz
+++ b/dictionaries/sv_wordlist.combined.gz
--- a/dictionaries/sv_wordlist.xml.gz
+++ b/dictionaries/sv_wordlist.xml.gz
--- a/dictionaries/tr_wordlist.combined.gz
+++ b/dictionaries/tr_wordlist.combined.gz
--- a/dictionaries/tr_wordlist.xml.gz
+++ b/dictionaries/tr_wordlist.xml.gz