Compare commits

...

10 Commits

Author SHA1 Message Date
~erin e42e87121e
Bump up crawler version 2023-07-25 20:54:26 -04:00
~erin b5aa0c2171
Initialize core code 2023-07-25 20:53:18 -04:00
~erin 0c9f66277e
Rename table, add search_index 2023-07-25 20:47:31 -04:00
~erin f3422a4949
Filetype filtering 2023-07-25 20:25:49 -04:00
~erin ed53ec320e
Fix panic, preventing full crawling 2023-07-25 20:11:29 -04:00
~erin c141135847
Sanitize HTML? 2023-07-25 19:54:05 -04:00
~erin 758c16b78b
Justfile 2023-07-25 19:47:39 -04:00
~erin 561ef2dfb4
Test database 2023-07-25 19:28:17 -04:00
~erin 541deca5f6
Add README 2023-07-25 19:27:56 -04:00
~erin 326a6b8042
Should crawl pages after a certain age 2023-07-25 18:33:38 -04:00
11 changed files with 839 additions and 20 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
/target
test.db*

708
Cargo.lock generated
View File

@ -64,6 +64,25 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "allocator-api2"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "ammonia"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64e6d1c7838db705c9b756557ee27c384ce695a1c51a6fe528784cb1c6840170"
dependencies = [
"html5ever",
"maplit",
"once_cell",
"tendril",
"url",
]
[[package]]
name = "android-tzdata"
version = "0.1.1"
@ -104,6 +123,15 @@ dependencies = [
"syn 2.0.27",
]
[[package]]
name = "atoi"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528"
dependencies = [
"num-traits",
]
[[package]]
name = "atty"
version = "0.2.14"
@ -148,6 +176,12 @@ version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d"
[[package]]
name = "base64ct"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "bincode"
version = "1.3.3"
@ -168,6 +202,9 @@ name = "bitflags"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
dependencies = [
"serde",
]
[[package]]
name = "block-buffer"
@ -291,9 +328,24 @@ dependencies = [
"yaml-rust",
]
[[package]]
name = "const-oid"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "795bc6e66a8e340f075fcf6227e417a2dc976b92b91f3cdc778bb858778b6747"
[[package]]
name = "core"
version = "0.1.0"
dependencies = [
"chrono",
"config",
"dirs",
"env_logger",
"log",
"sqlx",
"tokio",
]
[[package]]
name = "core-foundation-sys"
@ -314,6 +366,7 @@ dependencies = [
name = "crawler"
version = "0.1.0"
dependencies = [
"ammonia",
"bincode",
"cacache",
"chrono",
@ -324,10 +377,26 @@ dependencies = [
"reqwest",
"scraper",
"serde",
"sqlx",
"tokio",
"url",
]
[[package]]
name = "crc"
version = "3.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe"
dependencies = [
"crc-catalog",
]
[[package]]
name = "crc-catalog"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9cace84e55f07e7301bae1c519df89cdad8cc3cd868413d3fdbdeca9ff3db484"
[[package]]
name = "crc32fast"
version = "1.3.2"
@ -337,6 +406,25 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-queue"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
dependencies = [
"cfg-if",
]
[[package]]
name = "crypto-common"
version = "0.1.6"
@ -370,6 +458,17 @@ dependencies = [
"syn 2.0.27",
]
[[package]]
name = "der"
version = "0.7.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c7ed52955ce76b1554f509074bb357d3fb8ac9b51288a65a3fd480d1dfba946"
dependencies = [
"const-oid",
"pem-rfc7468",
"zeroize",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -388,7 +487,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"const-oid",
"crypto-common",
"subtle",
]
[[package]]
@ -418,6 +519,12 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0688c2a7f92e427f44895cd63841bff7b29f8d7a1648b9e7e07a4a365b2e1257"
[[package]]
name = "dotenvy"
version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
[[package]]
name = "dtoa"
version = "1.0.9"
@ -444,6 +551,9 @@ name = "either"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
dependencies = [
"serde",
]
[[package]]
name = "encoding_rs"
@ -467,6 +577,12 @@ dependencies = [
"termcolor",
]
[[package]]
name = "equivalent"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.1"
@ -488,6 +604,23 @@ dependencies = [
"libc",
]
[[package]]
name = "etcetera"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943"
dependencies = [
"cfg-if",
"home",
"windows-sys",
]
[[package]]
name = "event-listener"
version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "fastrand"
version = "2.0.0"
@ -504,6 +637,18 @@ dependencies = [
"miniz_oxide",
]
[[package]]
name = "flume"
version = "0.10.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577"
dependencies = [
"futures-core",
"futures-sink",
"pin-project",
"spin 0.9.8",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -575,6 +720,17 @@ dependencies = [
"futures-util",
]
[[package]]
name = "futures-intrusive"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f"
dependencies = [
"futures-core",
"lock_api",
"parking_lot",
]
[[package]]
name = "futures-io"
version = "0.3.28"
@ -679,7 +835,7 @@ dependencies = [
"futures-sink",
"futures-util",
"http",
"indexmap",
"indexmap 1.9.3",
"slab",
"tokio",
"tokio-util",
@ -695,6 +851,34 @@ dependencies = [
"ahash 0.7.6",
]
[[package]]
name = "hashbrown"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
dependencies = [
"ahash 0.8.3",
"allocator-api2",
]
[[package]]
name = "hashlink"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "312f66718a2d7789ffef4f4b7b213138ed9f1eb3aa1d0d82fc99f88fb3ffd26f"
dependencies = [
"hashbrown 0.14.0",
]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "hermit-abi"
version = "0.1.19"
@ -716,6 +900,33 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "hkdf"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "791a029f6b9fc27657f6f188ec6e5e43f6911f6f878e0dc5501396e09809d437"
dependencies = [
"hmac",
]
[[package]]
name = "hmac"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
dependencies = [
"digest",
]
[[package]]
name = "home"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb"
dependencies = [
"windows-sys",
]
[[package]]
name = "html5ever"
version = "0.26.0"
@ -848,7 +1059,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown",
"hashbrown 0.12.3",
]
[[package]]
name = "indexmap"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d"
dependencies = [
"equivalent",
"hashbrown 0.14.0",
]
[[package]]
@ -857,6 +1078,15 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
[[package]]
name = "itertools"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.9"
@ -888,6 +1118,9 @@ name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
dependencies = [
"spin 0.5.2",
]
[[package]]
name = "libc"
@ -895,6 +1128,23 @@ version = "0.2.147"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
[[package]]
name = "libm"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4"
[[package]]
name = "libsqlite3-sys"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afc22eff61b133b115c6e8c74e818c628d6d5e7a502afea6f64dee076dd94326"
dependencies = [
"cc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "linked-hash-map"
version = "0.5.6"
@ -929,6 +1179,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "markup5ever"
version = "0.11.0"
@ -943,6 +1199,15 @@ dependencies = [
"tendril",
]
[[package]]
name = "md-5"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca"
dependencies = [
"digest",
]
[[package]]
name = "memchr"
version = "2.5.0"
@ -1029,6 +1294,44 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "num-bigint-dig"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
dependencies = [
"byteorder",
"lazy_static",
"libm",
"num-integer",
"num-iter",
"num-traits",
"rand",
"smallvec",
"zeroize",
]
[[package]]
name = "num-integer"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
dependencies = [
"autocfg",
"num-traits",
]
[[package]]
name = "num-iter"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.16"
@ -1036,6 +1339,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
dependencies = [
"autocfg",
"libm",
]
[[package]]
@ -1076,7 +1380,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccd746e37177e1711c20dd619a1620f34f5c8b569c53590a72dedd5344d8924a"
dependencies = [
"dlv-list",
"hashbrown",
"hashbrown 0.12.3",
]
[[package]]
@ -1102,12 +1406,27 @@ dependencies = [
"windows-targets",
]
[[package]]
name = "paste"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]]
name = "pathdiff"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
[[package]]
name = "pem-rfc7468"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
dependencies = [
"base64ct",
]
[[package]]
name = "percent-encoding"
version = "2.3.0"
@ -1238,6 +1557,26 @@ dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "030ad2bc4db10a8944cb0d837f158bdfec4d4a4873ab701a95046770d11f8842"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec2e072ecce94ec471b13398d5402c188e76ac03cf74dd1a975161b23a3f6d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.27",
]
[[package]]
name = "pin-project-lite"
version = "0.2.10"
@ -1250,6 +1589,33 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkcs1"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
dependencies = [
"der",
"pkcs8",
"spki",
]
[[package]]
name = "pkcs8"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
dependencies = [
"der",
"spki",
]
[[package]]
name = "pkg-config"
version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@ -1415,7 +1781,7 @@ dependencies = [
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
"webpki-roots",
"webpki-roots 0.22.6",
"winreg",
]
@ -1428,7 +1794,7 @@ dependencies = [
"cc",
"libc",
"once_cell",
"spin",
"spin 0.5.2",
"untrusted",
"web-sys",
"winapi",
@ -1445,6 +1811,28 @@ dependencies = [
"serde",
]
[[package]]
name = "rsa"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ab43bb47d23c1a631b4b680199a45255dce26fa9ab2fa902581f624ff13e6a8"
dependencies = [
"byteorder",
"const-oid",
"digest",
"num-bigint-dig",
"num-integer",
"num-iter",
"num-traits",
"pkcs1",
"pkcs8",
"rand_core",
"signature",
"spki",
"subtle",
"zeroize",
]
[[package]]
name = "rust-ini"
version = "0.18.0"
@ -1666,6 +2054,16 @@ dependencies = [
"libc",
]
[[package]]
name = "signature"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e1788eed21689f9cf370582dfc467ef36ed9c707f073528ddafa8d83e3b8500"
dependencies = [
"digest",
"rand_core",
]
[[package]]
name = "siphasher"
version = "0.3.10"
@ -1703,6 +2101,238 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "spin"
version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
dependencies = [
"lock_api",
]
[[package]]
name = "spki"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d1e996ef02c474957d681f1b05213dfb0abab947b446a62d37770b23500184a"
dependencies = [
"base64ct",
"der",
]
[[package]]
name = "sqlformat"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c12bc9199d1db8234678b7051747c07f517cdcf019262d1847b94ec8b1aee3e"
dependencies = [
"itertools",
"nom",
"unicode_categories",
]
[[package]]
name = "sqlx"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e58421b6bc416714d5115a2ca953718f6c621a51b68e4f4922aea5a4391a721"
dependencies = [
"sqlx-core",
"sqlx-macros",
"sqlx-mysql",
"sqlx-postgres",
"sqlx-sqlite",
]
[[package]]
name = "sqlx-core"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd4cef4251aabbae751a3710927945901ee1d97ee96d757f6880ebb9a79bfd53"
dependencies = [
"ahash 0.8.3",
"atoi",
"byteorder",
"bytes",
"chrono",
"crc",
"crossbeam-queue",
"dotenvy",
"either",
"event-listener",
"futures-channel",
"futures-core",
"futures-intrusive",
"futures-io",
"futures-util",
"hashlink",
"hex",
"indexmap 2.0.0",
"log",
"memchr",
"once_cell",
"paste",
"percent-encoding",
"rustls",
"rustls-pemfile",
"serde",
"serde_json",
"sha2",
"smallvec",
"sqlformat",
"thiserror",
"tokio",
"tokio-stream",
"tracing",
"url",
"webpki-roots 0.24.0",
]
[[package]]
name = "sqlx-macros"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "208e3165167afd7f3881b16c1ef3f2af69fa75980897aac8874a0696516d12c2"
dependencies = [
"proc-macro2",
"quote",
"sqlx-core",
"sqlx-macros-core",
"syn 1.0.109",
]
[[package]]
name = "sqlx-macros-core"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a4a8336d278c62231d87f24e8a7a74898156e34c1c18942857be2acb29c7dfc"
dependencies = [
"dotenvy",
"either",
"heck",
"hex",
"once_cell",
"proc-macro2",
"quote",
"serde",
"serde_json",
"sha2",
"sqlx-core",
"sqlx-mysql",
"sqlx-postgres",
"sqlx-sqlite",
"syn 1.0.109",
"tempfile",
"tokio",
"url",
]
[[package]]
name = "sqlx-mysql"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ca69bf415b93b60b80dc8fda3cb4ef52b2336614d8da2de5456cc942a110482"
dependencies = [
"atoi",
"base64 0.21.2",
"bitflags 2.3.3",
"byteorder",
"bytes",
"chrono",
"crc",
"digest",
"dotenvy",
"either",
"futures-channel",
"futures-core",
"futures-io",
"futures-util",
"generic-array",
"hex",
"hkdf",
"hmac",
"itoa",
"log",
"md-5",
"memchr",
"once_cell",
"percent-encoding",
"rand",
"rsa",
"serde",
"sha1",
"sha2",
"smallvec",
"sqlx-core",
"stringprep",
"thiserror",
"tracing",
"whoami",
]
[[package]]
name = "sqlx-postgres"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0db2df1b8731c3651e204629dd55e52adbae0462fa1bdcbed56a2302c18181e"
dependencies = [
"atoi",
"base64 0.21.2",
"bitflags 2.3.3",
"byteorder",
"chrono",
"crc",
"dotenvy",
"etcetera",
"futures-channel",
"futures-core",
"futures-io",
"futures-util",
"hex",
"hkdf",
"hmac",
"home",
"itoa",
"log",
"md-5",
"memchr",
"once_cell",
"rand",
"serde",
"serde_json",
"sha1",
"sha2",
"smallvec",
"sqlx-core",
"stringprep",
"thiserror",
"tracing",
"whoami",
]
[[package]]
name = "sqlx-sqlite"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4c21bf34c7cae5b283efb3ac1bcc7670df7561124dc2f8bdc0b59be40f79a2"
dependencies = [
"atoi",
"chrono",
"flume",
"futures-channel",
"futures-core",
"futures-executor",
"futures-intrusive",
"futures-util",
"libsqlite3-sys",
"log",
"percent-encoding",
"serde",
"sqlx-core",
"tracing",
"url",
]
[[package]]
name = "ssri"
version = "9.2.0"
@ -1752,6 +2382,22 @@ dependencies = [
"quote",
]
[[package]]
name = "stringprep"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db3737bde7edce97102e0e2b15365bf7a20bfdb5f60f4f9e8d7004258a51a8da"
dependencies = [
"unicode-bidi",
"unicode-normalization",
]
[[package]]
name = "subtle"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
[[package]]
name = "syn"
version = "1.0.109"
@ -1941,10 +2587,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
dependencies = [
"cfg-if",
"log",
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-attributes"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.27",
]
[[package]]
name = "tracing-core"
version = "0.1.31"
@ -1993,12 +2652,24 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-segmentation"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]]
name = "unicode-width"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]]
name = "unicode_categories"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
[[package]]
name = "untrusted"
version = "0.7.1"
@ -2023,6 +2694,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
@ -2155,6 +2832,21 @@ dependencies = [
"webpki",
]
[[package]]
name = "webpki-roots"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b291546d5d9d1eab74f069c77749f2cb8504a12caa20f0f2de93ddbf6f411888"
dependencies = [
"rustls-webpki",
]
[[package]]
name = "whoami"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
[[package]]
name = "winapi"
version = "0.3.9"
@ -2284,3 +2976,9 @@ checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85"
dependencies = [
"linked-hash-map",
]
[[package]]
name = "zeroize"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"

View File

@ -15,6 +15,7 @@ url = { version = "2.4.0", features = [ "serde" ] }
log = "0.4.0"
env_logger = "0.9.0"
chrono = { version = "0.4.26", features = [ "serde" ] }
sqlx = { version = "0.7", features = [ "runtime-tokio", "tls-rustls", "chrono", "macros", "sqlite" ] }
[profile.release]
strip = true

6
Justfile Normal file
View File

@ -0,0 +1,6 @@
db:
rm test.db*
sqlx database create
sqlx migrate run
crawl: db
RUST_LOG=info cargo run -p crawler

20
README.md Normal file
View File

@ -0,0 +1,20 @@
# Ferret
**Ferret** is an attempt at a new, self-hosted, independent search engine.
## Goals
- Simple and accessible pure-HTML user interface
- No tracking or bloat
- Only indexes smaller actually cool sites
## Technology
- Written in **Rust**
- Uses the [Tokio](https://tokio.rs/) async framwework
- Uses [Axum](https://lib.rs/crates/axum) as a web framework
- [SQLx](https://lib.rs/crates/sqlx) compile-time checked SQL queries
## Development
1. Install **Rust** via `rustup`
2. Clone this repository
3. Install [sqlx-cli](https://lib.rs/crates/sqlx-cli)
4. Set the `DATABASE_URL` environment variable to `"sqlite:todos.db"`
5. Install the [just](https://just.systems/) command runner

View File

@ -5,3 +5,12 @@ edition.workspace = true
authors.workspace = true
homepage.workspace = true
license.workspace = true
[dependencies]
log.workspace = true
env_logger.workspace = true
tokio.workspace = true
sqlx.workspace = true
chrono.workspace = true
config.workspace = true
dirs.workspace = true

View File

@ -1,3 +1,9 @@
fn main() {
println!("Hello, world!");
#[macro_use]
extern crate log;
#[tokio::main]
async fn main() {
env_logger::init();
info!("Hello, world!");
}

View File

@ -1,6 +1,6 @@
[package]
name = "crawler"
version = "0.1.0"
version = "0.2.0"
edition.workspace = true
authors.workspace = true
homepage.workspace = true
@ -14,8 +14,10 @@ url.workspace = true
log.workspace = true
env_logger.workspace = true
chrono.workspace = true
sqlx.workspace = true
serde = { version = "1.0.175", features = [ "derive" ] }
reqwest = { version = "0.11", default-features = false, features = [ "rustls-tls", "gzip", "brotli", "deflate" ] }
cacache = { version = "11.6.0", default-features = false, features = ["tokio-runtime", "mmap"] }
bincode = "1.3.3"
scraper = "0.17.1"
ammonia = "3"

View File

@ -2,10 +2,15 @@ mod settings;
#[macro_use]
extern crate log;
use ammonia::clean;
use chrono::prelude::*;
use chrono::Duration;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use settings::Settings;
use sqlx::sqlite::SqlitePool;
use std::env;
use std::fs::File;
use url::Url;
#[derive(Debug, Deserialize, Serialize)]
@ -20,6 +25,9 @@ struct Page {
async fn main() {
env_logger::init();
let settings = Settings::new().unwrap();
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
.await
.unwrap();
let mut to_crawl = settings.sitemap;
let mut crawled = 0;
@ -40,6 +48,7 @@ async fn main() {
for res in results {
for t in res {
for url in t {
info!("pushing {}", &url.as_str());
to_crawl.push(url);
}
}
@ -58,11 +67,7 @@ async fn main() {
.await
.unwrap();
let decoded_page: Page = bincode::deserialize(&data).unwrap();
info!(
"Found page: {} {}",
&decoded_page.url.as_str(),
&decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S")
);
insert_db(&pool, &decoded_page).await;
}
}
@ -72,9 +77,17 @@ async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
for i in cacache::list_sync(cache_dir.as_path()) {
match i {
Ok(_) => {
if i.unwrap().key == url.clone().into_string() {
error!("Already crawled {}", &url.as_str());
return vec![];
if i.as_ref().unwrap().key == url.clone().into_string() {
let now = Utc::now();
let timestamp = DateTime::<Utc>::from_utc(
NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(),
Utc,
);
let diff = now - timestamp;
if diff <= Duration::hours(1) {
error!("Already crawled {}", &url.as_str());
return vec![];
}
}
}
Err(e) => error!("{}", e),
@ -123,15 +136,60 @@ async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
for element in document.select(&selector) {
let href = element.value().attr("href").unwrap();
let url = base.join(href).unwrap();
// if &url == base {
match &url.fragment() {
Some(_) => break,
None => {}
}
let path = url.path().clone();
let ignore_ends = vec![
".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",
".css", ".json",
];
let mut br = false;
for i in ignore_ends {
if path.ends_with(i) {
br = true;
}
}
if br {
break;
}
// if !path.ends_with(".html") || !path.ends_with("/") {
// break;
// }
for x in &allow {
if &x.domain().unwrap() == &url.domain().unwrap() {
links.push(url);
break;
match &url.domain() {
Some(d) => {
if &x.domain().unwrap() == d {
links.push(url);
break;
}
}
None => {}
}
}
}
return links;
}
async fn insert_db(pool: &SqlitePool, page: &Page) {
let mut conn = pool.acquire().await.unwrap();
let url = page.url.clone().into_string();
let timestamp = page.last_fetched.clone().timestamp();
let body = page.body.clone();
let safe_html = clean(&*body);
let id = sqlx::query!(
r#"
REPLACE INTO crawled_urls ( last_fetched, url, body )
VALUES ( ?1, ?2, ?3 )
"#,
timestamp,
url,
safe_html,
)
.execute(&mut *conn)
.await
.unwrap()
.last_insert_rowid();
}

View File

@ -0,0 +1,6 @@
CREATE TABLE IF NOT EXISTS crawled_urls
(
last_fetched INTEGER PRIMARY KEY NOT NULL,
url TEXT NOT NULL,
body TEXT NOT NULL
);

View File

@ -0,0 +1,12 @@
CREATE TABLE IF NOT EXISTS search_index
(
id INTEGER PRIMARY KEY NOT NULL,
url TEXT NOT NULL,
clicks INTEGER NOT NULL,
size INTEGER NOT NULL,
language TEXT NOT NULL,
title TEXT NOT NULL,
summary TEXT NOT NULL,
content TEXT NOT NULL,
last_updated INTEGER NOT NULL
);