Compare commits

...

6 Commits

Author SHA1 Message Date
~erin c75f9f7de9
Bump up core version, update README 2023-07-25 23:17:40 -04:00
~erin 3ccbb09c26
Refactor search 2023-07-25 23:11:15 -04:00
~erin 88381113cd
Rank by score in BTreeMap 2023-07-25 22:50:23 -04:00
~erin 450e6092a0
Basic fuzzy search 2023-07-25 22:39:46 -04:00
~erin 7165765c5c
Axum server 2023-07-25 21:47:03 -04:00
~erin e5cc3e7a31
Core engine - setup, create search index 2023-07-25 21:34:03 -04:00
8 changed files with 435 additions and 13 deletions

200
Cargo.lock generated
View File

@ -149,6 +149,57 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "axum"
version = "0.6.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6a1de45611fdb535bfde7b7de4fd54f4fd2b17b1737c0a59b69bf9b92074b8c"
dependencies = [
"async-trait",
"axum-core",
"bitflags 1.3.2",
"bytes",
"futures-util",
"http",
"http-body",
"hyper",
"itoa",
"matchit",
"memchr",
"mime",
"percent-encoding",
"pin-project-lite",
"rustversion",
"serde",
"serde_json",
"serde_path_to_error",
"serde_urlencoded",
"sync_wrapper",
"tokio",
"tower",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "axum-core"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
dependencies = [
"async-trait",
"bytes",
"futures-util",
"http",
"http-body",
"mime",
"rustversion",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "backtrace"
version = "0.3.68"
@ -338,13 +389,23 @@ checksum = "795bc6e66a8e340f075fcf6227e417a2dc976b92b91f3cdc778bb858778b6747"
name = "core"
version = "0.1.0"
dependencies = [
"ammonia",
"axum",
"chrono",
"config",
"dirs",
"env_logger",
"fuzzy-matcher",
"log",
"scraper",
"serde",
"serde_json",
"sqlx",
"tokio",
"tracing",
"tracing-subscriber",
"url",
"whatlang",
]
[[package]]
@ -364,9 +425,8 @@ dependencies = [
[[package]]
name = "crawler"
version = "0.1.0"
version = "0.2.0"
dependencies = [
"ammonia",
"bincode",
"cacache",
"chrono",
@ -778,6 +838,15 @@ dependencies = [
"slab",
]
[[package]]
name = "fuzzy-matcher"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94"
dependencies = [
"thread_local",
]
[[package]]
name = "fxhash"
version = "0.2.1"
@ -1199,6 +1268,12 @@ dependencies = [
"tendril",
]
[[package]]
name = "matchit"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
[[package]]
name = "md-5"
version = "0.10.5"
@ -1294,6 +1369,16 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num-bigint-dig"
version = "0.8.4"
@ -1383,6 +1468,12 @@ dependencies = [
"hashbrown 0.12.3",
]
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "parking_lot"
version = "0.12.1"
@ -1893,6 +1984,12 @@ dependencies = [
"untrusted",
]
[[package]]
name = "rustversion"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
[[package]]
name = "ryu"
version = "1.0.15"
@ -1991,6 +2088,16 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_path_to_error"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
dependencies = [
"itoa",
"serde",
]
[[package]]
name = "serde_urlencoded"
version = "0.7.1"
@ -2045,6 +2152,15 @@ dependencies = [
"digest",
]
[[package]]
name = "sharded-slab"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
dependencies = [
"lazy_static",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.1"
@ -2420,6 +2536,12 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "sync_wrapper"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
[[package]]
name = "tempfile"
version = "3.7.0"
@ -2473,6 +2595,16 @@ dependencies = [
"syn 2.0.27",
]
[[package]]
name = "thread_local"
version = "1.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "time"
version = "0.1.45"
@ -2574,6 +2706,28 @@ dependencies = [
"serde",
]
[[package]]
name = "tower"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
dependencies = [
"futures-core",
"futures-util",
"pin-project",
"pin-project-lite",
"tokio",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower-layer"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
[[package]]
name = "tower-service"
version = "0.3.2"
@ -2611,6 +2765,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
dependencies = [
"once_cell",
"valuable",
]
[[package]]
name = "tracing-log"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922"
dependencies = [
"lazy_static",
"log",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"nu-ansi-term",
"sharded-slab",
"smallvec",
"thread_local",
"tracing-core",
"tracing-log",
]
[[package]]
@ -2694,6 +2874,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "valuable"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
[[package]]
name = "vcpkg"
version = "0.2.15"
@ -2841,6 +3027,16 @@ dependencies = [
"rustls-webpki",
]
[[package]]
name = "whatlang"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
dependencies = [
"hashbrown 0.12.3",
"once_cell",
]
[[package]]
name = "whoami"
version = "1.4.1"

View File

@ -16,6 +16,8 @@ log = "0.4.0"
env_logger = "0.9.0"
chrono = { version = "0.4.26", features = [ "serde" ] }
sqlx = { version = "0.7", features = [ "runtime-tokio", "tls-rustls", "chrono", "macros", "sqlite" ] }
scraper = "0.17.1"
axum = { version = "0.6.19", features = [ "http2", "tracing" ] }
[profile.release]
strip = true

View File

@ -18,3 +18,15 @@
3. Install [sqlx-cli](https://lib.rs/crates/sqlx-cli)
4. Set the `DATABASE_URL` environment variable to `"sqlite:todos.db"`
5. Install the [just](https://just.systems/) command runner
### Parts
The [crawler](/crawler) is a self-contained executable, that when ran crawls through the URLs and inserts them into an **SQL** database.
The engine [core](/core) is a server that hosts an *API* to perform search queries on the database.
It creates a seperate indexed table in the database, and has a search endpoint at `/api/search`.
It takes a `POST` request with a *JSON* body, and returns a *JSON* response.
The variables it takes are:
- `language`: an *ISO 639-3* lang code
- `include`: the primary search query
- `ignore`: keywords to ignore (optional)
- `option`: a `SearchType` (`Fuzzy`, `Regex`, `Sql`)

View File

@ -1,6 +1,6 @@
[package]
name = "core"
version = "0.1.0"
version = "0.2.0"
edition.workspace = true
authors.workspace = true
homepage.workspace = true
@ -14,3 +14,13 @@ sqlx.workspace = true
chrono.workspace = true
config.workspace = true
dirs.workspace = true
scraper.workspace = true
axum.workspace = true
url.workspace = true
whatlang = "0.16.2"
ammonia = "3"
tracing-subscriber = "0.3.17"
tracing = "0.1.37"
serde = "1.0.175"
serde_json = "1.0.103"
fuzzy-matcher = "0.3.7"

View File

@ -1,9 +1,215 @@
#[macro_use]
extern crate log;
use ammonia::clean;
use axum::{
body::Bytes,
extract::State,
http::StatusCode,
response::IntoResponse,
routing::{get, post},
Json, Router,
};
use chrono::{DateTime, NaiveDateTime, Utc};
use fuzzy_matcher::skim::SkimMatcherV2;
use fuzzy_matcher::FuzzyMatcher;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use sqlx::sqlite::SqlitePool;
use std::collections::BTreeMap;
use std::env;
use std::net::SocketAddr;
use std::sync::Arc;
use url::Url;
use whatlang::{detect_lang, Lang};
struct AppState {
pool: SqlitePool,
}
#[tokio::main]
async fn main() {
env_logger::init();
tracing_subscriber::fmt::init();
info!("Hello, world!");
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
.await
.unwrap();
// update_index(&pool).await;
let shared_state = Arc::new(AppState { pool: pool });
let app = Router::new()
// `GET /` goes to `root`
.route("/", get(root))
.route("/api/search", get(search))
.with_state(shared_state);
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
tracing::debug!("listening on {}", addr);
axum::Server::bind(&addr)
.serve(app.into_make_service())
.await
.unwrap();
}
async fn root() -> &'static str {
"Hello, World!"
}
#[derive(Deserialize)]
struct SearchQuery {
language: String,
include: String,
ignore: Option<Vec<String>>,
option: SearchType,
}
#[derive(Deserialize)]
enum SearchType {
Fuzzy,
Regex,
Sql,
}
#[derive(Serialize)]
struct SearchResult {
url: Url,
size: i64,
title: String,
summary: String,
last_updated: DateTime<Utc>,
}
async fn fuzzy_search(
title: &str,
summary: &str,
url: &str,
last_updated: i64,
size: i64,
query: &SearchQuery,
) -> Option<(i64, SearchResult)> {
let mut score = 0;
let matcher = SkimMatcherV2::default();
let t_match = matcher.fuzzy_match(title, &query.include);
let s_match = matcher.fuzzy_match(summary, &query.include);
let u_match = matcher.fuzzy_match(url, &query.include);
if t_match.is_some() {
score += t_match.unwrap();
}
if s_match.is_some() {
score += s_match.unwrap() / 2;
}
if u_match.is_some() {
score += u_match.unwrap() / 2;
}
if score > 5 {
let timestamp = DateTime::<Utc>::from_utc(
NaiveDateTime::from_timestamp_opt(last_updated, 0).unwrap(),
Utc,
);
return Some((
score,
SearchResult {
url: Url::parse(url).unwrap(),
size: size,
title: title.to_string(),
summary: summary.to_string(),
last_updated: timestamp,
},
));
}
return None;
}
async fn search(
State(state): State<Arc<AppState>>,
Json(query): Json<SearchQuery>,
) -> Json<BTreeMap<i64, SearchResult>> {
let mut conn = state.pool.acquire().await.unwrap();
let list = sqlx::query!(
r#"
SELECT title, summary, url, content, last_updated, clicks, size
FROM search_index
WHERE language = ?1
ORDER BY last_updated
"#,
query.language
)
.fetch_all(&mut *conn)
.await
.unwrap();
let mut results = BTreeMap::new();
for res in list {
let mut is_match = false;
match query.option {
SearchType::Fuzzy => {
match fuzzy_search(
&res.title,
&res.summary,
&res.url,
res.last_updated,
res.size,
&query,
)
.await
{
Some((s, r)) => results.insert(s, r),
None => None,
};
}
_ => {}
};
}
return Json(results);
}
async fn update_index(pool: &SqlitePool) {
let mut conn = pool.acquire().await.unwrap();
let crawled = sqlx::query!(
r#"
SELECT last_fetched, url, body
FROM crawled_urls
ORDER BY last_fetched
"#
)
.fetch_all(&mut *conn)
.await
.unwrap();
for res in crawled {
let size = std::mem::size_of_val(&res.body) as u32;
let lang = detect_lang(&res.body).unwrap().code();
let document = Html::parse_document(&res.body);
let title_selector = Selector::parse("title").unwrap();
let title = match document.select(&title_selector).next() {
Some(v) => v.inner_html(),
None => res.url.clone(),
};
let desc_selector = Selector::parse("p").unwrap();
let summary = match document.select(&desc_selector).next() {
Some(v) => v.inner_html(),
None => String::new(),
};
let id = sqlx::query!(
r#"
REPLACE INTO search_index ( url, size, language, title, summary, content, last_updated )
VALUES ( ?1, ?2, ?3, ?4, ?5, ?6, ?7 )
"#,
res.url,
size,
lang,
title,
summary,
res.body,
res.last_fetched,
)
.execute(&mut *conn)
.await
.unwrap()
.last_insert_rowid();
}
}

View File

@ -15,9 +15,8 @@ log.workspace = true
env_logger.workspace = true
chrono.workspace = true
sqlx.workspace = true
scraper.workspace = true
serde = { version = "1.0.175", features = [ "derive" ] }
reqwest = { version = "0.11", default-features = false, features = [ "rustls-tls", "gzip", "brotli", "deflate" ] }
cacache = { version = "11.6.0", default-features = false, features = ["tokio-runtime", "mmap"] }
bincode = "1.3.3"
scraper = "0.17.1"
ammonia = "3"

View File

@ -2,7 +2,6 @@ mod settings;
#[macro_use]
extern crate log;
use ammonia::clean;
use chrono::prelude::*;
use chrono::Duration;
use scraper::{Html, Selector};
@ -177,7 +176,6 @@ async fn insert_db(pool: &SqlitePool, page: &Page) {
let url = page.url.clone().into_string();
let timestamp = page.last_fetched.clone().timestamp();
let body = page.body.clone();
let safe_html = clean(&*body);
let id = sqlx::query!(
r#"
@ -186,7 +184,7 @@ async fn insert_db(pool: &SqlitePool, page: &Page) {
"#,
timestamp,
url,
safe_html,
body,
)
.execute(&mut *conn)
.await

View File

@ -1,8 +1,7 @@
CREATE TABLE IF NOT EXISTS search_index
(
id INTEGER PRIMARY KEY NOT NULL,
url TEXT NOT NULL,
clicks INTEGER NOT NULL,
url TEXT PRIMARY KEY NOT NULL,
clicks INTEGER NOT NULL DEFAULT 0,
size INTEGER NOT NULL,
language TEXT NOT NULL,
title TEXT NOT NULL,