Basic fuzzy search

main
~erin 2023-07-25 22:39:46 -04:00
parent 7165765c5c
commit 450e6092a0
Signed by: erin
GPG Key ID: 0FEDEAFF1C14847E
4 changed files with 96 additions and 4 deletions

12
Cargo.lock generated
View File

@ -395,13 +395,16 @@ dependencies = [
"config",
"dirs",
"env_logger",
"fuzzy-matcher",
"log",
"scraper",
"serde",
"serde_json",
"sqlx",
"tokio",
"tracing",
"tracing-subscriber",
"url",
"whatlang",
]
@ -835,6 +838,15 @@ dependencies = [
"slab",
]
[[package]]
name = "fuzzy-matcher"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94"
dependencies = [
"thread_local",
]
[[package]]
name = "fxhash"
version = "0.2.1"

View File

@ -16,8 +16,11 @@ config.workspace = true
dirs.workspace = true
scraper.workspace = true
axum.workspace = true
url.workspace = true
whatlang = "0.16.2"
ammonia = "3"
tracing-subscriber = "0.3.17"
tracing = "0.1.37"
serde = "1.0.175"
serde_json = "1.0.103"
fuzzy-matcher = "0.3.7"

View File

@ -3,18 +3,29 @@ extern crate log;
use ammonia::clean;
use axum::{
body::Bytes,
extract::State,
http::StatusCode,
response::IntoResponse,
routing::{get, post},
Json, Router,
};
use chrono::{DateTime, NaiveDateTime, Utc};
use fuzzy_matcher::skim::SkimMatcherV2;
use fuzzy_matcher::FuzzyMatcher;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use sqlx::sqlite::SqlitePool;
use std::env;
use std::net::SocketAddr;
use std::sync::Arc;
use url::Url;
use whatlang::{detect_lang, Lang};
struct AppState {
pool: SqlitePool,
}
#[tokio::main]
async fn main() {
tracing_subscriber::fmt::init();
@ -22,12 +33,17 @@ async fn main() {
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
.await
.unwrap();
// update_index(&pool).await;
let shared_state = Arc::new(AppState { pool: pool });
let app = Router::new()
// `GET /` goes to `root`
.route("/", get(root));
.route("/", get(root))
.route("/api/search", get(search))
.with_state(shared_state);
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
tracing::debug!("listening on {}", addr);
axum::Server::bind(&addr)
.serve(app.into_make_service())
.await
@ -38,6 +54,68 @@ async fn root() -> &'static str {
"Hello, World!"
}
#[derive(Deserialize)]
struct SearchQuery {
language: String,
include: String,
ignore: Option<Vec<String>>,
}
#[derive(Serialize)]
struct SearchResult {
url: Url,
size: i64,
title: String,
summary: String,
last_updated: DateTime<Utc>,
}
async fn search(
State(state): State<Arc<AppState>>,
Json(query): Json<SearchQuery>,
) -> Json<Vec<SearchResult>> {
let mut conn = state.pool.acquire().await.unwrap();
let list = sqlx::query!(
r#"
SELECT title, summary, url, content, last_updated, clicks, size
FROM search_index
WHERE language = ?1
ORDER BY last_updated
"#,
query.language
)
.fetch_all(&mut *conn)
.await
.unwrap();
let mut results = Vec::new();
let matcher = SkimMatcherV2::default();
for res in list {
let mut is_match = false;
if matcher.fuzzy_match(&res.title, &query.include).is_some() {
is_match = true;
} else if matcher.fuzzy_match(&res.summary, &query.include).is_some() {
is_match = true;
} else if matcher.fuzzy_match(&res.url, &query.include).is_some() {
is_match = true;
}
if is_match {
let timestamp = DateTime::<Utc>::from_utc(
NaiveDateTime::from_timestamp_opt(res.last_updated, 0).unwrap(),
Utc,
);
results.push(SearchResult {
url: Url::parse(&res.url).unwrap(),
size: res.size,
title: res.title,
summary: res.summary,
last_updated: timestamp,
});
}
}
return Json(results);
}
async fn update_index(pool: &SqlitePool) {
let mut conn = pool.acquire().await.unwrap();
let crawled = sqlx::query!(
@ -47,7 +125,7 @@ async fn update_index(pool: &SqlitePool) {
ORDER BY last_fetched
"#
)
.fetch_all(pool)
.fetch_all(&mut *conn)
.await
.unwrap();

View File

@ -1,7 +1,6 @@
CREATE TABLE IF NOT EXISTS search_index
(
id INTEGER PRIMARY KEY NOT NULL,
url TEXT NOT NULL,
url TEXT PRIMARY KEY NOT NULL,
clicks INTEGER NOT NULL DEFAULT 0,
size INTEGER NOT NULL,
language TEXT NOT NULL,