Basic fuzzy search

main
~erin 2023-07-25 22:39:46 -04:00
parent 7165765c5c
commit 450e6092a0
Signed by: erin
GPG Key ID: 0FEDEAFF1C14847E
4 changed files with 96 additions and 4 deletions

12
Cargo.lock generated
View File

@ -395,13 +395,16 @@ dependencies = [
"config", "config",
"dirs", "dirs",
"env_logger", "env_logger",
"fuzzy-matcher",
"log", "log",
"scraper", "scraper",
"serde", "serde",
"serde_json",
"sqlx", "sqlx",
"tokio", "tokio",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"url",
"whatlang", "whatlang",
] ]
@ -835,6 +838,15 @@ dependencies = [
"slab", "slab",
] ]
[[package]]
name = "fuzzy-matcher"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94"
dependencies = [
"thread_local",
]
[[package]] [[package]]
name = "fxhash" name = "fxhash"
version = "0.2.1" version = "0.2.1"

View File

@ -16,8 +16,11 @@ config.workspace = true
dirs.workspace = true dirs.workspace = true
scraper.workspace = true scraper.workspace = true
axum.workspace = true axum.workspace = true
url.workspace = true
whatlang = "0.16.2" whatlang = "0.16.2"
ammonia = "3" ammonia = "3"
tracing-subscriber = "0.3.17" tracing-subscriber = "0.3.17"
tracing = "0.1.37" tracing = "0.1.37"
serde = "1.0.175" serde = "1.0.175"
serde_json = "1.0.103"
fuzzy-matcher = "0.3.7"

View File

@ -3,18 +3,29 @@ extern crate log;
use ammonia::clean; use ammonia::clean;
use axum::{ use axum::{
body::Bytes,
extract::State,
http::StatusCode, http::StatusCode,
response::IntoResponse, response::IntoResponse,
routing::{get, post}, routing::{get, post},
Json, Router, Json, Router,
}; };
use chrono::{DateTime, NaiveDateTime, Utc};
use fuzzy_matcher::skim::SkimMatcherV2;
use fuzzy_matcher::FuzzyMatcher;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sqlx::sqlite::SqlitePool; use sqlx::sqlite::SqlitePool;
use std::env; use std::env;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::Arc;
use url::Url;
use whatlang::{detect_lang, Lang}; use whatlang::{detect_lang, Lang};
struct AppState {
pool: SqlitePool,
}
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
tracing_subscriber::fmt::init(); tracing_subscriber::fmt::init();
@ -22,12 +33,17 @@ async fn main() {
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap()) let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
.await .await
.unwrap(); .unwrap();
// update_index(&pool).await;
let shared_state = Arc::new(AppState { pool: pool });
let app = Router::new() let app = Router::new()
// `GET /` goes to `root` // `GET /` goes to `root`
.route("/", get(root)); .route("/", get(root))
.route("/api/search", get(search))
.with_state(shared_state);
let addr = SocketAddr::from(([127, 0, 0, 1], 3000)); let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
tracing::debug!("listening on {}", addr); tracing::debug!("listening on {}", addr);
axum::Server::bind(&addr) axum::Server::bind(&addr)
.serve(app.into_make_service()) .serve(app.into_make_service())
.await .await
@ -38,6 +54,68 @@ async fn root() -> &'static str {
"Hello, World!" "Hello, World!"
} }
#[derive(Deserialize)]
struct SearchQuery {
language: String,
include: String,
ignore: Option<Vec<String>>,
}
#[derive(Serialize)]
struct SearchResult {
url: Url,
size: i64,
title: String,
summary: String,
last_updated: DateTime<Utc>,
}
async fn search(
State(state): State<Arc<AppState>>,
Json(query): Json<SearchQuery>,
) -> Json<Vec<SearchResult>> {
let mut conn = state.pool.acquire().await.unwrap();
let list = sqlx::query!(
r#"
SELECT title, summary, url, content, last_updated, clicks, size
FROM search_index
WHERE language = ?1
ORDER BY last_updated
"#,
query.language
)
.fetch_all(&mut *conn)
.await
.unwrap();
let mut results = Vec::new();
let matcher = SkimMatcherV2::default();
for res in list {
let mut is_match = false;
if matcher.fuzzy_match(&res.title, &query.include).is_some() {
is_match = true;
} else if matcher.fuzzy_match(&res.summary, &query.include).is_some() {
is_match = true;
} else if matcher.fuzzy_match(&res.url, &query.include).is_some() {
is_match = true;
}
if is_match {
let timestamp = DateTime::<Utc>::from_utc(
NaiveDateTime::from_timestamp_opt(res.last_updated, 0).unwrap(),
Utc,
);
results.push(SearchResult {
url: Url::parse(&res.url).unwrap(),
size: res.size,
title: res.title,
summary: res.summary,
last_updated: timestamp,
});
}
}
return Json(results);
}
async fn update_index(pool: &SqlitePool) { async fn update_index(pool: &SqlitePool) {
let mut conn = pool.acquire().await.unwrap(); let mut conn = pool.acquire().await.unwrap();
let crawled = sqlx::query!( let crawled = sqlx::query!(
@ -47,7 +125,7 @@ async fn update_index(pool: &SqlitePool) {
ORDER BY last_fetched ORDER BY last_fetched
"# "#
) )
.fetch_all(pool) .fetch_all(&mut *conn)
.await .await
.unwrap(); .unwrap();

View File

@ -1,7 +1,6 @@
CREATE TABLE IF NOT EXISTS search_index CREATE TABLE IF NOT EXISTS search_index
( (
id INTEGER PRIMARY KEY NOT NULL, url TEXT PRIMARY KEY NOT NULL,
url TEXT NOT NULL,
clicks INTEGER NOT NULL DEFAULT 0, clicks INTEGER NOT NULL DEFAULT 0,
size INTEGER NOT NULL, size INTEGER NOT NULL,
language TEXT NOT NULL, language TEXT NOT NULL,