Basic fuzzy search
parent
7165765c5c
commit
450e6092a0
|
@ -395,13 +395,16 @@ dependencies = [
|
|||
"config",
|
||||
"dirs",
|
||||
"env_logger",
|
||||
"fuzzy-matcher",
|
||||
"log",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sqlx",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"whatlang",
|
||||
]
|
||||
|
||||
|
@ -835,6 +838,15 @@ dependencies = [
|
|||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fuzzy-matcher"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94"
|
||||
dependencies = [
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
|
|
|
@ -16,8 +16,11 @@ config.workspace = true
|
|||
dirs.workspace = true
|
||||
scraper.workspace = true
|
||||
axum.workspace = true
|
||||
url.workspace = true
|
||||
whatlang = "0.16.2"
|
||||
ammonia = "3"
|
||||
tracing-subscriber = "0.3.17"
|
||||
tracing = "0.1.37"
|
||||
serde = "1.0.175"
|
||||
serde_json = "1.0.103"
|
||||
fuzzy-matcher = "0.3.7"
|
||||
|
|
|
@ -3,18 +3,29 @@ extern crate log;
|
|||
|
||||
use ammonia::clean;
|
||||
use axum::{
|
||||
body::Bytes,
|
||||
extract::State,
|
||||
http::StatusCode,
|
||||
response::IntoResponse,
|
||||
routing::{get, post},
|
||||
Json, Router,
|
||||
};
|
||||
use chrono::{DateTime, NaiveDateTime, Utc};
|
||||
use fuzzy_matcher::skim::SkimMatcherV2;
|
||||
use fuzzy_matcher::FuzzyMatcher;
|
||||
use scraper::{Html, Selector};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::sqlite::SqlitePool;
|
||||
use std::env;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use url::Url;
|
||||
use whatlang::{detect_lang, Lang};
|
||||
|
||||
struct AppState {
|
||||
pool: SqlitePool,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
tracing_subscriber::fmt::init();
|
||||
|
@ -22,12 +33,17 @@ async fn main() {
|
|||
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
// update_index(&pool).await;
|
||||
|
||||
let shared_state = Arc::new(AppState { pool: pool });
|
||||
let app = Router::new()
|
||||
// `GET /` goes to `root`
|
||||
.route("/", get(root));
|
||||
.route("/", get(root))
|
||||
.route("/api/search", get(search))
|
||||
.with_state(shared_state);
|
||||
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
|
||||
tracing::debug!("listening on {}", addr);
|
||||
|
||||
axum::Server::bind(&addr)
|
||||
.serve(app.into_make_service())
|
||||
.await
|
||||
|
@ -38,6 +54,68 @@ async fn root() -> &'static str {
|
|||
"Hello, World!"
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct SearchQuery {
|
||||
language: String,
|
||||
include: String,
|
||||
ignore: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SearchResult {
|
||||
url: Url,
|
||||
size: i64,
|
||||
title: String,
|
||||
summary: String,
|
||||
last_updated: DateTime<Utc>,
|
||||
}
|
||||
|
||||
async fn search(
|
||||
State(state): State<Arc<AppState>>,
|
||||
Json(query): Json<SearchQuery>,
|
||||
) -> Json<Vec<SearchResult>> {
|
||||
let mut conn = state.pool.acquire().await.unwrap();
|
||||
let list = sqlx::query!(
|
||||
r#"
|
||||
SELECT title, summary, url, content, last_updated, clicks, size
|
||||
FROM search_index
|
||||
WHERE language = ?1
|
||||
ORDER BY last_updated
|
||||
"#,
|
||||
query.language
|
||||
)
|
||||
.fetch_all(&mut *conn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut results = Vec::new();
|
||||
let matcher = SkimMatcherV2::default();
|
||||
for res in list {
|
||||
let mut is_match = false;
|
||||
if matcher.fuzzy_match(&res.title, &query.include).is_some() {
|
||||
is_match = true;
|
||||
} else if matcher.fuzzy_match(&res.summary, &query.include).is_some() {
|
||||
is_match = true;
|
||||
} else if matcher.fuzzy_match(&res.url, &query.include).is_some() {
|
||||
is_match = true;
|
||||
}
|
||||
if is_match {
|
||||
let timestamp = DateTime::<Utc>::from_utc(
|
||||
NaiveDateTime::from_timestamp_opt(res.last_updated, 0).unwrap(),
|
||||
Utc,
|
||||
);
|
||||
results.push(SearchResult {
|
||||
url: Url::parse(&res.url).unwrap(),
|
||||
size: res.size,
|
||||
title: res.title,
|
||||
summary: res.summary,
|
||||
last_updated: timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
return Json(results);
|
||||
}
|
||||
|
||||
async fn update_index(pool: &SqlitePool) {
|
||||
let mut conn = pool.acquire().await.unwrap();
|
||||
let crawled = sqlx::query!(
|
||||
|
@ -47,7 +125,7 @@ async fn update_index(pool: &SqlitePool) {
|
|||
ORDER BY last_fetched
|
||||
"#
|
||||
)
|
||||
.fetch_all(pool)
|
||||
.fetch_all(&mut *conn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
CREATE TABLE IF NOT EXISTS search_index
|
||||
(
|
||||
id INTEGER PRIMARY KEY NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
url TEXT PRIMARY KEY NOT NULL,
|
||||
clicks INTEGER NOT NULL DEFAULT 0,
|
||||
size INTEGER NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
|
|
Loading…
Reference in New Issue