168 lines
4.2 KiB
Rust
168 lines
4.2 KiB
Rust
#[macro_use]
|
|
extern crate log;
|
|
|
|
use ammonia::clean;
|
|
use axum::{
|
|
body::Bytes,
|
|
extract::State,
|
|
http::StatusCode,
|
|
response::IntoResponse,
|
|
routing::{get, post},
|
|
Json, Router,
|
|
};
|
|
use chrono::{DateTime, NaiveDateTime, Utc};
|
|
use fuzzy_matcher::skim::SkimMatcherV2;
|
|
use fuzzy_matcher::FuzzyMatcher;
|
|
use scraper::{Html, Selector};
|
|
use serde::{Deserialize, Serialize};
|
|
use sqlx::sqlite::SqlitePool;
|
|
use std::env;
|
|
use std::net::SocketAddr;
|
|
use std::sync::Arc;
|
|
use url::Url;
|
|
use whatlang::{detect_lang, Lang};
|
|
|
|
struct AppState {
|
|
pool: SqlitePool,
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() {
|
|
tracing_subscriber::fmt::init();
|
|
|
|
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
|
|
.await
|
|
.unwrap();
|
|
// update_index(&pool).await;
|
|
|
|
let shared_state = Arc::new(AppState { pool: pool });
|
|
let app = Router::new()
|
|
// `GET /` goes to `root`
|
|
.route("/", get(root))
|
|
.route("/api/search", get(search))
|
|
.with_state(shared_state);
|
|
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
|
|
tracing::debug!("listening on {}", addr);
|
|
|
|
axum::Server::bind(&addr)
|
|
.serve(app.into_make_service())
|
|
.await
|
|
.unwrap();
|
|
}
|
|
|
|
async fn root() -> &'static str {
|
|
"Hello, World!"
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct SearchQuery {
|
|
language: String,
|
|
include: String,
|
|
ignore: Option<Vec<String>>,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct SearchResult {
|
|
url: Url,
|
|
size: i64,
|
|
title: String,
|
|
summary: String,
|
|
last_updated: DateTime<Utc>,
|
|
}
|
|
|
|
async fn search(
|
|
State(state): State<Arc<AppState>>,
|
|
Json(query): Json<SearchQuery>,
|
|
) -> Json<Vec<SearchResult>> {
|
|
let mut conn = state.pool.acquire().await.unwrap();
|
|
let list = sqlx::query!(
|
|
r#"
|
|
SELECT title, summary, url, content, last_updated, clicks, size
|
|
FROM search_index
|
|
WHERE language = ?1
|
|
ORDER BY last_updated
|
|
"#,
|
|
query.language
|
|
)
|
|
.fetch_all(&mut *conn)
|
|
.await
|
|
.unwrap();
|
|
|
|
let mut results = Vec::new();
|
|
let matcher = SkimMatcherV2::default();
|
|
for res in list {
|
|
let mut is_match = false;
|
|
if matcher.fuzzy_match(&res.title, &query.include).is_some() {
|
|
is_match = true;
|
|
} else if matcher.fuzzy_match(&res.summary, &query.include).is_some() {
|
|
is_match = true;
|
|
} else if matcher.fuzzy_match(&res.url, &query.include).is_some() {
|
|
is_match = true;
|
|
}
|
|
if is_match {
|
|
let timestamp = DateTime::<Utc>::from_utc(
|
|
NaiveDateTime::from_timestamp_opt(res.last_updated, 0).unwrap(),
|
|
Utc,
|
|
);
|
|
results.push(SearchResult {
|
|
url: Url::parse(&res.url).unwrap(),
|
|
size: res.size,
|
|
title: res.title,
|
|
summary: res.summary,
|
|
last_updated: timestamp,
|
|
});
|
|
}
|
|
}
|
|
return Json(results);
|
|
}
|
|
|
|
async fn update_index(pool: &SqlitePool) {
|
|
let mut conn = pool.acquire().await.unwrap();
|
|
let crawled = sqlx::query!(
|
|
r#"
|
|
SELECT last_fetched, url, body
|
|
FROM crawled_urls
|
|
ORDER BY last_fetched
|
|
"#
|
|
)
|
|
.fetch_all(&mut *conn)
|
|
.await
|
|
.unwrap();
|
|
|
|
for res in crawled {
|
|
let size = std::mem::size_of_val(&res.body) as u32;
|
|
let lang = detect_lang(&res.body).unwrap().code();
|
|
let document = Html::parse_document(&res.body);
|
|
|
|
let title_selector = Selector::parse("title").unwrap();
|
|
let title = match document.select(&title_selector).next() {
|
|
Some(v) => v.inner_html(),
|
|
None => res.url.clone(),
|
|
};
|
|
|
|
let desc_selector = Selector::parse("p").unwrap();
|
|
let summary = match document.select(&desc_selector).next() {
|
|
Some(v) => v.inner_html(),
|
|
None => String::new(),
|
|
};
|
|
|
|
let id = sqlx::query!(
|
|
r#"
|
|
REPLACE INTO search_index ( url, size, language, title, summary, content, last_updated )
|
|
VALUES ( ?1, ?2, ?3, ?4, ?5, ?6, ?7 )
|
|
"#,
|
|
res.url,
|
|
size,
|
|
lang,
|
|
title,
|
|
summary,
|
|
res.body,
|
|
res.last_fetched,
|
|
)
|
|
.execute(&mut *conn)
|
|
.await
|
|
.unwrap()
|
|
.last_insert_rowid();
|
|
}
|
|
}
|