ferret/crawler/src/main.rs

196 lines
5.5 KiB
Rust
Raw Normal View History

2023-07-25 19:24:14 +00:00
mod settings;
2023-07-25 20:11:09 +00:00
#[macro_use]
extern crate log;
2023-07-25 23:54:05 +00:00
use ammonia::clean;
2023-07-25 22:16:45 +00:00
use chrono::prelude::*;
2023-07-25 22:33:38 +00:00
use chrono::Duration;
2023-07-25 21:36:41 +00:00
use scraper::{Html, Selector};
2023-07-25 20:58:25 +00:00
use serde::{Deserialize, Serialize};
2023-07-25 19:24:14 +00:00
use settings::Settings;
2023-07-25 23:28:17 +00:00
use sqlx::sqlite::SqlitePool;
use std::env;
use std::fs::File;
2023-07-25 20:11:09 +00:00
use url::Url;
2023-07-25 19:24:14 +00:00
2023-07-25 20:58:25 +00:00
#[derive(Debug, Deserialize, Serialize)]
struct Page {
url: Url,
status: u16,
2023-07-25 22:16:45 +00:00
last_fetched: DateTime<Utc>,
2023-07-25 20:58:25 +00:00
body: String,
}
2023-07-25 18:52:19 +00:00
#[tokio::main]
async fn main() {
2023-07-25 20:11:09 +00:00
env_logger::init();
let settings = Settings::new().unwrap();
2023-07-25 23:47:39 +00:00
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
.await
.unwrap();
2023-07-25 19:24:14 +00:00
2023-07-25 21:36:41 +00:00
let mut to_crawl = settings.sitemap;
2023-07-25 22:16:45 +00:00
let mut crawled = 0;
2023-07-25 21:36:41 +00:00
loop {
let mut handles = Vec::new();
for url in to_crawl {
2023-07-25 22:16:45 +00:00
let job = tokio::spawn(crawl_url(url, settings.allowlist.clone()));
2023-07-25 21:36:41 +00:00
handles.push(job);
2023-07-25 22:16:45 +00:00
crawled += 1;
2023-07-25 21:36:41 +00:00
}
let mut results = Vec::new();
for job in handles {
results.push(job.await);
}
to_crawl = Vec::new();
for res in results {
for t in res {
for url in t {
2023-07-26 00:11:29 +00:00
info!("pushing {}", &url.as_str());
2023-07-25 22:16:45 +00:00
to_crawl.push(url);
2023-07-25 21:36:41 +00:00
}
}
}
if to_crawl == vec![] {
break;
}
2023-07-25 20:11:09 +00:00
}
2023-07-25 22:16:45 +00:00
info!("Succesfully crawled {} pages!", crawled);
let mut cache_dir = dirs::cache_dir().unwrap();
cache_dir.push("ferret");
for i in cacache::list_sync(cache_dir.as_path()) {
let data = cacache::read(cache_dir.as_path(), &i.unwrap().key)
.await
.unwrap();
let decoded_page: Page = bincode::deserialize(&data).unwrap();
2023-07-25 23:47:39 +00:00
insert_db(&pool, &decoded_page).await;
2023-07-25 22:16:45 +00:00
}
2023-07-25 20:11:09 +00:00
}
2023-07-25 22:16:45 +00:00
async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
let mut cache_dir = dirs::cache_dir().unwrap();
cache_dir.push("ferret");
for i in cacache::list_sync(cache_dir.as_path()) {
match i {
Ok(_) => {
2023-07-25 22:33:38 +00:00
if i.as_ref().unwrap().key == url.clone().into_string() {
let now = Utc::now();
let timestamp = DateTime::<Utc>::from_utc(
NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(),
Utc,
);
let diff = now - timestamp;
if diff <= Duration::hours(1) {
error!("Already crawled {}", &url.as_str());
return vec![];
}
2023-07-25 22:16:45 +00:00
}
}
Err(e) => error!("{}", e),
}
2023-07-25 21:36:41 +00:00
}
2023-07-25 20:11:09 +00:00
let mut resp = reqwest::get(url).await;
match resp {
Ok(v) => {
2023-07-25 20:58:25 +00:00
let page = Page {
url: v.url().to_owned(),
status: v.status().as_u16(),
2023-07-25 22:16:45 +00:00
last_fetched: Utc::now(),
2023-07-25 20:58:25 +00:00
body: v.text().await.unwrap(),
};
if page.status >= 400 {
error!("{:?} Error for {}", &page.status, &page.url);
2023-07-25 21:36:41 +00:00
return vec![];
2023-07-25 20:58:25 +00:00
} else {
info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);
let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();
cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)
.await
.unwrap();
2023-07-25 20:11:09 +00:00
2023-07-25 22:16:45 +00:00
return find_links(&page.body, &page.url, allow).await;
2023-07-25 21:36:41 +00:00
// let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
// let decoded_page: Page = bincode::deserialize(&data).unwrap();
2023-07-25 20:58:25 +00:00
}
2023-07-25 20:11:09 +00:00
}
Err(e) => {
error!("Could not get url: {}", e);
2023-07-25 21:36:41 +00:00
return vec![];
}
}
}
2023-07-25 22:16:45 +00:00
async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
2023-07-25 21:36:41 +00:00
let document = Html::parse_document(html);
let selector = Selector::parse("a").unwrap();
let mut links: Vec<Url> = Vec::default();
for element in document.select(&selector) {
let href = element.value().attr("href").unwrap();
let url = base.join(href).unwrap();
2023-07-26 00:25:49 +00:00
match &url.fragment() {
Some(_) => break,
None => {}
}
let path = url.path().clone();
let ignore_ends = vec![
".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",
".css", ".json",
];
let mut br = false;
for i in ignore_ends {
if path.ends_with(i) {
br = true;
}
}
if br {
break;
}
// if !path.ends_with(".html") || !path.ends_with("/") {
2023-07-25 22:16:45 +00:00
// break;
// }
2023-07-25 21:36:41 +00:00
for x in &allow {
2023-07-26 00:11:29 +00:00
match &url.domain() {
Some(d) => {
if &x.domain().unwrap() == d {
links.push(url);
break;
}
}
None => {}
2023-07-25 21:36:41 +00:00
}
2023-07-25 20:11:09 +00:00
}
}
2023-07-25 21:36:41 +00:00
return links;
2023-07-25 18:52:19 +00:00
}
2023-07-25 23:28:17 +00:00
2023-07-25 23:47:39 +00:00
async fn insert_db(pool: &SqlitePool, page: &Page) {
let mut conn = pool.acquire().await.unwrap();
let url = page.url.clone().into_string();
let timestamp = page.last_fetched.clone().timestamp();
2023-07-25 23:54:05 +00:00
let body = page.body.clone();
let safe_html = clean(&*body);
2023-07-25 23:47:39 +00:00
let id = sqlx::query!(
r#"
REPLACE INTO cached_urls ( last_fetched, url, body )
VALUES ( ?1, ?2, ?3 )
"#,
timestamp,
url,
2023-07-25 23:54:05 +00:00
safe_html,
2023-07-25 23:47:39 +00:00
)
.execute(&mut *conn)
.await
.unwrap()
.last_insert_rowid();
2023-07-25 23:28:17 +00:00
}