2023-07-25 19:24:14 +00:00
|
|
|
mod settings;
|
|
|
|
|
2023-07-25 20:11:09 +00:00
|
|
|
#[macro_use]
|
|
|
|
extern crate log;
|
2023-07-25 22:16:45 +00:00
|
|
|
use chrono::prelude::*;
|
2023-07-25 22:33:38 +00:00
|
|
|
use chrono::Duration;
|
2023-07-25 21:36:41 +00:00
|
|
|
use scraper::{Html, Selector};
|
2023-07-25 20:58:25 +00:00
|
|
|
use serde::{Deserialize, Serialize};
|
2023-07-25 19:24:14 +00:00
|
|
|
use settings::Settings;
|
2023-07-25 23:28:17 +00:00
|
|
|
use sqlx::sqlite::SqlitePool;
|
|
|
|
use std::env;
|
|
|
|
use std::fs::File;
|
2023-07-25 20:11:09 +00:00
|
|
|
use url::Url;
|
2023-07-25 19:24:14 +00:00
|
|
|
|
2023-07-25 20:58:25 +00:00
|
|
|
#[derive(Debug, Deserialize, Serialize)]
|
|
|
|
struct Page {
|
|
|
|
url: Url,
|
|
|
|
status: u16,
|
2023-07-25 22:16:45 +00:00
|
|
|
last_fetched: DateTime<Utc>,
|
2023-07-25 20:58:25 +00:00
|
|
|
body: String,
|
|
|
|
}
|
|
|
|
|
2023-07-25 18:52:19 +00:00
|
|
|
#[tokio::main]
|
|
|
|
async fn main() {
|
2023-07-25 20:11:09 +00:00
|
|
|
env_logger::init();
|
|
|
|
let settings = Settings::new().unwrap();
|
2023-07-25 19:24:14 +00:00
|
|
|
|
2023-07-25 21:36:41 +00:00
|
|
|
let mut to_crawl = settings.sitemap;
|
2023-07-25 22:16:45 +00:00
|
|
|
let mut crawled = 0;
|
2023-07-25 21:36:41 +00:00
|
|
|
|
|
|
|
loop {
|
|
|
|
let mut handles = Vec::new();
|
|
|
|
for url in to_crawl {
|
2023-07-25 22:16:45 +00:00
|
|
|
let job = tokio::spawn(crawl_url(url, settings.allowlist.clone()));
|
2023-07-25 21:36:41 +00:00
|
|
|
handles.push(job);
|
2023-07-25 22:16:45 +00:00
|
|
|
crawled += 1;
|
2023-07-25 21:36:41 +00:00
|
|
|
}
|
|
|
|
let mut results = Vec::new();
|
|
|
|
for job in handles {
|
|
|
|
results.push(job.await);
|
|
|
|
}
|
|
|
|
|
|
|
|
to_crawl = Vec::new();
|
|
|
|
for res in results {
|
|
|
|
for t in res {
|
|
|
|
for url in t {
|
2023-07-25 22:16:45 +00:00
|
|
|
to_crawl.push(url);
|
2023-07-25 21:36:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if to_crawl == vec![] {
|
|
|
|
break;
|
|
|
|
}
|
2023-07-25 20:11:09 +00:00
|
|
|
}
|
2023-07-25 22:16:45 +00:00
|
|
|
|
|
|
|
info!("Succesfully crawled {} pages!", crawled);
|
|
|
|
|
2023-07-25 23:28:17 +00:00
|
|
|
insert_db().await.unwrap();
|
|
|
|
|
2023-07-25 22:16:45 +00:00
|
|
|
let mut cache_dir = dirs::cache_dir().unwrap();
|
|
|
|
cache_dir.push("ferret");
|
|
|
|
for i in cacache::list_sync(cache_dir.as_path()) {
|
|
|
|
let data = cacache::read(cache_dir.as_path(), &i.unwrap().key)
|
|
|
|
.await
|
|
|
|
.unwrap();
|
|
|
|
let decoded_page: Page = bincode::deserialize(&data).unwrap();
|
|
|
|
info!(
|
|
|
|
"Found page: {} {}",
|
|
|
|
&decoded_page.url.as_str(),
|
|
|
|
&decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S")
|
|
|
|
);
|
|
|
|
}
|
2023-07-25 20:11:09 +00:00
|
|
|
}
|
|
|
|
|
2023-07-25 22:16:45 +00:00
|
|
|
async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
|
|
|
|
let mut cache_dir = dirs::cache_dir().unwrap();
|
|
|
|
cache_dir.push("ferret");
|
|
|
|
for i in cacache::list_sync(cache_dir.as_path()) {
|
|
|
|
match i {
|
|
|
|
Ok(_) => {
|
2023-07-25 22:33:38 +00:00
|
|
|
if i.as_ref().unwrap().key == url.clone().into_string() {
|
|
|
|
let now = Utc::now();
|
|
|
|
let timestamp = DateTime::<Utc>::from_utc(
|
|
|
|
NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(),
|
|
|
|
Utc,
|
|
|
|
);
|
|
|
|
let diff = now - timestamp;
|
|
|
|
if diff <= Duration::hours(1) {
|
|
|
|
error!("Already crawled {}", &url.as_str());
|
|
|
|
return vec![];
|
|
|
|
}
|
2023-07-25 22:16:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
Err(e) => error!("{}", e),
|
|
|
|
}
|
2023-07-25 21:36:41 +00:00
|
|
|
}
|
|
|
|
|
2023-07-25 20:11:09 +00:00
|
|
|
let mut resp = reqwest::get(url).await;
|
|
|
|
match resp {
|
|
|
|
Ok(v) => {
|
2023-07-25 20:58:25 +00:00
|
|
|
let page = Page {
|
|
|
|
url: v.url().to_owned(),
|
|
|
|
status: v.status().as_u16(),
|
2023-07-25 22:16:45 +00:00
|
|
|
last_fetched: Utc::now(),
|
2023-07-25 20:58:25 +00:00
|
|
|
body: v.text().await.unwrap(),
|
|
|
|
};
|
|
|
|
|
|
|
|
if page.status >= 400 {
|
|
|
|
error!("{:?} Error for {}", &page.status, &page.url);
|
2023-07-25 21:36:41 +00:00
|
|
|
return vec![];
|
2023-07-25 20:58:25 +00:00
|
|
|
} else {
|
|
|
|
info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);
|
|
|
|
|
|
|
|
let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();
|
|
|
|
cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)
|
|
|
|
.await
|
|
|
|
.unwrap();
|
2023-07-25 20:11:09 +00:00
|
|
|
|
2023-07-25 22:16:45 +00:00
|
|
|
return find_links(&page.body, &page.url, allow).await;
|
2023-07-25 21:36:41 +00:00
|
|
|
|
|
|
|
// let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
|
|
|
|
// let decoded_page: Page = bincode::deserialize(&data).unwrap();
|
2023-07-25 20:58:25 +00:00
|
|
|
}
|
2023-07-25 20:11:09 +00:00
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
error!("Could not get url: {}", e);
|
2023-07-25 21:36:41 +00:00
|
|
|
return vec![];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-25 22:16:45 +00:00
|
|
|
async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
|
2023-07-25 21:36:41 +00:00
|
|
|
let document = Html::parse_document(html);
|
|
|
|
let selector = Selector::parse("a").unwrap();
|
|
|
|
|
|
|
|
let mut links: Vec<Url> = Vec::default();
|
|
|
|
for element in document.select(&selector) {
|
|
|
|
let href = element.value().attr("href").unwrap();
|
|
|
|
let url = base.join(href).unwrap();
|
2023-07-25 22:16:45 +00:00
|
|
|
// if &url == base {
|
|
|
|
// break;
|
|
|
|
// }
|
2023-07-25 21:36:41 +00:00
|
|
|
for x in &allow {
|
|
|
|
if &x.domain().unwrap() == &url.domain().unwrap() {
|
|
|
|
links.push(url);
|
|
|
|
break;
|
|
|
|
}
|
2023-07-25 20:11:09 +00:00
|
|
|
}
|
|
|
|
}
|
2023-07-25 21:36:41 +00:00
|
|
|
return links;
|
2023-07-25 18:52:19 +00:00
|
|
|
}
|
2023-07-25 23:28:17 +00:00
|
|
|
|
|
|
|
async fn insert_db() -> Result<(), sqlx::Error> {
|
|
|
|
let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap()).await?;
|
|
|
|
let row: (i64,) = sqlx::query_as("SELECT $1")
|
|
|
|
.bind(150_i64)
|
|
|
|
.fetch_one(&pool)
|
|
|
|
.await?;
|
|
|
|
Ok(())
|
|
|
|
}
|