ferret/crawler/src/main.rs

mod settings;

#[macro_use]
extern crate log;
use ammonia::clean;
use chrono::prelude::*;
use chrono::Duration;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use settings::Settings;
use sqlx::sqlite::SqlitePool;
use std::env;
use std::fs::File;
use url::Url;

#[derive(Debug, Deserialize, Serialize)]
struct Page {
    url: Url,
    status: u16,
    last_fetched: DateTime<Utc>,
    body: String,
}

#[tokio::main]
async fn main() {
    env_logger::init();
    let settings = Settings::new().unwrap();
    let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
        .await
        .unwrap();

    let mut to_crawl = settings.sitemap;
    let mut crawled = 0;

    loop {
        let mut handles = Vec::new();
        for url in to_crawl {
            let job = tokio::spawn(crawl_url(url, settings.allowlist.clone()));
            handles.push(job);
            crawled += 1;
        }
        let mut results = Vec::new();
        for job in handles {
            results.push(job.await);
        }

        to_crawl = Vec::new();
        for res in results {
            for t in res {
                for url in t {
                    info!("pushing {}", &url.as_str());
                    to_crawl.push(url);
                }
            }
        }
        if to_crawl == vec![] {
            break;
        }
    }

    info!("Succesfully crawled {} pages!", crawled);

    let mut cache_dir = dirs::cache_dir().unwrap();
    cache_dir.push("ferret");
    for i in cacache::list_sync(cache_dir.as_path()) {
        let data = cacache::read(cache_dir.as_path(), &i.unwrap().key)
            .await
            .unwrap();
        let decoded_page: Page = bincode::deserialize(&data).unwrap();
        insert_db(&pool, &decoded_page).await;
    }
}

async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
    let mut cache_dir = dirs::cache_dir().unwrap();
    cache_dir.push("ferret");
    for i in cacache::list_sync(cache_dir.as_path()) {
        match i {
            Ok(_) => {
                if i.as_ref().unwrap().key == url.clone().into_string() {
                    let now = Utc::now();
                    let timestamp = DateTime::<Utc>::from_utc(
                        NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(),
                        Utc,
                    );
                    let diff = now - timestamp;
                    if diff <= Duration::hours(1) {
                        error!("Already crawled {}", &url.as_str());
                        return vec![];
                    }
                }
            }
            Err(e) => error!("{}", e),
        }
    }

    let mut resp = reqwest::get(url).await;
    match resp {
        Ok(v) => {
            let page = Page {
                url: v.url().to_owned(),
                status: v.status().as_u16(),
                last_fetched: Utc::now(),
                body: v.text().await.unwrap(),
            };

            if page.status >= 400 {
                error!("{:?} Error for {}", &page.status, &page.url);
                return vec![];
            } else {
                info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);

                let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();
                cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)
                    .await
                    .unwrap();

                return find_links(&page.body, &page.url, allow).await;

                // let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
                // let decoded_page: Page = bincode::deserialize(&data).unwrap();
            }
        }
        Err(e) => {
            error!("Could not get url: {}", e);
            return vec![];
        }
    }
}

async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
    let document = Html::parse_document(html);
    let selector = Selector::parse("a").unwrap();

    let mut links: Vec<Url> = Vec::default();
    for element in document.select(&selector) {
        let href = element.value().attr("href").unwrap();
        let url = base.join(href).unwrap();
        match &url.fragment() {
            Some(_) => break,
            None => {}
        }
        let path = url.path().clone();
        let ignore_ends = vec![
            ".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",
            ".css", ".json",
        ];
        let mut br = false;
        for i in ignore_ends {
            if path.ends_with(i) {
                br = true;
            }
        }
        if br {
            break;
        }
        // if !path.ends_with(".html") || !path.ends_with("/") {
        //     break;
        // }
        for x in &allow {
            match &url.domain() {
                Some(d) => {
                    if &x.domain().unwrap() == d {
                        links.push(url);
                        break;
                    }
                }
                None => {}
            }
        }
    }
    return links;
}

async fn insert_db(pool: &SqlitePool, page: &Page) {
    let mut conn = pool.acquire().await.unwrap();
    let url = page.url.clone().into_string();
    let timestamp = page.last_fetched.clone().timestamp();
    let body = page.body.clone();
    let safe_html = clean(&*body);

    let id = sqlx::query!(
        r#"
        REPLACE INTO cached_urls ( last_fetched, url, body )
        VALUES ( ?1, ?2, ?3 )
        "#,
        timestamp,
        url,
        safe_html,
    )
    .execute(&mut *conn)
    .await
    .unwrap()
    .last_insert_rowid();
}