ferret/crawler/src/main.rs

mod settings;

#[macro_use]
extern crate log;
use ammonia::clean;
use chrono::prelude::*;
use chrono::Duration;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use settings::Settings;
use sqlx::sqlite::SqlitePool;
use std::env;
use std::fs::File;
use url::Url;

#[derive(Debug, Deserialize, Serialize)]
struct Page {
    url: Url,
    status: u16,
    last_fetched: DateTime<Utc>,
    body: String,
}

#[tokio::main]
async fn main() {
    env_logger::init();
    let settings = Settings::new().unwrap();
    let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())
        .await
        .unwrap();

    let mut to_crawl = settings.sitemap;
    let mut crawled = 0;

    loop {
        let mut handles = Vec::new();
        for url in to_crawl {
            let job = tokio::spawn(crawl_url(url, settings.allowlist.clone()));
            handles.push(job);
            crawled += 1;
        }
        let mut results = Vec::new();
        for job in handles {
            results.push(job.await);
        }

        to_crawl = Vec::new();
        for res in results {
            for t in res {
                for url in t {
                    info!("pushing {}", &url.as_str());
                    to_crawl.push(url);
                }
            }
        }
        if to_crawl == vec![] {
            break;
        }
    }

    info!("Succesfully crawled {} pages!", crawled);

    let mut cache_dir = dirs::cache_dir().unwrap();
    cache_dir.push("ferret");
    for i in cacache::list_sync(cache_dir.as_path()) {
        let data = cacache::read(cache_dir.as_path(), &i.unwrap().key)
            .await
            .unwrap();
        let decoded_page: Page = bincode::deserialize(&data).unwrap();
        insert_db(&pool, &decoded_page).await;
    }
}

async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
    let mut cache_dir = dirs::cache_dir().unwrap();
    cache_dir.push("ferret");
    for i in cacache::list_sync(cache_dir.as_path()) {
        match i {
            Ok(_) => {
                if i.as_ref().unwrap().key == url.clone().into_string() {
                    let now = Utc::now();
                    let timestamp = DateTime::<Utc>::from_utc(
                        NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(),
                        Utc,
                    );
                    let diff = now - timestamp;
                    if diff <= Duration::hours(1) {
                        error!("Already crawled {}", &url.as_str());
                        return vec![];
                    }
                }
            }
            Err(e) => error!("{}", e),
        }
    }

    let mut resp = reqwest::get(url).await;
    match resp {
        Ok(v) => {
            let page = Page {
                url: v.url().to_owned(),
                status: v.status().as_u16(),
                last_fetched: Utc::now(),
                body: v.text().await.unwrap(),
            };

            if page.status >= 400 {
                error!("{:?} Error for {}", &page.status, &page.url);
                return vec![];
            } else {
                info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);

                let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();
                cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)
                    .await
                    .unwrap();

                return find_links(&page.body, &page.url, allow).await;

                // let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
                // let decoded_page: Page = bincode::deserialize(&data).unwrap();
            }
        }
        Err(e) => {
            error!("Could not get url: {}", e);
            return vec![];
        }
    }
}

async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
    let document = Html::parse_document(html);
    let selector = Selector::parse("a").unwrap();

    let mut links: Vec<Url> = Vec::default();
    for element in document.select(&selector) {
        let href = element.value().attr("href").unwrap();
        let url = base.join(href).unwrap();
        match &url.fragment() {
            Some(_) => break,
            None => {}
        }
        let path = url.path().clone();
        let ignore_ends = vec![
            ".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",
            ".css", ".json",
        ];
        let mut br = false;
        for i in ignore_ends {
            if path.ends_with(i) {
                br = true;
            }
        }
        if br {
            break;
        }
        // if !path.ends_with(".html") || !path.ends_with("/") {
        //     break;
        // }
        for x in &allow {
            match &url.domain() {
                Some(d) => {
                    if &x.domain().unwrap() == d {
                        links.push(url);
                        break;
                    }
                }
                None => {}
            }
        }
    }
    return links;
}

async fn insert_db(pool: &SqlitePool, page: &Page) {
    let mut conn = pool.acquire().await.unwrap();
    let url = page.url.clone().into_string();
    let timestamp = page.last_fetched.clone().timestamp();
    let body = page.body.clone();
    let safe_html = clean(&*body);

    let id = sqlx::query!(
        r#"
        REPLACE INTO cached_urls ( last_fetched, url, body )
        VALUES ( ?1, ?2, ?3 )
        "#,
        timestamp,
        url,
        safe_html,
    )
    .execute(&mut *conn)
    .await
    .unwrap()
    .last_insert_rowid();
}
Use basic configuration file 2023-07-25 19:24:14 +00:00			`mod settings;`

Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`#[macro_use]`
			`extern crate log;`
Sanitize HTML? 2023-07-25 23:54:05 +00:00			`use ammonia::clean;`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`use chrono::prelude::*;`
Should crawl pages after a certain age 2023-07-25 22:33:38 +00:00			`use chrono::Duration;`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`use scraper::{Html, Selector};`
Crawl all at once, test cacache 2023-07-25 20:58:25 +00:00			`use serde::{Deserialize, Serialize};`
Use basic configuration file 2023-07-25 19:24:14 +00:00			`use settings::Settings;`
Test database 2023-07-25 23:28:17 +00:00			`use sqlx::sqlite::SqlitePool;`
			`use std::env;`
			`use std::fs::File;`
Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`use url::Url;`
Use basic configuration file 2023-07-25 19:24:14 +00:00
Crawl all at once, test cacache 2023-07-25 20:58:25 +00:00			`#[derive(Debug, Deserialize, Serialize)]`
			`struct Page {`
			`url: Url,`
			`status: u16,`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`last_fetched: DateTime<Utc>,`
Crawl all at once, test cacache 2023-07-25 20:58:25 +00:00			`body: String,`
			`}`

Basic project setup 2023-07-25 18:52:19 +00:00			`#[tokio::main]`
			`async fn main() {`
Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`env_logger::init();`
			`let settings = Settings::new().unwrap();`
Justfile 2023-07-25 23:47:39 +00:00			`let pool = SqlitePool::connect(&env::var("DATABASE_URL").unwrap())`
			`.await`
			`.unwrap();`
Use basic configuration file 2023-07-25 19:24:14 +00:00
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`let mut to_crawl = settings.sitemap;`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`let mut crawled = 0;`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00
			`loop {`
			`let mut handles = Vec::new();`
			`for url in to_crawl {`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`let job = tokio::spawn(crawl_url(url, settings.allowlist.clone()));`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`handles.push(job);`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`crawled += 1;`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`}`
			`let mut results = Vec::new();`
			`for job in handles {`
			`results.push(job.await);`
			`}`

			`to_crawl = Vec::new();`
			`for res in results {`
			`for t in res {`
			`for url in t {`
Fix panic, preventing full crawling 2023-07-26 00:11:29 +00:00			`info!("pushing {}", &url.as_str());`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`to_crawl.push(url);`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`}`
			`}`
			`}`
			`if to_crawl == vec![] {`
			`break;`
			`}`
Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`}`
Fix recursive crawling 2023-07-25 22:16:45 +00:00
			`info!("Succesfully crawled {} pages!", crawled);`

			`let mut cache_dir = dirs::cache_dir().unwrap();`
			`cache_dir.push("ferret");`
			`for i in cacache::list_sync(cache_dir.as_path()) {`
			`let data = cacache::read(cache_dir.as_path(), &i.unwrap().key)`
			`.await`
			`.unwrap();`
			`let decoded_page: Page = bincode::deserialize(&data).unwrap();`
Justfile 2023-07-25 23:47:39 +00:00			`insert_db(&pool, &decoded_page).await;`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`}`
Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`}`

Fix recursive crawling 2023-07-25 22:16:45 +00:00			`async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {`
			`let mut cache_dir = dirs::cache_dir().unwrap();`
			`cache_dir.push("ferret");`
			`for i in cacache::list_sync(cache_dir.as_path()) {`
			`match i {`
			`Ok(_) => {`
Should crawl pages after a certain age 2023-07-25 22:33:38 +00:00			`if i.as_ref().unwrap().key == url.clone().into_string() {`
			`let now = Utc::now();`
			`let timestamp = DateTime::<Utc>::from_utc(`
			`NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(),`
			`Utc,`
			`);`
			`let diff = now - timestamp;`
			`if diff <= Duration::hours(1) {`
			`error!("Already crawled {}", &url.as_str());`
			`return vec![];`
			`}`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`}`
			`}`
			`Err(e) => error!("{}", e),`
			`}`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`}`

Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`let mut resp = reqwest::get(url).await;`
			`match resp {`
			`Ok(v) => {`
Crawl all at once, test cacache 2023-07-25 20:58:25 +00:00			`let page = Page {`
			`url: v.url().to_owned(),`
			`status: v.status().as_u16(),`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`last_fetched: Utc::now(),`
Crawl all at once, test cacache 2023-07-25 20:58:25 +00:00			`body: v.text().await.unwrap(),`
			`};`

			`if page.status >= 400 {`
			`error!("{:?} Error for {}", &page.status, &page.url);`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`return vec![];`
Crawl all at once, test cacache 2023-07-25 20:58:25 +00:00			`} else {`
			`info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);`

			`let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();`
			`cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)`
			`.await`
			`.unwrap();`
Fetch pages in sitemap 2023-07-25 20:11:09 +00:00
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`return find_links(&page.body, &page.url, allow).await;`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00
			`// let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();`
			`// let decoded_page: Page = bincode::deserialize(&data).unwrap();`
Crawl all at once, test cacache 2023-07-25 20:58:25 +00:00			`}`
Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`}`
			`Err(e) => {`
			`error!("Could not get url: {}", e);`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`return vec![];`
			`}`
			`}`
			`}`

Fix recursive crawling 2023-07-25 22:16:45 +00:00			`async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`let document = Html::parse_document(html);`
			`let selector = Selector::parse("a").unwrap();`

			`let mut links: Vec<Url> = Vec::default();`
			`for element in document.select(&selector) {`
			`let href = element.value().attr("href").unwrap();`
			`let url = base.join(href).unwrap();`
Filetype filtering 2023-07-26 00:25:49 +00:00			`match &url.fragment() {`
			`Some(_) => break,`
			`None => {}`
			`}`
			`let path = url.path().clone();`
			`let ignore_ends = vec![`
			`".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",`
			`".css", ".json",`
			`];`
			`let mut br = false;`
			`for i in ignore_ends {`
			`if path.ends_with(i) {`
			`br = true;`
			`}`
			`}`
			`if br {`
			`break;`
			`}`
			`// if !path.ends_with(".html") \|\| !path.ends_with("/") {`
Fix recursive crawling 2023-07-25 22:16:45 +00:00			`// break;`
			`// }`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`for x in &allow {`
Fix panic, preventing full crawling 2023-07-26 00:11:29 +00:00			`match &url.domain() {`
			`Some(d) => {`
			`if &x.domain().unwrap() == d {`
			`links.push(url);`
			`break;`
			`}`
			`}`
			`None => {}`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`}`
Fetch pages in sitemap 2023-07-25 20:11:09 +00:00			`}`
			`}`
Crawl through URLs via allowlist 2023-07-25 21:36:41 +00:00			`return links;`
Basic project setup 2023-07-25 18:52:19 +00:00			`}`
Test database 2023-07-25 23:28:17 +00:00
Justfile 2023-07-25 23:47:39 +00:00			`async fn insert_db(pool: &SqlitePool, page: &Page) {`
			`let mut conn = pool.acquire().await.unwrap();`
			`let url = page.url.clone().into_string();`
			`let timestamp = page.last_fetched.clone().timestamp();`
Sanitize HTML? 2023-07-25 23:54:05 +00:00			`let body = page.body.clone();`
			`let safe_html = clean(&*body);`

Justfile 2023-07-25 23:47:39 +00:00			`let id = sqlx::query!(`
			`r#"`
			`REPLACE INTO cached_urls ( last_fetched, url, body )`
			`VALUES ( ?1, ?2, ?3 )`
			`"#,`
			`timestamp,`
			`url,`
Sanitize HTML? 2023-07-25 23:54:05 +00:00			`safe_html,`
Justfile 2023-07-25 23:47:39 +00:00			`)`
			`.execute(&mut *conn)`
			`.await`
			`.unwrap()`
			`.last_insert_rowid();`
Test database 2023-07-25 23:28:17 +00:00			`}`