From 326a6b8042d61430b56d217965f0e8f92e5f881f Mon Sep 17 00:00:00 2001 From: Erin Nova Date: Tue, 25 Jul 2023 18:33:38 -0400 Subject: [PATCH] Should crawl pages after a certain age --- crawler/src/main.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/crawler/src/main.rs b/crawler/src/main.rs index c7c3178..9b559d2 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -3,6 +3,7 @@ mod settings; #[macro_use] extern crate log; use chrono::prelude::*; +use chrono::Duration; use scraper::{Html, Selector}; use serde::{Deserialize, Serialize}; use settings::Settings; @@ -72,9 +73,17 @@ async fn crawl_url(url: Url, allow: Vec) -> Vec { for i in cacache::list_sync(cache_dir.as_path()) { match i { Ok(_) => { - if i.unwrap().key == url.clone().into_string() { - error!("Already crawled {}", &url.as_str()); - return vec![]; + if i.as_ref().unwrap().key == url.clone().into_string() { + let now = Utc::now(); + let timestamp = DateTime::::from_utc( + NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(), + Utc, + ); + let diff = now - timestamp; + if diff <= Duration::hours(1) { + error!("Already crawled {}", &url.as_str()); + return vec![]; + } } } Err(e) => error!("{}", e),