Should crawl pages after a certain age
parent
57684c037e
commit
326a6b8042
|
@ -3,6 +3,7 @@ mod settings;
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate log;
|
extern crate log;
|
||||||
use chrono::prelude::*;
|
use chrono::prelude::*;
|
||||||
|
use chrono::Duration;
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use settings::Settings;
|
use settings::Settings;
|
||||||
|
@ -72,9 +73,17 @@ async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
|
||||||
for i in cacache::list_sync(cache_dir.as_path()) {
|
for i in cacache::list_sync(cache_dir.as_path()) {
|
||||||
match i {
|
match i {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
if i.unwrap().key == url.clone().into_string() {
|
if i.as_ref().unwrap().key == url.clone().into_string() {
|
||||||
error!("Already crawled {}", &url.as_str());
|
let now = Utc::now();
|
||||||
return vec![];
|
let timestamp = DateTime::<Utc>::from_utc(
|
||||||
|
NaiveDateTime::from_timestamp_opt(i.unwrap().time as i64, 0).unwrap(),
|
||||||
|
Utc,
|
||||||
|
);
|
||||||
|
let diff = now - timestamp;
|
||||||
|
if diff <= Duration::hours(1) {
|
||||||
|
error!("Already crawled {}", &url.as_str());
|
||||||
|
return vec![];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => error!("{}", e),
|
Err(e) => error!("{}", e),
|
||||||
|
|
Loading…
Reference in New Issue