Fix recursive crawling

main
~erin 2023-07-25 18:16:45 -04:00
parent 159164674e
commit 57684c037e
Signed by: erin
GPG Key ID: 0FEDEAFF1C14847E
4 changed files with 141 additions and 23 deletions

100
Cargo.lock generated
View File

@ -64,6 +64,21 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "async-compression"
version = "0.4.1"
@ -241,6 +256,22 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-traits",
"serde",
"time",
"wasm-bindgen",
"winapi",
]
[[package]]
name = "config"
version = "0.13.3"
@ -264,6 +295,12 @@ dependencies = [
name = "core"
version = "0.1.0"
[[package]]
name = "core-foundation-sys"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "cpufeatures"
version = "0.2.9"
@ -279,6 +316,7 @@ version = "0.1.0"
dependencies = [
"bincode",
"cacache",
"chrono",
"config",
"dirs",
"env_logger",
@ -620,7 +658,7 @@ checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
dependencies = [
"cfg-if",
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
]
[[package]]
@ -770,6 +808,29 @@ dependencies = [
"tokio-rustls",
]
[[package]]
name = "iana-time-zone"
version = "0.1.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]]
name = "idna"
version = "0.4.0"
@ -948,7 +1009,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2"
dependencies = [
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys",
]
@ -968,6 +1029,15 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "num-traits"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.16.0"
@ -1757,6 +1827,17 @@ dependencies = [
"syn 2.0.27",
]
[[package]]
name = "time"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
dependencies = [
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"winapi",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
@ -1967,6 +2048,12 @@ dependencies = [
"try-lock",
]
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
@ -2099,6 +2186,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-sys"
version = "0.48.0"

View File

@ -10,11 +10,11 @@ license = "AGPL-3.0-or-later"
[workspace.dependencies]
config = "0.13.3"
dirs = "5.0.1"
chrono = "0.4.26"
tokio = { version = "1.29.1", features = ["full"] }
url = { version = "2.4.0", features = [ "serde" ] }
log = "0.4.0"
env_logger = "0.9.0"
chrono = { version = "0.4.26", features = [ "serde" ] }
[profile.release]
strip = true

View File

@ -13,6 +13,7 @@ dirs.workspace = true
url.workspace = true
log.workspace = true
env_logger.workspace = true
chrono.workspace = true
serde = { version = "1.0.175", features = [ "derive" ] }
reqwest = { version = "0.11", default-features = false, features = [ "rustls-tls", "gzip", "brotli", "deflate" ] }
cacache = { version = "11.6.0", default-features = false, features = ["tokio-runtime", "mmap"] }

View File

@ -2,6 +2,7 @@ mod settings;
#[macro_use]
extern crate log;
use chrono::prelude::*;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use settings::Settings;
@ -11,6 +12,7 @@ use url::Url;
struct Page {
url: Url,
status: u16,
last_fetched: DateTime<Utc>,
body: String,
}
@ -20,14 +22,14 @@ async fn main() {
let settings = Settings::new().unwrap();
let mut to_crawl = settings.sitemap;
let mut crawled = Vec::new();
let mut crawl_count = 0;
let mut crawled = 0;
loop {
let mut handles = Vec::new();
for url in to_crawl {
let job = tokio::spawn(crawl_url(url, settings.allowlist.clone(), crawled.clone()));
let job = tokio::spawn(crawl_url(url, settings.allowlist.clone()));
handles.push(job);
crawled += 1;
}
let mut results = Vec::new();
for job in handles {
@ -38,9 +40,7 @@ async fn main() {
for res in results {
for t in res {
for url in t {
crawl_count += 1;
to_crawl.push(url.clone());
crawled.push(url);
to_crawl.push(url);
}
}
}
@ -48,13 +48,37 @@ async fn main() {
break;
}
}
info!("Succesfully crawled {} pages!", crawl_count);
info!("Succesfully crawled {} pages!", crawled);
let mut cache_dir = dirs::cache_dir().unwrap();
cache_dir.push("ferret");
for i in cacache::list_sync(cache_dir.as_path()) {
let data = cacache::read(cache_dir.as_path(), &i.unwrap().key)
.await
.unwrap();
let decoded_page: Page = bincode::deserialize(&data).unwrap();
info!(
"Found page: {} {}",
&decoded_page.url.as_str(),
&decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S")
);
}
}
async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
if crawled.contains(&url) {
warn!("{} already crawled", &url.as_str());
return vec![];
async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
let mut cache_dir = dirs::cache_dir().unwrap();
cache_dir.push("ferret");
for i in cacache::list_sync(cache_dir.as_path()) {
match i {
Ok(_) => {
if i.unwrap().key == url.clone().into_string() {
error!("Already crawled {}", &url.as_str());
return vec![];
}
}
Err(e) => error!("{}", e),
}
}
let mut resp = reqwest::get(url).await;
@ -63,6 +87,7 @@ async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
let page = Page {
url: v.url().to_owned(),
status: v.status().as_u16(),
last_fetched: Utc::now(),
body: v.text().await.unwrap(),
};
@ -72,15 +97,12 @@ async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
} else {
info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);
let mut cache_dir = dirs::cache_dir().unwrap();
cache_dir.push("ferret");
let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();
cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)
.await
.unwrap();
return find_links(&page.body, &page.url, allow, crawled).await;
return find_links(&page.body, &page.url, allow).await;
// let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
// let decoded_page: Page = bincode::deserialize(&data).unwrap();
@ -93,7 +115,7 @@ async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
}
}
async fn find_links(html: &str, base: &Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
let document = Html::parse_document(html);
let selector = Selector::parse("a").unwrap();
@ -101,10 +123,9 @@ async fn find_links(html: &str, base: &Url, allow: Vec<Url>, crawled: Vec<Url>)
for element in document.select(&selector) {
let href = element.value().attr("href").unwrap();
let url = base.join(href).unwrap();
if crawled.contains(&url) {
warn!("{} already crawled", &url.as_str());
break;
}
// if &url == base {
// break;
// }
for x in &allow {
if &x.domain().unwrap() == &url.domain().unwrap() {
links.push(url);