Fix recursive crawling
parent
159164674e
commit
57684c037e
|
@ -64,6 +64,21 @@ dependencies = [
|
|||
"alloc-no-stdlib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "android-tzdata"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.4.1"
|
||||
|
@ -241,6 +256,22 @@ version = "1.0.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
"js-sys",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"time",
|
||||
"wasm-bindgen",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "config"
|
||||
version = "0.13.3"
|
||||
|
@ -264,6 +295,12 @@ dependencies = [
|
|||
name = "core"
|
||||
version = "0.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation-sys"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.9"
|
||||
|
@ -279,6 +316,7 @@ version = "0.1.0"
|
|||
dependencies = [
|
||||
"bincode",
|
||||
"cacache",
|
||||
"chrono",
|
||||
"config",
|
||||
"dirs",
|
||||
"env_logger",
|
||||
|
@ -620,7 +658,7 @@ checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
|
|||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -770,6 +808,29 @@ dependencies = [
|
|||
"tokio-rustls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.57"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
|
||||
dependencies = [
|
||||
"android_system_properties",
|
||||
"core-foundation-sys",
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"windows",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone-haiku"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.4.0"
|
||||
|
@ -948,7 +1009,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"wasi",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
|
@ -968,6 +1029,15 @@ dependencies = [
|
|||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.16.0"
|
||||
|
@ -1757,6 +1827,17 @@ dependencies = [
|
|||
"syn 2.0.27",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.1.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"wasi 0.10.0+wasi-snapshot-preview1",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.6.0"
|
||||
|
@ -1967,6 +2048,12 @@ dependencies = [
|
|||
"try-lock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.10.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.0+wasi-snapshot-preview1"
|
||||
|
@ -2099,6 +2186,15 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
|
|
|
@ -10,11 +10,11 @@ license = "AGPL-3.0-or-later"
|
|||
[workspace.dependencies]
|
||||
config = "0.13.3"
|
||||
dirs = "5.0.1"
|
||||
chrono = "0.4.26"
|
||||
tokio = { version = "1.29.1", features = ["full"] }
|
||||
url = { version = "2.4.0", features = [ "serde" ] }
|
||||
log = "0.4.0"
|
||||
env_logger = "0.9.0"
|
||||
chrono = { version = "0.4.26", features = [ "serde" ] }
|
||||
|
||||
[profile.release]
|
||||
strip = true
|
||||
|
|
|
@ -13,6 +13,7 @@ dirs.workspace = true
|
|||
url.workspace = true
|
||||
log.workspace = true
|
||||
env_logger.workspace = true
|
||||
chrono.workspace = true
|
||||
serde = { version = "1.0.175", features = [ "derive" ] }
|
||||
reqwest = { version = "0.11", default-features = false, features = [ "rustls-tls", "gzip", "brotli", "deflate" ] }
|
||||
cacache = { version = "11.6.0", default-features = false, features = ["tokio-runtime", "mmap"] }
|
||||
|
|
|
@ -2,6 +2,7 @@ mod settings;
|
|||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
use chrono::prelude::*;
|
||||
use scraper::{Html, Selector};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use settings::Settings;
|
||||
|
@ -11,6 +12,7 @@ use url::Url;
|
|||
struct Page {
|
||||
url: Url,
|
||||
status: u16,
|
||||
last_fetched: DateTime<Utc>,
|
||||
body: String,
|
||||
}
|
||||
|
||||
|
@ -20,14 +22,14 @@ async fn main() {
|
|||
let settings = Settings::new().unwrap();
|
||||
|
||||
let mut to_crawl = settings.sitemap;
|
||||
let mut crawled = Vec::new();
|
||||
let mut crawl_count = 0;
|
||||
let mut crawled = 0;
|
||||
|
||||
loop {
|
||||
let mut handles = Vec::new();
|
||||
for url in to_crawl {
|
||||
let job = tokio::spawn(crawl_url(url, settings.allowlist.clone(), crawled.clone()));
|
||||
let job = tokio::spawn(crawl_url(url, settings.allowlist.clone()));
|
||||
handles.push(job);
|
||||
crawled += 1;
|
||||
}
|
||||
let mut results = Vec::new();
|
||||
for job in handles {
|
||||
|
@ -38,9 +40,7 @@ async fn main() {
|
|||
for res in results {
|
||||
for t in res {
|
||||
for url in t {
|
||||
crawl_count += 1;
|
||||
to_crawl.push(url.clone());
|
||||
crawled.push(url);
|
||||
to_crawl.push(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -48,13 +48,37 @@ async fn main() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
info!("Succesfully crawled {} pages!", crawl_count);
|
||||
|
||||
info!("Succesfully crawled {} pages!", crawled);
|
||||
|
||||
let mut cache_dir = dirs::cache_dir().unwrap();
|
||||
cache_dir.push("ferret");
|
||||
for i in cacache::list_sync(cache_dir.as_path()) {
|
||||
let data = cacache::read(cache_dir.as_path(), &i.unwrap().key)
|
||||
.await
|
||||
.unwrap();
|
||||
let decoded_page: Page = bincode::deserialize(&data).unwrap();
|
||||
info!(
|
||||
"Found page: {} {}",
|
||||
&decoded_page.url.as_str(),
|
||||
&decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
|
||||
if crawled.contains(&url) {
|
||||
warn!("{} already crawled", &url.as_str());
|
||||
return vec![];
|
||||
async fn crawl_url(url: Url, allow: Vec<Url>) -> Vec<Url> {
|
||||
let mut cache_dir = dirs::cache_dir().unwrap();
|
||||
cache_dir.push("ferret");
|
||||
for i in cacache::list_sync(cache_dir.as_path()) {
|
||||
match i {
|
||||
Ok(_) => {
|
||||
if i.unwrap().key == url.clone().into_string() {
|
||||
error!("Already crawled {}", &url.as_str());
|
||||
return vec![];
|
||||
}
|
||||
}
|
||||
Err(e) => error!("{}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let mut resp = reqwest::get(url).await;
|
||||
|
@ -63,6 +87,7 @@ async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
|
|||
let page = Page {
|
||||
url: v.url().to_owned(),
|
||||
status: v.status().as_u16(),
|
||||
last_fetched: Utc::now(),
|
||||
body: v.text().await.unwrap(),
|
||||
};
|
||||
|
||||
|
@ -72,15 +97,12 @@ async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
|
|||
} else {
|
||||
info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);
|
||||
|
||||
let mut cache_dir = dirs::cache_dir().unwrap();
|
||||
cache_dir.push("ferret");
|
||||
|
||||
let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();
|
||||
cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
return find_links(&page.body, &page.url, allow, crawled).await;
|
||||
return find_links(&page.body, &page.url, allow).await;
|
||||
|
||||
// let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
|
||||
// let decoded_page: Page = bincode::deserialize(&data).unwrap();
|
||||
|
@ -93,7 +115,7 @@ async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
|
|||
}
|
||||
}
|
||||
|
||||
async fn find_links(html: &str, base: &Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
|
||||
async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
|
||||
let document = Html::parse_document(html);
|
||||
let selector = Selector::parse("a").unwrap();
|
||||
|
||||
|
@ -101,10 +123,9 @@ async fn find_links(html: &str, base: &Url, allow: Vec<Url>, crawled: Vec<Url>)
|
|||
for element in document.select(&selector) {
|
||||
let href = element.value().attr("href").unwrap();
|
||||
let url = base.join(href).unwrap();
|
||||
if crawled.contains(&url) {
|
||||
warn!("{} already crawled", &url.as_str());
|
||||
break;
|
||||
}
|
||||
// if &url == base {
|
||||
// break;
|
||||
// }
|
||||
for x in &allow {
|
||||
if &x.domain().unwrap() == &url.domain().unwrap() {
|
||||
links.push(url);
|
||||
|
|
Loading…
Reference in New Issue