Crawl through URLs via allowlist

main
~erin 2023-07-25 17:36:41 -04:00
parent 892dc35f74
commit 159164674e
Signed by: erin
GPG Key ID: 0FEDEAFF1C14847E
4 changed files with 451 additions and 21 deletions

395
Cargo.lock generated
View File

@ -28,6 +28,18 @@ dependencies = [
"version_check",
]
[[package]]
name = "ahash"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
dependencies = [
"cfg-if",
"getrandom",
"once_cell",
"version_check",
]
[[package]]
name = "aho-corasick"
version = "1.0.2"
@ -74,7 +86,7 @@ checksum = "cc6dde6e4ed435a4c1ee4e73592f5ba9da2151af10076cc04858746af9352d09"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
]
[[package]]
@ -178,6 +190,12 @@ version = "3.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "bytes"
version = "1.4.0"
@ -266,6 +284,7 @@ dependencies = [
"env_logger",
"log",
"reqwest",
"scraper",
"serde",
"tokio",
"url",
@ -290,6 +309,40 @@ dependencies = [
"typenum",
]
[[package]]
name = "cssparser"
version = "0.31.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf 0.11.2",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.27",
]
[[package]]
name = "derive_more"
version = "0.99.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "digest"
version = "0.10.7"
@ -327,6 +380,27 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0688c2a7f92e427f44895cd63841bff7b29f8d7a1648b9e7e07a4a365b2e1257"
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
[[package]]
name = "either"
version = "1.9.0"
@ -411,6 +485,16 @@ dependencies = [
name = "frontend"
version = "0.1.0"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.28"
@ -467,7 +551,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
]
[[package]]
@ -500,6 +584,15 @@ dependencies = [
"slab",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generic-array"
version = "0.14.7"
@ -510,6 +603,15 @@ dependencies = [
"version_check",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.10"
@ -552,7 +654,7 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
dependencies = [
"ahash",
"ahash 0.7.6",
]
[[package]]
@ -576,6 +678,20 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "http"
version = "0.2.9"
@ -746,6 +862,26 @@ version = "0.4.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "memchr"
version = "2.5.0"
@ -781,7 +917,7 @@ checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
]
[[package]]
@ -816,6 +952,12 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "nom"
version = "7.1.3"
@ -932,7 +1074,7 @@ dependencies = [
"pest_meta",
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
]
[[package]]
@ -946,6 +1088,86 @@ dependencies = [
"sha2",
]
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_macros",
"phf_shared 0.11.2",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared 0.11.2",
"rand",
]
[[package]]
name = "phf_macros"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
dependencies = [
"phf_generator 0.11.2",
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.27",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
version = "0.2.10"
@ -958,6 +1180,18 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro2"
version = "1.0.66"
@ -976,6 +1210,36 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "redox_syscall"
version = "0.2.16"
@ -1192,6 +1456,23 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c95a930e03325234c18c7071fd2b60118307e025d6fff3e12745ffbf63a3d29c"
dependencies = [
"ahash 0.8.3",
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"once_cell",
"selectors",
"smallvec",
"tendril",
]
[[package]]
name = "sct"
version = "0.7.0"
@ -1202,6 +1483,25 @@ dependencies = [
"untrusted",
]
[[package]]
name = "selectors"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
dependencies = [
"bitflags 2.3.3",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf 0.10.1",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "serde"
version = "1.0.175"
@ -1219,7 +1519,7 @@ checksum = "b23f7ade6f110613c0d63858ddb8b94c1041f550eab58a16b371bdf2c9c80ab4"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
]
[[package]]
@ -1245,6 +1545,15 @@ dependencies = [
"serde",
]
[[package]]
name = "servo_arc"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "sha-1"
version = "0.10.1"
@ -1287,6 +1596,12 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
[[package]]
name = "slab"
version = "0.4.8"
@ -1335,6 +1650,49 @@ dependencies = [
"xxhash-rust",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.27"
@ -1359,6 +1717,17 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "termcolor"
version = "1.2.0"
@ -1385,7 +1754,7 @@ checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
]
[[package]]
@ -1431,7 +1800,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
]
[[package]]
@ -1567,6 +1936,12 @@ dependencies = [
"serde",
]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "version_check"
version = "0.9.4"
@ -1619,7 +1994,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
"wasm-bindgen-shared",
]
@ -1653,7 +2028,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.27",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]

View File

@ -17,3 +17,4 @@ serde = { version = "1.0.175", features = [ "derive" ] }
reqwest = { version = "0.11", default-features = false, features = [ "rustls-tls", "gzip", "brotli", "deflate" ] }
cacache = { version = "11.6.0", default-features = false, features = ["tokio-runtime", "mmap"] }
bincode = "1.3.3"
scraper = "0.17.1"

View File

@ -2,6 +2,7 @@ mod settings;
#[macro_use]
extern crate log;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use settings::Settings;
use url::Url;
@ -18,19 +19,45 @@ async fn main() {
env_logger::init();
let settings = Settings::new().unwrap();
let mut handles = Vec::new();
for url in settings.sitemap {
let job = tokio::spawn(crawl_url(url));
handles.push(job);
}
for job in handles {
job.await;
let mut to_crawl = settings.sitemap;
let mut crawled = Vec::new();
let mut crawl_count = 0;
loop {
let mut handles = Vec::new();
for url in to_crawl {
let job = tokio::spawn(crawl_url(url, settings.allowlist.clone(), crawled.clone()));
handles.push(job);
}
let mut results = Vec::new();
for job in handles {
results.push(job.await);
}
to_crawl = Vec::new();
for res in results {
for t in res {
for url in t {
crawl_count += 1;
to_crawl.push(url.clone());
crawled.push(url);
}
}
}
if to_crawl == vec![] {
break;
}
}
info!("Succesfully crawled {} pages!", crawl_count);
}
async fn crawl_url(url: Url) {
async fn crawl_url(url: Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
if crawled.contains(&url) {
warn!("{} already crawled", &url.as_str());
return vec![];
}
let mut resp = reqwest::get(url).await;
tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
match resp {
Ok(v) => {
let page = Page {
@ -41,6 +68,7 @@ async fn crawl_url(url: Url) {
if page.status >= 400 {
error!("{:?} Error for {}", &page.status, &page.url);
return vec![];
} else {
info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);
@ -52,12 +80,37 @@ async fn crawl_url(url: Url) {
.await
.unwrap();
let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
let decoded_page: Page = bincode::deserialize(&data).unwrap();
return find_links(&page.body, &page.url, allow, crawled).await;
// let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
// let decoded_page: Page = bincode::deserialize(&data).unwrap();
}
}
Err(e) => {
error!("Could not get url: {}", e);
return vec![];
}
}
}
async fn find_links(html: &str, base: &Url, allow: Vec<Url>, crawled: Vec<Url>) -> Vec<Url> {
let document = Html::parse_document(html);
let selector = Selector::parse("a").unwrap();
let mut links: Vec<Url> = Vec::default();
for element in document.select(&selector) {
let href = element.value().attr("href").unwrap();
let url = base.join(href).unwrap();
if crawled.contains(&url) {
warn!("{} already crawled", &url.as_str());
break;
}
for x in &allow {
if &x.domain().unwrap() == &url.domain().unwrap() {
links.push(url);
break;
}
}
}
return links;
}

View File

@ -9,6 +9,7 @@ pub struct Settings {
pub strict: bool,
pub timeout: u8,
pub sitemap: Vec<Url>,
pub allowlist: Vec<Url>,
}
impl Settings {