Crawl all at once, test cacache

main
~erin 2023-07-25 16:58:25 -04:00
parent 86f6d1a631
commit 892dc35f74
Signed by: erin
GPG Key ID: 0FEDEAFF1C14847E
3 changed files with 316 additions and 9 deletions

281
Cargo.lock generated
View File

@ -121,12 +121,27 @@ version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d"
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
[[package]]
name = "block-buffer"
version = "0.10.4"
@ -169,6 +184,33 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
[[package]]
name = "cacache"
version = "11.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11e3f7fcc57143528b55ff07ce71b4608d674d2929c2365f768c6c5bcaaa7a17"
dependencies = [
"digest",
"either",
"futures",
"hex",
"libc",
"memmap2",
"miette",
"reflink",
"serde",
"serde_derive",
"serde_json",
"sha1",
"sha2",
"ssri",
"tempfile",
"thiserror",
"tokio",
"tokio-stream",
"walkdir",
]
[[package]]
name = "cc"
version = "1.0.79"
@ -217,6 +259,8 @@ dependencies = [
name = "crawler"
version = "0.1.0"
dependencies = [
"bincode",
"cacache",
"config",
"dirs",
"env_logger",
@ -283,6 +327,12 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0688c2a7f92e427f44895cd63841bff7b29f8d7a1648b9e7e07a4a365b2e1257"
[[package]]
name = "either"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
[[package]]
name = "encoding_rs"
version = "0.8.32"
@ -305,6 +355,33 @@ dependencies = [
"termcolor",
]
[[package]]
name = "errno"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "fastrand"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
[[package]]
name = "flate2"
version = "1.0.26"
@ -334,6 +411,21 @@ dependencies = [
name = "frontend"
version = "0.1.0"
[[package]]
name = "futures"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.28"
@ -341,6 +433,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
@ -349,6 +442,23 @@ version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
[[package]]
name = "futures-executor"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
[[package]]
name = "futures-macro"
version = "0.3.28"
@ -378,9 +488,13 @@ version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
@ -456,6 +570,12 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
[[package]]
name = "hex"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "http"
version = "0.2.9"
@ -604,6 +724,12 @@ version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0"
[[package]]
name = "lock_api"
version = "0.4.10"
@ -626,6 +752,38 @@ version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memmap2"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
dependencies = [
"libc",
]
[[package]]
name = "miette"
version = "5.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e"
dependencies = [
"miette-derive",
"once_cell",
"thiserror",
"unicode-width",
]
[[package]]
name = "miette-derive"
version = "5.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "mime"
version = "0.3.17"
@ -824,7 +982,7 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
"bitflags 1.3.2",
]
[[package]]
@ -833,7 +991,7 @@ version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
dependencies = [
"bitflags",
"bitflags 1.3.2",
]
[[package]]
@ -847,6 +1005,16 @@ dependencies = [
"thiserror",
]
[[package]]
name = "reflink"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc585ec28b565b4c28977ce8363a6636cedc280351ba25a7915f6c9f37f68cbe"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "regex"
version = "1.9.1"
@ -939,7 +1107,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88073939a61e5b7680558e6be56b419e208420c2adb92be54921fa6b72283f1a"
dependencies = [
"base64 0.13.1",
"bitflags",
"bitflags 1.3.2",
"serde",
]
@ -959,6 +1127,19 @@ version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
[[package]]
name = "rustix"
version = "0.38.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5"
dependencies = [
"bitflags 2.3.3",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
name = "rustls"
version = "0.21.5"
@ -996,6 +1177,15 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
@ -1055,6 +1245,28 @@ dependencies = [
"serde",
]
[[package]]
name = "sha-1"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "sha1"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "sha2"
version = "0.10.7"
@ -1106,6 +1318,23 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "ssri"
version = "9.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da7a2b3c2bc9693bcb40870c4e9b5bf0d79f9cb46273321bf855ec513e919082"
dependencies = [
"base64 0.21.2",
"digest",
"hex",
"miette",
"serde",
"sha-1",
"sha2",
"thiserror",
"xxhash-rust",
]
[[package]]
name = "syn"
version = "2.0.27"
@ -1117,6 +1346,19 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5486094ee78b2e5038a6382ed7645bc084dc2ec433426ca4c3cb61e2007b8998"
dependencies = [
"cfg-if",
"fastrand",
"redox_syscall 0.3.5",
"rustix",
"windows-sys",
]
[[package]]
name = "termcolor"
version = "1.2.0"
@ -1202,6 +1444,17 @@ dependencies = [
"tokio",
]
[[package]]
name = "tokio-stream"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842"
dependencies = [
"futures-core",
"pin-project-lite",
"tokio",
]
[[package]]
name = "tokio-util"
version = "0.7.8"
@ -1290,6 +1543,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-width"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]]
name = "untrusted"
version = "0.7.1"
@ -1314,6 +1573,16 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "walkdir"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
dependencies = [
"same-file",
"winapi-util",
]
[[package]]
name = "want"
version = "0.3.1"
@ -1530,6 +1799,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "xxhash-rust"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70"
[[package]]
name = "yaml-rust"
version = "0.4.5"

View File

@ -15,3 +15,5 @@ log.workspace = true
env_logger.workspace = true
serde = { version = "1.0.175", features = [ "derive" ] }
reqwest = { version = "0.11", default-features = false, features = [ "rustls-tls", "gzip", "brotli", "deflate" ] }
cacache = { version = "11.6.0", default-features = false, features = ["tokio-runtime", "mmap"] }
bincode = "1.3.3"

View File

@ -2,29 +2,59 @@ mod settings;
#[macro_use]
extern crate log;
use serde::{Deserialize, Serialize};
use settings::Settings;
use url::Url;
#[derive(Debug, Deserialize, Serialize)]
struct Page {
url: Url,
status: u16,
body: String,
}
#[tokio::main]
async fn main() {
env_logger::init();
let settings = Settings::new().unwrap();
let mut handles = Vec::new();
for url in settings.sitemap {
crawl_url(url).await;
let job = tokio::spawn(crawl_url(url));
handles.push(job);
}
for job in handles {
job.await;
}
}
async fn crawl_url(url: Url) {
let mut resp = reqwest::get(url).await;
tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
match resp {
Ok(v) => {
let status = v.status();
let ver = v.version();
let fin_url = v.url().as_str();
// let body = resp.text().await?;
let page = Page {
url: v.url().to_owned(),
status: v.status().as_u16(),
body: v.text().await.unwrap(),
};
info!("Crawled {:?}: {:?} {:?}", fin_url, ver, status);
if page.status >= 400 {
error!("{:?} Error for {}", &page.status, &page.url);
} else {
info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status);
let mut cache_dir = dirs::cache_dir().unwrap();
cache_dir.push("ferret");
let encoded_page: Vec<u8> = bincode::serialize(&page).unwrap();
cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page)
.await
.unwrap();
let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap();
let decoded_page: Page = bincode::deserialize(&data).unwrap();
}
}
Err(e) => {
error!("Could not get url: {}", e);