From 892dc35f74e4a56234f5825822fe5fe478bfa66f Mon Sep 17 00:00:00 2001 From: Erin Nova Date: Tue, 25 Jul 2023 16:58:25 -0400 Subject: [PATCH] Crawl all at once, test cacache --- Cargo.lock | 281 +++++++++++++++++++++++++++++++++++++++++++- crawler/Cargo.toml | 2 + crawler/src/main.rs | 42 ++++++- 3 files changed, 316 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45bc684..c71e6ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -121,12 +121,27 @@ version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" + [[package]] name = "block-buffer" version = "0.10.4" @@ -169,6 +184,33 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +[[package]] +name = "cacache" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11e3f7fcc57143528b55ff07ce71b4608d674d2929c2365f768c6c5bcaaa7a17" +dependencies = [ + "digest", + "either", + "futures", + "hex", + "libc", + "memmap2", + "miette", + "reflink", + "serde", + "serde_derive", + "serde_json", + "sha1", + "sha2", + "ssri", + "tempfile", + "thiserror", + "tokio", + "tokio-stream", + "walkdir", +] + [[package]] name = "cc" version = "1.0.79" @@ -217,6 +259,8 @@ dependencies = [ name = "crawler" version = "0.1.0" dependencies = [ + "bincode", + "cacache", "config", "dirs", "env_logger", @@ -283,6 +327,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0688c2a7f92e427f44895cd63841bff7b29f8d7a1648b9e7e07a4a365b2e1257" +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + [[package]] name = "encoding_rs" version = "0.8.32" @@ -305,6 +355,33 @@ dependencies = [ "termcolor", ] +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" + [[package]] name = "flate2" version = "1.0.26" @@ -334,6 +411,21 @@ dependencies = [ name = "frontend" version = "0.1.0" +[[package]] +name = "futures" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.28" @@ -341,6 +433,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -349,6 +442,23 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +[[package]] +name = "futures-executor" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" + [[package]] name = "futures-macro" version = "0.3.28" @@ -378,9 +488,13 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ + "futures-channel", "futures-core", + "futures-io", "futures-macro", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", "slab", @@ -456,6 +570,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "http" version = "0.2.9" @@ -604,6 +724,12 @@ version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" +[[package]] +name = "linux-raw-sys" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" + [[package]] name = "lock_api" version = "0.4.10" @@ -626,6 +752,38 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memmap2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +dependencies = [ + "libc", +] + +[[package]] +name = "miette" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e" +dependencies = [ + "miette-derive", + "once_cell", + "thiserror", + "unicode-width", +] + +[[package]] +name = "miette-derive" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "mime" version = "0.3.17" @@ -824,7 +982,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -833,7 +991,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -847,6 +1005,16 @@ dependencies = [ "thiserror", ] +[[package]] +name = "reflink" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc585ec28b565b4c28977ce8363a6636cedc280351ba25a7915f6c9f37f68cbe" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "regex" version = "1.9.1" @@ -939,7 +1107,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88073939a61e5b7680558e6be56b419e208420c2adb92be54921fa6b72283f1a" dependencies = [ "base64 0.13.1", - "bitflags", + "bitflags 1.3.2", "serde", ] @@ -959,6 +1127,19 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustix" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" +dependencies = [ + "bitflags 2.3.3", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustls" version = "0.21.5" @@ -996,6 +1177,15 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1055,6 +1245,28 @@ dependencies = [ "serde", ] +[[package]] +name = "sha-1" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha1" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sha2" version = "0.10.7" @@ -1106,6 +1318,23 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "ssri" +version = "9.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da7a2b3c2bc9693bcb40870c4e9b5bf0d79f9cb46273321bf855ec513e919082" +dependencies = [ + "base64 0.21.2", + "digest", + "hex", + "miette", + "serde", + "sha-1", + "sha2", + "thiserror", + "xxhash-rust", +] + [[package]] name = "syn" version = "2.0.27" @@ -1117,6 +1346,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5486094ee78b2e5038a6382ed7645bc084dc2ec433426ca4c3cb61e2007b8998" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall 0.3.5", + "rustix", + "windows-sys", +] + [[package]] name = "termcolor" version = "1.2.0" @@ -1202,6 +1444,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.8" @@ -1290,6 +1543,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + [[package]] name = "untrusted" version = "0.7.1" @@ -1314,6 +1573,16 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -1530,6 +1799,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "xxhash-rust" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70" + [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index 95d1291..2c96d36 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -15,3 +15,5 @@ log.workspace = true env_logger.workspace = true serde = { version = "1.0.175", features = [ "derive" ] } reqwest = { version = "0.11", default-features = false, features = [ "rustls-tls", "gzip", "brotli", "deflate" ] } +cacache = { version = "11.6.0", default-features = false, features = ["tokio-runtime", "mmap"] } +bincode = "1.3.3" diff --git a/crawler/src/main.rs b/crawler/src/main.rs index f6a4095..8c0f304 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -2,29 +2,59 @@ mod settings; #[macro_use] extern crate log; +use serde::{Deserialize, Serialize}; use settings::Settings; use url::Url; +#[derive(Debug, Deserialize, Serialize)] +struct Page { + url: Url, + status: u16, + body: String, +} + #[tokio::main] async fn main() { env_logger::init(); let settings = Settings::new().unwrap(); + let mut handles = Vec::new(); for url in settings.sitemap { - crawl_url(url).await; + let job = tokio::spawn(crawl_url(url)); + handles.push(job); + } + for job in handles { + job.await; } } async fn crawl_url(url: Url) { let mut resp = reqwest::get(url).await; + tokio::time::sleep(tokio::time::Duration::from_secs(10)).await; match resp { Ok(v) => { - let status = v.status(); - let ver = v.version(); - let fin_url = v.url().as_str(); - // let body = resp.text().await?; + let page = Page { + url: v.url().to_owned(), + status: v.status().as_u16(), + body: v.text().await.unwrap(), + }; - info!("Crawled {:?}: {:?} {:?}", fin_url, ver, status); + if page.status >= 400 { + error!("{:?} Error for {}", &page.status, &page.url); + } else { + info!("Crawled {:?}: {:?}", &page.url.as_str(), &page.status); + + let mut cache_dir = dirs::cache_dir().unwrap(); + cache_dir.push("ferret"); + + let encoded_page: Vec = bincode::serialize(&page).unwrap(); + cacache::write(cache_dir.as_path(), &page.url.as_str(), encoded_page) + .await + .unwrap(); + + let data = cacache::read(cache_dir, &page.url.as_str()).await.unwrap(); + let decoded_page: Page = bincode::deserialize(&data).unwrap(); + } } Err(e) => { error!("Could not get url: {}", e);