diff --git a/crawler/src/main.rs b/crawler/src/main.rs index 69ee7f3..f88f2f1 100644 --- a/crawler/src/main.rs +++ b/crawler/src/main.rs @@ -68,11 +68,6 @@ async fn main() { .unwrap(); let decoded_page: Page = bincode::deserialize(&data).unwrap(); insert_db(&pool, &decoded_page).await; - info!( - "Found page: {} {}", - &decoded_page.url.as_str(), - &decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S") - ); } } @@ -141,10 +136,27 @@ async fn find_links(html: &str, base: &Url, allow: Vec) -> Vec { for element in document.select(&selector) { let href = element.value().attr("href").unwrap(); let url = base.join(href).unwrap(); - // if &url == base { + match &url.fragment() { + Some(_) => break, + None => {} + } + let path = url.path().clone(); + let ignore_ends = vec![ + ".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt", + ".css", ".json", + ]; + let mut br = false; + for i in ignore_ends { + if path.ends_with(i) { + br = true; + } + } + if br { + break; + } + // if !path.ends_with(".html") || !path.ends_with("/") { // break; // } - info!("Found url: {}", &url.as_str()); for x in &allow { match &url.domain() { Some(d) => {