Filetype filtering

main
~erin 2023-07-25 20:25:49 -04:00
parent ed53ec320e
commit f3422a4949
Signed by: erin
GPG Key ID: 0FEDEAFF1C14847E
1 changed files with 19 additions and 7 deletions

View File

@ -68,11 +68,6 @@ async fn main() {
.unwrap();
let decoded_page: Page = bincode::deserialize(&data).unwrap();
insert_db(&pool, &decoded_page).await;
info!(
"Found page: {} {}",
&decoded_page.url.as_str(),
&decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S")
);
}
}
@ -141,10 +136,27 @@ async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
for element in document.select(&selector) {
let href = element.value().attr("href").unwrap();
let url = base.join(href).unwrap();
// if &url == base {
match &url.fragment() {
Some(_) => break,
None => {}
}
let path = url.path().clone();
let ignore_ends = vec![
".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",
".css", ".json",
];
let mut br = false;
for i in ignore_ends {
if path.ends_with(i) {
br = true;
}
}
if br {
break;
}
// if !path.ends_with(".html") || !path.ends_with("/") {
// break;
// }
info!("Found url: {}", &url.as_str());
for x in &allow {
match &url.domain() {
Some(d) => {