Filetype filtering
parent
ed53ec320e
commit
f3422a4949
|
@ -68,11 +68,6 @@ async fn main() {
|
|||
.unwrap();
|
||||
let decoded_page: Page = bincode::deserialize(&data).unwrap();
|
||||
insert_db(&pool, &decoded_page).await;
|
||||
info!(
|
||||
"Found page: {} {}",
|
||||
&decoded_page.url.as_str(),
|
||||
&decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -141,10 +136,27 @@ async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
|
|||
for element in document.select(&selector) {
|
||||
let href = element.value().attr("href").unwrap();
|
||||
let url = base.join(href).unwrap();
|
||||
// if &url == base {
|
||||
match &url.fragment() {
|
||||
Some(_) => break,
|
||||
None => {}
|
||||
}
|
||||
let path = url.path().clone();
|
||||
let ignore_ends = vec![
|
||||
".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",
|
||||
".css", ".json",
|
||||
];
|
||||
let mut br = false;
|
||||
for i in ignore_ends {
|
||||
if path.ends_with(i) {
|
||||
br = true;
|
||||
}
|
||||
}
|
||||
if br {
|
||||
break;
|
||||
}
|
||||
// if !path.ends_with(".html") || !path.ends_with("/") {
|
||||
// break;
|
||||
// }
|
||||
info!("Found url: {}", &url.as_str());
|
||||
for x in &allow {
|
||||
match &url.domain() {
|
||||
Some(d) => {
|
||||
|
|
Loading…
Reference in New Issue