Filetype filtering
parent
ed53ec320e
commit
f3422a4949
|
@ -68,11 +68,6 @@ async fn main() {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let decoded_page: Page = bincode::deserialize(&data).unwrap();
|
let decoded_page: Page = bincode::deserialize(&data).unwrap();
|
||||||
insert_db(&pool, &decoded_page).await;
|
insert_db(&pool, &decoded_page).await;
|
||||||
info!(
|
|
||||||
"Found page: {} {}",
|
|
||||||
&decoded_page.url.as_str(),
|
|
||||||
&decoded_page.last_fetched.format("%Y-%m-%d %H:%M:%S")
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -141,10 +136,27 @@ async fn find_links(html: &str, base: &Url, allow: Vec<Url>) -> Vec<Url> {
|
||||||
for element in document.select(&selector) {
|
for element in document.select(&selector) {
|
||||||
let href = element.value().attr("href").unwrap();
|
let href = element.value().attr("href").unwrap();
|
||||||
let url = base.join(href).unwrap();
|
let url = base.join(href).unwrap();
|
||||||
// if &url == base {
|
match &url.fragment() {
|
||||||
|
Some(_) => break,
|
||||||
|
None => {}
|
||||||
|
}
|
||||||
|
let path = url.path().clone();
|
||||||
|
let ignore_ends = vec![
|
||||||
|
".js", ".gz", ".zip", ".7zip", ".pdf", ".png", ".jpg", ".webp", ".jpeg", ".odt",
|
||||||
|
".css", ".json",
|
||||||
|
];
|
||||||
|
let mut br = false;
|
||||||
|
for i in ignore_ends {
|
||||||
|
if path.ends_with(i) {
|
||||||
|
br = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if br {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// if !path.ends_with(".html") || !path.ends_with("/") {
|
||||||
// break;
|
// break;
|
||||||
// }
|
// }
|
||||||
info!("Found url: {}", &url.as_str());
|
|
||||||
for x in &allow {
|
for x in &allow {
|
||||||
match &url.domain() {
|
match &url.domain() {
|
||||||
Some(d) => {
|
Some(d) => {
|
||||||
|
|
Loading…
Reference in New Issue