2024-04-16 07:04:44 +00:00
|
|
|
use std::borrow::Cow;
|
|
|
|
|
|
|
|
use tracing::{error, warn};
|
2023-12-20 06:18:09 +00:00
|
|
|
use url::Url;
|
|
|
|
|
2024-04-16 07:04:44 +00:00
|
|
|
#[tracing::instrument]
|
2023-12-20 06:18:09 +00:00
|
|
|
pub fn normalize_url(url: &str) -> eyre::Result<String> {
|
2023-12-20 10:23:02 +00:00
|
|
|
let url = url.trim_end_matches('#');
|
2023-12-20 08:03:29 +00:00
|
|
|
if url.is_empty() {
|
2024-04-16 07:04:44 +00:00
|
|
|
warn!("url is empty");
|
2023-12-20 08:03:29 +00:00
|
|
|
return Ok(String::new());
|
|
|
|
}
|
|
|
|
|
2023-12-20 23:17:46 +00:00
|
|
|
let Ok(mut url) = Url::parse(url) else {
|
2024-04-16 07:04:44 +00:00
|
|
|
error!("failed to parse url");
|
2023-12-20 23:17:46 +00:00
|
|
|
return Ok(url.to_string());
|
|
|
|
};
|
2023-12-20 06:18:09 +00:00
|
|
|
|
|
|
|
// make sure the scheme is https
|
|
|
|
if url.scheme() == "http" {
|
|
|
|
url.set_scheme("https").unwrap();
|
|
|
|
}
|
|
|
|
|
2023-12-20 09:28:38 +00:00
|
|
|
// remove fragment
|
|
|
|
url.set_fragment(None);
|
|
|
|
|
2023-12-20 06:18:09 +00:00
|
|
|
// remove trailing slash
|
|
|
|
let path = url.path().to_string();
|
|
|
|
if let Some(path) = path.strip_suffix('/') {
|
|
|
|
url.set_path(path);
|
|
|
|
}
|
|
|
|
|
2023-12-20 23:17:46 +00:00
|
|
|
// remove tracking params
|
2023-12-20 06:18:09 +00:00
|
|
|
let query_pairs = url.query_pairs().into_owned();
|
|
|
|
let mut new_query_pairs = Vec::new();
|
2023-12-20 23:17:46 +00:00
|
|
|
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
|
2023-12-20 06:18:09 +00:00
|
|
|
for (key, value) in query_pairs {
|
2023-12-20 23:17:46 +00:00
|
|
|
if !TRACKING_PARAMS.contains(&key.as_str()) {
|
2023-12-20 06:18:09 +00:00
|
|
|
new_query_pairs.push((key, value));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if new_query_pairs.is_empty() {
|
|
|
|
url.set_query(None);
|
|
|
|
} else {
|
|
|
|
url.set_query(Some(
|
|
|
|
&url::form_urlencoded::Serializer::new(String::new())
|
|
|
|
.extend_pairs(new_query_pairs)
|
|
|
|
.finish(),
|
|
|
|
));
|
|
|
|
}
|
|
|
|
|
2023-12-21 09:00:18 +00:00
|
|
|
// convert minecraft.fandom.com/wiki/ to minecraft.wiki/w/
|
|
|
|
if url.host_str() == Some("minecraft.fandom.com") {
|
|
|
|
let path = url.path().to_string();
|
|
|
|
if let Some(path) = path.strip_prefix("/wiki/") {
|
|
|
|
url.set_host(Some("minecraft.wiki")).unwrap();
|
|
|
|
url.set_path(&format!("/w/{path}"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-20 08:03:29 +00:00
|
|
|
// url decode and encode path
|
|
|
|
let path = url.path().to_string();
|
2024-04-16 07:04:44 +00:00
|
|
|
let path = match urlencoding::decode(&path) {
|
|
|
|
Ok(path) => path,
|
|
|
|
Err(e) => {
|
|
|
|
warn!("failed to decode path: {e}");
|
|
|
|
Cow::Owned(path)
|
|
|
|
}
|
|
|
|
};
|
2023-12-21 09:00:18 +00:00
|
|
|
url.set_path(path.as_ref());
|
2023-12-20 08:03:29 +00:00
|
|
|
|
|
|
|
let url = url.to_string();
|
|
|
|
// remove trailing slash
|
|
|
|
let url = if let Some(url) = url.strip_suffix('/') {
|
|
|
|
url.to_string()
|
|
|
|
} else {
|
|
|
|
url
|
|
|
|
};
|
|
|
|
|
2023-12-21 09:00:18 +00:00
|
|
|
Ok(url)
|
2023-12-20 06:18:09 +00:00
|
|
|
}
|