244 lines
6.7 KiB
Rust
244 lines
6.7 KiB
Rust
use std::borrow::Cow;
|
|
|
|
use tracing::{error, warn};
|
|
use url::Url;
|
|
|
|
use crate::config::{HostAndPath, UrlsConfig};
|
|
|
|
#[tracing::instrument]
|
|
pub fn normalize_url(url: &str) -> String {
|
|
let url = url.trim_end_matches('#');
|
|
if url.is_empty() {
|
|
warn!("url is empty");
|
|
return String::new();
|
|
}
|
|
|
|
let Ok(mut url) = Url::parse(url) else {
|
|
error!("failed to parse url");
|
|
return url.to_string();
|
|
};
|
|
|
|
// make sure the scheme is https
|
|
if url.scheme() == "http" {
|
|
url.set_scheme("https").unwrap();
|
|
}
|
|
|
|
// remove fragment
|
|
url.set_fragment(None);
|
|
|
|
// remove trailing slash
|
|
let path = url.path().to_string();
|
|
if let Some(path) = path.strip_suffix('/') {
|
|
url.set_path(path);
|
|
}
|
|
|
|
// remove tracking params
|
|
let query_pairs = url.query_pairs().into_owned();
|
|
let mut new_query_pairs = Vec::new();
|
|
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
|
|
for (key, value) in query_pairs {
|
|
if !TRACKING_PARAMS.contains(&key.as_str()) {
|
|
new_query_pairs.push((key, value));
|
|
}
|
|
}
|
|
if new_query_pairs.is_empty() {
|
|
url.set_query(None);
|
|
} else {
|
|
url.set_query(Some(
|
|
&url::form_urlencoded::Serializer::new(String::new())
|
|
.extend_pairs(new_query_pairs)
|
|
.finish(),
|
|
));
|
|
}
|
|
|
|
// url decode and encode path
|
|
let path = url.path().to_string();
|
|
let path = match urlencoding::decode(&path) {
|
|
Ok(path) => path,
|
|
Err(e) => {
|
|
warn!("failed to decode path: {e}");
|
|
Cow::Owned(path)
|
|
}
|
|
};
|
|
url.set_path(path.as_ref());
|
|
|
|
let url = url.to_string();
|
|
// remove trailing slash
|
|
let url = if let Some(url) = url.strip_suffix('/') {
|
|
url.to_string()
|
|
} else {
|
|
url
|
|
};
|
|
|
|
url
|
|
}
|
|
|
|
impl HostAndPath {
|
|
pub fn contains(&self, host: &str, path: &str) -> bool {
|
|
if self.host.starts_with('.') {
|
|
if !host.ends_with(&self.host) {
|
|
return false;
|
|
}
|
|
} else if host != self.host {
|
|
return false;
|
|
}
|
|
|
|
if self.path.ends_with('/') || self.path.is_empty() {
|
|
path.starts_with(&self.path)
|
|
} else {
|
|
path == self.path
|
|
}
|
|
}
|
|
|
|
pub fn replace(
|
|
replace_from: &HostAndPath,
|
|
replace_with: &HostAndPath,
|
|
real_url: &HostAndPath,
|
|
) -> Option<(String, String)> {
|
|
let new_host = if replace_from.host.starts_with(".") {
|
|
if replace_with.host.starts_with(".") {
|
|
if let Some(host_without_suffix) = real_url.host.strip_suffix(&replace_from.host) {
|
|
format!("{host_without_suffix}{}", replace_with.host)
|
|
} else {
|
|
return None;
|
|
}
|
|
} else if real_url.host.ends_with(&replace_from.host) {
|
|
replace_with.host.to_owned()
|
|
} else {
|
|
return None;
|
|
}
|
|
} else if real_url.host == replace_from.host {
|
|
replace_with.host.clone()
|
|
} else {
|
|
return None;
|
|
};
|
|
|
|
// host matches, now check path
|
|
|
|
let new_path = if replace_from.path.ends_with('/') || replace_from.path.is_empty() {
|
|
if replace_with.path.ends_with('/') || replace_with.path.is_empty() {
|
|
if let Some(path_without_prefix) = real_url.path.strip_prefix(&replace_from.path) {
|
|
format!("{}{path_without_prefix}", replace_with.path)
|
|
} else {
|
|
return None;
|
|
}
|
|
} else if real_url.path.starts_with(&replace_from.path) {
|
|
replace_with.path.clone()
|
|
} else {
|
|
return None;
|
|
}
|
|
} else if real_url.path == replace_from.path {
|
|
replace_with.path.clone()
|
|
} else {
|
|
return None;
|
|
};
|
|
|
|
Some((new_host, new_path))
|
|
}
|
|
}
|
|
|
|
pub fn apply_url_replacements(url: &str, urls_config: &UrlsConfig) -> String {
|
|
let Ok(mut url) = Url::parse(url) else {
|
|
error!("failed to parse url");
|
|
return url.to_string();
|
|
};
|
|
|
|
let host = url.host_str().unwrap_or_default().to_owned();
|
|
|
|
let path = url
|
|
.path()
|
|
.strip_prefix("/")
|
|
.unwrap_or(url.path())
|
|
.to_owned();
|
|
let real_url = HostAndPath { host, path };
|
|
for (replace_from, replace_to) in &urls_config.replace {
|
|
if let Some((new_host, new_path)) =
|
|
HostAndPath::replace(replace_from, replace_to, &real_url)
|
|
{
|
|
let _ = url.set_host(Some(&new_host));
|
|
url.set_path(&new_path);
|
|
break;
|
|
}
|
|
}
|
|
|
|
normalize_url(url.as_ref())
|
|
}
|
|
pub fn get_url_weight(url: &str, urls_config: &UrlsConfig) -> f64 {
|
|
let Ok(url) = Url::parse(url) else {
|
|
error!("failed to parse url");
|
|
return 1.;
|
|
};
|
|
|
|
let host = url.host_str().unwrap_or_default().to_owned();
|
|
let path = url.path().strip_prefix("/").unwrap_or_default().to_owned();
|
|
for (check, weight) in &urls_config.weight {
|
|
if check.contains(&host, &path) {
|
|
return *weight;
|
|
}
|
|
}
|
|
|
|
1.
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::config::HostAndPath;
|
|
|
|
use super::*;
|
|
|
|
fn test_replacement(from: &str, to: &str, url: &str, expected: &str) {
|
|
let urls_config = UrlsConfig {
|
|
replace: vec![(HostAndPath::new(from), HostAndPath::new(to))],
|
|
weight: vec![],
|
|
};
|
|
let normalized_url = apply_url_replacements(url, &urls_config);
|
|
assert_eq!(normalized_url, expected);
|
|
}
|
|
|
|
#[test]
|
|
fn test_replace_url() {
|
|
test_replacement(
|
|
"minecraft.fandom.com/wiki/",
|
|
"minecraft.wiki/w/",
|
|
"https://minecraft.fandom.com/wiki/Java_Edition",
|
|
"https://minecraft.wiki/w/Java_Edition",
|
|
);
|
|
}
|
|
#[test]
|
|
fn test_replace_wildcard_host_with_absolute() {
|
|
test_replacement(
|
|
".medium.com",
|
|
"scribe.rip",
|
|
"https://example.medium.com/asdf",
|
|
"https://scribe.rip/asdf",
|
|
);
|
|
}
|
|
#[test]
|
|
fn test_replace_wildcard_host_with_wildcard() {
|
|
test_replacement(
|
|
".medium.com",
|
|
".scribe.rip",
|
|
"https://example.medium.com/asdf",
|
|
"https://example.scribe.rip/asdf",
|
|
);
|
|
}
|
|
#[test]
|
|
fn test_non_matching_wildcard() {
|
|
test_replacement(
|
|
".medium.com",
|
|
".scribe.rip",
|
|
"https://medium.com/asdf",
|
|
"https://medium.com/asdf",
|
|
);
|
|
}
|
|
#[test]
|
|
fn test_non_matching_wildcard_to_absolute() {
|
|
test_replacement(
|
|
".medium.com",
|
|
"scribe.rip",
|
|
"https://example.com/asdf",
|
|
"https://example.com/asdf",
|
|
);
|
|
}
|
|
}
|