metasearch/src/urls.rs
2024-07-17 05:44:41 +00:00

244 lines
6.7 KiB
Rust

use std::borrow::Cow;
use tracing::{error, warn};
use url::Url;
use crate::config::{HostAndPath, UrlsConfig};
#[tracing::instrument]
pub fn normalize_url(url: &str) -> String {
let url = url.trim_end_matches('#');
if url.is_empty() {
warn!("url is empty");
return String::new();
}
let Ok(mut url) = Url::parse(url) else {
error!("failed to parse url");
return url.to_string();
};
// make sure the scheme is https
if url.scheme() == "http" {
url.set_scheme("https").unwrap();
}
// remove fragment
url.set_fragment(None);
// remove trailing slash
let path = url.path().to_string();
if let Some(path) = path.strip_suffix('/') {
url.set_path(path);
}
// remove tracking params
let query_pairs = url.query_pairs().into_owned();
let mut new_query_pairs = Vec::new();
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
for (key, value) in query_pairs {
if !TRACKING_PARAMS.contains(&key.as_str()) {
new_query_pairs.push((key, value));
}
}
if new_query_pairs.is_empty() {
url.set_query(None);
} else {
url.set_query(Some(
&url::form_urlencoded::Serializer::new(String::new())
.extend_pairs(new_query_pairs)
.finish(),
));
}
// url decode and encode path
let path = url.path().to_string();
let path = match urlencoding::decode(&path) {
Ok(path) => path,
Err(e) => {
warn!("failed to decode path: {e}");
Cow::Owned(path)
}
};
url.set_path(path.as_ref());
let url = url.to_string();
// remove trailing slash
let url = if let Some(url) = url.strip_suffix('/') {
url.to_string()
} else {
url
};
url
}
impl HostAndPath {
pub fn contains(&self, host: &str, path: &str) -> bool {
if self.host.starts_with('.') {
if !host.ends_with(&self.host) {
return false;
}
} else if host != self.host {
return false;
}
if self.path.ends_with('/') || self.path.is_empty() {
path.starts_with(&self.path)
} else {
path == self.path
}
}
pub fn replace(
replace_from: &HostAndPath,
replace_with: &HostAndPath,
real_url: &HostAndPath,
) -> Option<(String, String)> {
let new_host = if replace_from.host.starts_with(".") {
if replace_with.host.starts_with(".") {
if let Some(host_without_suffix) = real_url.host.strip_suffix(&replace_from.host) {
format!("{host_without_suffix}{}", replace_with.host)
} else {
return None;
}
} else if real_url.host.ends_with(&replace_from.host) {
replace_with.host.to_owned()
} else {
return None;
}
} else if real_url.host == replace_from.host {
replace_with.host.clone()
} else {
return None;
};
// host matches, now check path
let new_path = if replace_from.path.ends_with('/') || replace_from.path.is_empty() {
if replace_with.path.ends_with('/') || replace_with.path.is_empty() {
if let Some(path_without_prefix) = real_url.path.strip_prefix(&replace_from.path) {
format!("{}{path_without_prefix}", replace_with.path)
} else {
return None;
}
} else if real_url.path.starts_with(&replace_from.path) {
replace_with.path.clone()
} else {
return None;
}
} else if real_url.path == replace_from.path {
replace_with.path.clone()
} else {
return None;
};
Some((new_host, new_path))
}
}
pub fn apply_url_replacements(url: &str, urls_config: &UrlsConfig) -> String {
let Ok(mut url) = Url::parse(url) else {
error!("failed to parse url");
return url.to_string();
};
let host = url.host_str().unwrap_or_default().to_owned();
let path = url
.path()
.strip_prefix("/")
.unwrap_or(url.path())
.to_owned();
let real_url = HostAndPath { host, path };
for (replace_from, replace_to) in &urls_config.replace {
if let Some((new_host, new_path)) =
HostAndPath::replace(replace_from, replace_to, &real_url)
{
let _ = url.set_host(Some(&new_host));
url.set_path(&new_path);
break;
}
}
normalize_url(url.as_ref())
}
pub fn get_url_weight(url: &str, urls_config: &UrlsConfig) -> f64 {
let Ok(url) = Url::parse(url) else {
error!("failed to parse url");
return 1.;
};
let host = url.host_str().unwrap_or_default().to_owned();
let path = url.path().strip_prefix("/").unwrap_or_default().to_owned();
for (check, weight) in &urls_config.weight {
if check.contains(&host, &path) {
return *weight;
}
}
1.
}
#[cfg(test)]
mod tests {
use crate::config::HostAndPath;
use super::*;
fn test_replacement(from: &str, to: &str, url: &str, expected: &str) {
let urls_config = UrlsConfig {
replace: vec![(HostAndPath::new(from), HostAndPath::new(to))],
weight: vec![],
};
let normalized_url = apply_url_replacements(url, &urls_config);
assert_eq!(normalized_url, expected);
}
#[test]
fn test_replace_url() {
test_replacement(
"minecraft.fandom.com/wiki/",
"minecraft.wiki/w/",
"https://minecraft.fandom.com/wiki/Java_Edition",
"https://minecraft.wiki/w/Java_Edition",
);
}
#[test]
fn test_replace_wildcard_host_with_absolute() {
test_replacement(
".medium.com",
"scribe.rip",
"https://example.medium.com/asdf",
"https://scribe.rip/asdf",
);
}
#[test]
fn test_replace_wildcard_host_with_wildcard() {
test_replacement(
".medium.com",
".scribe.rip",
"https://example.medium.com/asdf",
"https://example.scribe.rip/asdf",
);
}
#[test]
fn test_non_matching_wildcard() {
test_replacement(
".medium.com",
".scribe.rip",
"https://medium.com/asdf",
"https://medium.com/asdf",
);
}
#[test]
fn test_non_matching_wildcard_to_absolute() {
test_replacement(
".medium.com",
"scribe.rip",
"https://example.com/asdf",
"https://example.com/asdf",
);
}
}