From ee1572fab03602e0d07b9f79555d65cf692d5e60 Mon Sep 17 00:00:00 2001 From: mat <27899617+mat-1@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:42:43 -0500 Subject: [PATCH] Add config options to replace parts of urls and to change their ranking weights (#14) * add config options to replace parts of urls and change their weight * improve config-default.toml comments * refactor checking/replacing a bit --- README | 8 +- config-default.toml | 15 ++- src/config.rs | 66 ++++++++++++ src/engines/ranking.rs | 27 ++++- src/main.rs | 2 +- src/normalize.rs | 81 -------------- src/parse.rs | 6 +- src/urls.rs | 232 +++++++++++++++++++++++++++++++++++++++++ 8 files changed, 344 insertions(+), 93 deletions(-) delete mode 100644 src/normalize.rs create mode 100644 src/urls.rs diff --git a/README b/README index 2b3a125..28982f3 100644 --- a/README +++ b/README @@ -36,8 +36,8 @@ checked at the following locations: If no config file exists, it'll be created at the first valid path in the list. -By default, metasearch runs on the port 28019. You are recommended to use a -reverse proxy. +By default, metasearch runs on port 28019. You are encouraged to use a reverse +proxy. ------------- CONFIGURATION @@ -46,7 +46,7 @@ CONFIGURATION You can see all the default config options at `src/config.rs`. Some interesting options you may want to change are: - - bind - the host and port that the web server runs on, for example + - bind - the host and port that the web server runs on, defaults to `0.0.0.0:28019`. - api - whether your instance is accessible through a JSON API. See below for more details. @@ -69,4 +69,4 @@ For example: curl 'http://localhost:28019/search?q=sandcats' -H 'Accept: application/json' The structure of the API is not guaranteed to be stable, as it relies on -serializing internal structs. It may break across versions! +serializing internal structs. It may change without warning in the future. diff --git a/config-default.toml b/config-default.toml index ded0b7e..b982726 100644 --- a/config-default.toml +++ b/config-default.toml @@ -1,4 +1,5 @@ -# See src/config.rs for all of the possible options +# See src/config.rs for all of the possible options. +# The commented-out lines are examples of values you could set, not the defaults. bind = "0.0.0.0:28019" api = false @@ -14,3 +15,15 @@ api = false [engines] # numbat = false # fend = true + +[urls.replace] +# "www.reddit.com" = "old.reddit.com" +# "medium.com" = "scribe.rip" +# ".medium.com" = "scribe.rip" + +[urls.weight] +# These are checked after applying replacements. Setting the weight to 0 (or any +# negative number) completely hides the result. Longer matches have a higher +# priority. +# "quora.com" = 0.1 +# ".quora.com" = 0.1 diff --git a/src/config.rs b/src/config.rs index 9471dee..af89ab5 100644 --- a/src/config.rs +++ b/src/config.rs @@ -33,6 +33,13 @@ impl Default for Config { }, }, engines: Arc::new(EnginesConfig::default()), + urls: UrlsConfig { + replace: vec![( + HostAndPath::new("minecraft.fandom.com/wiki/"), + HostAndPath::new("minecraft.wiki/w/"), + )], + weight: vec![], + }, } } } @@ -148,6 +155,7 @@ pub struct Config { pub image_search: ImageSearchConfig, // wrapped in an arc to make Config cheaper to clone pub engines: Arc, + pub urls: UrlsConfig, } #[derive(Deserialize, Debug)] @@ -157,6 +165,7 @@ pub struct PartialConfig { pub ui: Option, pub image_search: Option, pub engines: Option, + pub urls: Option, } impl Config { @@ -171,6 +180,7 @@ impl Config { engines.overlay(partial_engines); self.engines = Arc::new(engines); } + self.urls.overlay(partial.urls.unwrap_or_default()); } } @@ -345,3 +355,59 @@ impl Config { Ok(config) } } + +#[derive(Debug, Clone, PartialEq)] +pub struct HostAndPath { + pub host: String, + pub path: String, +} +impl HostAndPath { + pub fn new(s: &str) -> Self { + let (host, path) = s.split_once('/').unwrap_or((s, "")); + Self { + host: host.to_owned(), + path: path.to_owned(), + } + } +} + +#[derive(Debug, Clone)] +pub struct UrlsConfig { + pub replace: Vec<(HostAndPath, HostAndPath)>, + pub weight: Vec<(HostAndPath, f64)>, +} +#[derive(Deserialize, Debug, Default)] +pub struct PartialUrlsConfig { + #[serde(default)] + pub replace: HashMap, + #[serde(default)] + pub weight: HashMap, +} +impl UrlsConfig { + pub fn overlay(&mut self, partial: PartialUrlsConfig) { + for (from, to) in partial.replace { + let from = HostAndPath::new(&from); + if to.is_empty() { + // setting the value to an empty string removes it + let index = self.replace.iter().position(|(u, _)| u == &from); + // swap_remove is fine because the order of this vec doesn't matter + self.replace.swap_remove(index.unwrap()); + } else { + let to = HostAndPath::new(&to); + self.replace.push((from, to)); + } + } + + for (url, weight) in partial.weight { + let url = HostAndPath::new(&url); + self.weight.push((url, weight)); + } + + // sort by length so that more specific checls are done first + self.weight.sort_by(|(a, _), (b, _)| { + let a_len = a.path.len() + a.host.len(); + let b_len = b.path.len() + b.host.len(); + b_len.cmp(&a_len) + }); + } +} diff --git a/src/engines/ranking.rs b/src/engines/ranking.rs index 40d6dc5..3c6a3c6 100644 --- a/src/engines/ranking.rs +++ b/src/engines/ranking.rs @@ -1,6 +1,9 @@ use std::{collections::HashMap, sync::Arc}; -use crate::config::Config; +use crate::{ + config::Config, + urls::{apply_url_replacements, get_url_weight}, +}; use super::{ Answer, AutocompleteResult, Engine, EngineImageResult, EngineImagesResponse, EngineResponse, @@ -19,12 +22,20 @@ pub fn merge_engine_responses( for (engine, response) in responses { let engine_config = config.engines.get(engine); - for (result_index, search_result) in response.search_results.into_iter().enumerate() { + for (result_index, mut search_result) in response.search_results.into_iter().enumerate() { // position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a // score of 0.33, etc. let base_result_score = 1. / (result_index + 1) as f64; let result_score = base_result_score * engine_config.weight; + // apply url config here + search_result.url = apply_url_replacements(&search_result.url, &config.urls); + let url_weight = get_url_weight(&search_result.url, &config.urls); + if url_weight <= 0. { + continue; + } + let result_score = result_score * url_weight; + if let Some(existing_result) = search_results .iter_mut() .find(|r| r.result.url == search_result.url) @@ -57,12 +68,22 @@ pub fn merge_engine_responses( } } - if let Some(engine_featured_snippet) = response.featured_snippet { + if let Some(mut engine_featured_snippet) = response.featured_snippet { // if it has a higher weight than the current featured snippet let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| { let other_engine_config = config.engines.get(s.engine); other_engine_config.weight }); + + // url config applies to featured snippets too + engine_featured_snippet.url = + apply_url_replacements(&engine_featured_snippet.url, &config.urls); + let url_weight = get_url_weight(&engine_featured_snippet.url, &config.urls); + if url_weight <= 0. { + continue; + } + let featured_snippet_weight = featured_snippet_weight * url_weight; + if engine_config.weight > featured_snippet_weight { featured_snippet = Some(FeaturedSnippet { url: engine_featured_snippet.url, diff --git a/src/main.rs b/src/main.rs index f545759..2ba5a22 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,8 +8,8 @@ use tracing::error; pub mod config; pub mod engines; -pub mod normalize; pub mod parse; +pub mod urls; pub mod web; #[tokio::main(flavor = "current_thread")] diff --git a/src/normalize.rs b/src/normalize.rs deleted file mode 100644 index 4318dbf..0000000 --- a/src/normalize.rs +++ /dev/null @@ -1,81 +0,0 @@ -use std::borrow::Cow; - -use tracing::{error, warn}; -use url::Url; - -#[tracing::instrument] -pub fn normalize_url(url: &str) -> eyre::Result { - let url = url.trim_end_matches('#'); - if url.is_empty() { - warn!("url is empty"); - return Ok(String::new()); - } - - let Ok(mut url) = Url::parse(url) else { - error!("failed to parse url"); - return Ok(url.to_string()); - }; - - // make sure the scheme is https - if url.scheme() == "http" { - url.set_scheme("https").unwrap(); - } - - // remove fragment - url.set_fragment(None); - - // remove trailing slash - let path = url.path().to_string(); - if let Some(path) = path.strip_suffix('/') { - url.set_path(path); - } - - // remove tracking params - let query_pairs = url.query_pairs().into_owned(); - let mut new_query_pairs = Vec::new(); - const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"]; - for (key, value) in query_pairs { - if !TRACKING_PARAMS.contains(&key.as_str()) { - new_query_pairs.push((key, value)); - } - } - if new_query_pairs.is_empty() { - url.set_query(None); - } else { - url.set_query(Some( - &url::form_urlencoded::Serializer::new(String::new()) - .extend_pairs(new_query_pairs) - .finish(), - )); - } - - // convert minecraft.fandom.com/wiki/ to minecraft.wiki/w/ - if url.host_str() == Some("minecraft.fandom.com") { - let path = url.path().to_string(); - if let Some(path) = path.strip_prefix("/wiki/") { - url.set_host(Some("minecraft.wiki")).unwrap(); - url.set_path(&format!("/w/{path}")); - } - } - - // url decode and encode path - let path = url.path().to_string(); - let path = match urlencoding::decode(&path) { - Ok(path) => path, - Err(e) => { - warn!("failed to decode path: {e}"); - Cow::Owned(path) - } - }; - url.set_path(path.as_ref()); - - let url = url.to_string(); - // remove trailing slash - let url = if let Some(url) = url.strip_suffix('/') { - url.to_string() - } else { - url - }; - - Ok(url) -} diff --git a/src/parse.rs b/src/parse.rs index b6f9992..ffbd280 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -2,7 +2,7 @@ use crate::{ engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult}, - normalize::normalize_url, + urls::normalize_url, }; use scraper::{Html, Selector}; @@ -169,7 +169,7 @@ pub(super) fn parse_html_response_with_opts( continue; } - let url = normalize_url(&url)?; + let url = normalize_url(&url); search_results.push(EngineSearchResult { url, @@ -186,7 +186,7 @@ pub(super) fn parse_html_response_with_opts( { let title = featured_snippet_title_query_method.call(&featured_snippet)?; let url = featured_snippet_href_query_method.call(&featured_snippet)?; - let url = normalize_url(&url)?; + let url = normalize_url(&url); let description = featured_snippet_description_query_method.call(&featured_snippet)?; // this can happen on google if you search "what's my user agent" diff --git a/src/urls.rs b/src/urls.rs new file mode 100644 index 0000000..bc4ecfd --- /dev/null +++ b/src/urls.rs @@ -0,0 +1,232 @@ +use std::borrow::Cow; + +use tracing::{error, warn}; +use url::Url; + +use crate::config::{HostAndPath, UrlsConfig}; + +#[tracing::instrument] +pub fn normalize_url(url: &str) -> String { + let url = url.trim_end_matches('#'); + if url.is_empty() { + warn!("url is empty"); + return String::new(); + } + + let Ok(mut url) = Url::parse(url) else { + error!("failed to parse url"); + return url.to_string(); + }; + + // make sure the scheme is https + if url.scheme() == "http" { + url.set_scheme("https").unwrap(); + } + + // remove fragment + url.set_fragment(None); + + // remove trailing slash + let path = url.path().to_string(); + if let Some(path) = path.strip_suffix('/') { + url.set_path(path); + } + + // remove tracking params + let query_pairs = url.query_pairs().into_owned(); + let mut new_query_pairs = Vec::new(); + const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"]; + for (key, value) in query_pairs { + if !TRACKING_PARAMS.contains(&key.as_str()) { + new_query_pairs.push((key, value)); + } + } + if new_query_pairs.is_empty() { + url.set_query(None); + } else { + url.set_query(Some( + &url::form_urlencoded::Serializer::new(String::new()) + .extend_pairs(new_query_pairs) + .finish(), + )); + } + + // url decode and encode path + let path = url.path().to_string(); + let path = match urlencoding::decode(&path) { + Ok(path) => path, + Err(e) => { + warn!("failed to decode path: {e}"); + Cow::Owned(path) + } + }; + url.set_path(path.as_ref()); + + let url = url.to_string(); + // remove trailing slash + let url = if let Some(url) = url.strip_suffix('/') { + url.to_string() + } else { + url + }; + + url +} + +impl HostAndPath { + pub fn contains(&self, host: &str, path: &str) -> bool { + if self.host.starts_with('.') { + if !host.ends_with(&self.host) { + return false; + } + } else if host != self.host { + return false; + } + + if self.path.ends_with('/') || self.path.is_empty() { + path.starts_with(&self.path) + } else { + path == self.path + } + } + + pub fn replace( + replace_from: &HostAndPath, + replace_with: &HostAndPath, + real_url: &HostAndPath, + ) -> Option<(String, String)> { + let new_host = if replace_from.host.starts_with(".") { + if replace_with.host.starts_with(".") { + if let Some(host_without_suffix) = real_url.host.strip_suffix(&replace_from.host) { + format!("{host_without_suffix}{}", replace_with.host) + } else { + return None; + } + } else { + replace_with.host.clone() + } + } else if real_url.host == replace_from.host { + replace_with.host.clone() + } else { + return None; + }; + + // host matches, now check path + + let new_path = if replace_from.path.ends_with('/') || replace_with.path.is_empty() { + if replace_with.path.ends_with('/') { + if let Some(path_without_prefix) = real_url.path.strip_prefix(&replace_from.path) { + format!("{}{path_without_prefix}", replace_with.path) + } else { + return None; + } + } else if replace_with.path.is_empty() { + real_url.path.clone() + } else { + replace_with.path.clone() + } + } else if real_url.path == replace_from.path { + replace_with.path.clone() + } else { + return None; + }; + + Some((new_host, new_path)) + } +} + +pub fn apply_url_replacements(url: &str, urls_config: &UrlsConfig) -> String { + let Ok(mut url) = Url::parse(url) else { + error!("failed to parse url"); + return url.to_string(); + }; + + let host = url.host_str().unwrap_or_default().to_owned(); + + let path = url + .path() + .strip_prefix("/") + .unwrap_or(url.path()) + .to_owned(); + let real_url = HostAndPath { host, path }; + for (replace_from, replace_to) in &urls_config.replace { + if let Some((new_host, new_path)) = + HostAndPath::replace(replace_from, replace_to, &real_url) + { + let _ = url.set_host(Some(&new_host)); + url.set_path(&new_path); + break; + } + } + + normalize_url(url.as_ref()) +} +pub fn get_url_weight(url: &str, urls_config: &UrlsConfig) -> f64 { + let Ok(url) = Url::parse(url) else { + error!("failed to parse url"); + return 1.; + }; + + let host = url.host_str().unwrap_or_default().to_owned(); + let path = url.path().strip_prefix("/").unwrap_or_default().to_owned(); + for (check, weight) in &urls_config.weight { + if check.contains(&host, &path) { + return *weight; + } + } + + 1. +} + +#[cfg(test)] +mod tests { + use crate::config::HostAndPath; + + use super::*; + + fn test_replacement(from: &str, to: &str, url: &str, expected: &str) { + let urls_config = UrlsConfig { + replace: vec![(HostAndPath::new(from), HostAndPath::new(to))], + weight: vec![], + }; + let normalized_url = apply_url_replacements(url, &urls_config); + assert_eq!(normalized_url, expected); + } + + #[test] + fn test_replace_url() { + test_replacement( + "minecraft.fandom.com/wiki/", + "minecraft.wiki/w/", + "https://minecraft.fandom.com/wiki/Java_Edition", + "https://minecraft.wiki/w/Java_Edition", + ); + } + #[test] + fn test_replace_wildcard_host_with_absolute() { + test_replacement( + ".medium.com", + "scribe.rip", + "https://example.medium.com/asdf", + "https://scribe.rip/asdf", + ); + } + #[test] + fn test_replace_wildcard_host_with_wildcard() { + test_replacement( + ".medium.com", + ".scribe.rip", + "https://example.medium.com/asdf", + "https://example.scribe.rip/asdf", + ); + } + #[test] + fn test_non_matching_wildcard() { + test_replacement( + ".medium.com", + ".scribe.rip", + "https://medium.com/asdf", + "https://medium.com/asdf", + ); + } +}